Merge commit 'origin/gallium-0.1' into gallium-0.2
[mesa.git] / src / gallium / auxiliary / rtasm / rtasm_ppc_spe.c
1 /*
2 * (C) Copyright IBM Corporation 2008
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file
27 * Real-time assembly generation interface for Cell B.E. SPEs.
28 *
29 * \author Ian Romanick <idr@us.ibm.com>
30 * \author Brian Paul
31 */
32
33
34 #include <stdio.h>
35 #include "pipe/p_compiler.h"
36 #include "util/u_memory.h"
37 #include "rtasm_ppc_spe.h"
38
39
40 #ifdef GALLIUM_CELL
41 /**
42 * SPE instruction types
43 *
44 * There are 6 primary instruction encodings used on the Cell's SPEs. Each of
45 * the following unions encodes one type.
46 *
47 * \bug
48 * If, at some point, we start generating SPE code from a little-endian host
49 * these unions will not work.
50 */
51 /*@{*/
52 /**
53 * Encode one output register with two input registers
54 */
55 union spe_inst_RR {
56 uint32_t bits;
57 struct {
58 unsigned op:11;
59 unsigned rB:7;
60 unsigned rA:7;
61 unsigned rT:7;
62 } inst;
63 };
64
65
66 /**
67 * Encode one output register with three input registers
68 */
69 union spe_inst_RRR {
70 uint32_t bits;
71 struct {
72 unsigned op:4;
73 unsigned rT:7;
74 unsigned rB:7;
75 unsigned rA:7;
76 unsigned rC:7;
77 } inst;
78 };
79
80
81 /**
82 * Encode one output register with one input reg. and a 7-bit signed immed
83 */
84 union spe_inst_RI7 {
85 uint32_t bits;
86 struct {
87 unsigned op:11;
88 unsigned i7:7;
89 unsigned rA:7;
90 unsigned rT:7;
91 } inst;
92 };
93
94
95 /**
96 * Encode one output register with one input reg. and an 8-bit signed immed
97 */
98 union spe_inst_RI8 {
99 uint32_t bits;
100 struct {
101 unsigned op:10;
102 unsigned i8:8;
103 unsigned rA:7;
104 unsigned rT:7;
105 } inst;
106 };
107
108
109 /**
110 * Encode one output register with one input reg. and a 10-bit signed immed
111 */
112 union spe_inst_RI10 {
113 uint32_t bits;
114 struct {
115 unsigned op:8;
116 unsigned i10:10;
117 unsigned rA:7;
118 unsigned rT:7;
119 } inst;
120 };
121
122
123 /**
124 * Encode one output register with a 16-bit signed immediate
125 */
126 union spe_inst_RI16 {
127 uint32_t bits;
128 struct {
129 unsigned op:9;
130 unsigned i16:16;
131 unsigned rT:7;
132 } inst;
133 };
134
135
136 /**
137 * Encode one output register with a 18-bit signed immediate
138 */
139 union spe_inst_RI18 {
140 uint32_t bits;
141 struct {
142 unsigned op:7;
143 unsigned i18:18;
144 unsigned rT:7;
145 } inst;
146 };
147 /*@}*/
148
149
150 static void
151 indent(const struct spe_function *p)
152 {
153 int i;
154 for (i = 0; i < p->indent; i++) {
155 putchar(' ');
156 }
157 }
158
159
160 static const char *
161 rem_prefix(const char *longname)
162 {
163 return longname + 4;
164 }
165
166
167 static const char *
168 reg_name(int reg)
169 {
170 switch (reg) {
171 case SPE_REG_SP:
172 return "$sp";
173 case SPE_REG_RA:
174 return "$lr";
175 default:
176 {
177 /* cycle through four buffers to handle multiple calls per printf */
178 static char buf[4][10];
179 static int b = 0;
180 b = (b + 1) % 4;
181 sprintf(buf[b], "$%d", reg);
182 return buf[b];
183 }
184 }
185 }
186
187
188 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
189 unsigned rA, unsigned rB, const char *name)
190 {
191 union spe_inst_RR inst;
192 inst.inst.op = op;
193 inst.inst.rB = rB;
194 inst.inst.rA = rA;
195 inst.inst.rT = rT;
196 p->store[p->num_inst++] = inst.bits;
197 assert(p->num_inst <= p->max_inst);
198 if (p->print) {
199 indent(p);
200 printf("%s\t%s, %s, %s\n",
201 rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
202 }
203 }
204
205
206 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
207 unsigned rA, unsigned rB, unsigned rC, const char *name)
208 {
209 union spe_inst_RRR inst;
210 inst.inst.op = op;
211 inst.inst.rT = rT;
212 inst.inst.rB = rB;
213 inst.inst.rA = rA;
214 inst.inst.rC = rC;
215 p->store[p->num_inst++] = inst.bits;
216 assert(p->num_inst <= p->max_inst);
217 if (p->print) {
218 indent(p);
219 printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
220 reg_name(rA), reg_name(rB), reg_name(rC));
221 }
222 }
223
224
225 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
226 unsigned rA, int imm, const char *name)
227 {
228 union spe_inst_RI7 inst;
229 inst.inst.op = op;
230 inst.inst.i7 = imm;
231 inst.inst.rA = rA;
232 inst.inst.rT = rT;
233 p->store[p->num_inst++] = inst.bits;
234 assert(p->num_inst <= p->max_inst);
235 if (p->print) {
236 indent(p);
237 printf("%s\t%s, %s, 0x%x\n",
238 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
239 }
240 }
241
242
243
244 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
245 unsigned rA, int imm, const char *name)
246 {
247 union spe_inst_RI8 inst;
248 inst.inst.op = op;
249 inst.inst.i8 = imm;
250 inst.inst.rA = rA;
251 inst.inst.rT = rT;
252 p->store[p->num_inst++] = inst.bits;
253 assert(p->num_inst <= p->max_inst);
254 if (p->print) {
255 indent(p);
256 printf("%s\t%s, %s, 0x%x\n",
257 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
258 }
259 }
260
261
262
263 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
264 unsigned rA, int imm, const char *name)
265 {
266 union spe_inst_RI10 inst;
267 inst.inst.op = op;
268 inst.inst.i10 = imm;
269 inst.inst.rA = rA;
270 inst.inst.rT = rT;
271 p->store[p->num_inst++] = inst.bits;
272 assert(p->num_inst <= p->max_inst);
273 if (p->print) {
274 indent(p);
275 printf("%s\t%s, %s, 0x%x\n",
276 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
277 }
278 }
279
280
281 /** As above, but do range checking on signed immediate value */
282 static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
283 unsigned rA, int imm, const char *name)
284 {
285 assert(imm <= 511);
286 assert(imm >= -512);
287 emit_RI10(p, op, rT, rA, imm, name);
288 }
289
290
291 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
292 int imm, const char *name)
293 {
294 union spe_inst_RI16 inst;
295 inst.inst.op = op;
296 inst.inst.i16 = imm;
297 inst.inst.rT = rT;
298 p->store[p->num_inst++] = inst.bits;
299 assert(p->num_inst <= p->max_inst);
300 if (p->print) {
301 indent(p);
302 printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
303 }
304 }
305
306
307 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
308 int imm, const char *name)
309 {
310 union spe_inst_RI18 inst;
311 inst.inst.op = op;
312 inst.inst.i18 = imm;
313 inst.inst.rT = rT;
314 p->store[p->num_inst++] = inst.bits;
315 assert(p->num_inst <= p->max_inst);
316 if (p->print) {
317 indent(p);
318 printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
319 }
320 }
321
322
323
324
325 #define EMIT_(_name, _op) \
326 void _name (struct spe_function *p, unsigned rT) \
327 { \
328 emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
329 }
330
331 #define EMIT_R(_name, _op) \
332 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
333 { \
334 emit_RR(p, _op, rT, rA, 0, __FUNCTION__); \
335 }
336
337 #define EMIT_RR(_name, _op) \
338 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
339 { \
340 emit_RR(p, _op, rT, rA, rB, __FUNCTION__); \
341 }
342
343 #define EMIT_RRR(_name, _op) \
344 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
345 { \
346 emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__); \
347 }
348
349 #define EMIT_RI7(_name, _op) \
350 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
351 { \
352 emit_RI7(p, _op, rT, rA, imm, __FUNCTION__); \
353 }
354
355 #define EMIT_RI8(_name, _op, bias) \
356 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
357 { \
358 emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__); \
359 }
360
361 #define EMIT_RI10(_name, _op) \
362 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
363 { \
364 emit_RI10(p, _op, rT, rA, imm, __FUNCTION__); \
365 }
366
367 #define EMIT_RI10s(_name, _op) \
368 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
369 { \
370 emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__); \
371 }
372
373 #define EMIT_RI16(_name, _op) \
374 void _name (struct spe_function *p, unsigned rT, int imm) \
375 { \
376 emit_RI16(p, _op, rT, imm, __FUNCTION__); \
377 }
378
379 #define EMIT_RI18(_name, _op) \
380 void _name (struct spe_function *p, unsigned rT, int imm) \
381 { \
382 emit_RI18(p, _op, rT, imm, __FUNCTION__); \
383 }
384
385 #define EMIT_I16(_name, _op) \
386 void _name (struct spe_function *p, int imm) \
387 { \
388 emit_RI16(p, _op, 0, imm, __FUNCTION__); \
389 }
390
391 #include "rtasm_ppc_spe.h"
392
393
394
395 /**
396 * Initialize an spe_function.
397 * \param code_size size of instruction buffer to allocate, in bytes.
398 */
399 void spe_init_func(struct spe_function *p, unsigned code_size)
400 {
401 unsigned int i;
402
403 p->store = align_malloc(code_size, 16);
404 p->num_inst = 0;
405 p->max_inst = code_size / SPE_INST_SIZE;
406
407 p->set_count = 0;
408 memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
409
410 /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
411 */
412 p->regs[0] = p->regs[1] = p->regs[2] = 1;
413 for (i = 80; i <= 127; i++) {
414 p->regs[i] = 1;
415 }
416
417 p->print = false;
418 p->indent = 0;
419 }
420
421
422 void spe_release_func(struct spe_function *p)
423 {
424 assert(p->num_inst <= p->max_inst);
425 if (p->store != NULL) {
426 align_free(p->store);
427 }
428 p->store = NULL;
429 }
430
431
432 /** Return current code size in bytes. */
433 unsigned spe_code_size(const struct spe_function *p)
434 {
435 return p->num_inst * SPE_INST_SIZE;
436 }
437
438
439 /**
440 * Allocate a SPE register.
441 * \return register index or -1 if none left.
442 */
443 int spe_allocate_available_register(struct spe_function *p)
444 {
445 unsigned i;
446 for (i = 0; i < SPE_NUM_REGS; i++) {
447 if (p->regs[i] == 0) {
448 p->regs[i] = 1;
449 return i;
450 }
451 }
452
453 return -1;
454 }
455
456
457 /**
458 * Mark the given SPE register as "allocated".
459 */
460 int spe_allocate_register(struct spe_function *p, int reg)
461 {
462 assert(reg < SPE_NUM_REGS);
463 assert(p->regs[reg] == 0);
464 p->regs[reg] = 1;
465 return reg;
466 }
467
468
469 /**
470 * Mark the given SPE register as "unallocated". Note that this should
471 * only be used on registers allocated in the current register set; an
472 * assertion will fail if an attempt is made to deallocate a register
473 * allocated in an earlier register set.
474 */
475 void spe_release_register(struct spe_function *p, int reg)
476 {
477 assert(reg < SPE_NUM_REGS);
478 assert(p->regs[reg] == 1);
479
480 p->regs[reg] = 0;
481 }
482
483 /**
484 * Start a new set of registers. This can be called if
485 * it will be difficult later to determine exactly what
486 * registers were actually allocated during a code generation
487 * sequence, and you really just want to deallocate all of them.
488 */
489 void spe_allocate_register_set(struct spe_function *p)
490 {
491 unsigned int i;
492
493 /* Keep track of the set count. If it ever wraps around to 0,
494 * we're in trouble.
495 */
496 p->set_count++;
497 assert(p->set_count > 0);
498
499 /* Increment the allocation count of all registers currently
500 * allocated. Then any registers that are allocated in this set
501 * will be the only ones with a count of 1; they'll all be released
502 * when the register set is released.
503 */
504 for (i = 0; i < SPE_NUM_REGS; i++) {
505 if (p->regs[i] > 0)
506 p->regs[i]++;
507 }
508 }
509
510 void spe_release_register_set(struct spe_function *p)
511 {
512 unsigned int i;
513
514 /* If the set count drops below zero, we're in trouble. */
515 assert(p->set_count > 0);
516 p->set_count--;
517
518 /* Drop the allocation level of all registers. Any allocated
519 * during this register set will drop to 0 and then become
520 * available.
521 */
522 for (i = 0; i < SPE_NUM_REGS; i++) {
523 if (p->regs[i] > 0)
524 p->regs[i]--;
525 }
526 }
527
528
529 unsigned
530 spe_get_registers_used(const struct spe_function *p, ubyte used[])
531 {
532 unsigned i, num = 0;
533 /* only count registers in the range available to callers */
534 for (i = 2; i < 80; i++) {
535 if (p->regs[i]) {
536 used[num++] = i;
537 }
538 }
539 return num;
540 }
541
542
543 void
544 spe_print_code(struct spe_function *p, boolean enable)
545 {
546 p->print = enable;
547 }
548
549
550 void
551 spe_indent(struct spe_function *p, int spaces)
552 {
553 p->indent += spaces;
554 }
555
556
557 void
558 spe_comment(struct spe_function *p, int rel_indent, const char *s)
559 {
560 if (p->print) {
561 p->indent += rel_indent;
562 indent(p);
563 p->indent -= rel_indent;
564 printf("# %s\n", s);
565 }
566 }
567
568
569 /**
570 * Load quad word.
571 * NOTE: offset is in bytes and the least significant 4 bits must be zero!
572 */
573 void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
574 {
575 const boolean pSave = p->print;
576
577 /* offset must be a multiple of 16 */
578 assert(offset % 16 == 0);
579 /* offset must fit in 10-bit signed int field, after shifting */
580 assert((offset >> 4) <= 511);
581 assert((offset >> 4) >= -512);
582
583 p->print = FALSE;
584 emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
585 p->print = pSave;
586
587 if (p->print) {
588 indent(p);
589 printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
590 }
591 }
592
593
594 /**
595 * Store quad word.
596 * NOTE: offset is in bytes and the least significant 4 bits must be zero!
597 */
598 void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
599 {
600 const boolean pSave = p->print;
601
602 /* offset must be a multiple of 16 */
603 assert(offset % 16 == 0);
604 /* offset must fit in 10-bit signed int field, after shifting */
605 assert((offset >> 4) <= 511);
606 assert((offset >> 4) >= -512);
607
608 p->print = FALSE;
609 emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
610 p->print = pSave;
611
612 if (p->print) {
613 indent(p);
614 printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
615 }
616 }
617
618
619 /**
620 * For branch instructions:
621 * \param d if 1, disable interupts if branch is taken
622 * \param e if 1, enable interupts if branch is taken
623 * If d and e are both zero, don't change interupt status (right?)
624 */
625
626 /** Branch Indirect to address in rA */
627 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
628 {
629 emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
630 }
631
632 /** Interupt Return */
633 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
634 {
635 emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
636 }
637
638 /** Branch indirect and set link on external data */
639 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
640 int e)
641 {
642 emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
643 }
644
645 /** Branch indirect and set link. Save PC in rT, jump to rA. */
646 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
647 int e)
648 {
649 emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
650 }
651
652 /** Branch indirect if zero word. If rT.word[0]==0, jump to rA. */
653 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
654 {
655 emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
656 }
657
658 /** Branch indirect if non-zero word. If rT.word[0]!=0, jump to rA. */
659 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
660 {
661 emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
662 }
663
664 /** Branch indirect if zero halfword. If rT.halfword[1]==0, jump to rA. */
665 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
666 {
667 emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
668 }
669
670 /** Branch indirect if non-zero halfword. If rT.halfword[1]!=0, jump to rA. */
671 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
672 {
673 emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
674 }
675
676
677 /* Hint-for-branch instructions
678 */
679 #if 0
680 hbr;
681 hbra;
682 hbrr;
683 #endif
684
685
686 /* Control instructions
687 */
688 #if 0
689 stop;
690 EMIT_RR (spe_stopd, 0x140);
691 EMIT_ (spe_lnop, 0x001);
692 EMIT_ (spe_nop, 0x201);
693 sync;
694 EMIT_ (spe_dsync, 0x003);
695 EMIT_R (spe_mfspr, 0x00c);
696 EMIT_R (spe_mtspr, 0x10c);
697 #endif
698
699
700 /**
701 ** Helper / "macro" instructions.
702 ** Use somewhat verbose names as a reminder that these aren't native
703 ** SPE instructions.
704 **/
705
706
707 void
708 spe_load_float(struct spe_function *p, unsigned rT, float x)
709 {
710 if (x == 0.0f) {
711 spe_il(p, rT, 0x0);
712 }
713 else if (x == 0.5f) {
714 spe_ilhu(p, rT, 0x3f00);
715 }
716 else if (x == 1.0f) {
717 spe_ilhu(p, rT, 0x3f80);
718 }
719 else if (x == -1.0f) {
720 spe_ilhu(p, rT, 0xbf80);
721 }
722 else {
723 union {
724 float f;
725 unsigned u;
726 } bits;
727 bits.f = x;
728 spe_ilhu(p, rT, bits.u >> 16);
729 spe_iohl(p, rT, bits.u & 0xffff);
730 }
731 }
732
733
734 void
735 spe_load_int(struct spe_function *p, unsigned rT, int i)
736 {
737 if (-32768 <= i && i <= 32767) {
738 spe_il(p, rT, i);
739 }
740 else {
741 spe_ilhu(p, rT, i >> 16);
742 if (i & 0xffff)
743 spe_iohl(p, rT, i & 0xffff);
744 }
745 }
746
747 void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
748 {
749 /* If the whole value is in the lower 18 bits, use ila, which
750 * doesn't sign-extend. Otherwise, if the two halfwords of
751 * the constant are identical, use ilh. Otherwise, if every byte of
752 * the desired value is 0x00 or 0xff, we can use Form Select Mask for
753 * Bytes Immediate (fsmbi) to load the value in a single instruction.
754 * Otherwise, in the general case, we have to use ilhu followed by iohl.
755 */
756 if ((ui & 0x0003ffff) == ui) {
757 spe_ila(p, rT, ui);
758 }
759 else if ((ui >> 16) == (ui & 0xffff)) {
760 spe_ilh(p, rT, ui & 0xffff);
761 }
762 else if (
763 ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
764 ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
765 ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
766 ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
767 ) {
768 unsigned int mask = 0;
769 /* fsmbi duplicates each bit in the given mask eight times,
770 * using a 16-bit value to initialize a 16-byte quadword.
771 * Each 4-bit nybble of the mask corresponds to a full word
772 * of the result; look at the value and figure out the mask
773 * (replicated for each word in the quadword), and then
774 * form the "select mask" to get the value.
775 */
776 if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
777 if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
778 if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
779 if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
780 spe_fsmbi(p, rT, mask);
781 }
782 else {
783 /* The general case: this usually uses two instructions, but
784 * may use only one if the low-order 16 bits of each word are 0.
785 */
786 spe_ilhu(p, rT, ui >> 16);
787 if (ui & 0xffff)
788 spe_iohl(p, rT, ui & 0xffff);
789 }
790 }
791
792 /**
793 * This function is constructed identically to spe_xor_uint() below.
794 * Changes to one should be made in the other.
795 */
796 void
797 spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
798 {
799 /* If we can, emit a single instruction, either And Byte Immediate
800 * (which uses the same constant across each byte), And Halfword Immediate
801 * (which sign-extends a 10-bit immediate to 16 bits and uses that
802 * across each halfword), or And Word Immediate (which sign-extends
803 * a 10-bit immediate to 32 bits).
804 *
805 * Otherwise, we'll need to use a temporary register.
806 */
807 unsigned int tmp;
808
809 /* If the upper 23 bits are all 0s or all 1s, sign extension
810 * will work and we can use And Word Immediate
811 */
812 tmp = ui & 0xfffffe00;
813 if (tmp == 0xfffffe00 || tmp == 0) {
814 spe_andi(p, rT, rA, ui & 0x000003ff);
815 return;
816 }
817
818 /* If the ui field is symmetric along halfword boundaries and
819 * the upper 7 bits of each halfword are all 0s or 1s, we
820 * can use And Halfword Immediate
821 */
822 tmp = ui & 0xfe00fe00;
823 if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
824 spe_andhi(p, rT, rA, ui & 0x000003ff);
825 return;
826 }
827
828 /* If the ui field is symmetric in each byte, then we can use
829 * the And Byte Immediate instruction.
830 */
831 tmp = ui & 0x000000ff;
832 if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
833 spe_andbi(p, rT, rA, tmp);
834 return;
835 }
836
837 /* Otherwise, we'll have to use a temporary register. */
838 unsigned int tmp_reg = spe_allocate_available_register(p);
839 spe_load_uint(p, tmp_reg, ui);
840 spe_and(p, rT, rA, tmp_reg);
841 spe_release_register(p, tmp_reg);
842 }
843
844
845 /**
846 * This function is constructed identically to spe_and_uint() above.
847 * Changes to one should be made in the other.
848 */
849 void
850 spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
851 {
852 /* If we can, emit a single instruction, either Exclusive Or Byte
853 * Immediate (which uses the same constant across each byte), Exclusive
854 * Or Halfword Immediate (which sign-extends a 10-bit immediate to
855 * 16 bits and uses that across each halfword), or Exclusive Or Word
856 * Immediate (which sign-extends a 10-bit immediate to 32 bits).
857 *
858 * Otherwise, we'll need to use a temporary register.
859 */
860 unsigned int tmp;
861
862 /* If the upper 23 bits are all 0s or all 1s, sign extension
863 * will work and we can use Exclusive Or Word Immediate
864 */
865 tmp = ui & 0xfffffe00;
866 if (tmp == 0xfffffe00 || tmp == 0) {
867 spe_xori(p, rT, rA, ui & 0x000003ff);
868 return;
869 }
870
871 /* If the ui field is symmetric along halfword boundaries and
872 * the upper 7 bits of each halfword are all 0s or 1s, we
873 * can use Exclusive Or Halfword Immediate
874 */
875 tmp = ui & 0xfe00fe00;
876 if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
877 spe_xorhi(p, rT, rA, ui & 0x000003ff);
878 return;
879 }
880
881 /* If the ui field is symmetric in each byte, then we can use
882 * the Exclusive Or Byte Immediate instruction.
883 */
884 tmp = ui & 0x000000ff;
885 if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
886 spe_xorbi(p, rT, rA, tmp);
887 return;
888 }
889
890 /* Otherwise, we'll have to use a temporary register. */
891 unsigned int tmp_reg = spe_allocate_available_register(p);
892 spe_load_uint(p, tmp_reg, ui);
893 spe_xor(p, rT, rA, tmp_reg);
894 spe_release_register(p, tmp_reg);
895 }
896
897 void
898 spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
899 {
900 /* If the comparison value is 9 bits or less, it fits inside a
901 * Compare Equal Word Immediate instruction.
902 */
903 if ((ui & 0x000001ff) == ui) {
904 spe_ceqi(p, rT, rA, ui);
905 }
906 /* Otherwise, we're going to have to load a word first. */
907 else {
908 unsigned int tmp_reg = spe_allocate_available_register(p);
909 spe_load_uint(p, tmp_reg, ui);
910 spe_ceq(p, rT, rA, tmp_reg);
911 spe_release_register(p, tmp_reg);
912 }
913 }
914
915 void
916 spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
917 {
918 /* If the comparison value is 10 bits or less, it fits inside a
919 * Compare Logical Greater Than Word Immediate instruction.
920 */
921 if ((ui & 0x000003ff) == ui) {
922 spe_clgti(p, rT, rA, ui);
923 }
924 /* Otherwise, we're going to have to load a word first. */
925 else {
926 unsigned int tmp_reg = spe_allocate_available_register(p);
927 spe_load_uint(p, tmp_reg, ui);
928 spe_clgt(p, rT, rA, tmp_reg);
929 spe_release_register(p, tmp_reg);
930 }
931 }
932
933 void
934 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
935 {
936 /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
937 spe_ila(p, rT, 0x00010203);
938 spe_shufb(p, rT, rA, rA, rT);
939 }
940
941
942 void
943 spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
944 {
945 spe_nor(p, rT, rA, rA);
946 }
947
948
949 void
950 spe_move(struct spe_function *p, unsigned rT, unsigned rA)
951 {
952 /* Use different instructions depending on the instruction address
953 * to take advantage of the dual pipelines.
954 */
955 if (p->num_inst & 1)
956 spe_shlqbyi(p, rT, rA, 0); /* odd pipe */
957 else
958 spe_ori(p, rT, rA, 0); /* even pipe */
959 }
960
961
962 void
963 spe_zero(struct spe_function *p, unsigned rT)
964 {
965 spe_xor(p, rT, rT, rT);
966 }
967
968
969 void
970 spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
971 {
972 assert(word >= 0);
973 assert(word <= 3);
974
975 if (word == 0) {
976 int tmp1 = rT;
977 spe_ila(p, tmp1, 66051);
978 spe_shufb(p, rT, rA, rA, tmp1);
979 }
980 else {
981 /* XXX review this, we may not need the rotqbyi instruction */
982 int tmp1 = rT;
983 int tmp2 = spe_allocate_available_register(p);
984
985 spe_ila(p, tmp1, 66051);
986 spe_rotqbyi(p, tmp2, rA, 4 * word);
987 spe_shufb(p, rT, tmp2, tmp2, tmp1);
988
989 spe_release_register(p, tmp2);
990 }
991 }
992
993 /**
994 * For each 32-bit float element of rA and rB, choose the smaller of the
995 * two, compositing them into the rT register.
996 *
997 * The Float Compare Greater Than (fcgt) instruction will put 1s into
998 * compare_reg where rA > rB, and 0s where rA <= rB.
999 *
1000 * Then the Select Bits (selb) instruction will take bits from rA where
1001 * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
1002 * where rA <= rB and from rB where rB > rA, which is exactly the
1003 * "min" operation.
1004 *
1005 * The compare_reg could in many cases be the same as rT, unless
1006 * rT == rA || rt == rB. But since this is common in constructions
1007 * like "x = min(x, a)", we always allocate a new register to be safe.
1008 */
1009 void
1010 spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
1011 {
1012 unsigned int compare_reg = spe_allocate_available_register(p);
1013 spe_fcgt(p, compare_reg, rA, rB);
1014 spe_selb(p, rT, rA, rB, compare_reg);
1015 spe_release_register(p, compare_reg);
1016 }
1017
1018 /**
1019 * For each 32-bit float element of rA and rB, choose the greater of the
1020 * two, compositing them into the rT register.
1021 *
1022 * The logic is similar to that of spe_float_min() above; the only
1023 * difference is that the registers on spe_selb() have been reversed,
1024 * so that the larger of the two is selected instead of the smaller.
1025 */
1026 void
1027 spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
1028 {
1029 unsigned int compare_reg = spe_allocate_available_register(p);
1030 spe_fcgt(p, compare_reg, rA, rB);
1031 spe_selb(p, rT, rB, rA, compare_reg);
1032 spe_release_register(p, compare_reg);
1033 }
1034
1035 #endif /* GALLIUM_CELL */