Merge commit 'origin/gallium-0.1' into gallium-0.2
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fp.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29
30 /**
31 * Generate SPU fragment program/shader code.
32 *
33 * Note that we generate SOA-style code here. So each TGSI instruction
34 * operates on four pixels (and is translated into four SPU instructions,
35 * generally speaking).
36 *
37 * \author Brian Paul
38 */
39
40 #include <math.h>
41 #include "pipe/p_defines.h"
42 #include "pipe/p_state.h"
43 #include "pipe/p_shader_tokens.h"
44 #include "tgsi/tgsi_parse.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_exec.h"
47 #include "tgsi/tgsi_dump.h"
48 #include "rtasm/rtasm_ppc_spe.h"
49 #include "util/u_memory.h"
50 #include "cell_context.h"
51 #include "cell_gen_fp.h"
52
53
54 #define MAX_TEMPS 16
55 #define MAX_IMMED 8
56
57 #define CHAN_X 0
58 #define CHAN_Y 1
59 #define CHAN_Z 2
60 #define CHAN_W 3
61
62 /**
63 * Context needed during code generation.
64 */
65 struct codegen
66 {
67 struct cell_context *cell;
68 int inputs_reg; /**< 1st function parameter */
69 int outputs_reg; /**< 2nd function parameter */
70 int constants_reg; /**< 3rd function parameter */
71 int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
72 int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */
73
74 int num_imm; /**< number of immediates */
75
76 int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */
77
78 /** Per-instruction temps / intermediate temps */
79 int num_itemps;
80 int itemps[10];
81
82 /** Current IF/ELSE/ENDIF nesting level */
83 int if_nesting;
84 /** Index of execution mask register */
85 int exec_mask_reg;
86
87 int frame_size; /**< Stack frame size, in words */
88
89 struct spe_function *f;
90 boolean error;
91 };
92
93
94 /**
95 * Allocate an intermediate temporary register.
96 */
97 static int
98 get_itemp(struct codegen *gen)
99 {
100 int t = spe_allocate_available_register(gen->f);
101 assert(gen->num_itemps < Elements(gen->itemps));
102 gen->itemps[gen->num_itemps++] = t;
103 return t;
104 }
105
106 /**
107 * Free all intermediate temporary registers. To be called after each
108 * instruction has been emitted.
109 */
110 static void
111 free_itemps(struct codegen *gen)
112 {
113 int i;
114 for (i = 0; i < gen->num_itemps; i++) {
115 spe_release_register(gen->f, gen->itemps[i]);
116 }
117 gen->num_itemps = 0;
118 }
119
120
121 /**
122 * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
123 * The register is allocated and initialized upon the first call.
124 */
125 static int
126 get_const_one_reg(struct codegen *gen)
127 {
128 if (gen->one_reg <= 0) {
129 gen->one_reg = spe_allocate_available_register(gen->f);
130
131 spe_indent(gen->f, 4);
132 spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
133
134 /* one = {1.0, 1.0, 1.0, 1.0} */
135 spe_load_float(gen->f, gen->one_reg, 1.0f);
136
137 spe_indent(gen->f, -4);
138 }
139
140 return gen->one_reg;
141 }
142
143
144 /**
145 * Return index of the pixel execution mask.
146 * The register is allocated an initialized upon the first call.
147 *
148 * The pixel execution mask controls which pixels in a quad are
149 * modified, according to surrounding conditionals, loops, etc.
150 */
151 static int
152 get_exec_mask_reg(struct codegen *gen)
153 {
154 if (gen->exec_mask_reg <= 0) {
155 gen->exec_mask_reg = spe_allocate_available_register(gen->f);
156
157 spe_indent(gen->f, 4);
158 spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
159
160 /* exec_mask = {~0, ~0, ~0, ~0} */
161 spe_load_int(gen->f, gen->exec_mask_reg, ~0);
162
163 spe_indent(gen->f, -4);
164 }
165
166 return gen->exec_mask_reg;
167 }
168
169
170 /**
171 * Return the index of the SPU temporary containing the named TGSI
172 * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we
173 * just return the corresponding SPE register. If the TGIS register
174 * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
175 * and emit an SPE load instruction.
176 */
177 static int
178 get_src_reg(struct codegen *gen,
179 int channel,
180 const struct tgsi_full_src_register *src)
181 {
182 int reg = -1;
183 int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
184 boolean reg_is_itemp = FALSE;
185 uint sign_op;
186
187 assert(swizzle >= TGSI_SWIZZLE_X);
188 assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
189
190 if (swizzle == TGSI_EXTSWIZZLE_ONE) {
191 /* Load const one float and early out */
192 reg = get_const_one_reg(gen);
193 }
194 else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
195 /* Load const zero float and early out */
196 reg = get_itemp(gen);
197 spe_xor(gen->f, reg, reg, reg);
198 }
199 else {
200 assert(swizzle < 4);
201
202 switch (src->SrcRegister.File) {
203 case TGSI_FILE_TEMPORARY:
204 reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
205 break;
206 case TGSI_FILE_INPUT:
207 {
208 /* offset is measured in quadwords, not bytes */
209 int offset = src->SrcRegister.Index * 4 + swizzle;
210 reg = get_itemp(gen);
211 reg_is_itemp = TRUE;
212 /* Load: reg = memory[(machine_reg) + offset] */
213 spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
214 }
215 break;
216 case TGSI_FILE_IMMEDIATE:
217 reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
218 break;
219 case TGSI_FILE_CONSTANT:
220 {
221 /* offset is measured in quadwords, not bytes */
222 int offset = src->SrcRegister.Index * 4 + swizzle;
223 reg = get_itemp(gen);
224 reg_is_itemp = TRUE;
225 /* Load: reg = memory[(machine_reg) + offset] */
226 spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
227 }
228 break;
229 case TGSI_FILE_SAMPLER:
230 {
231 reg = 3; /* XXX total hack */
232 }
233 break;
234 default:
235 assert(0);
236 }
237 }
238
239 /*
240 * Handle absolute value, negate or set-negative of src register.
241 */
242 sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
243 if (sign_op != TGSI_UTIL_SIGN_KEEP) {
244 /*
245 * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
246 */
247 const int bit31mask_reg = get_itemp(gen);
248 int result_reg;
249
250 if (reg_is_itemp) {
251 /* re-use 'reg' for the result */
252 result_reg = reg;
253 }
254 else {
255 /* alloc a new reg for the result */
256 result_reg = get_itemp(gen);
257 }
258
259 /* mask with bit 31 set, the rest cleared */
260 spe_load_int(gen->f, bit31mask_reg, (1 << 31));
261
262 if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
263 spe_andc(gen->f, result_reg, reg, bit31mask_reg);
264 }
265 else if (sign_op == TGSI_UTIL_SIGN_SET) {
266 spe_and(gen->f, result_reg, reg, bit31mask_reg);
267 }
268 else {
269 assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
270 spe_xor(gen->f, result_reg, reg, bit31mask_reg);
271 }
272
273 reg = result_reg;
274 }
275
276 return reg;
277 }
278
279
280 /**
281 * Return the index of an SPE register to use for the given TGSI register.
282 * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
283 * corresponding SPE register is returned. If the TGSI register is
284 * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
285 * See store_dest_reg() below...
286 */
287 static int
288 get_dst_reg(struct codegen *gen,
289 int channel,
290 const struct tgsi_full_dst_register *dest)
291 {
292 int reg = -1;
293
294 switch (dest->DstRegister.File) {
295 case TGSI_FILE_TEMPORARY:
296 if (gen->if_nesting > 0)
297 reg = get_itemp(gen);
298 else
299 reg = gen->temp_regs[dest->DstRegister.Index][channel];
300 break;
301 case TGSI_FILE_OUTPUT:
302 reg = get_itemp(gen);
303 break;
304 default:
305 assert(0);
306 }
307
308 return reg;
309 }
310
311
312 /**
313 * When a TGSI instruction is writing to an output register, this
314 * function emits the SPE store instruction to store the value_reg.
315 * \param value_reg the SPE register containing the value to store.
316 * This would have been returned by get_dst_reg().
317 */
318 static void
319 store_dest_reg(struct codegen *gen,
320 int value_reg, int channel,
321 const struct tgsi_full_dst_register *dest)
322 {
323 switch (dest->DstRegister.File) {
324 case TGSI_FILE_TEMPORARY:
325 if (gen->if_nesting > 0) {
326 int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
327 int exec_reg = get_exec_mask_reg(gen);
328 /* Mix d with new value according to exec mask:
329 * d[i] = mask_reg[i] ? value_reg : d_reg
330 */
331 spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
332 }
333 else {
334 /* we're not inside a condition or loop: do nothing special */
335
336 }
337 break;
338 case TGSI_FILE_OUTPUT:
339 {
340 /* offset is measured in quadwords, not bytes */
341 int offset = dest->DstRegister.Index * 4 + channel;
342 if (gen->if_nesting > 0) {
343 int exec_reg = get_exec_mask_reg(gen);
344 int curval_reg = get_itemp(gen);
345 /* First read the current value from memory:
346 * Load: curval = memory[(machine_reg) + offset]
347 */
348 spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
349 /* Mix curval with newvalue according to exec mask:
350 * d[i] = mask_reg[i] ? value_reg : d_reg
351 */
352 spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
353 /* Store: memory[(machine_reg) + offset] = curval */
354 spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
355 }
356 else {
357 /* Store: memory[(machine_reg) + offset] = reg */
358 spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
359 }
360 }
361 break;
362 default:
363 assert(0);
364 }
365 }
366
367
368
369 static void
370 emit_prologue(struct codegen *gen)
371 {
372 gen->frame_size = 256+128; /* XXX temporary */
373
374 spe_comment(gen->f, -4, "Function prologue:");
375
376 /* save $lr on stack # stqd $lr,16($sp) */
377 spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
378
379 /* save stack pointer # stqd $sp,-frameSize($sp) */
380 spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
381
382 /* adjust stack pointer # ai $sp,$sp,-frameSize */
383 spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
384 }
385
386
387 static void
388 emit_epilogue(struct codegen *gen)
389 {
390 spe_comment(gen->f, -4, "Function epilogue:");
391
392 /* restore stack pointer # ai $sp,$sp,frameSize */
393 spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
394
395 /* restore $lr # lqd $lr,16($sp) */
396 spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
397
398 /* return from function call */
399 spe_bi(gen->f, SPE_REG_RA, 0, 0);
400 }
401
402
403 static boolean
404 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
405 {
406 int ch, src_reg[4], dst_reg[4];
407 spe_comment(gen->f, -4, "MOV:");
408 for (ch = 0; ch < 4; ch++) {
409 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
410 src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
411 dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
412 }
413 }
414 for (ch = 0; ch < 4; ch++) {
415 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
416 /* XXX we don't always need to actually emit a mov instruction here */
417 spe_move(gen->f, dst_reg[ch], src_reg[ch]);
418 store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
419 free_itemps(gen);
420 }
421 }
422 return true;
423 }
424
425 /**
426 * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD
427 * becomes (up to) four SPU "fa" instructions because we're doing SOA
428 * processing.
429 */
430 static boolean
431 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
432 {
433 int ch, s1_reg[4], s2_reg[4], d_reg[4];
434
435 spe_comment(gen->f, -4, "ADD:");
436 /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
437 for (ch = 0; ch < 4; ch++) {
438 /* If the dest R, G, B or A writemask is enabled... */
439 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
440 s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
441 s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
442 d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
443 }
444 }
445 /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
446 for (ch = 0; ch < 4; ch++) {
447 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
448 /* Emit actual SPE instruction: d = s1 + s2 */
449 spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
450 /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
451 store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
452 /* Free any intermediate temps we allocated */
453 free_itemps(gen);
454 }
455 }
456 return true;
457 }
458
459 /**
460 * Emit subtract. See emit_ADD for comments.
461 */
462 static boolean
463 emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
464 {
465 int ch, s1_reg[4], s2_reg[4], d_reg[4];
466 spe_comment(gen->f, -4, "SUB:");
467 for (ch = 0; ch < 4; ch++) {
468 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
469 s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
470 s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
471 d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
472 }
473 }
474 for (ch = 0; ch < 4; ch++) {
475 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
476 /* d = s1 - s2 */
477 spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
478 store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
479 free_itemps(gen);
480 }
481 }
482 return true;
483 }
484
485 /**
486 * Emit multiply add. See emit_ADD for comments.
487 */
488 static boolean
489 emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
490 {
491 int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
492 spe_comment(gen->f, -4, "MAD:");
493 for (ch = 0; ch < 4; ch++) {
494 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
495 s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
496 s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
497 s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
498 d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
499 }
500 }
501 for (ch = 0; ch < 4; ch++) {
502 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
503 /* d = s1 * s2 + s3 */
504 spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
505 store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
506 free_itemps(gen);
507 }
508 }
509 return true;
510 }
511
512
513 /**
514 * Emit linear interpolate. See emit_ADD for comments.
515 */
516 static boolean
517 emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
518 {
519 int ch;
520 spe_comment(gen->f, -4, "LERP:");
521 for (ch = 0; ch < 4; ch++) {
522 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
523 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
524 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
525 int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
526 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
527 /* d = s3 + s1(s2 - s3) */
528 spe_fs(gen->f, d_reg, s2_reg, s3_reg);
529 spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
530 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
531 free_itemps(gen);
532 }
533 }
534 return true;
535 }
536
537 /**
538 * Emit multiply. See emit_ADD for comments.
539 */
540 static boolean
541 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
542 {
543 int ch, s1_reg[4], s2_reg[4], d_reg[4];
544 spe_comment(gen->f, -4, "MUL:");
545 for (ch = 0; ch < 4; ch++) {
546 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
547 s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
548 s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
549 d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
550 }
551 }
552 for (ch = 0; ch < 4; ch++) {
553 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
554 /* d = s1 * s2 */
555 spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
556 store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
557 free_itemps(gen);
558 }
559 }
560 return true;
561 }
562
563 /**
564 * Emit reciprocal. See emit_ADD for comments.
565 */
566 static boolean
567 emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
568 {
569 int ch;
570 spe_comment(gen->f, -4, "RCP:");
571 for (ch = 0; ch < 4; ch++) {
572 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
573 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
574 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
575 /* d = 1/s1 */
576 spe_frest(gen->f, d_reg, s1_reg);
577 spe_fi(gen->f, d_reg, s1_reg, d_reg);
578 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
579 free_itemps(gen);
580 }
581 }
582 return true;
583 }
584
585 /**
586 * Emit reciprocal sqrt. See emit_ADD for comments.
587 */
588 static boolean
589 emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
590 {
591 int ch;
592 spe_comment(gen->f, -4, "RSQ:");
593 for (ch = 0; ch < 4; ch++) {
594 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
595 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
596 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
597 /* d = 1/s1 */
598 spe_frsqest(gen->f, d_reg, s1_reg);
599 spe_fi(gen->f, d_reg, s1_reg, d_reg);
600 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
601 free_itemps(gen);
602 }
603 }
604 return true;
605 }
606
607 /**
608 * Emit absolute value. See emit_ADD for comments.
609 */
610 static boolean
611 emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
612 {
613 int ch;
614 spe_comment(gen->f, -4, "ABS:");
615 for (ch = 0; ch < 4; ch++) {
616 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
617 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
618 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
619 const int bit31mask_reg = get_itemp(gen);
620
621 /* mask with bit 31 set, the rest cleared */
622 spe_load_int(gen->f, bit31mask_reg, (1 << 31));
623
624 /* d = sign bit cleared in s1 */
625 spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
626
627 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
628 free_itemps(gen);
629 }
630 }
631 return true;
632 }
633
634 /**
635 * Emit 3 component dot product. See emit_ADD for comments.
636 */
637 static boolean
638 emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
639 {
640 int ch;
641 int s1x_reg, s1y_reg, s1z_reg;
642 int s2x_reg, s2y_reg, s2z_reg;
643 int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
644
645 spe_comment(gen->f, -4, "DP3:");
646
647 s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
648 s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
649 s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
650 s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
651 s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
652 s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
653
654 /* t0 = x0 * x1 */
655 spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
656
657 /* t1 = y0 * y1 */
658 spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
659
660 /* t0 = z0 * z1 + t0 */
661 spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
662
663 /* t0 = t0 + t1 */
664 spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
665
666 for (ch = 0; ch < 4; ch++) {
667 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
668 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
669 spe_move(gen->f, d_reg, t0_reg);
670 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
671 }
672 }
673
674 free_itemps(gen);
675 return true;
676 }
677
678 /**
679 * Emit 4 component dot product. See emit_ADD for comments.
680 */
681 static boolean
682 emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
683 {
684 int ch;
685 spe_comment(gen->f, -4, "DP4:");
686
687 int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
688 int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
689 int tmp_reg = get_itemp(gen);
690
691 /* t = x0 * x1 */
692 spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
693
694 s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
695 s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
696 /* t = y0 * y1 + t */
697 spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
698
699 s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
700 s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
701 /* t = z0 * z1 + t */
702 spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
703
704 s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
705 s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
706 /* t = w0 * w1 + t */
707 spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
708
709 for (ch = 0; ch < 4; ch++) {
710 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
711 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
712 spe_move(gen->f, d_reg, tmp_reg);
713 store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
714 }
715 }
716
717 free_itemps(gen);
718 return true;
719 }
720
721 /**
722 * Emit homogeneous dot product. See emit_ADD for comments.
723 */
724 static boolean
725 emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
726 {
727 int ch;
728 spe_comment(gen->f, -4, "DPH:");
729
730 int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
731 int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
732 int tmp_reg = get_itemp(gen);
733
734 /* t = x0 * x1 */
735 spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
736
737 s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
738 s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
739 /* t = y0 * y1 + t */
740 spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
741
742 s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
743 s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
744 /* t = z0 * z1 + t */
745 spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
746
747 s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
748 /* t = w1 + t */
749 spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
750
751 for (ch = 0; ch < 4; ch++) {
752 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
753 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
754 spe_move(gen->f, d_reg, tmp_reg);
755 store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
756 }
757 }
758
759 free_itemps(gen);
760 return true;
761 }
762
763 /**
764 * Emit cross product. See emit_ADD for comments.
765 */
766 static boolean
767 emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
768 {
769 spe_comment(gen->f, -4, "XPD:");
770
771 int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
772 int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
773 int tmp_reg = get_itemp(gen);
774
775 /* t = z0 * y1 */
776 spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
777
778 s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
779 s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
780 /* t = y0 * z1 - t */
781 spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
782
783 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
784 store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
785 }
786
787 s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
788 s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
789 /* t = x0 * z1 */
790 spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
791
792 s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
793 s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
794 /* t = z0 * x1 - t */
795 spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
796
797 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
798 store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
799 }
800
801 s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
802 s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
803 /* t = y0 * x1 */
804 spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
805
806 s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
807 s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
808 /* t = x0 * y1 - t */
809 spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
810
811 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
812 store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
813 }
814
815 free_itemps(gen);
816 return true;
817 }
818
819 /**
820 * Emit set-if-greater-than.
821 * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
822 * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
823 * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
824 */
825 static boolean
826 emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
827 {
828 int ch;
829
830 spe_comment(gen->f, -4, "SGT:");
831
832 for (ch = 0; ch < 4; ch++) {
833 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
834 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
835 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
836 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
837
838 /* d = (s1 > s2) */
839 spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
840
841 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
842 /* d = d & one_reg */
843 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
844
845 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
846 free_itemps(gen);
847 }
848 }
849
850 return true;
851 }
852
853 /**
854 * Emit set-if_less-then. See emit_SGT for comments.
855 */
856 static boolean
857 emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
858 {
859 int ch;
860
861 spe_comment(gen->f, -4, "SLT:");
862
863 for (ch = 0; ch < 4; ch++) {
864 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
865 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
866 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
867 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
868
869 /* d = (s1 < s2) */
870 spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
871
872 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
873 /* d = d & one_reg */
874 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
875
876 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
877 free_itemps(gen);
878 }
879 }
880
881 return true;
882 }
883
884 /**
885 * Emit set-if_greater-then-or-equal. See emit_SGT for comments.
886 */
887 static boolean
888 emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
889 {
890 int ch;
891
892 spe_comment(gen->f, -4, "SGE:");
893
894 for (ch = 0; ch < 4; ch++) {
895 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
896 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
897 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
898 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
899
900 /* d = (s1 >= s2) */
901 spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
902
903 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
904 /* d = ~d & one_reg */
905 spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
906
907 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
908 free_itemps(gen);
909 }
910 }
911
912 return true;
913 }
914
915 /**
916 * Emit set-if_less-then-or-equal. See emit_SGT for comments.
917 */
918 static boolean
919 emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
920 {
921 int ch;
922
923 spe_comment(gen->f, -4, "SLE:");
924
925 for (ch = 0; ch < 4; ch++) {
926 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
927 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
928 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
929 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
930
931 /* d = (s1 <= s2) */
932 spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
933
934 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
935 /* d = ~d & one_reg */
936 spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
937
938 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
939 free_itemps(gen);
940 }
941 }
942
943 return true;
944 }
945
946 /**
947 * Emit set-if_equal. See emit_SGT for comments.
948 */
949 static boolean
950 emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
951 {
952 int ch;
953
954 spe_comment(gen->f, -4, "SEQ:");
955
956 for (ch = 0; ch < 4; ch++) {
957 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
958 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
959 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
960 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
961
962 /* d = (s1 == s2) */
963 spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
964
965 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
966 /* d = d & one_reg */
967 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
968
969 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
970 free_itemps(gen);
971 }
972 }
973
974 return true;
975 }
976
977 /**
978 * Emit set-if_not_equal. See emit_SGT for comments.
979 */
980 static boolean
981 emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
982 {
983 int ch;
984
985 spe_comment(gen->f, -4, "SNE:");
986
987 for (ch = 0; ch < 4; ch++) {
988 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
989 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
990 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
991 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
992
993 /* d = (s1 != s2) */
994 spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
995 spe_nor(gen->f, d_reg, d_reg, d_reg);
996
997 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
998 /* d = d & one_reg */
999 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
1000
1001 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1002 free_itemps(gen);
1003 }
1004 }
1005
1006 return true;
1007 }
1008
1009 /**
1010 * Emit compare. See emit_SGT for comments.
1011 */
1012 static boolean
1013 emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
1014 {
1015 int ch;
1016
1017 spe_comment(gen->f, -4, "CMP:");
1018
1019 for (ch = 0; ch < 4; ch++) {
1020 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1021 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1022 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
1023 int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
1024 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1025 int zero_reg = get_itemp(gen);
1026
1027 spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
1028
1029 /* d = (s1 < 0) ? s2 : s3 */
1030 spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
1031 spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
1032
1033 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1034 free_itemps(gen);
1035 }
1036 }
1037
1038 return true;
1039 }
1040
1041 /**
1042 * Emit trunc.
1043 * Convert float to signed int
1044 * Convert signed int to float
1045 */
1046 static boolean
1047 emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
1048 {
1049 int ch;
1050
1051 spe_comment(gen->f, -4, "TRUNC:");
1052
1053 for (ch = 0; ch < 4; ch++) {
1054 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1055 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1056 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1057
1058 /* Convert float to int */
1059 spe_cflts(gen->f, d_reg, s1_reg, 0);
1060
1061 /* Convert int to float */
1062 spe_csflt(gen->f, d_reg, d_reg, 0);
1063
1064 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1065 free_itemps(gen);
1066 }
1067 }
1068
1069 return true;
1070 }
1071
1072 /**
1073 * Emit floor.
1074 * If negative int subtract one
1075 * Convert float to signed int
1076 * Convert signed int to float
1077 */
1078 static boolean
1079 emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
1080 {
1081 int ch;
1082
1083 spe_comment(gen->f, -4, "FLR:");
1084
1085 int zero_reg = get_itemp(gen);
1086 spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
1087
1088 for (ch = 0; ch < 4; ch++) {
1089 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1090 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1091 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1092 int tmp_reg = get_itemp(gen);
1093
1094 /* If negative, subtract 1.0 */
1095 spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
1096 spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
1097 spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
1098
1099 /* Convert float to int */
1100 spe_cflts(gen->f, d_reg, d_reg, 0);
1101
1102 /* Convert int to float */
1103 spe_csflt(gen->f, d_reg, d_reg, 0);
1104
1105 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1106 free_itemps(gen);
1107 }
1108 }
1109
1110 return true;
1111 }
1112
1113 /**
1114 * Emit frac.
1115 * Input - FLR(Input)
1116 */
1117 static boolean
1118 emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
1119 {
1120 int ch;
1121
1122 spe_comment(gen->f, -4, "FLR:");
1123
1124 int zero_reg = get_itemp(gen);
1125 spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
1126
1127 for (ch = 0; ch < 4; ch++) {
1128 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1129 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1130 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1131 int tmp_reg = get_itemp(gen);
1132
1133 /* If negative, subtract 1.0 */
1134 spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
1135 spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
1136 spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
1137
1138 /* Convert float to int */
1139 spe_cflts(gen->f, d_reg, d_reg, 0);
1140
1141 /* Convert int to float */
1142 spe_csflt(gen->f, d_reg, d_reg, 0);
1143
1144 /* d = s1 - FLR(s1) */
1145 spe_fs(gen->f, d_reg, s1_reg, d_reg);
1146
1147 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1148 free_itemps(gen);
1149 }
1150 }
1151
1152 return true;
1153 }
1154
1155
1156 #if 0
1157 static void
1158 print_functions(struct cell_context *cell)
1159 {
1160 struct cell_spu_function_info *funcs = &cell->spu_functions;
1161 uint i;
1162 for (i = 0; i < funcs->num; i++) {
1163 printf("SPU func %u: %s at %u\n",
1164 i, funcs->names[i], funcs->addrs[i]);
1165 }
1166 }
1167 #endif
1168
1169
1170 static uint
1171 lookup_function(struct cell_context *cell, const char *funcname)
1172 {
1173 const struct cell_spu_function_info *funcs = &cell->spu_functions;
1174 uint i, addr = 0;
1175 for (i = 0; i < funcs->num; i++) {
1176 if (strcmp(funcs->names[i], funcname) == 0) {
1177 addr = funcs->addrs[i];
1178 }
1179 }
1180 assert(addr && "spu function not found");
1181 return addr / 4; /* discard 2 least significant bits */
1182 }
1183
1184
1185 /**
1186 * Emit code to call a SPU function.
1187 * Used to implement instructions like SIN/COS/POW/TEX/etc.
1188 */
1189 static boolean
1190 emit_function_call(struct codegen *gen,
1191 const struct tgsi_full_instruction *inst,
1192 char *funcname, uint num_args)
1193 {
1194 const uint addr = lookup_function(gen->cell, funcname);
1195 char comment[100];
1196 int ch;
1197
1198 assert(num_args <= 3);
1199
1200 snprintf(comment, sizeof(comment), "CALL %s:", funcname);
1201 spe_comment(gen->f, -4, comment);
1202
1203 for (ch = 0; ch < 4; ch++) {
1204 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1205 int s_regs[3], d_reg;
1206 ubyte usedRegs[SPE_NUM_REGS];
1207 uint a, i, numUsed;
1208
1209 for (a = 0; a < num_args; a++) {
1210 s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
1211 }
1212 d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1213
1214 numUsed = spe_get_registers_used(gen->f, usedRegs);
1215 assert(numUsed < gen->frame_size / 16 - 32);
1216
1217 /* save registers to stack */
1218 for (i = 0; i < numUsed; i++) {
1219 uint reg = usedRegs[i];
1220 int offset = 2 + i;
1221 spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
1222 }
1223
1224 /* setup function arguments */
1225 for (a = 0; a < num_args; a++) {
1226 spe_move(gen->f, 3 + a, s_regs[a]);
1227 }
1228
1229 /* branch to function, save return addr */
1230 spe_brasl(gen->f, SPE_REG_RA, addr);
1231
1232 /* save function's return value */
1233 spe_move(gen->f, d_reg, 3);
1234
1235 /* restore registers from stack */
1236 for (i = 0; i < numUsed; i++) {
1237 uint reg = usedRegs[i];
1238 if (reg != d_reg) {
1239 int offset = 2 + i;
1240 spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
1241 }
1242 }
1243
1244 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1245 free_itemps(gen);
1246 }
1247 }
1248
1249 return true;
1250 }
1251
1252
1253 static boolean
1254 emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
1255 {
1256 const uint addr = lookup_function(gen->cell, "spu_txp");
1257 int ch;
1258 int coord_regs[4], d_regs[4];
1259
1260 spe_comment(gen->f, -4, "CALL txp:");
1261
1262 /* get src/dst reg info */
1263 for (ch = 0; ch < 4; ch++) {
1264 coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1265 d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1266 }
1267
1268 {
1269 ubyte usedRegs[SPE_NUM_REGS];
1270 uint i, numUsed;
1271
1272 numUsed = spe_get_registers_used(gen->f, usedRegs);
1273 assert(numUsed < gen->frame_size / 16 - 32);
1274
1275 /* save registers to stack */
1276 for (i = 0; i < numUsed; i++) {
1277 uint reg = usedRegs[i];
1278 int offset = 2 + i;
1279 spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
1280 }
1281
1282 /* setup function arguments */
1283 for (i = 0; i < 4; i++) {
1284 spe_move(gen->f, 3 + i, coord_regs[i]);
1285 }
1286
1287 /* branch to function, save return addr */
1288 spe_brasl(gen->f, SPE_REG_RA, addr);
1289
1290 /* save function's return values (four pixel's colors) */
1291 for (i = 0; i < 4; i++) {
1292 spe_move(gen->f, d_regs[i], 3 + i);
1293 }
1294
1295 /* restore registers from stack */
1296 for (i = 0; i < numUsed; i++) {
1297 uint reg = usedRegs[i];
1298 if (reg != d_regs[0] &&
1299 reg != d_regs[1] &&
1300 reg != d_regs[2] &&
1301 reg != d_regs[3]) {
1302 int offset = 2 + i;
1303 spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
1304 }
1305 }
1306 }
1307
1308 for (ch = 0; ch < 4; ch++) {
1309 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1310 store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
1311 free_itemps(gen);
1312 }
1313 }
1314
1315 return TRUE;
1316 }
1317
1318
1319 /**
1320 * Emit max. See emit_SGT for comments.
1321 */
1322 static boolean
1323 emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
1324 {
1325 int ch;
1326
1327 spe_comment(gen->f, -4, "MAX:");
1328
1329 for (ch = 0; ch < 4; ch++) {
1330 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1331 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1332 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
1333 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1334 int tmp_reg = get_itemp(gen);
1335
1336 /* d = (s1 > s2) ? s1 : s2 */
1337 spe_fcgt(gen->f, tmp_reg, s1_reg, s2_reg);
1338 spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
1339
1340 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1341 free_itemps(gen);
1342 }
1343 }
1344
1345 return true;
1346 }
1347
1348 /**
1349 * Emit max. See emit_SGT for comments.
1350 */
1351 static boolean
1352 emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
1353 {
1354 int ch;
1355
1356 spe_comment(gen->f, -4, "MIN:");
1357
1358 for (ch = 0; ch < 4; ch++) {
1359 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1360 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1361 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
1362 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1363 int tmp_reg = get_itemp(gen);
1364
1365 /* d = (s2 > s1) ? s1 : s2 */
1366 spe_fcgt(gen->f, tmp_reg, s2_reg, s1_reg);
1367 spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
1368
1369 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
1370 free_itemps(gen);
1371 }
1372 }
1373
1374 return true;
1375 }
1376
1377 static boolean
1378 emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
1379 {
1380 const int channel = 0;
1381 const int exec_reg = get_exec_mask_reg(gen);
1382
1383 spe_comment(gen->f, -4, "IF:");
1384
1385 /* update execution mask with the predicate register */
1386 int tmp_reg = get_itemp(gen);
1387 int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
1388
1389 /* tmp = (s1_reg == 0) */
1390 spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
1391 /* tmp = !tmp */
1392 spe_complement(gen->f, tmp_reg, tmp_reg);
1393 /* exec_mask = exec_mask & tmp */
1394 spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
1395
1396 gen->if_nesting++;
1397
1398 free_itemps(gen);
1399
1400 return true;
1401 }
1402
1403
1404 static boolean
1405 emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
1406 {
1407 const int exec_reg = get_exec_mask_reg(gen);
1408
1409 spe_comment(gen->f, -4, "ELSE:");
1410
1411 /* exec_mask = !exec_mask */
1412 spe_complement(gen->f, exec_reg, exec_reg);
1413
1414 return true;
1415 }
1416
1417
1418 static boolean
1419 emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
1420 {
1421 const int exec_reg = get_exec_mask_reg(gen);
1422
1423 spe_comment(gen->f, -4, "ENDIF:");
1424
1425 /* XXX todo: pop execution mask */
1426
1427 spe_load_int(gen->f, exec_reg, ~0x0);
1428
1429 gen->if_nesting--;
1430 return true;
1431 }
1432
1433
1434 static boolean
1435 emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
1436 boolean ddx)
1437 {
1438 int ch;
1439
1440 spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
1441
1442 for (ch = 0; ch < 4; ch++) {
1443 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
1444 int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
1445 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
1446
1447 int t1_reg = get_itemp(gen);
1448 int t2_reg = get_itemp(gen);
1449
1450 spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
1451 if (ddx) {
1452 spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
1453 }
1454 else {
1455 spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
1456 }
1457 spe_fs(gen->f, d_reg, t2_reg, t1_reg);
1458
1459 free_itemps(gen);
1460 }
1461 }
1462
1463 return true;
1464 }
1465
1466
1467
1468
1469 /**
1470 * Emit END instruction.
1471 * We just return from the shader function at this point.
1472 *
1473 * Note that there may be more code after this that would be
1474 * called by TGSI_OPCODE_CALL.
1475 */
1476 static boolean
1477 emit_END(struct codegen *gen)
1478 {
1479 spe_comment(gen->f, -4, "END:");
1480 emit_epilogue(gen);
1481 return true;
1482 }
1483
1484
1485 /**
1486 * Emit code for the given instruction. Just a big switch stmt.
1487 */
1488 static boolean
1489 emit_instruction(struct codegen *gen,
1490 const struct tgsi_full_instruction *inst)
1491 {
1492 switch (inst->Instruction.Opcode) {
1493 case TGSI_OPCODE_MOV:
1494 case TGSI_OPCODE_SWZ:
1495 return emit_MOV(gen, inst);
1496 case TGSI_OPCODE_MUL:
1497 return emit_MUL(gen, inst);
1498 case TGSI_OPCODE_ADD:
1499 return emit_ADD(gen, inst);
1500 case TGSI_OPCODE_SUB:
1501 return emit_SUB(gen, inst);
1502 case TGSI_OPCODE_MAD:
1503 return emit_MAD(gen, inst);
1504 case TGSI_OPCODE_LERP:
1505 return emit_LERP(gen, inst);
1506 case TGSI_OPCODE_DP3:
1507 return emit_DP3(gen, inst);
1508 case TGSI_OPCODE_DP4:
1509 return emit_DP4(gen, inst);
1510 case TGSI_OPCODE_DPH:
1511 return emit_DPH(gen, inst);
1512 case TGSI_OPCODE_XPD:
1513 return emit_XPD(gen, inst);
1514 case TGSI_OPCODE_RCP:
1515 return emit_RCP(gen, inst);
1516 case TGSI_OPCODE_RSQ:
1517 return emit_RSQ(gen, inst);
1518 case TGSI_OPCODE_ABS:
1519 return emit_ABS(gen, inst);
1520 case TGSI_OPCODE_SGT:
1521 return emit_SGT(gen, inst);
1522 case TGSI_OPCODE_SLT:
1523 return emit_SLT(gen, inst);
1524 case TGSI_OPCODE_SGE:
1525 return emit_SGE(gen, inst);
1526 case TGSI_OPCODE_SLE:
1527 return emit_SLE(gen, inst);
1528 case TGSI_OPCODE_SEQ:
1529 return emit_SEQ(gen, inst);
1530 case TGSI_OPCODE_SNE:
1531 return emit_SNE(gen, inst);
1532 case TGSI_OPCODE_CMP:
1533 return emit_CMP(gen, inst);
1534 case TGSI_OPCODE_MAX:
1535 return emit_MAX(gen, inst);
1536 case TGSI_OPCODE_MIN:
1537 return emit_MIN(gen, inst);
1538 case TGSI_OPCODE_TRUNC:
1539 return emit_TRUNC(gen, inst);
1540 case TGSI_OPCODE_FLR:
1541 return emit_FLR(gen, inst);
1542 case TGSI_OPCODE_FRC:
1543 return emit_FRC(gen, inst);
1544 case TGSI_OPCODE_END:
1545 return emit_END(gen);
1546
1547 case TGSI_OPCODE_COS:
1548 return emit_function_call(gen, inst, "spu_cos", 1);
1549 case TGSI_OPCODE_SIN:
1550 return emit_function_call(gen, inst, "spu_sin", 1);
1551 case TGSI_OPCODE_POW:
1552 return emit_function_call(gen, inst, "spu_pow", 2);
1553 case TGSI_OPCODE_EXPBASE2:
1554 return emit_function_call(gen, inst, "spu_exp2", 1);
1555 case TGSI_OPCODE_LOGBASE2:
1556 return emit_function_call(gen, inst, "spu_log2", 1);
1557 case TGSI_OPCODE_TEX:
1558 /* fall-through for now */
1559 case TGSI_OPCODE_TXD:
1560 /* fall-through for now */
1561 case TGSI_OPCODE_TXP:
1562 return emit_TXP(gen, inst);
1563
1564 case TGSI_OPCODE_IF:
1565 return emit_IF(gen, inst);
1566 case TGSI_OPCODE_ELSE:
1567 return emit_ELSE(gen, inst);
1568 case TGSI_OPCODE_ENDIF:
1569 return emit_ENDIF(gen, inst);
1570
1571 case TGSI_OPCODE_DDX:
1572 return emit_DDX_DDY(gen, inst, true);
1573 case TGSI_OPCODE_DDY:
1574 return emit_DDX_DDY(gen, inst, false);
1575
1576 /* XXX lots more cases to do... */
1577
1578 default:
1579 fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
1580 inst->Instruction.Opcode);
1581 return false;
1582 }
1583
1584 return true;
1585 }
1586
1587
1588
1589 /**
1590 * Emit code for a TGSI immediate value (vector of four floats).
1591 * This involves register allocation and initialization.
1592 * XXX the initialization should be done by a "prepare" stage, not
1593 * per quad execution!
1594 */
1595 static boolean
1596 emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
1597 {
1598 int ch;
1599
1600 assert(gen->num_imm < MAX_TEMPS);
1601
1602 spe_comment(gen->f, -4, "IMMEDIATE:");
1603
1604 for (ch = 0; ch < 4; ch++) {
1605 float val = immed->u.ImmediateFloat32[ch].Float;
1606
1607 if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
1608 /* re-use previous register */
1609 gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
1610 }
1611 else {
1612 int reg = spe_allocate_available_register(gen->f);
1613
1614 if (reg < 0)
1615 return false;
1616
1617 /* update immediate map */
1618 gen->imm_regs[gen->num_imm][ch] = reg;
1619
1620 /* emit initializer instruction */
1621 spe_load_float(gen->f, reg, val);
1622 }
1623 }
1624
1625 gen->num_imm++;
1626
1627 return true;
1628 }
1629
1630
1631
1632 /**
1633 * Emit "code" for a TGSI declaration.
1634 * We only care about TGSI TEMPORARY register declarations at this time.
1635 * For each TGSI TEMPORARY we allocate four SPE registers.
1636 */
1637 static boolean
1638 emit_declaration(struct cell_context *cell,
1639 struct codegen *gen, const struct tgsi_full_declaration *decl)
1640 {
1641 int i, ch;
1642
1643 switch (decl->Declaration.File) {
1644 case TGSI_FILE_TEMPORARY:
1645 for (i = decl->DeclarationRange.First;
1646 i <= decl->DeclarationRange.Last;
1647 i++) {
1648 assert(i < MAX_TEMPS);
1649 for (ch = 0; ch < 4; ch++) {
1650 gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
1651 if (gen->temp_regs[i][ch] < 0)
1652 return false; /* out of regs */
1653 }
1654
1655 /* XXX if we run out of SPE registers, we need to spill
1656 * to SPU memory. someday...
1657 */
1658
1659 {
1660 char buf[100];
1661 sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
1662 gen->temp_regs[i][0], gen->temp_regs[i][1],
1663 gen->temp_regs[i][2], gen->temp_regs[i][3]);
1664 spe_comment(gen->f, -4, buf);
1665 }
1666 }
1667 break;
1668 default:
1669 ; /* ignore */
1670 }
1671
1672 return true;
1673 }
1674
1675
1676
1677 /**
1678 * Translate TGSI shader code to SPE instructions. This is done when
1679 * the state tracker gives us a new shader (via pipe->create_fs_state()).
1680 *
1681 * \param cell the rendering context (in)
1682 * \param tokens the TGSI shader (in)
1683 * \param f the generated function (out)
1684 */
1685 boolean
1686 cell_gen_fragment_program(struct cell_context *cell,
1687 const struct tgsi_token *tokens,
1688 struct spe_function *f)
1689 {
1690 struct tgsi_parse_context parse;
1691 struct codegen gen;
1692
1693 memset(&gen, 0, sizeof(gen));
1694 gen.cell = cell;
1695 gen.f = f;
1696
1697 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
1698 gen.inputs_reg = 3; /* pointer to inputs array */
1699 gen.outputs_reg = 4; /* pointer to outputs array */
1700 gen.constants_reg = 5; /* pointer to constants array */
1701
1702 spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
1703 spe_allocate_register(f, gen.inputs_reg);
1704 spe_allocate_register(f, gen.outputs_reg);
1705 spe_allocate_register(f, gen.constants_reg);
1706
1707 if (cell->debug_flags & CELL_DEBUG_ASM) {
1708 spe_print_code(f, true);
1709 spe_indent(f, 8);
1710 printf("Begin %s\n", __FUNCTION__);
1711 tgsi_dump(tokens, 0);
1712 }
1713
1714 tgsi_parse_init(&parse, tokens);
1715
1716 emit_prologue(&gen);
1717
1718 while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
1719 tgsi_parse_token(&parse);
1720
1721 switch (parse.FullToken.Token.Type) {
1722 case TGSI_TOKEN_TYPE_IMMEDIATE:
1723 if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
1724 gen.error = true;
1725 break;
1726
1727 case TGSI_TOKEN_TYPE_DECLARATION:
1728 if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
1729 gen.error = true;
1730 break;
1731
1732 case TGSI_TOKEN_TYPE_INSTRUCTION:
1733 if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
1734 gen.error = true;
1735 break;
1736
1737 default:
1738 assert(0);
1739 }
1740 }
1741
1742 if (gen.error) {
1743 /* terminate the SPE code */
1744 return emit_END(&gen);
1745 }
1746
1747 if (cell->debug_flags & CELL_DEBUG_ASM) {
1748 printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
1749 printf("End %s\n", __FUNCTION__);
1750 }
1751
1752 tgsi_parse_free( &parse );
1753
1754 return !gen.error;
1755 }