cell: export CELL_DEBUG=asm to dump SPU assembly code
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fp.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29
30 /**
31 * Generate SPU fragment program/shader code.
32 *
33 * Note that we generate SOA-style code here. So each TGSI instruction
34 * operates on four pixels (and is translated into four SPU instructions,
35 * generally speaking).
36 *
37 * \author Brian Paul
38 */
39
40
41 #include "pipe/p_defines.h"
42 #include "pipe/p_state.h"
43 #include "pipe/p_shader_tokens.h"
44 #include "tgsi/tgsi_parse.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_exec.h"
47 #include "tgsi/tgsi_dump.h"
48 #include "rtasm/rtasm_ppc_spe.h"
49 #include "util/u_memory.h"
50 #include "cell_context.h"
51 #include "cell_gen_fp.h"
52
53
54 #define MAX_TEMPS 16
55 #define MAX_IMMED 8
56
57
58 /**
59 * Context needed during code generation.
60 */
61 struct codegen
62 {
63 int inputs_reg; /**< 1st function parameter */
64 int outputs_reg; /**< 2nd function parameter */
65 int constants_reg; /**< 3rd function parameter */
66 int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
67 int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */
68
69 int num_imm; /**< number of immediates */
70
71 int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */
72
73 /** Per-instruction temps / intermediate temps */
74 int num_itemps;
75 int itemps[4];
76
77 /** Current IF/ELSE/ENDIF nesting level */
78 int if_nesting;
79 /** Index of execution mask register */
80 int exec_mask_reg;
81
82 struct spe_function *f;
83 boolean error;
84 };
85
86
87 /**
88 * Allocate an intermediate temporary register.
89 */
90 static int
91 get_itemp(struct codegen *gen)
92 {
93 int t = spe_allocate_available_register(gen->f);
94 assert(gen->num_itemps < Elements(gen->itemps));
95 gen->itemps[gen->num_itemps++] = t;
96 return t;
97 }
98
99 /**
100 * Free all intermediate temporary registers. To be called after each
101 * instruction has been emitted.
102 */
103 static void
104 free_itemps(struct codegen *gen)
105 {
106 int i;
107 for (i = 0; i < gen->num_itemps; i++) {
108 spe_release_register(gen->f, gen->itemps[i]);
109 }
110 gen->num_itemps = 0;
111 }
112
113
114 /**
115 * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
116 * The register is allocated and initialized upon the first call.
117 */
118 static int
119 get_const_one_reg(struct codegen *gen)
120 {
121 if (gen->one_reg <= 0) {
122 gen->one_reg = spe_allocate_available_register(gen->f);
123
124 spe_indent(gen->f, 4);
125 spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
126
127 /* one = {1.0, 1.0, 1.0, 1.0} */
128 spe_load_float(gen->f, gen->one_reg, 1.0f);
129
130 spe_indent(gen->f, -4);
131 }
132
133 return gen->one_reg;
134 }
135
136
137 /**
138 * Return index of the pixel execution mask.
139 * The register is allocated an initialized upon the first call.
140 *
141 * The pixel execution mask controls which pixels in a quad are
142 * modified, according to surrounding conditionals, loops, etc.
143 */
144 static int
145 get_exec_mask_reg(struct codegen *gen)
146 {
147 if (gen->exec_mask_reg <= 0) {
148 gen->exec_mask_reg = spe_allocate_available_register(gen->f);
149
150 spe_indent(gen->f, 4);
151 spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
152
153 /* exec_mask = {~0, ~0, ~0, ~0} */
154 spe_load_int(gen->f, gen->exec_mask_reg, ~0);
155
156 spe_indent(gen->f, -4);
157 }
158
159 return gen->exec_mask_reg;
160 }
161
162
163 /**
164 * Return the index of the SPU temporary containing the named TGSI
165 * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we
166 * just return the corresponding SPE register. If the TGIS register
167 * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
168 * and emit an SPE load instruction.
169 */
170 static int
171 get_src_reg(struct codegen *gen,
172 int channel,
173 const struct tgsi_full_src_register *src)
174 {
175 int reg = -1;
176 int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
177 boolean reg_is_itemp = FALSE;
178 uint sign_op;
179
180 assert(swizzle >= 0);
181 assert(swizzle <= 3);
182
183 channel = swizzle;
184
185 switch (src->SrcRegister.File) {
186 case TGSI_FILE_TEMPORARY:
187 reg = gen->temp_regs[src->SrcRegister.Index][channel];
188 break;
189 case TGSI_FILE_INPUT:
190 {
191 /* offset is measured in quadwords, not bytes */
192 int offset = src->SrcRegister.Index * 4 + channel;
193 reg = get_itemp(gen);
194 reg_is_itemp = TRUE;
195 /* Load: reg = memory[(machine_reg) + offset] */
196 spe_lqd(gen->f, reg, gen->inputs_reg, offset);
197 }
198 break;
199 case TGSI_FILE_IMMEDIATE:
200 reg = gen->imm_regs[src->SrcRegister.Index][channel];
201 break;
202 case TGSI_FILE_CONSTANT:
203 /* xxx fall-through for now / fix */
204 default:
205 assert(0);
206 }
207
208 /*
209 * Handle absolute value, negate or set-negative of src register.
210 */
211 sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
212 if (sign_op != TGSI_UTIL_SIGN_KEEP) {
213 /*
214 * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
215 */
216 const int bit31mask_reg = get_itemp(gen);
217 int result_reg;
218
219 if (reg_is_itemp) {
220 /* re-use 'reg' for the result */
221 result_reg = reg;
222 }
223 else {
224 /* alloc a new reg for the result */
225 result_reg = get_itemp(gen);
226 }
227
228 /* mask with bit 31 set, the rest cleared */
229 spe_load_int(gen->f, bit31mask_reg, (1 << 31));
230
231 if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
232 spe_andc(gen->f, result_reg, reg, bit31mask_reg);
233 }
234 else if (sign_op == TGSI_UTIL_SIGN_SET) {
235 spe_and(gen->f, result_reg, reg, bit31mask_reg);
236 }
237 else {
238 assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
239 spe_xor(gen->f, result_reg, reg, bit31mask_reg);
240 }
241
242 reg = result_reg;
243 }
244
245 return reg;
246 }
247
248
249 /**
250 * Return the index of an SPE register to use for the given TGSI register.
251 * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
252 * corresponding SPE register is returned. If the TGSI register is
253 * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
254 * See store_dest_reg() below...
255 */
256 static int
257 get_dst_reg(struct codegen *gen,
258 int channel,
259 const struct tgsi_full_dst_register *dest)
260 {
261 int reg = -1;
262
263 switch (dest->DstRegister.File) {
264 case TGSI_FILE_TEMPORARY:
265 if (gen->if_nesting > 0)
266 reg = get_itemp(gen);
267 else
268 reg = gen->temp_regs[dest->DstRegister.Index][channel];
269 break;
270 case TGSI_FILE_OUTPUT:
271 reg = get_itemp(gen);
272 break;
273 default:
274 assert(0);
275 }
276
277 return reg;
278 }
279
280
281 /**
282 * When a TGSI instruction is writing to an output register, this
283 * function emits the SPE store instruction to store the value_reg.
284 * \param value_reg the SPE register containing the value to store.
285 * This would have been returned by get_dst_reg().
286 */
287 static void
288 store_dest_reg(struct codegen *gen,
289 int value_reg, int channel,
290 const struct tgsi_full_dst_register *dest)
291 {
292 switch (dest->DstRegister.File) {
293 case TGSI_FILE_TEMPORARY:
294 if (gen->if_nesting > 0) {
295 int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
296 int exec_reg = get_exec_mask_reg(gen);
297 /* Mix d with new value according to exec mask:
298 * d[i] = mask_reg[i] ? value_reg : d_reg
299 */
300 spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
301 }
302 else {
303 /* we're not inside a condition or loop: do nothing special */
304 }
305 break;
306 case TGSI_FILE_OUTPUT:
307 {
308 /* offset is measured in quadwords, not bytes */
309 int offset = dest->DstRegister.Index * 4 + channel;
310 if (gen->if_nesting > 0) {
311 int exec_reg = get_exec_mask_reg(gen);
312 int curval_reg = get_itemp(gen);
313 /* First read the current value from memory:
314 * Load: curval = memory[(machine_reg) + offset]
315 */
316 spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
317 /* Mix curval with newvalue according to exec mask:
318 * d[i] = mask_reg[i] ? value_reg : d_reg
319 */
320 spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
321 /* Store: memory[(machine_reg) + offset] = curval */
322 spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
323 }
324 else {
325 /* Store: memory[(machine_reg) + offset] = reg */
326 spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
327 }
328 }
329 break;
330 default:
331 assert(0);
332 }
333 }
334
335
336 static boolean
337 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
338 {
339 int ch;
340 spe_comment(gen->f, -4, "MOV:");
341 for (ch = 0; ch < 4; ch++) {
342 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
343 int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
344 int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
345 /* XXX we don't always need to actually emit a mov instruction here */
346 spe_move(gen->f, dst_reg, src_reg);
347 store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
348 free_itemps(gen);
349 }
350 }
351 return true;
352 }
353
354
355
356 /**
357 * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD
358 * becomes (up to) four SPU "fa" instructions because we're doing SOA
359 * processing.
360 */
361 static boolean
362 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
363 {
364 int ch;
365 spe_comment(gen->f, -4, "ADD:");
366 /* Loop over Red/Green/Blue/Alpha channels */
367 for (ch = 0; ch < 4; ch++) {
368 /* If the dest R, G, B or A writemask is enabled... */
369 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
370 /* get indexes of the two src, one dest SPE registers */
371 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
372 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
373 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
374
375 /* Emit actual SPE instruction: d = s1 + s2 */
376 spe_fa(gen->f, d_reg, s1_reg, s2_reg);
377
378 /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
379 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
380 /* Free any intermediate temps we allocated */
381 free_itemps(gen);
382 }
383 }
384 return true;
385 }
386
387 /**
388 * Emit subtract. See emit_ADD for comments.
389 */
390 static boolean
391 emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
392 {
393 int ch;
394 spe_comment(gen->f, -4, "SUB:");
395 /* Loop over Red/Green/Blue/Alpha channels */
396 for (ch = 0; ch < 4; ch++) {
397 /* If the dest R, G, B or A writemask is enabled... */
398 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
399 /* get indexes of the two src, one dest SPE registers */
400 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
401 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
402 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
403
404 /* Emit actual SPE instruction: d = s1 - s2 */
405 spe_fs(gen->f, d_reg, s1_reg, s2_reg);
406
407 /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
408 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
409 /* Free any intermediate temps we allocated */
410 free_itemps(gen);
411 }
412 }
413 return true;
414 }
415
416 /**
417 * Emit multiply add. See emit_ADD for comments.
418 */
419 static boolean
420 emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
421 {
422 int ch;
423 spe_comment(gen->f, -4, "MAD:");
424 for (ch = 0; ch < 4; ch++) {
425 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
426 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
427 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
428 int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
429 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
430 /* d = s1 * s2 + s3 */
431 spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg);
432 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
433 free_itemps(gen);
434 }
435 }
436 return true;
437 }
438
439
440 /**
441 * Emit linear interpolate. See emit_ADD for comments.
442 */
443 static boolean
444 emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
445 {
446 int ch;
447 spe_comment(gen->f, -4, "LERP:");
448 for (ch = 0; ch < 4; ch++) {
449 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
450 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
451 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
452 int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
453 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
454 /* d = s3 + s1(s2 - s3) */
455 spe_fs(gen->f, d_reg, s2_reg, s3_reg);
456 spe_fm(gen->f, d_reg, d_reg, s1_reg);
457 spe_fa(gen->f, d_reg, d_reg, s3_reg);
458 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
459 free_itemps(gen);
460 }
461 }
462 return true;
463 }
464
465 /**
466 * Emit multiply. See emit_ADD for comments.
467 */
468 static boolean
469 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
470 {
471 int ch;
472 spe_comment(gen->f, -4, "MUL:");
473 for (ch = 0; ch < 4; ch++) {
474 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
475 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
476 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
477 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
478 /* d = s1 * s2 */
479 spe_fm(gen->f, d_reg, s1_reg, s2_reg);
480 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
481 free_itemps(gen);
482 }
483 }
484 return true;
485 }
486
487 /**
488 * Emit absolute value. See emit_ADD for comments.
489 */
490 static boolean
491 emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
492 {
493 int ch;
494 spe_comment(gen->f, -4, "ABS:");
495 for (ch = 0; ch < 4; ch++) {
496 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
497 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
498 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
499 const int bit31mask_reg = get_itemp(gen);
500
501 /* mask with bit 31 set, the rest cleared */
502 spe_load_int(gen->f, bit31mask_reg, (1 << 31));
503
504 /* d = sign bit cleared in s1 */
505 spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
506
507 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
508 free_itemps(gen);
509 }
510 }
511 return true;
512 }
513
514 /**
515 * Emit set-if-greater-than.
516 * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
517 * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
518 * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
519 */
520 static boolean
521 emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
522 {
523 int ch;
524
525 spe_comment(gen->f, -4, "SGT:");
526
527 for (ch = 0; ch < 4; ch++) {
528 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
529 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
530 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
531 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
532
533 /* d = (s1 > s2) */
534 spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
535
536 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
537 /* d = d & one_reg */
538 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
539
540 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
541 free_itemps(gen);
542 }
543 }
544
545 return true;
546 }
547
548 /**
549 * Emit set-if_less-then. See emit_SGT for comments.
550 */
551 static boolean
552 emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
553 {
554 int ch;
555
556 spe_comment(gen->f, -4, "SLT:");
557
558 for (ch = 0; ch < 4; ch++) {
559 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
560 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
561 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
562 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
563
564 /* d = (s1 < s2) */
565 spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
566
567 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
568 /* d = d & one_reg */
569 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
570
571 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
572 free_itemps(gen);
573 }
574 }
575
576 return true;
577 }
578
579 /**
580 * Emit set-if_equal. See emit_SGT for comments.
581 */
582 static boolean
583 emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
584 {
585 int ch;
586
587 spe_comment(gen->f, -4, "SEQ:");
588
589 for (ch = 0; ch < 4; ch++) {
590 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
591 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
592 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
593 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
594
595 /* d = (s1 == s2) */
596 spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
597
598 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
599 /* d = d & one_reg */
600 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
601
602 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
603 free_itemps(gen);
604 }
605 }
606
607 return true;
608 }
609
610 /**
611 * Emit set-if_not_equal. See emit_SGT for comments.
612 */
613 static boolean
614 emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
615 {
616 int ch;
617
618 spe_comment(gen->f, -4, "SNE:");
619
620 for (ch = 0; ch < 4; ch++) {
621 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
622 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
623 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
624 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
625
626 /* d = (s1 != s2) */
627 spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
628 spe_nor(gen->f, d_reg, d_reg, d_reg);
629
630 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
631 /* d = d & one_reg */
632 spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
633
634 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
635 free_itemps(gen);
636 }
637 }
638
639 return true;
640 }
641
642 /**
643 * Emit max. See emit_SGT for comments.
644 */
645 static boolean
646 emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
647 {
648 int ch;
649
650 spe_comment(gen->f, -4, "MAX:");
651
652 for (ch = 0; ch < 4; ch++) {
653 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
654 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
655 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
656 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
657
658 /* d = (s1 > s2) ? s1 : s2 */
659 spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
660 spe_and(gen->f, d_reg, d_reg, s1_reg);
661 spe_nor(gen->f, d_reg, d_reg, d_reg);
662 spe_and(gen->f, d_reg, d_reg, s2_reg);
663
664 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
665 free_itemps(gen);
666 }
667 }
668
669 return true;
670 }
671
672 /**
673 * Emit max. See emit_SGT for comments.
674 */
675 static boolean
676 emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
677 {
678 int ch;
679
680 spe_comment(gen->f, -4, "MIN:");
681
682 for (ch = 0; ch < 4; ch++) {
683 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
684 int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
685 int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
686 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
687
688 /* d = (s1 < s2) ? s1 : s2 */
689 spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
690 spe_and(gen->f, d_reg, d_reg, s1_reg);
691 spe_nor(gen->f, d_reg, d_reg, d_reg);
692 spe_and(gen->f, d_reg, d_reg, s2_reg);
693
694 store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
695 free_itemps(gen);
696 }
697 }
698
699 return true;
700 }
701
702 static boolean
703 emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
704 {
705 const int channel = 0;
706 const int exec_reg = get_exec_mask_reg(gen);
707
708 spe_comment(gen->f, -4, "IF:");
709
710 /* update execution mask with the predicate register */
711 int tmp_reg = get_itemp(gen);
712 int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
713
714 /* tmp = (s1_reg == 0) */
715 spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
716 /* tmp = !tmp */
717 spe_complement(gen->f, tmp_reg);
718 /* exec_mask = exec_mask & tmp */
719 spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
720
721 gen->if_nesting++;
722
723 free_itemps(gen);
724
725 return true;
726 }
727
728
729 static boolean
730 emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
731 {
732 const int exec_reg = get_exec_mask_reg(gen);
733
734 spe_comment(gen->f, -4, "ELSE:");
735
736 /* exec_mask = !exec_mask */
737 spe_complement(gen->f, exec_reg);
738
739 return true;
740 }
741
742
743 static boolean
744 emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
745 {
746 const int exec_reg = get_exec_mask_reg(gen);
747
748 spe_comment(gen->f, -4, "ENDIF:");
749
750 /* XXX todo: pop execution mask */
751
752 spe_load_int(gen->f, exec_reg, ~0x0);
753
754 gen->if_nesting--;
755 return true;
756 }
757
758
759 static boolean
760 emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
761 boolean ddx)
762 {
763 int ch;
764
765 spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
766
767 for (ch = 0; ch < 4; ch++) {
768 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
769 int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
770 int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
771
772 int t1_reg = get_itemp(gen);
773 int t2_reg = get_itemp(gen);
774
775 spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
776 if (ddx) {
777 spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
778 }
779 else {
780 spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
781 }
782 spe_fs(gen->f, d_reg, t2_reg, t1_reg);
783
784 free_itemps(gen);
785 }
786 }
787
788 return true;
789 }
790
791
792
793
794 /**
795 * Emit END instruction.
796 * We just return from the shader function at this point.
797 *
798 * Note that there may be more code after this that would be
799 * called by TGSI_OPCODE_CALL.
800 */
801 static boolean
802 emit_END(struct codegen *gen)
803 {
804 spe_comment(gen->f, -4, "END:");
805 /* return from function call */
806 spe_bi(gen->f, SPE_REG_RA, 0, 0);
807 return true;
808 }
809
810
811 /**
812 * Emit code for the given instruction. Just a big switch stmt.
813 */
814 static boolean
815 emit_instruction(struct codegen *gen,
816 const struct tgsi_full_instruction *inst)
817 {
818 switch (inst->Instruction.Opcode) {
819 case TGSI_OPCODE_MOV:
820 return emit_MOV(gen, inst);
821 case TGSI_OPCODE_MUL:
822 return emit_MUL(gen, inst);
823 case TGSI_OPCODE_ADD:
824 return emit_ADD(gen, inst);
825 case TGSI_OPCODE_SUB:
826 return emit_SUB(gen, inst);
827 case TGSI_OPCODE_MAD:
828 return emit_MAD(gen, inst);
829 case TGSI_OPCODE_LERP:
830 return emit_LERP(gen, inst);
831 case TGSI_OPCODE_ABS:
832 return emit_ABS(gen, inst);
833 case TGSI_OPCODE_SGT:
834 return emit_SGT(gen, inst);
835 case TGSI_OPCODE_SLT:
836 return emit_SLT(gen, inst);
837 case TGSI_OPCODE_SEQ:
838 return emit_SEQ(gen, inst);
839 case TGSI_OPCODE_SNE:
840 return emit_SNE(gen, inst);
841 case TGSI_OPCODE_MAX:
842 return emit_MAX(gen, inst);
843 case TGSI_OPCODE_MIN:
844 return emit_MIN(gen, inst);
845 case TGSI_OPCODE_END:
846 return emit_END(gen);
847
848 case TGSI_OPCODE_IF:
849 return emit_IF(gen, inst);
850 case TGSI_OPCODE_ELSE:
851 return emit_ELSE(gen, inst);
852 case TGSI_OPCODE_ENDIF:
853 return emit_ENDIF(gen, inst);
854
855 case TGSI_OPCODE_DDX:
856 return emit_DDX_DDY(gen, inst, true);
857 case TGSI_OPCODE_DDY:
858 return emit_DDX_DDY(gen, inst, false);
859
860 /* XXX lots more cases to do... */
861
862 default:
863 fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
864 inst->Instruction.Opcode);
865 return false;
866 }
867
868 return true;
869 }
870
871
872
873 /**
874 * Emit code for a TGSI immediate value (vector of four floats).
875 * This involves register allocation and initialization.
876 * XXX the initialization should be done by a "prepare" stage, not
877 * per quad execution!
878 */
879 static boolean
880 emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
881 {
882 int ch;
883
884 assert(gen->num_imm < MAX_TEMPS);
885
886 spe_comment(gen->f, -4, "IMMEDIATE:");
887
888 for (ch = 0; ch < 4; ch++) {
889 float val = immed->u.ImmediateFloat32[ch].Float;
890 int reg = spe_allocate_available_register(gen->f);
891
892 if (reg < 0)
893 return false;
894
895 /* update immediate map */
896 gen->imm_regs[gen->num_imm][ch] = reg;
897
898 /* emit initializer instruction */
899 spe_load_float(gen->f, reg, val);
900 }
901
902 gen->num_imm++;
903
904 return true;
905 }
906
907
908
909 /**
910 * Emit "code" for a TGSI declaration.
911 * We only care about TGSI TEMPORARY register declarations at this time.
912 * For each TGSI TEMPORARY we allocate four SPE registers.
913 */
914 static boolean
915 emit_declaration(struct cell_context *cell,
916 struct codegen *gen, const struct tgsi_full_declaration *decl)
917 {
918 int i, ch;
919
920 switch (decl->Declaration.File) {
921 case TGSI_FILE_TEMPORARY:
922 if (cell->debug_flags & CELL_DEBUG_ASM) {
923 printf("Declare temp reg %d .. %d\n",
924 decl->DeclarationRange.First,
925 decl->DeclarationRange.Last);
926 }
927
928 for (i = decl->DeclarationRange.First;
929 i <= decl->DeclarationRange.Last;
930 i++) {
931 assert(i < MAX_TEMPS);
932 for (ch = 0; ch < 4; ch++) {
933 gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
934 if (gen->temp_regs[i][ch] < 0)
935 return false; /* out of regs */
936 }
937
938 /* XXX if we run out of SPE registers, we need to spill
939 * to SPU memory. someday...
940 */
941
942 if (cell->debug_flags & CELL_DEBUG_ASM) {
943 printf(" SPE regs: %d %d %d %d\n",
944 gen->temp_regs[i][0],
945 gen->temp_regs[i][1],
946 gen->temp_regs[i][2],
947 gen->temp_regs[i][3]);
948 }
949 }
950 break;
951 default:
952 ; /* ignore */
953 }
954
955 return true;
956 }
957
958
959 /**
960 * Translate TGSI shader code to SPE instructions. This is done when
961 * the state tracker gives us a new shader (via pipe->create_fs_state()).
962 *
963 * \param cell the rendering context (in)
964 * \param tokens the TGSI shader (in)
965 * \param f the generated function (out)
966 */
967 boolean
968 cell_gen_fragment_program(struct cell_context *cell,
969 const struct tgsi_token *tokens,
970 struct spe_function *f)
971 {
972 struct tgsi_parse_context parse;
973 struct codegen gen;
974
975 memset(&gen, 0, sizeof(gen));
976 gen.f = f;
977
978 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
979 gen.inputs_reg = 3; /* pointer to inputs array */
980 gen.outputs_reg = 4; /* pointer to outputs array */
981 gen.constants_reg = 5; /* pointer to constants array */
982
983 spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
984 spe_allocate_register(f, gen.inputs_reg);
985 spe_allocate_register(f, gen.outputs_reg);
986 spe_allocate_register(f, gen.constants_reg);
987
988 if (cell->debug_flags & CELL_DEBUG_ASM) {
989 spe_print_code(f, true);
990 spe_indent(f, 8);
991 printf("Begin %s\n", __FUNCTION__);
992 tgsi_dump(tokens, 0);
993 }
994
995 tgsi_parse_init(&parse, tokens);
996
997 while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
998 tgsi_parse_token(&parse);
999
1000 switch (parse.FullToken.Token.Type) {
1001 case TGSI_TOKEN_TYPE_IMMEDIATE:
1002 if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
1003 gen.error = true;
1004 break;
1005
1006 case TGSI_TOKEN_TYPE_DECLARATION:
1007 if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
1008 gen.error = true;
1009 break;
1010
1011 case TGSI_TOKEN_TYPE_INSTRUCTION:
1012 if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
1013 gen.error = true;
1014 break;
1015
1016 default:
1017 assert(0);
1018 }
1019 }
1020
1021
1022 if (gen.error) {
1023 /* terminate the SPE code */
1024 return emit_END(&gen);
1025 }
1026
1027 if (cell->debug_flags & CELL_DEBUG_ASM) {
1028 printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
1029 printf("End %s\n", __FUNCTION__);
1030 }
1031
1032 tgsi_parse_free( &parse );
1033
1034 return !gen.error;
1035 }