1 /**************************************************************************
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5 * Copyright 2009 VMware, Inc. All rights reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 **************************************************************************/
32 * Generate SPU fragment program/shader code.
34 * Note that we generate SOA-style code here. So each TGSI instruction
35 * operates on four pixels (and is translated into four SPU instructions,
36 * generally speaking).
42 #include "pipe/p_defines.h"
43 #include "pipe/p_state.h"
44 #include "pipe/p_shader_tokens.h"
45 #include "tgsi/tgsi_parse.h"
46 #include "tgsi/tgsi_util.h"
47 #include "tgsi/tgsi_exec.h"
48 #include "tgsi/tgsi_dump.h"
49 #include "rtasm/rtasm_ppc_spe.h"
50 #include "util/u_memory.h"
51 #include "cell_context.h"
52 #include "cell_gen_fp.h"
64 * Context needed during code generation.
68 struct cell_context
*cell
;
69 int inputs_reg
; /**< 1st function parameter */
70 int outputs_reg
; /**< 2nd function parameter */
71 int constants_reg
; /**< 3rd function parameter */
72 int temp_regs
[MAX_TEMPS
][4]; /**< maps TGSI temps to SPE registers */
73 int imm_regs
[MAX_IMMED
][4]; /**< maps TGSI immediates to SPE registers */
75 int num_imm
; /**< number of immediates */
77 int one_reg
; /**< register containing {1.0, 1.0, 1.0, 1.0} */
79 int addr_reg
; /**< address register, integer values */
81 /** Per-instruction temps / intermediate temps */
85 /** Current IF/ELSE/ENDIF nesting level */
87 /** Current BGNLOOP/ENDLOOP nesting level */
89 /** Location of start of current loop */
92 /** Index of if/conditional mask register */
94 /** Index of loop mask register */
97 /** Index of master execution mask register */
100 /** KIL mask: indicates which fragments have been killed */
103 int frame_size
; /**< Stack frame size, in words */
105 struct spe_function
*f
;
111 * Allocate an intermediate temporary register.
114 get_itemp(struct codegen
*gen
)
116 int t
= spe_allocate_available_register(gen
->f
);
117 assert(gen
->num_itemps
< Elements(gen
->itemps
));
118 gen
->itemps
[gen
->num_itemps
++] = t
;
123 * Free all intermediate temporary registers. To be called after each
124 * instruction has been emitted.
127 free_itemps(struct codegen
*gen
)
130 for (i
= 0; i
< gen
->num_itemps
; i
++) {
131 spe_release_register(gen
->f
, gen
->itemps
[i
]);
138 * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
139 * The register is allocated and initialized upon the first call.
142 get_const_one_reg(struct codegen
*gen
)
144 if (gen
->one_reg
<= 0) {
145 gen
->one_reg
= spe_allocate_available_register(gen
->f
);
147 spe_indent(gen
->f
, 4);
148 spe_comment(gen
->f
, -4, "init constant reg = 1.0:");
150 /* one = {1.0, 1.0, 1.0, 1.0} */
151 spe_load_float(gen
->f
, gen
->one_reg
, 1.0f
);
153 spe_indent(gen
->f
, -4);
161 * Return index of the address register.
162 * Used for indirect register loads/stores.
165 get_address_reg(struct codegen
*gen
)
167 if (gen
->addr_reg
<= 0) {
168 gen
->addr_reg
= spe_allocate_available_register(gen
->f
);
170 spe_indent(gen
->f
, 4);
171 spe_comment(gen
->f
, -4, "init address reg = 0:");
173 /* init addr = {0, 0, 0, 0} */
174 spe_zero(gen
->f
, gen
->addr_reg
);
176 spe_indent(gen
->f
, -4);
179 return gen
->addr_reg
;
184 * Return index of the master execution mask.
185 * The register is allocated an initialized upon the first call.
187 * The master execution mask controls which pixels in a quad are
188 * modified, according to surrounding conditionals, loops, etc.
191 get_exec_mask_reg(struct codegen
*gen
)
193 if (gen
->exec_mask_reg
<= 0) {
194 gen
->exec_mask_reg
= spe_allocate_available_register(gen
->f
);
196 /* XXX this may not be needed */
197 spe_comment(gen
->f
, 0*-4, "initialize master execution mask = ~0");
198 spe_load_int(gen
->f
, gen
->exec_mask_reg
, ~0);
201 return gen
->exec_mask_reg
;
205 /** Return index of the conditional (if/else) execution mask register */
207 get_cond_mask_reg(struct codegen
*gen
)
209 if (gen
->cond_mask_reg
<= 0) {
210 gen
->cond_mask_reg
= spe_allocate_available_register(gen
->f
);
213 return gen
->cond_mask_reg
;
217 /** Return index of the loop execution mask register */
219 get_loop_mask_reg(struct codegen
*gen
)
221 if (gen
->loop_mask_reg
<= 0) {
222 gen
->loop_mask_reg
= spe_allocate_available_register(gen
->f
);
225 return gen
->loop_mask_reg
;
231 is_register_src(struct codegen
*gen
, int channel
,
232 const struct tgsi_full_src_register
*src
)
234 int swizzle
= tgsi_util_get_full_src_register_extswizzle(src
, channel
);
235 int sign_op
= tgsi_util_get_full_src_register_sign_mode(src
, channel
);
237 if (swizzle
> TGSI_SWIZZLE_W
|| sign_op
!= TGSI_UTIL_SIGN_KEEP
) {
240 if (src
->SrcRegister
.File
== TGSI_FILE_TEMPORARY
||
241 src
->SrcRegister
.File
== TGSI_FILE_IMMEDIATE
) {
249 is_memory_dst(struct codegen
*gen
, int channel
,
250 const struct tgsi_full_dst_register
*dst
)
252 if (dst
->DstRegister
.File
== TGSI_FILE_OUTPUT
) {
262 * Return the index of the SPU temporary containing the named TGSI
263 * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we
264 * just return the corresponding SPE register. If the TGIS register
265 * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
266 * and emit an SPE load instruction.
269 get_src_reg(struct codegen
*gen
,
271 const struct tgsi_full_src_register
*src
)
274 int swizzle
= tgsi_util_get_full_src_register_extswizzle(src
, channel
);
275 boolean reg_is_itemp
= FALSE
;
278 assert(swizzle
>= TGSI_SWIZZLE_X
);
279 assert(swizzle
<= TGSI_EXTSWIZZLE_ONE
);
281 if (swizzle
== TGSI_EXTSWIZZLE_ONE
) {
282 /* Load const one float and early out */
283 reg
= get_const_one_reg(gen
);
285 else if (swizzle
== TGSI_EXTSWIZZLE_ZERO
) {
286 /* Load const zero float and early out */
287 reg
= get_itemp(gen
);
288 spe_xor(gen
->f
, reg
, reg
, reg
);
291 int index
= src
->SrcRegister
.Index
;
295 if (src
->SrcRegister
.Indirect
) {
299 switch (src
->SrcRegister
.File
) {
300 case TGSI_FILE_TEMPORARY
:
301 reg
= gen
->temp_regs
[index
][swizzle
];
303 case TGSI_FILE_INPUT
:
305 /* offset is measured in quadwords, not bytes */
306 int offset
= index
* 4 + swizzle
;
307 reg
= get_itemp(gen
);
309 /* Load: reg = memory[(machine_reg) + offset] */
310 spe_lqd(gen
->f
, reg
, gen
->inputs_reg
, offset
* 16);
313 case TGSI_FILE_IMMEDIATE
:
314 reg
= gen
->imm_regs
[index
][swizzle
];
316 case TGSI_FILE_CONSTANT
:
318 /* offset is measured in quadwords, not bytes */
319 int offset
= index
* 4 + swizzle
;
320 reg
= get_itemp(gen
);
322 /* Load: reg = memory[(machine_reg) + offset] */
323 spe_lqd(gen
->f
, reg
, gen
->constants_reg
, offset
* 16);
332 * Handle absolute value, negate or set-negative of src register.
334 sign_op
= tgsi_util_get_full_src_register_sign_mode(src
, channel
);
335 if (sign_op
!= TGSI_UTIL_SIGN_KEEP
) {
337 * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
339 const int bit31mask_reg
= get_itemp(gen
);
343 /* re-use 'reg' for the result */
347 /* alloc a new reg for the result */
348 result_reg
= get_itemp(gen
);
351 /* mask with bit 31 set, the rest cleared */
352 spe_load_uint(gen
->f
, bit31mask_reg
, (1 << 31));
354 if (sign_op
== TGSI_UTIL_SIGN_CLEAR
) {
355 spe_andc(gen
->f
, result_reg
, reg
, bit31mask_reg
);
357 else if (sign_op
== TGSI_UTIL_SIGN_SET
) {
358 spe_and(gen
->f
, result_reg
, reg
, bit31mask_reg
);
361 assert(sign_op
== TGSI_UTIL_SIGN_TOGGLE
);
362 spe_xor(gen
->f
, result_reg
, reg
, bit31mask_reg
);
373 * Return the index of an SPE register to use for the given TGSI register.
374 * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
375 * corresponding SPE register is returned. If the TGSI register is
376 * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
377 * See store_dest_reg() below...
380 get_dst_reg(struct codegen
*gen
,
382 const struct tgsi_full_dst_register
*dest
)
386 switch (dest
->DstRegister
.File
) {
387 case TGSI_FILE_TEMPORARY
:
388 if (gen
->if_nesting
> 0 || gen
->loop_nesting
> 0)
389 reg
= get_itemp(gen
);
391 reg
= gen
->temp_regs
[dest
->DstRegister
.Index
][channel
];
393 case TGSI_FILE_OUTPUT
:
394 reg
= get_itemp(gen
);
405 * When a TGSI instruction is writing to an output register, this
406 * function emits the SPE store instruction to store the value_reg.
407 * \param value_reg the SPE register containing the value to store.
408 * This would have been returned by get_dst_reg().
411 store_dest_reg(struct codegen
*gen
,
412 int value_reg
, int channel
,
413 const struct tgsi_full_dst_register
*dest
)
416 * XXX need to implement dst reg clamping/saturation
419 switch (inst
->Instruction
.Saturate
) {
422 case TGSI_SAT_ZERO_ONE
:
424 case TGSI_SAT_MINUS_PLUS_ONE
:
431 switch (dest
->DstRegister
.File
) {
432 case TGSI_FILE_TEMPORARY
:
433 if (gen
->if_nesting
> 0 || gen
->loop_nesting
> 0) {
434 int d_reg
= gen
->temp_regs
[dest
->DstRegister
.Index
][channel
];
435 int exec_reg
= get_exec_mask_reg(gen
);
436 /* Mix d with new value according to exec mask:
437 * d[i] = mask_reg[i] ? value_reg : d_reg
439 spe_selb(gen
->f
, d_reg
, d_reg
, value_reg
, exec_reg
);
442 /* we're not inside a condition or loop: do nothing special */
446 case TGSI_FILE_OUTPUT
:
448 /* offset is measured in quadwords, not bytes */
449 int offset
= dest
->DstRegister
.Index
* 4 + channel
;
450 if (gen
->if_nesting
> 0 || gen
->loop_nesting
> 0) {
451 int exec_reg
= get_exec_mask_reg(gen
);
452 int curval_reg
= get_itemp(gen
);
453 /* First read the current value from memory:
454 * Load: curval = memory[(machine_reg) + offset]
456 spe_lqd(gen
->f
, curval_reg
, gen
->outputs_reg
, offset
* 16);
457 /* Mix curval with newvalue according to exec mask:
458 * d[i] = mask_reg[i] ? value_reg : d_reg
460 spe_selb(gen
->f
, curval_reg
, curval_reg
, value_reg
, exec_reg
);
461 /* Store: memory[(machine_reg) + offset] = curval */
462 spe_stqd(gen
->f
, curval_reg
, gen
->outputs_reg
, offset
* 16);
465 /* Store: memory[(machine_reg) + offset] = reg */
466 spe_stqd(gen
->f
, value_reg
, gen
->outputs_reg
, offset
* 16);
478 emit_prologue(struct codegen
*gen
)
480 gen
->frame_size
= 1024; /* XXX temporary, should be dynamic */
482 spe_comment(gen
->f
, 0, "Function prologue:");
484 /* save $lr on stack # stqd $lr,16($sp) */
485 spe_stqd(gen
->f
, SPE_REG_RA
, SPE_REG_SP
, 16);
487 if (gen
->frame_size
>= 512) {
488 /* offset is too large for ai instruction */
489 int offset_reg
= spe_allocate_available_register(gen
->f
);
490 int sp_reg
= spe_allocate_available_register(gen
->f
);
491 /* offset = -framesize */
492 spe_load_int(gen
->f
, offset_reg
, -gen
->frame_size
);
494 spe_move(gen
->f
, sp_reg
, SPE_REG_SP
);
495 /* $sp = $sp + offset_reg */
496 spe_a(gen
->f
, SPE_REG_SP
, SPE_REG_SP
, offset_reg
);
497 /* save $sp in stack frame */
498 spe_stqd(gen
->f
, sp_reg
, SPE_REG_SP
, 0);
500 spe_release_register(gen
->f
, offset_reg
);
501 spe_release_register(gen
->f
, sp_reg
);
504 /* save stack pointer # stqd $sp,-frameSize($sp) */
505 spe_stqd(gen
->f
, SPE_REG_SP
, SPE_REG_SP
, -gen
->frame_size
);
507 /* adjust stack pointer # ai $sp,$sp,-frameSize */
508 spe_ai(gen
->f
, SPE_REG_SP
, SPE_REG_SP
, -gen
->frame_size
);
514 emit_epilogue(struct codegen
*gen
)
516 const int return_reg
= 3;
518 spe_comment(gen
->f
, 0, "Function epilogue:");
520 spe_comment(gen
->f
, 0, "return the killed mask");
521 if (gen
->kill_mask_reg
> 0) {
522 /* shader called KIL, return the "alive" mask */
523 spe_move(gen
->f
, return_reg
, gen
->kill_mask_reg
);
526 /* return {0,0,0,0} */
527 spe_load_uint(gen
->f
, return_reg
, 0);
530 spe_comment(gen
->f
, 0, "restore stack and return");
531 if (gen
->frame_size
>= 512) {
532 /* offset is too large for ai instruction */
533 int offset_reg
= spe_allocate_available_register(gen
->f
);
534 /* offset = framesize */
535 spe_load_int(gen
->f
, offset_reg
, gen
->frame_size
);
536 /* $sp = $sp + offset */
537 spe_a(gen
->f
, SPE_REG_SP
, SPE_REG_SP
, offset_reg
);
539 spe_release_register(gen
->f
, offset_reg
);
542 /* restore stack pointer # ai $sp,$sp,frameSize */
543 spe_ai(gen
->f
, SPE_REG_SP
, SPE_REG_SP
, gen
->frame_size
);
546 /* restore $lr # lqd $lr,16($sp) */
547 spe_lqd(gen
->f
, SPE_REG_RA
, SPE_REG_SP
, 16);
549 /* return from function call */
550 spe_bi(gen
->f
, SPE_REG_RA
, 0, 0);
554 #define FOR_EACH_ENABLED_CHANNEL(inst, ch) \
555 for (ch = 0; ch < 4; ch++) \
556 if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch))
560 emit_ARL(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
562 int ch
= 0, src_reg
, addr_reg
;
564 src_reg
= get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
565 addr_reg
= get_address_reg(gen
);
567 /* convert float to int */
568 spe_cflts(gen
->f
, addr_reg
, src_reg
, 0);
577 emit_MOV(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
579 int ch
, src_reg
[4], dst_reg
[4];
581 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
582 src_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
583 dst_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
586 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
587 if (is_register_src(gen
, ch
, &inst
->FullSrcRegisters
[0]) &&
588 is_memory_dst(gen
, ch
, &inst
->FullDstRegisters
[0])) {
589 /* special-case: register to memory store */
590 store_dest_reg(gen
, src_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
593 spe_move(gen
->f
, dst_reg
[ch
], src_reg
[ch
]);
594 store_dest_reg(gen
, dst_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
604 * Emit binary operation
607 emit_binop(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
609 int ch
, s1_reg
[4], s2_reg
[4], d_reg
[4];
611 /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
612 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
613 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
614 s2_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[1]);
615 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
618 /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */
619 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
620 /* Emit actual SPE instruction: d = s1 + s2 */
621 switch (inst
->Instruction
.Opcode
) {
622 case TGSI_OPCODE_ADD
:
623 spe_fa(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
]);
625 case TGSI_OPCODE_SUB
:
626 spe_fs(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
]);
628 case TGSI_OPCODE_MUL
:
629 spe_fm(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
]);
636 /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
637 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
638 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
641 /* Free any intermediate temps we allocated */
649 * Emit multiply add. See emit_ADD for comments.
652 emit_MAD(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
654 int ch
, s1_reg
[4], s2_reg
[4], s3_reg
[4], d_reg
[4];
656 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
657 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
658 s2_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[1]);
659 s3_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[2]);
660 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
662 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
663 spe_fma(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
], s3_reg
[ch
]);
665 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
666 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
674 * Emit linear interpolate. See emit_ADD for comments.
677 emit_LERP(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
679 int ch
, s1_reg
[4], s2_reg
[4], s3_reg
[4], d_reg
[4], tmp_reg
[4];
681 /* setup/get src/dst/temp regs */
682 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
683 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
684 s2_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[1]);
685 s3_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[2]);
686 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
687 tmp_reg
[ch
] = get_itemp(gen
);
690 /* d = s3 + s1(s2 - s3) */
691 /* do all subtracts, then all fma, then all stores to better pipeline */
692 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
693 spe_fs(gen
->f
, tmp_reg
[ch
], s2_reg
[ch
], s3_reg
[ch
]);
695 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
696 spe_fma(gen
->f
, d_reg
[ch
], tmp_reg
[ch
], s1_reg
[ch
], s3_reg
[ch
]);
698 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
699 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
708 * Emit reciprocal or recip sqrt.
711 emit_RCP_RSQ(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
713 int ch
, s1_reg
[4], d_reg
[4], tmp_reg
[4];
715 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
716 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
717 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
718 tmp_reg
[ch
] = get_itemp(gen
);
721 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
722 if (inst
->Instruction
.Opcode
== TGSI_OPCODE_RCP
) {
724 spe_frest(gen
->f
, tmp_reg
[ch
], s1_reg
[ch
]);
727 /* tmp = 1/sqrt(s1) */
728 spe_frsqest(gen
->f
, tmp_reg
[ch
], s1_reg
[ch
]);
732 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
733 /* d = float_interp(s1, tmp) */
734 spe_fi(gen
->f
, d_reg
[ch
], s1_reg
[ch
], tmp_reg
[ch
]);
737 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
738 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
747 * Emit absolute value. See emit_ADD for comments.
750 emit_ABS(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
752 int ch
, s1_reg
[4], d_reg
[4];
753 const int bit31mask_reg
= get_itemp(gen
);
755 /* mask with bit 31 set, the rest cleared */
756 spe_load_uint(gen
->f
, bit31mask_reg
, (1 << 31));
758 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
759 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
760 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
763 /* d = sign bit cleared in s1 */
764 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
765 spe_andc(gen
->f
, d_reg
[ch
], s1_reg
[ch
], bit31mask_reg
);
768 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
769 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
777 * Emit 3 component dot product. See emit_ADD for comments.
780 emit_DP3(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
783 int s1x_reg
, s1y_reg
, s1z_reg
;
784 int s2x_reg
, s2y_reg
, s2z_reg
;
785 int t0_reg
= get_itemp(gen
), t1_reg
= get_itemp(gen
);
787 s1x_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[0]);
788 s2x_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[1]);
789 s1y_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[0]);
790 s2y_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[1]);
791 s1z_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[0]);
792 s2z_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[1]);
795 spe_fm(gen
->f
, t0_reg
, s1x_reg
, s2x_reg
);
798 spe_fm(gen
->f
, t1_reg
, s1y_reg
, s2y_reg
);
800 /* t0 = z0 * z1 + t0 */
801 spe_fma(gen
->f
, t0_reg
, s1z_reg
, s2z_reg
, t0_reg
);
804 spe_fa(gen
->f
, t0_reg
, t0_reg
, t1_reg
);
806 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
807 int d_reg
= get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
808 spe_move(gen
->f
, d_reg
, t0_reg
);
809 store_dest_reg(gen
, d_reg
, ch
, &inst
->FullDstRegisters
[0]);
817 * Emit 4 component dot product. See emit_ADD for comments.
820 emit_DP4(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
823 int s0x_reg
, s0y_reg
, s0z_reg
, s0w_reg
;
824 int s1x_reg
, s1y_reg
, s1z_reg
, s1w_reg
;
825 int t0_reg
= get_itemp(gen
), t1_reg
= get_itemp(gen
);
827 s0x_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[0]);
828 s1x_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[1]);
829 s0y_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[0]);
830 s1y_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[1]);
831 s0z_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[0]);
832 s1z_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[1]);
833 s0w_reg
= get_src_reg(gen
, CHAN_W
, &inst
->FullSrcRegisters
[0]);
834 s1w_reg
= get_src_reg(gen
, CHAN_W
, &inst
->FullSrcRegisters
[1]);
837 spe_fm(gen
->f
, t0_reg
, s0x_reg
, s1x_reg
);
840 spe_fm(gen
->f
, t1_reg
, s0y_reg
, s1y_reg
);
842 /* t0 = z0 * z1 + t0 */
843 spe_fma(gen
->f
, t0_reg
, s0z_reg
, s1z_reg
, t0_reg
);
845 /* t1 = w0 * w1 + t1 */
846 spe_fma(gen
->f
, t1_reg
, s0w_reg
, s1w_reg
, t1_reg
);
849 spe_fa(gen
->f
, t0_reg
, t0_reg
, t1_reg
);
851 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
852 int d_reg
= get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
853 spe_move(gen
->f
, d_reg
, t0_reg
);
854 store_dest_reg(gen
, d_reg
, ch
, &inst
->FullDstRegisters
[0]);
862 * Emit homogeneous dot product. See emit_ADD for comments.
865 emit_DPH(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
867 /* XXX rewrite this function to look more like DP3/DP4 */
869 int s1_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[0]);
870 int s2_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[1]);
871 int tmp_reg
= get_itemp(gen
);
874 spe_fm(gen
->f
, tmp_reg
, s1_reg
, s2_reg
);
876 s1_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[0]);
877 s2_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[1]);
878 /* t = y0 * y1 + t */
879 spe_fma(gen
->f
, tmp_reg
, s1_reg
, s2_reg
, tmp_reg
);
881 s1_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[0]);
882 s2_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[1]);
883 /* t = z0 * z1 + t */
884 spe_fma(gen
->f
, tmp_reg
, s1_reg
, s2_reg
, tmp_reg
);
886 s2_reg
= get_src_reg(gen
, CHAN_W
, &inst
->FullSrcRegisters
[1]);
888 spe_fa(gen
->f
, tmp_reg
, s2_reg
, tmp_reg
);
890 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
891 int d_reg
= get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
892 spe_move(gen
->f
, d_reg
, tmp_reg
);
893 store_dest_reg(gen
, tmp_reg
, ch
, &inst
->FullDstRegisters
[0]);
901 * Emit 3-component vector normalize.
904 emit_NRM3(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
908 int t0_reg
= get_itemp(gen
), t1_reg
= get_itemp(gen
);
910 src_reg
[0] = get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[0]);
911 src_reg
[1] = get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[0]);
912 src_reg
[2] = get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[0]);
915 spe_fm(gen
->f
, t0_reg
, src_reg
[0], src_reg
[0]);
918 spe_fm(gen
->f
, t1_reg
, src_reg
[1], src_reg
[1]);
920 /* t0 = z * z + t0 */
921 spe_fma(gen
->f
, t0_reg
, src_reg
[2], src_reg
[2], t0_reg
);
924 spe_fa(gen
->f
, t0_reg
, t0_reg
, t1_reg
);
926 /* t1 = 1.0 / sqrt(t0) */
927 spe_frsqest(gen
->f
, t1_reg
, t0_reg
);
928 spe_fi(gen
->f
, t1_reg
, t0_reg
, t1_reg
);
930 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
931 int d_reg
= get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
932 /* dst = src[ch] * t1 */
933 spe_fm(gen
->f
, d_reg
, src_reg
[ch
], t1_reg
);
934 store_dest_reg(gen
, d_reg
, ch
, &inst
->FullDstRegisters
[0]);
943 * Emit cross product. See emit_ADD for comments.
946 emit_XPD(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
948 int s1_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[0]);
949 int s2_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[1]);
950 int tmp_reg
= get_itemp(gen
);
953 spe_fm(gen
->f
, tmp_reg
, s1_reg
, s2_reg
);
955 s1_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[0]);
956 s2_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[1]);
957 /* t = y0 * z1 - t */
958 spe_fms(gen
->f
, tmp_reg
, s1_reg
, s2_reg
, tmp_reg
);
960 if (inst
->FullDstRegisters
[0].DstRegister
.WriteMask
& (1 << CHAN_X
)) {
961 store_dest_reg(gen
, tmp_reg
, CHAN_X
, &inst
->FullDstRegisters
[0]);
964 s1_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[0]);
965 s2_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[1]);
967 spe_fm(gen
->f
, tmp_reg
, s1_reg
, s2_reg
);
969 s1_reg
= get_src_reg(gen
, CHAN_Z
, &inst
->FullSrcRegisters
[0]);
970 s2_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[1]);
971 /* t = z0 * x1 - t */
972 spe_fms(gen
->f
, tmp_reg
, s1_reg
, s2_reg
, tmp_reg
);
974 if (inst
->FullDstRegisters
[0].DstRegister
.WriteMask
& (1 << CHAN_Y
)) {
975 store_dest_reg(gen
, tmp_reg
, CHAN_Y
, &inst
->FullDstRegisters
[0]);
978 s1_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[0]);
979 s2_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[1]);
981 spe_fm(gen
->f
, tmp_reg
, s1_reg
, s2_reg
);
983 s1_reg
= get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[0]);
984 s2_reg
= get_src_reg(gen
, CHAN_Y
, &inst
->FullSrcRegisters
[1]);
985 /* t = x0 * y1 - t */
986 spe_fms(gen
->f
, tmp_reg
, s1_reg
, s2_reg
, tmp_reg
);
988 if (inst
->FullDstRegisters
[0].DstRegister
.WriteMask
& (1 << CHAN_Z
)) {
989 store_dest_reg(gen
, tmp_reg
, CHAN_Z
, &inst
->FullDstRegisters
[0]);
998 * Emit inequality instruction.
999 * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
1000 * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
1001 * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
1004 emit_inequality(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1006 int ch
, s1_reg
[4], s2_reg
[4], d_reg
[4], one_reg
;
1007 bool complement
= FALSE
;
1009 one_reg
= get_const_one_reg(gen
);
1011 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1012 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1013 s2_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[1]);
1014 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1017 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1018 switch (inst
->Instruction
.Opcode
) {
1019 case TGSI_OPCODE_SGT
:
1020 spe_fcgt(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
]);
1022 case TGSI_OPCODE_SLT
:
1023 spe_fcgt(gen
->f
, d_reg
[ch
], s2_reg
[ch
], s1_reg
[ch
]);
1025 case TGSI_OPCODE_SGE
:
1026 spe_fcgt(gen
->f
, d_reg
[ch
], s2_reg
[ch
], s1_reg
[ch
]);
1029 case TGSI_OPCODE_SLE
:
1030 spe_fcgt(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
]);
1033 case TGSI_OPCODE_SEQ
:
1034 spe_fceq(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
]);
1036 case TGSI_OPCODE_SNE
:
1037 spe_fceq(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s2_reg
[ch
]);
1045 /* convert d from 0x0/0xffffffff to 0.0/1.0 */
1046 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1047 /* d = d & one_reg */
1049 spe_andc(gen
->f
, d_reg
[ch
], one_reg
, d_reg
[ch
]);
1051 spe_and(gen
->f
, d_reg
[ch
], one_reg
, d_reg
[ch
]);
1054 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1055 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
1067 emit_CMP(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1071 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1072 int s1_reg
= get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1073 int s2_reg
= get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[1]);
1074 int s3_reg
= get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[2]);
1075 int d_reg
= get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1076 int zero_reg
= get_itemp(gen
);
1078 spe_zero(gen
->f
, zero_reg
);
1080 /* d = (s1 < 0) ? s2 : s3 */
1081 spe_fcgt(gen
->f
, d_reg
, zero_reg
, s1_reg
);
1082 spe_selb(gen
->f
, d_reg
, s3_reg
, s2_reg
, d_reg
);
1084 store_dest_reg(gen
, d_reg
, ch
, &inst
->FullDstRegisters
[0]);
1093 * Convert float to signed int
1094 * Convert signed int to float
1097 emit_TRUNC(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1099 int ch
, s1_reg
[4], d_reg
[4];
1101 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1102 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1103 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1106 /* Convert float to int */
1107 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1108 spe_cflts(gen
->f
, d_reg
[ch
], s1_reg
[ch
], 0);
1111 /* Convert int to float */
1112 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1113 spe_csflt(gen
->f
, d_reg
[ch
], d_reg
[ch
], 0);
1116 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1117 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
1127 * If negative int subtract one
1128 * Convert float to signed int
1129 * Convert signed int to float
1132 emit_FLR(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1134 int ch
, s1_reg
[4], d_reg
[4], tmp_reg
[4], zero_reg
, one_reg
;
1136 zero_reg
= get_itemp(gen
);
1137 spe_zero(gen
->f
, zero_reg
);
1138 one_reg
= get_const_one_reg(gen
);
1140 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1141 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1142 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1143 tmp_reg
[ch
] = get_itemp(gen
);
1146 /* If negative, subtract 1.0 */
1147 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1148 spe_fcgt(gen
->f
, tmp_reg
[ch
], zero_reg
, s1_reg
[ch
]);
1150 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1151 spe_selb(gen
->f
, tmp_reg
[ch
], zero_reg
, one_reg
, tmp_reg
[ch
]);
1153 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1154 spe_fs(gen
->f
, tmp_reg
[ch
], s1_reg
[ch
], tmp_reg
[ch
]);
1157 /* Convert float to int */
1158 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1159 spe_cflts(gen
->f
, tmp_reg
[ch
], tmp_reg
[ch
], 0);
1162 /* Convert int to float */
1163 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1164 spe_csflt(gen
->f
, d_reg
[ch
], tmp_reg
[ch
], 0);
1167 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1168 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
1177 * Compute frac = Input - FLR(Input)
1180 emit_FRC(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1182 int ch
, s1_reg
[4], d_reg
[4], tmp_reg
[4], zero_reg
, one_reg
;
1184 zero_reg
= get_itemp(gen
);
1185 spe_zero(gen
->f
, zero_reg
);
1186 one_reg
= get_const_one_reg(gen
);
1188 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1189 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1190 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1191 tmp_reg
[ch
] = get_itemp(gen
);
1194 /* If negative, subtract 1.0 */
1195 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1196 spe_fcgt(gen
->f
, tmp_reg
[ch
], zero_reg
, s1_reg
[ch
]);
1198 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1199 spe_selb(gen
->f
, tmp_reg
[ch
], zero_reg
, one_reg
, tmp_reg
[ch
]);
1201 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1202 spe_fs(gen
->f
, tmp_reg
[ch
], s1_reg
[ch
], tmp_reg
[ch
]);
1205 /* Convert float to int */
1206 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1207 spe_cflts(gen
->f
, tmp_reg
[ch
], tmp_reg
[ch
], 0);
1210 /* Convert int to float */
1211 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1212 spe_csflt(gen
->f
, tmp_reg
[ch
], tmp_reg
[ch
], 0);
1215 /* d = s1 - FLR(s1) */
1216 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1217 spe_fs(gen
->f
, d_reg
[ch
], s1_reg
[ch
], tmp_reg
[ch
]);
1221 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1222 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
1232 print_functions(struct cell_context
*cell
)
1234 struct cell_spu_function_info
*funcs
= &cell
->spu_functions
;
1236 for (i
= 0; i
< funcs
->num
; i
++) {
1237 printf("SPU func %u: %s at %u\n",
1238 i
, funcs
->names
[i
], funcs
->addrs
[i
]);
1245 lookup_function(struct cell_context
*cell
, const char *funcname
)
1247 const struct cell_spu_function_info
*funcs
= &cell
->spu_functions
;
1249 for (i
= 0; i
< funcs
->num
; i
++) {
1250 if (strcmp(funcs
->names
[i
], funcname
) == 0) {
1251 addr
= funcs
->addrs
[i
];
1254 assert(addr
&& "spu function not found");
1255 return addr
/ 4; /* discard 2 least significant bits */
1260 * Emit code to call a SPU function.
1261 * Used to implement instructions like SIN/COS/POW/TEX/etc.
1262 * If scalar, only the X components of the src regs are used, and the
1263 * result is replicated across the dest register's XYZW components.
1266 emit_function_call(struct codegen
*gen
,
1267 const struct tgsi_full_instruction
*inst
,
1268 char *funcname
, uint num_args
, boolean scalar
)
1270 const uint addr
= lookup_function(gen
->cell
, funcname
);
1273 int func_called
= FALSE
;
1275 int retval_reg
= -1;
1277 assert(num_args
<= 3);
1279 snprintf(comment
, sizeof(comment
), "CALL %s:", funcname
);
1280 spe_comment(gen
->f
, -4, comment
);
1283 for (a
= 0; a
< num_args
; a
++) {
1284 s_regs
[a
] = get_src_reg(gen
, CHAN_X
, &inst
->FullSrcRegisters
[a
]);
1286 /* we'll call the function, put the return value in this register,
1287 * then replicate it across all write-enabled components in d_reg.
1289 retval_reg
= spe_allocate_available_register(gen
->f
);
1292 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1294 ubyte usedRegs
[SPE_NUM_REGS
];
1298 for (a
= 0; a
< num_args
; a
++) {
1299 s_regs
[a
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[a
]);
1303 d_reg
= get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1305 if (!scalar
|| !func_called
) {
1306 /* for a scalar function, we'll really only call the function once */
1308 numUsed
= spe_get_registers_used(gen
->f
, usedRegs
);
1309 assert(numUsed
< gen
->frame_size
/ 16 - 2);
1311 /* save registers to stack */
1312 for (i
= 0; i
< numUsed
; i
++) {
1313 uint reg
= usedRegs
[i
];
1315 spe_stqd(gen
->f
, reg
, SPE_REG_SP
, 16 * offset
);
1318 /* setup function arguments */
1319 for (a
= 0; a
< num_args
; a
++) {
1320 spe_move(gen
->f
, 3 + a
, s_regs
[a
]);
1323 /* branch to function, save return addr */
1324 spe_brasl(gen
->f
, SPE_REG_RA
, addr
);
1326 /* save function's return value */
1328 spe_move(gen
->f
, retval_reg
, 3);
1330 spe_move(gen
->f
, d_reg
, 3);
1332 /* restore registers from stack */
1333 for (i
= 0; i
< numUsed
; i
++) {
1334 uint reg
= usedRegs
[i
];
1335 if (reg
!= d_reg
&& reg
!= retval_reg
) {
1337 spe_lqd(gen
->f
, reg
, SPE_REG_SP
, 16 * offset
);
1345 spe_move(gen
->f
, d_reg
, retval_reg
);
1348 store_dest_reg(gen
, d_reg
, ch
, &inst
->FullDstRegisters
[0]);
1353 spe_release_register(gen
->f
, retval_reg
);
1361 emit_TEX(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1363 const uint target
= inst
->InstructionExtTexture
.Texture
;
1364 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
1367 int coord_regs
[4], d_regs
[4];
1370 case TGSI_TEXTURE_1D
:
1371 case TGSI_TEXTURE_2D
:
1372 addr
= lookup_function(gen
->cell
, "spu_tex_2d");
1374 case TGSI_TEXTURE_3D
:
1375 addr
= lookup_function(gen
->cell
, "spu_tex_3d");
1377 case TGSI_TEXTURE_CUBE
:
1378 addr
= lookup_function(gen
->cell
, "spu_tex_cube");
1381 ASSERT(0 && "unsupported texture target");
1385 assert(inst
->FullSrcRegisters
[1].SrcRegister
.File
== TGSI_FILE_SAMPLER
);
1387 spe_comment(gen
->f
, -4, "CALL tex:");
1389 /* get src/dst reg info */
1390 for (ch
= 0; ch
< 4; ch
++) {
1391 coord_regs
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1392 d_regs
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1396 ubyte usedRegs
[SPE_NUM_REGS
];
1399 numUsed
= spe_get_registers_used(gen
->f
, usedRegs
);
1400 assert(numUsed
< gen
->frame_size
/ 16 - 2);
1402 /* save registers to stack */
1403 for (i
= 0; i
< numUsed
; i
++) {
1404 uint reg
= usedRegs
[i
];
1406 spe_stqd(gen
->f
, reg
, SPE_REG_SP
, 16 * offset
);
1409 /* setup function arguments (XXX depends on target) */
1410 for (i
= 0; i
< 4; i
++) {
1411 spe_move(gen
->f
, 3 + i
, coord_regs
[i
]);
1413 spe_load_uint(gen
->f
, 7, unit
); /* sampler unit */
1415 /* branch to function, save return addr */
1416 spe_brasl(gen
->f
, SPE_REG_RA
, addr
);
1418 /* save function's return values (four pixel's colors) */
1419 for (i
= 0; i
< 4; i
++) {
1420 spe_move(gen
->f
, d_regs
[i
], 3 + i
);
1423 /* restore registers from stack */
1424 for (i
= 0; i
< numUsed
; i
++) {
1425 uint reg
= usedRegs
[i
];
1426 if (reg
!= d_regs
[0] &&
1431 spe_lqd(gen
->f
, reg
, SPE_REG_SP
, 16 * offset
);
1436 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1437 store_dest_reg(gen
, d_regs
[ch
], ch
, &inst
->FullDstRegisters
[0]);
1446 * KILL if any of src reg values are less than zero.
1449 emit_KIL(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1452 int s_regs
[4], kil_reg
= -1, cmp_reg
, zero_reg
;
1454 spe_comment(gen
->f
, -4, "CALL kil:");
1456 /* zero = {0,0,0,0} */
1457 zero_reg
= get_itemp(gen
);
1458 spe_zero(gen
->f
, zero_reg
);
1460 cmp_reg
= get_itemp(gen
);
1463 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1464 s_regs
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1467 /* test if any src regs are < 0 */
1468 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1470 /* cmp = 0 > src ? : ~0 : 0 */
1471 spe_fcgt(gen
->f
, cmp_reg
, zero_reg
, s_regs
[ch
]);
1472 /* kil = kil | cmp */
1473 spe_or(gen
->f
, kil_reg
, kil_reg
, cmp_reg
);
1476 kil_reg
= get_itemp(gen
);
1477 /* kil = 0 > src ? : ~0 : 0 */
1478 spe_fcgt(gen
->f
, kil_reg
, zero_reg
, s_regs
[ch
]);
1482 if (gen
->if_nesting
|| gen
->loop_nesting
) {
1483 /* may have been a conditional kil */
1484 spe_and(gen
->f
, kil_reg
, kil_reg
, gen
->exec_mask_reg
);
1487 /* allocate the kill mask reg if needed */
1488 if (gen
->kill_mask_reg
<= 0) {
1489 gen
->kill_mask_reg
= spe_allocate_available_register(gen
->f
);
1490 spe_move(gen
->f
, gen
->kill_mask_reg
, kil_reg
);
1493 spe_or(gen
->f
, gen
->kill_mask_reg
, gen
->kill_mask_reg
, kil_reg
);
1507 emit_MIN_MAX(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1509 int ch
, s0_reg
[4], s1_reg
[4], d_reg
[4], tmp_reg
[4];
1511 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1512 s0_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1513 s1_reg
[ch
] = get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[1]);
1514 d_reg
[ch
] = get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1515 tmp_reg
[ch
] = get_itemp(gen
);
1518 /* d = (s0 > s1) ? s0 : s1 */
1519 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1520 if (inst
->Instruction
.Opcode
== TGSI_OPCODE_MAX
)
1521 spe_fcgt(gen
->f
, tmp_reg
[ch
], s0_reg
[ch
], s1_reg
[ch
]);
1523 spe_fcgt(gen
->f
, tmp_reg
[ch
], s1_reg
[ch
], s0_reg
[ch
]);
1525 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1526 spe_selb(gen
->f
, d_reg
[ch
], s1_reg
[ch
], s0_reg
[ch
], tmp_reg
[ch
]);
1529 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1530 store_dest_reg(gen
, d_reg
[ch
], ch
, &inst
->FullDstRegisters
[0]);
1539 * Emit code to update the execution mask.
1540 * This needs to be done whenever the execution status of a conditional
1541 * or loop is changed.
1544 emit_update_exec_mask(struct codegen
*gen
)
1546 const int exec_reg
= get_exec_mask_reg(gen
);
1547 const int cond_reg
= gen
->cond_mask_reg
;
1548 const int loop_reg
= gen
->loop_mask_reg
;
1550 spe_comment(gen
->f
, 0, "Update master execution mask");
1552 if (gen
->if_nesting
> 0 && gen
->loop_nesting
> 0) {
1553 /* exec_mask = cond_mask & loop_mask */
1554 assert(cond_reg
> 0);
1555 assert(loop_reg
> 0);
1556 spe_and(gen
->f
, exec_reg
, cond_reg
, loop_reg
);
1558 else if (gen
->if_nesting
> 0) {
1559 assert(cond_reg
> 0);
1560 spe_move(gen
->f
, exec_reg
, cond_reg
);
1562 else if (gen
->loop_nesting
> 0) {
1563 assert(loop_reg
> 0);
1564 spe_move(gen
->f
, exec_reg
, loop_reg
);
1567 spe_load_int(gen
->f
, exec_reg
, ~0x0);
1573 emit_IF(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1575 const int channel
= 0;
1578 cond_reg
= get_cond_mask_reg(gen
);
1580 /* XXX push cond exec mask */
1582 spe_comment(gen
->f
, 0, "init conditional exec mask = ~0:");
1583 spe_load_int(gen
->f
, cond_reg
, ~0);
1585 /* update conditional execution mask with the predicate register */
1586 int tmp_reg
= get_itemp(gen
);
1587 int s1_reg
= get_src_reg(gen
, channel
, &inst
->FullSrcRegisters
[0]);
1589 /* tmp = (s1_reg == 0) */
1590 spe_ceqi(gen
->f
, tmp_reg
, s1_reg
, 0);
1592 spe_complement(gen
->f
, tmp_reg
, tmp_reg
);
1593 /* cond_mask = cond_mask & tmp */
1594 spe_and(gen
->f
, cond_reg
, cond_reg
, tmp_reg
);
1598 /* update the master execution mask */
1599 emit_update_exec_mask(gen
);
1608 emit_ELSE(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1610 const int cond_reg
= get_cond_mask_reg(gen
);
1612 spe_comment(gen
->f
, 0, "cond exec mask = !cond exec mask");
1613 spe_complement(gen
->f
, cond_reg
, cond_reg
);
1614 emit_update_exec_mask(gen
);
1621 emit_ENDIF(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1623 /* XXX todo: pop cond exec mask */
1627 emit_update_exec_mask(gen
);
1634 emit_BGNLOOP(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1636 int exec_reg
, loop_reg
;
1638 exec_reg
= get_exec_mask_reg(gen
);
1639 loop_reg
= get_loop_mask_reg(gen
);
1641 /* XXX push loop_exec mask */
1643 spe_comment(gen
->f
, 0*-4, "initialize loop exec mask = ~0");
1644 spe_load_int(gen
->f
, loop_reg
, ~0x0);
1646 gen
->loop_nesting
++;
1647 gen
->loop_start
= spe_code_size(gen
->f
); /* in bytes */
1654 emit_ENDLOOP(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1656 const int loop_reg
= get_loop_mask_reg(gen
);
1657 const int tmp_reg
= get_itemp(gen
);
1660 /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */
1661 spe_orx(gen
->f
, tmp_reg
, loop_reg
);
1663 offset
= gen
->loop_start
- spe_code_size(gen
->f
); /* in bytes */
1665 /* branch back to top of loop if tmp_reg != 0 */
1666 spe_brnz(gen
->f
, tmp_reg
, offset
/ 4);
1668 /* XXX pop loop_exec mask */
1670 gen
->loop_nesting
--;
1672 emit_update_exec_mask(gen
);
1679 emit_BRK(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1681 const int exec_reg
= get_exec_mask_reg(gen
);
1682 const int loop_reg
= get_loop_mask_reg(gen
);
1684 assert(gen
->loop_nesting
> 0);
1686 spe_comment(gen
->f
, 0, "loop exec mask &= ~master exec mask");
1687 spe_andc(gen
->f
, loop_reg
, loop_reg
, exec_reg
);
1689 emit_update_exec_mask(gen
);
1696 emit_CONT(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
)
1698 assert(gen
->loop_nesting
> 0);
1705 emit_DDX_DDY(struct codegen
*gen
, const struct tgsi_full_instruction
*inst
,
1710 FOR_EACH_ENABLED_CHANNEL(inst
, ch
) {
1711 int s_reg
= get_src_reg(gen
, ch
, &inst
->FullSrcRegisters
[0]);
1712 int d_reg
= get_dst_reg(gen
, ch
, &inst
->FullDstRegisters
[0]);
1714 int t1_reg
= get_itemp(gen
);
1715 int t2_reg
= get_itemp(gen
);
1717 spe_splat_word(gen
->f
, t1_reg
, s_reg
, 0); /* upper-left pixel */
1719 spe_splat_word(gen
->f
, t2_reg
, s_reg
, 1); /* upper-right pixel */
1722 spe_splat_word(gen
->f
, t2_reg
, s_reg
, 2); /* lower-left pixel */
1724 spe_fs(gen
->f
, d_reg
, t2_reg
, t1_reg
);
1736 * Emit END instruction.
1737 * We just return from the shader function at this point.
1739 * Note that there may be more code after this that would be
1740 * called by TGSI_OPCODE_CALL.
1743 emit_END(struct codegen
*gen
)
1751 * Emit code for the given instruction. Just a big switch stmt.
1754 emit_instruction(struct codegen
*gen
,
1755 const struct tgsi_full_instruction
*inst
)
1757 switch (inst
->Instruction
.Opcode
) {
1758 case TGSI_OPCODE_ARL
:
1759 return emit_ARL(gen
, inst
);
1760 case TGSI_OPCODE_MOV
:
1761 case TGSI_OPCODE_SWZ
:
1762 return emit_MOV(gen
, inst
);
1763 case TGSI_OPCODE_ADD
:
1764 case TGSI_OPCODE_SUB
:
1765 case TGSI_OPCODE_MUL
:
1766 return emit_binop(gen
, inst
);
1767 case TGSI_OPCODE_MAD
:
1768 return emit_MAD(gen
, inst
);
1769 case TGSI_OPCODE_LERP
:
1770 return emit_LRP(gen
, inst
);
1771 case TGSI_OPCODE_DP3
:
1772 return emit_DP3(gen
, inst
);
1773 case TGSI_OPCODE_DP4
:
1774 return emit_DP4(gen
, inst
);
1775 case TGSI_OPCODE_DPH
:
1776 return emit_DPH(gen
, inst
);
1777 case TGSI_OPCODE_NRM
:
1778 return emit_NRM3(gen
, inst
);
1779 case TGSI_OPCODE_XPD
:
1780 return emit_XPD(gen
, inst
);
1781 case TGSI_OPCODE_RCP
:
1782 case TGSI_OPCODE_RSQ
:
1783 return emit_RCP_RSQ(gen
, inst
);
1784 case TGSI_OPCODE_ABS
:
1785 return emit_ABS(gen
, inst
);
1786 case TGSI_OPCODE_SGT
:
1787 case TGSI_OPCODE_SLT
:
1788 case TGSI_OPCODE_SGE
:
1789 case TGSI_OPCODE_SLE
:
1790 case TGSI_OPCODE_SEQ
:
1791 case TGSI_OPCODE_SNE
:
1792 return emit_inequality(gen
, inst
);
1793 case TGSI_OPCODE_CMP
:
1794 return emit_CMP(gen
, inst
);
1795 case TGSI_OPCODE_MIN
:
1796 case TGSI_OPCODE_MAX
:
1797 return emit_MIN_MAX(gen
, inst
);
1798 case TGSI_OPCODE_TRUNC
:
1799 return emit_TRUNC(gen
, inst
);
1800 case TGSI_OPCODE_FLR
:
1801 return emit_FLR(gen
, inst
);
1802 case TGSI_OPCODE_FRC
:
1803 return emit_FRC(gen
, inst
);
1804 case TGSI_OPCODE_END
:
1805 return emit_END(gen
);
1807 case TGSI_OPCODE_COS
:
1808 return emit_function_call(gen
, inst
, "spu_cos", 1, TRUE
);
1809 case TGSI_OPCODE_SIN
:
1810 return emit_function_call(gen
, inst
, "spu_sin", 1, TRUE
);
1811 case TGSI_OPCODE_POW
:
1812 return emit_function_call(gen
, inst
, "spu_pow", 2, TRUE
);
1813 case TGSI_OPCODE_EX2
:
1814 return emit_function_call(gen
, inst
, "spu_exp2", 1, TRUE
);
1815 case TGSI_OPCODE_LG2
:
1816 return emit_function_call(gen
, inst
, "spu_log2", 1, TRUE
);
1817 case TGSI_OPCODE_TEX
:
1818 /* fall-through for now */
1819 case TGSI_OPCODE_TXD
:
1820 /* fall-through for now */
1821 case TGSI_OPCODE_TXB
:
1822 /* fall-through for now */
1823 case TGSI_OPCODE_TXL
:
1824 /* fall-through for now */
1825 case TGSI_OPCODE_TXP
:
1826 return emit_TEX(gen
, inst
);
1827 case TGSI_OPCODE_KIL
:
1828 return emit_KIL(gen
, inst
);
1830 case TGSI_OPCODE_IF
:
1831 return emit_IF(gen
, inst
);
1832 case TGSI_OPCODE_ELSE
:
1833 return emit_ELSE(gen
, inst
);
1834 case TGSI_OPCODE_ENDIF
:
1835 return emit_ENDIF(gen
, inst
);
1837 case TGSI_OPCODE_BGNLOOP
:
1838 return emit_BGNLOOP(gen
, inst
);
1839 case TGSI_OPCODE_ENDLOOP
:
1840 return emit_ENDLOOP(gen
, inst
);
1841 case TGSI_OPCODE_BRK
:
1842 return emit_BRK(gen
, inst
);
1843 case TGSI_OPCODE_CONT
:
1844 return emit_CONT(gen
, inst
);
1846 case TGSI_OPCODE_DDX
:
1847 return emit_DDX_DDY(gen
, inst
, TRUE
);
1848 case TGSI_OPCODE_DDY
:
1849 return emit_DDX_DDY(gen
, inst
, FALSE
);
1851 /* XXX lots more cases to do... */
1854 fprintf(stderr
, "Cell: unimplemented TGSI instruction %d!\n",
1855 inst
->Instruction
.Opcode
);
1865 * Emit code for a TGSI immediate value (vector of four floats).
1866 * This involves register allocation and initialization.
1867 * XXX the initialization should be done by a "prepare" stage, not
1868 * per quad execution!
1871 emit_immediate(struct codegen
*gen
, const struct tgsi_full_immediate
*immed
)
1875 assert(gen
->num_imm
< MAX_TEMPS
);
1877 for (ch
= 0; ch
< 4; ch
++) {
1878 float val
= immed
->u
[ch
].Float
;
1880 if (ch
> 0 && val
== immed
->u
[ch
- 1].Float
) {
1881 /* re-use previous register */
1882 gen
->imm_regs
[gen
->num_imm
][ch
] = gen
->imm_regs
[gen
->num_imm
][ch
- 1];
1886 int reg
= spe_allocate_available_register(gen
->f
);
1891 sprintf(str
, "init $%d = %f", reg
, val
);
1892 spe_comment(gen
->f
, 0, str
);
1894 /* update immediate map */
1895 gen
->imm_regs
[gen
->num_imm
][ch
] = reg
;
1897 /* emit initializer instruction */
1898 spe_load_float(gen
->f
, reg
, val
);
1910 * Emit "code" for a TGSI declaration.
1911 * We only care about TGSI TEMPORARY register declarations at this time.
1912 * For each TGSI TEMPORARY we allocate four SPE registers.
1915 emit_declaration(struct cell_context
*cell
,
1916 struct codegen
*gen
, const struct tgsi_full_declaration
*decl
)
1920 switch (decl
->Declaration
.File
) {
1921 case TGSI_FILE_TEMPORARY
:
1922 for (i
= decl
->DeclarationRange
.First
;
1923 i
<= decl
->DeclarationRange
.Last
;
1925 assert(i
< MAX_TEMPS
);
1926 for (ch
= 0; ch
< 4; ch
++) {
1927 gen
->temp_regs
[i
][ch
] = spe_allocate_available_register(gen
->f
);
1928 if (gen
->temp_regs
[i
][ch
] < 0)
1929 return FALSE
; /* out of regs */
1932 /* XXX if we run out of SPE registers, we need to spill
1933 * to SPU memory. someday...
1938 sprintf(buf
, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i
,
1939 gen
->temp_regs
[i
][0], gen
->temp_regs
[i
][1],
1940 gen
->temp_regs
[i
][2], gen
->temp_regs
[i
][3]);
1941 spe_comment(gen
->f
, 0, buf
);
1955 * Translate TGSI shader code to SPE instructions. This is done when
1956 * the state tracker gives us a new shader (via pipe->create_fs_state()).
1958 * \param cell the rendering context (in)
1959 * \param tokens the TGSI shader (in)
1960 * \param f the generated function (out)
1963 cell_gen_fragment_program(struct cell_context
*cell
,
1964 const struct tgsi_token
*tokens
,
1965 struct spe_function
*f
)
1967 struct tgsi_parse_context parse
;
1971 memset(&gen
, 0, sizeof(gen
));
1975 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
1976 gen
.inputs_reg
= 3; /* pointer to inputs array */
1977 gen
.outputs_reg
= 4; /* pointer to outputs array */
1978 gen
.constants_reg
= 5; /* pointer to constants array */
1980 spe_init_func(f
, SPU_MAX_FRAGMENT_PROGRAM_INSTS
* SPE_INST_SIZE
);
1981 spe_allocate_register(f
, gen
.inputs_reg
);
1982 spe_allocate_register(f
, gen
.outputs_reg
);
1983 spe_allocate_register(f
, gen
.constants_reg
);
1985 if (cell
->debug_flags
& CELL_DEBUG_ASM
) {
1986 spe_print_code(f
, TRUE
);
1988 printf("Begin %s\n", __FUNCTION__
);
1989 tgsi_dump(tokens
, 0);
1992 tgsi_parse_init(&parse
, tokens
);
1994 emit_prologue(&gen
);
1996 while (!tgsi_parse_end_of_tokens(&parse
) && !gen
.error
) {
1997 tgsi_parse_token(&parse
);
1999 switch (parse
.FullToken
.Token
.Type
) {
2000 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2002 _debug_printf(" # ");
2003 tgsi_dump_immediate(&parse
.FullToken
.FullImmediate
);
2005 if (!emit_immediate(&gen
, &parse
.FullToken
.FullImmediate
))
2009 case TGSI_TOKEN_TYPE_DECLARATION
:
2011 _debug_printf(" # ");
2012 tgsi_dump_declaration(&parse
.FullToken
.FullDeclaration
);
2014 if (!emit_declaration(cell
, &gen
, &parse
.FullToken
.FullDeclaration
))
2018 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2020 _debug_printf(" # ");
2022 tgsi_dump_instruction(&parse
.FullToken
.FullInstruction
, ic
);
2024 if (!emit_instruction(&gen
, &parse
.FullToken
.FullInstruction
))
2034 /* terminate the SPE code */
2035 return emit_END(&gen
);
2038 if (cell
->debug_flags
& CELL_DEBUG_ASM
) {
2039 printf("cell_gen_fragment_program nr instructions: %d\n", f
->num_inst
);
2040 printf("End %s\n", __FUNCTION__
);
2043 tgsi_parse_free( &parse
);