1 /**************************************************************************
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 * TGSI interpretor/executor.
31 * Flow control information:
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers. This is the ExecMask.
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
53 #include <transpose_matrix4x4.h>
54 #include <simdmath/ceilf4.h>
55 #include <simdmath/cosf4.h>
56 #include <simdmath/divf4.h>
57 #include <simdmath/floorf4.h>
58 #include <simdmath/log2f4.h>
59 #include <simdmath/powf4.h>
60 #include <simdmath/sinf4.h>
61 #include <simdmath/sqrtf4.h>
62 #include <simdmath/truncf4.h>
64 #include "pipe/p_compiler.h"
65 #include "pipe/p_state.h"
66 #include "pipe/p_shader_tokens.h"
67 #include "tgsi/tgsi_parse.h"
68 #include "tgsi/tgsi_util.h"
71 #include "spu_vertex_shader.h"
72 #include "spu_dcache.h"
73 #include "cell/common.h"
75 #define TILE_TOP_LEFT 0
76 #define TILE_TOP_RIGHT 1
77 #define TILE_BOTTOM_LEFT 2
78 #define TILE_BOTTOM_RIGHT 3
81 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
83 #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I
84 #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C
85 #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I
86 #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C
87 #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I
88 #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C
89 #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I
90 #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C
91 #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I
92 #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C
93 #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I
94 #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C
95 #define TEMP_128_I TGSI_EXEC_TEMP_128_I
96 #define TEMP_128_C TGSI_EXEC_TEMP_128_C
97 #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I
98 #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C
99 #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
100 #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
101 #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
102 #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
103 #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
104 #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
105 #define TEMP_R0 TGSI_EXEC_TEMP_R0
107 #define FOR_EACH_CHANNEL(CHAN)\
108 for (CHAN = 0; CHAN < 4; CHAN++)
110 #define IS_CHANNEL_ENABLED(INST, CHAN)\
111 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
113 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
114 ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
116 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
117 FOR_EACH_CHANNEL( CHAN )\
118 if (IS_CHANNEL_ENABLED( INST, CHAN ))
120 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
121 FOR_EACH_CHANNEL( CHAN )\
122 if (IS_CHANNEL_ENABLED2( INST, CHAN ))
125 /** The execution mask depends on the conditional mask and the loop mask */
126 #define UPDATE_EXEC_MASK(MACH) \
127 MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
138 * Initialize machine state by expanding tokens to full instructions,
139 * allocating temporary storage, setting up constants, etc.
140 * After this, we can call spu_exec_machine_run() many times.
143 spu_exec_machine_init(struct spu_exec_machine
*mach
,
145 struct spu_sampler
*samplers
,
148 const qword zero
= si_il(0);
149 const qword not_zero
= si_il(~0);
152 mach
->Samplers
= samplers
;
153 mach
->Processor
= processor
;
154 mach
->Addrs
= &mach
->Temps
[TGSI_EXEC_NUM_TEMPS
];
156 /* Setup constants. */
157 mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
= zero
;
158 mach
->Temps
[TEMP_FF_I
].xyzw
[TEMP_FF_C
].q
= not_zero
;
159 mach
->Temps
[TEMP_7F_I
].xyzw
[TEMP_7F_C
].q
= si_shli(not_zero
, -1);
160 mach
->Temps
[TEMP_80_I
].xyzw
[TEMP_80_C
].q
= si_shli(not_zero
, 31);
162 mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
= (qword
) spu_splats(1.0f
);
163 mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
= (qword
) spu_splats(2.0f
);
164 mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
= (qword
) spu_splats(128.0f
);
165 mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
= (qword
) spu_splats(-128.0f
);
172 return si_rotmi(si_shli(src
, 1), -1);
176 micro_ceil(qword src
)
178 return (qword
) _ceilf4((vec_float4
) src
);
184 return (qword
) _cosf4((vec_float4
) src
);
187 static const qword br_shuf
= {
188 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
189 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
190 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
191 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
192 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
193 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
194 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
195 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
198 static const qword bl_shuf
= {
199 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
200 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
201 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
202 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
203 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
204 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
205 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
206 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
209 static const qword tl_shuf
= {
210 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
211 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
212 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
213 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
214 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
215 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
216 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
217 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
223 qword bottom_right
= si_shufb(src
, src
, br_shuf
);
224 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
226 return si_fs(bottom_right
, bottom_left
);
232 qword top_left
= si_shufb(src
, src
, tl_shuf
);
233 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
235 return si_fs(top_left
, bottom_left
);
239 micro_div(qword src0
, qword src1
)
241 return (qword
) _divf4((vec_float4
) src0
, (vec_float4
) src1
);
247 return (qword
) _floorf4((vec_float4
) src
);
253 return si_fs(src
, (qword
) _floorf4((vec_float4
) src
));
257 micro_ge(qword src0
, qword src1
)
259 return si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
265 return (qword
) _log2f4((vec_float4
) src
);
269 micro_lt(qword src0
, qword src1
)
271 const qword tmp
= si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
273 return si_xori(tmp
, 0xff);
277 micro_max(qword src0
, qword src1
)
279 return si_selb(src1
, src0
, si_fcgt(src0
, src1
));
283 micro_min(qword src0
, qword src1
)
285 return si_selb(src0
, src1
, si_fcgt(src0
, src1
));
291 return si_xor(src
, (qword
) spu_splats(0x80000000));
295 micro_set_sign(qword src
)
297 return si_or(src
, (qword
) spu_splats(0x80000000));
301 micro_pow(qword src0
, qword src1
)
303 return (qword
) _powf4((vec_float4
) src0
, (vec_float4
) src1
);
309 const qword half
= (qword
) spu_splats(0.5f
);
311 /* May be able to use _roundf4. There may be some difference, though.
313 return (qword
) _floorf4((vec_float4
) si_fa(src
, half
));
317 micro_ishr(qword src0
, qword src1
)
319 return si_rotma(src0
, si_sfi(src1
, 0));
323 micro_trunc(qword src
)
325 return (qword
) _truncf4((vec_float4
) src
);
331 return (qword
) _sinf4((vec_float4
) src
);
335 micro_sqrt(qword src
)
337 return (qword
) _sqrtf4((vec_float4
) src
);
341 fetch_src_file_channel(
342 const struct spu_exec_machine
*mach
,
345 const union spu_exec_channel
*index
,
346 union spu_exec_channel
*chan
)
349 case TGSI_EXTSWIZZLE_X
:
350 case TGSI_EXTSWIZZLE_Y
:
351 case TGSI_EXTSWIZZLE_Z
:
352 case TGSI_EXTSWIZZLE_W
:
354 case TGSI_FILE_CONSTANT
: {
357 for (i
= 0; i
< 4; i
++) {
358 const float *ptr
= mach
->Consts
[index
->i
[i
]];
361 spu_dcache_fetch_unaligned((qword
*) tmp
,
362 (uintptr_t)(ptr
+ swizzle
),
370 case TGSI_FILE_INPUT
:
371 chan
->u
[0] = mach
->Inputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
372 chan
->u
[1] = mach
->Inputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
373 chan
->u
[2] = mach
->Inputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
374 chan
->u
[3] = mach
->Inputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
377 case TGSI_FILE_TEMPORARY
:
378 chan
->u
[0] = mach
->Temps
[index
->i
[0]].xyzw
[swizzle
].u
[0];
379 chan
->u
[1] = mach
->Temps
[index
->i
[1]].xyzw
[swizzle
].u
[1];
380 chan
->u
[2] = mach
->Temps
[index
->i
[2]].xyzw
[swizzle
].u
[2];
381 chan
->u
[3] = mach
->Temps
[index
->i
[3]].xyzw
[swizzle
].u
[3];
384 case TGSI_FILE_IMMEDIATE
:
385 ASSERT( index
->i
[0] < (int) mach
->ImmLimit
);
386 ASSERT( index
->i
[1] < (int) mach
->ImmLimit
);
387 ASSERT( index
->i
[2] < (int) mach
->ImmLimit
);
388 ASSERT( index
->i
[3] < (int) mach
->ImmLimit
);
390 chan
->f
[0] = mach
->Imms
[index
->i
[0]][swizzle
];
391 chan
->f
[1] = mach
->Imms
[index
->i
[1]][swizzle
];
392 chan
->f
[2] = mach
->Imms
[index
->i
[2]][swizzle
];
393 chan
->f
[3] = mach
->Imms
[index
->i
[3]][swizzle
];
396 case TGSI_FILE_ADDRESS
:
397 chan
->u
[0] = mach
->Addrs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
398 chan
->u
[1] = mach
->Addrs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
399 chan
->u
[2] = mach
->Addrs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
400 chan
->u
[3] = mach
->Addrs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
403 case TGSI_FILE_OUTPUT
:
404 /* vertex/fragment output vars can be read too */
405 chan
->u
[0] = mach
->Outputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
406 chan
->u
[1] = mach
->Outputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
407 chan
->u
[2] = mach
->Outputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
408 chan
->u
[3] = mach
->Outputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
416 case TGSI_EXTSWIZZLE_ZERO
:
417 *chan
= mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
];
420 case TGSI_EXTSWIZZLE_ONE
:
421 *chan
= mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
];
431 const struct spu_exec_machine
*mach
,
432 union spu_exec_channel
*chan
,
433 const struct tgsi_full_src_register
*reg
,
434 const uint chan_index
)
436 union spu_exec_channel index
;
442 index
.i
[3] = reg
->SrcRegister
.Index
;
444 if (reg
->SrcRegister
.Indirect
) {
445 union spu_exec_channel index2
;
446 union spu_exec_channel indir_index
;
451 index2
.i
[3] = reg
->SrcRegisterInd
.Index
;
453 swizzle
= tgsi_util_get_src_register_swizzle(®
->SrcRegisterInd
,
455 fetch_src_file_channel(
457 reg
->SrcRegisterInd
.File
,
462 index
.q
= si_a(index
.q
, indir_index
.q
);
465 if( reg
->SrcRegister
.Dimension
) {
466 switch( reg
->SrcRegister
.File
) {
467 case TGSI_FILE_INPUT
:
468 index
.q
= si_mpyi(index
.q
, 17);
470 case TGSI_FILE_CONSTANT
:
471 index
.q
= si_shli(index
.q
, 12);
477 index
.i
[0] += reg
->SrcRegisterDim
.Index
;
478 index
.i
[1] += reg
->SrcRegisterDim
.Index
;
479 index
.i
[2] += reg
->SrcRegisterDim
.Index
;
480 index
.i
[3] += reg
->SrcRegisterDim
.Index
;
482 if (reg
->SrcRegisterDim
.Indirect
) {
483 union spu_exec_channel index2
;
484 union spu_exec_channel indir_index
;
489 index2
.i
[3] = reg
->SrcRegisterDimInd
.Index
;
491 swizzle
= tgsi_util_get_src_register_swizzle( ®
->SrcRegisterDimInd
, CHAN_X
);
492 fetch_src_file_channel(
494 reg
->SrcRegisterDimInd
.File
,
499 index
.q
= si_a(index
.q
, indir_index
.q
);
503 swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
504 fetch_src_file_channel(
506 reg
->SrcRegister
.File
,
511 switch (tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
)) {
512 case TGSI_UTIL_SIGN_CLEAR
:
513 chan
->q
= micro_abs(chan
->q
);
516 case TGSI_UTIL_SIGN_SET
:
517 chan
->q
= micro_set_sign(chan
->q
);
520 case TGSI_UTIL_SIGN_TOGGLE
:
521 chan
->q
= micro_neg(chan
->q
);
524 case TGSI_UTIL_SIGN_KEEP
:
528 if (reg
->SrcRegisterExtMod
.Complement
) {
529 chan
->q
= si_fs(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, chan
->q
);
535 struct spu_exec_machine
*mach
,
536 const union spu_exec_channel
*chan
,
537 const struct tgsi_full_dst_register
*reg
,
538 const struct tgsi_full_instruction
*inst
,
541 union spu_exec_channel
*dst
;
543 switch( reg
->DstRegister
.File
) {
547 case TGSI_FILE_OUTPUT
:
548 dst
= &mach
->Outputs
[mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0]
549 + reg
->DstRegister
.Index
].xyzw
[chan_index
];
552 case TGSI_FILE_TEMPORARY
:
553 dst
= &mach
->Temps
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
556 case TGSI_FILE_ADDRESS
:
557 dst
= &mach
->Addrs
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
565 switch (inst
->Instruction
.Saturate
)
568 if (mach
->ExecMask
& 0x1)
569 dst
->i
[0] = chan
->i
[0];
570 if (mach
->ExecMask
& 0x2)
571 dst
->i
[1] = chan
->i
[1];
572 if (mach
->ExecMask
& 0x4)
573 dst
->i
[2] = chan
->i
[2];
574 if (mach
->ExecMask
& 0x8)
575 dst
->i
[3] = chan
->i
[3];
578 case TGSI_SAT_ZERO_ONE
:
579 /* XXX need to obey ExecMask here */
580 dst
->q
= micro_max(chan
->q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
581 dst
->q
= micro_min(dst
->q
, mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
);
584 case TGSI_SAT_MINUS_PLUS_ONE
:
593 #define FETCH(VAL,INDEX,CHAN)\
594 fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
596 #define STORE(VAL,INDEX,CHAN)\
597 store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
601 * Execute ARB-style KIL which is predicated by a src register.
602 * Kill fragment if any of the four values is less than zero.
605 exec_kil(struct spu_exec_machine
*mach
,
606 const struct tgsi_full_instruction
*inst
)
610 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
611 union spu_exec_channel r
[1];
613 /* This mask stores component bits that were already tested. Note that
614 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
616 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
618 for (chan_index
= 0; chan_index
< 4; chan_index
++)
623 /* unswizzle channel */
624 swizzle
= tgsi_util_get_full_src_register_extswizzle (
625 &inst
->FullSrcRegisters
[0],
628 /* check if the component has not been already tested */
629 if (uniquemask
& (1 << swizzle
))
631 uniquemask
|= 1 << swizzle
;
633 FETCH(&r
[0], 0, chan_index
);
634 for (i
= 0; i
< 4; i
++)
635 if (r
[0].f
[i
] < 0.0f
)
639 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
643 * Execute NVIDIA-style KIL which is predicated by a condition code.
644 * Kill fragment if the condition code is TRUE.
647 exec_kilp(struct tgsi_exec_machine
*mach
,
648 const struct tgsi_full_instruction
*inst
)
650 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
652 /* TODO: build kilmask from CC mask */
654 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
658 * Fetch a texel using STR texture coordinates.
661 fetch_texel( struct spu_sampler
*sampler
,
662 const union spu_exec_channel
*s
,
663 const union spu_exec_channel
*t
,
664 const union spu_exec_channel
*p
,
665 float lodbias
, /* XXX should be float[4] */
666 union spu_exec_channel
*r
,
667 union spu_exec_channel
*g
,
668 union spu_exec_channel
*b
,
669 union spu_exec_channel
*a
)
674 sampler
->get_samples(sampler
, s
->f
, t
->f
, p
->f
, lodbias
,
675 (float (*)[4]) rgba
);
677 _transpose_matrix4x4((vec_float4
*) out
, (vec_float4
*) rgba
);
686 exec_tex(struct spu_exec_machine
*mach
,
687 const struct tgsi_full_instruction
*inst
,
688 boolean biasLod
, boolean projected
)
690 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
691 union spu_exec_channel r
[8];
695 /* printf("Sampler %u unit %u\n", sampler, unit); */
697 switch (inst
->InstructionExtTexture
.Texture
) {
698 case TGSI_TEXTURE_1D
:
700 FETCH(&r
[0], 0, CHAN_X
);
703 FETCH(&r
[1], 0, CHAN_W
);
704 r
[0].q
= micro_div(r
[0].q
, r
[1].q
);
708 FETCH(&r
[1], 0, CHAN_W
);
714 fetch_texel(&mach
->Samplers
[unit
],
715 &r
[0], NULL
, NULL
, lodBias
, /* S, T, P, BIAS */
716 &r
[0], &r
[1], &r
[2], &r
[3]); /* R, G, B, A */
719 case TGSI_TEXTURE_2D
:
720 case TGSI_TEXTURE_RECT
:
722 FETCH(&r
[0], 0, CHAN_X
);
723 FETCH(&r
[1], 0, CHAN_Y
);
724 FETCH(&r
[2], 0, CHAN_Z
);
727 FETCH(&r
[3], 0, CHAN_W
);
728 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
729 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
730 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
734 FETCH(&r
[3], 0, CHAN_W
);
740 fetch_texel(&mach
->Samplers
[unit
],
741 &r
[0], &r
[1], &r
[2], lodBias
, /* inputs */
742 &r
[0], &r
[1], &r
[2], &r
[3]); /* outputs */
745 case TGSI_TEXTURE_3D
:
746 case TGSI_TEXTURE_CUBE
:
748 FETCH(&r
[0], 0, CHAN_X
);
749 FETCH(&r
[1], 0, CHAN_Y
);
750 FETCH(&r
[2], 0, CHAN_Z
);
753 FETCH(&r
[3], 0, CHAN_W
);
754 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
755 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
756 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
760 FETCH(&r
[3], 0, CHAN_W
);
766 fetch_texel(&mach
->Samplers
[unit
],
767 &r
[0], &r
[1], &r
[2], lodBias
,
768 &r
[0], &r
[1], &r
[2], &r
[3]);
775 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
776 STORE( &r
[chan_index
], 0, chan_index
);
783 constant_interpolation(
784 struct spu_exec_machine
*mach
,
790 for( i
= 0; i
< QUAD_SIZE
; i
++ ) {
791 mach
->Inputs
[attrib
].xyzw
[chan
].f
[i
] = mach
->InterpCoefs
[attrib
].a0
[chan
];
796 linear_interpolation(
797 struct spu_exec_machine
*mach
,
801 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
802 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
803 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
804 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
805 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
806 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
;
807 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = a0
+ dadx
;
808 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = a0
+ dady
;
809 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = a0
+ dadx
+ dady
;
813 perspective_interpolation(
814 struct spu_exec_machine
*mach
,
818 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
819 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
820 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
821 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
822 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
823 const float *w
= mach
->QuadPos
.xyzw
[3].f
;
824 /* divide by W here */
825 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
/ w
[0];
826 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = (a0
+ dadx
) / w
[1];
827 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = (a0
+ dady
) / w
[2];
828 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = (a0
+ dadx
+ dady
) / w
[3];
832 typedef void (* interpolation_func
)(
833 struct spu_exec_machine
*mach
,
838 exec_declaration(struct spu_exec_machine
*mach
,
839 const struct tgsi_full_declaration
*decl
)
841 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
842 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
843 unsigned first
, last
, mask
;
844 interpolation_func interp
;
846 first
= decl
->DeclarationRange
.First
;
847 last
= decl
->DeclarationRange
.Last
;
848 mask
= decl
->Declaration
.UsageMask
;
850 switch( decl
->Declaration
.Interpolate
) {
851 case TGSI_INTERPOLATE_CONSTANT
:
852 interp
= constant_interpolation
;
855 case TGSI_INTERPOLATE_LINEAR
:
856 interp
= linear_interpolation
;
859 case TGSI_INTERPOLATE_PERSPECTIVE
:
860 interp
= perspective_interpolation
;
867 if( mask
== TGSI_WRITEMASK_XYZW
) {
870 for( i
= first
; i
<= last
; i
++ ) {
871 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
872 interp( mach
, i
, j
);
879 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
880 if( mask
& (1 << j
) ) {
881 for( i
= first
; i
<= last
; i
++ ) {
882 interp( mach
, i
, j
);
893 struct spu_exec_machine
*mach
,
894 const struct tgsi_full_instruction
*inst
,
898 union spu_exec_channel r
[8];
902 switch (inst
->Instruction
.Opcode
) {
903 case TGSI_OPCODE_ARL
:
904 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
905 FETCH( &r
[0], 0, chan_index
);
906 r
[0].q
= si_cflts(r
[0].q
, 0);
907 STORE( &r
[0], 0, chan_index
);
911 case TGSI_OPCODE_MOV
:
912 case TGSI_OPCODE_SWZ
:
913 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
914 FETCH( &r
[0], 0, chan_index
);
915 STORE( &r
[0], 0, chan_index
);
919 case TGSI_OPCODE_LIT
:
920 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
921 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
924 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
925 FETCH( &r
[0], 0, CHAN_X
);
926 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
927 r
[0].q
= micro_max(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
928 STORE( &r
[0], 0, CHAN_Y
);
931 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
932 FETCH( &r
[1], 0, CHAN_Y
);
933 r
[1].q
= micro_max(r
[1].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
935 FETCH( &r
[2], 0, CHAN_W
);
936 r
[2].q
= micro_min(r
[2].q
, mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
);
937 r
[2].q
= micro_max(r
[2].q
, mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
);
938 r
[1].q
= micro_pow(r
[1].q
, r
[2].q
);
940 /* r0 = (r0 > 0.0) ? r1 : 0.0
942 r
[0].q
= si_fcgt(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
943 r
[0].q
= si_selb(mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
, r
[1].q
,
945 STORE( &r
[0], 0, CHAN_Z
);
949 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
950 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
954 case TGSI_OPCODE_RCP
:
955 FETCH( &r
[0], 0, CHAN_X
);
956 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
957 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
958 STORE( &r
[0], 0, chan_index
);
962 case TGSI_OPCODE_RSQ
:
963 FETCH( &r
[0], 0, CHAN_X
);
964 r
[0].q
= micro_sqrt(r
[0].q
);
965 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
966 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
967 STORE( &r
[0], 0, chan_index
);
971 case TGSI_OPCODE_EXP
:
975 case TGSI_OPCODE_LOG
:
979 case TGSI_OPCODE_MUL
:
980 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
)
982 FETCH(&r
[0], 0, chan_index
);
983 FETCH(&r
[1], 1, chan_index
);
985 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
987 STORE(&r
[0], 0, chan_index
);
991 case TGSI_OPCODE_ADD
:
992 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
993 FETCH( &r
[0], 0, chan_index
);
994 FETCH( &r
[1], 1, chan_index
);
995 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
996 STORE( &r
[0], 0, chan_index
);
1000 case TGSI_OPCODE_DP3
:
1001 /* TGSI_OPCODE_DOT3 */
1002 FETCH( &r
[0], 0, CHAN_X
);
1003 FETCH( &r
[1], 1, CHAN_X
);
1004 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1006 FETCH( &r
[1], 0, CHAN_Y
);
1007 FETCH( &r
[2], 1, CHAN_Y
);
1008 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1011 FETCH( &r
[1], 0, CHAN_Z
);
1012 FETCH( &r
[2], 1, CHAN_Z
);
1013 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1015 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1016 STORE( &r
[0], 0, chan_index
);
1020 case TGSI_OPCODE_DP4
:
1021 /* TGSI_OPCODE_DOT4 */
1022 FETCH(&r
[0], 0, CHAN_X
);
1023 FETCH(&r
[1], 1, CHAN_X
);
1025 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1027 FETCH(&r
[1], 0, CHAN_Y
);
1028 FETCH(&r
[2], 1, CHAN_Y
);
1030 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1032 FETCH(&r
[1], 0, CHAN_Z
);
1033 FETCH(&r
[2], 1, CHAN_Z
);
1035 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1037 FETCH(&r
[1], 0, CHAN_W
);
1038 FETCH(&r
[2], 1, CHAN_W
);
1040 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1042 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1043 STORE( &r
[0], 0, chan_index
);
1047 case TGSI_OPCODE_DST
:
1048 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1049 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
1052 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1053 FETCH( &r
[0], 0, CHAN_Y
);
1054 FETCH( &r
[1], 1, CHAN_Y
);
1055 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1056 STORE( &r
[0], 0, CHAN_Y
);
1059 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1060 FETCH( &r
[0], 0, CHAN_Z
);
1061 STORE( &r
[0], 0, CHAN_Z
);
1064 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1065 FETCH( &r
[0], 1, CHAN_W
);
1066 STORE( &r
[0], 0, CHAN_W
);
1070 case TGSI_OPCODE_MIN
:
1071 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1072 FETCH(&r
[0], 0, chan_index
);
1073 FETCH(&r
[1], 1, chan_index
);
1075 r
[0].q
= micro_min(r
[0].q
, r
[1].q
);
1077 STORE(&r
[0], 0, chan_index
);
1081 case TGSI_OPCODE_MAX
:
1082 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1083 FETCH(&r
[0], 0, chan_index
);
1084 FETCH(&r
[1], 1, chan_index
);
1086 r
[0].q
= micro_max(r
[0].q
, r
[1].q
);
1088 STORE(&r
[0], 0, chan_index
);
1092 case TGSI_OPCODE_SLT
:
1093 /* TGSI_OPCODE_SETLT */
1094 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1095 FETCH( &r
[0], 0, chan_index
);
1096 FETCH( &r
[1], 1, chan_index
);
1098 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1099 r
[0].q
= si_xori(r
[0].q
, 0xff);
1101 STORE( &r
[0], 0, chan_index
);
1105 case TGSI_OPCODE_SGE
:
1106 /* TGSI_OPCODE_SETGE */
1107 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1108 FETCH( &r
[0], 0, chan_index
);
1109 FETCH( &r
[1], 1, chan_index
);
1110 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1111 STORE( &r
[0], 0, chan_index
);
1115 case TGSI_OPCODE_MAD
:
1116 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1117 FETCH( &r
[0], 0, chan_index
);
1118 FETCH( &r
[1], 1, chan_index
);
1119 FETCH( &r
[2], 2, chan_index
);
1120 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1121 STORE( &r
[0], 0, chan_index
);
1125 case TGSI_OPCODE_SUB
:
1126 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1127 FETCH(&r
[0], 0, chan_index
);
1128 FETCH(&r
[1], 1, chan_index
);
1130 r
[0].q
= si_fs(r
[0].q
, r
[1].q
);
1132 STORE(&r
[0], 0, chan_index
);
1136 case TGSI_OPCODE_LRP
:
1137 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1138 FETCH(&r
[0], 0, chan_index
);
1139 FETCH(&r
[1], 1, chan_index
);
1140 FETCH(&r
[2], 2, chan_index
);
1142 r
[1].q
= si_fs(r
[1].q
, r
[2].q
);
1143 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1145 STORE(&r
[0], 0, chan_index
);
1149 case TGSI_OPCODE_CND
:
1153 case TGSI_OPCODE_CND0
:
1157 case TGSI_OPCODE_DP2A
:
1161 case TGSI_OPCODE_FRC
:
1162 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1163 FETCH( &r
[0], 0, chan_index
);
1164 r
[0].q
= micro_frc(r
[0].q
);
1165 STORE( &r
[0], 0, chan_index
);
1169 case TGSI_OPCODE_CLAMP
:
1173 case TGSI_OPCODE_FLR
:
1174 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1175 FETCH( &r
[0], 0, chan_index
);
1176 r
[0].q
= micro_flr(r
[0].q
);
1177 STORE( &r
[0], 0, chan_index
);
1181 case TGSI_OPCODE_ROUND
:
1182 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1183 FETCH( &r
[0], 0, chan_index
);
1184 r
[0].q
= micro_rnd(r
[0].q
);
1185 STORE( &r
[0], 0, chan_index
);
1189 case TGSI_OPCODE_EX2
:
1190 FETCH(&r
[0], 0, CHAN_X
);
1192 r
[0].q
= micro_pow(mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
, r
[0].q
);
1194 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1195 STORE( &r
[0], 0, chan_index
);
1199 case TGSI_OPCODE_LG2
:
1200 FETCH( &r
[0], 0, CHAN_X
);
1201 r
[0].q
= micro_lg2(r
[0].q
);
1202 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1203 STORE( &r
[0], 0, chan_index
);
1207 case TGSI_OPCODE_POW
:
1208 FETCH(&r
[0], 0, CHAN_X
);
1209 FETCH(&r
[1], 1, CHAN_X
);
1211 r
[0].q
= micro_pow(r
[0].q
, r
[1].q
);
1213 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1214 STORE( &r
[0], 0, chan_index
);
1218 case TGSI_OPCODE_XPD
:
1219 /* TGSI_OPCODE_XPD */
1220 FETCH(&r
[0], 0, CHAN_Y
);
1221 FETCH(&r
[1], 1, CHAN_Z
);
1222 FETCH(&r
[3], 0, CHAN_Z
);
1223 FETCH(&r
[4], 1, CHAN_Y
);
1225 /* r2 = (r0 * r1) - (r3 * r5)
1227 r
[2].q
= si_fm(r
[3].q
, r
[5].q
);
1228 r
[2].q
= si_fms(r
[0].q
, r
[1].q
, r
[2].q
);
1230 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1231 STORE( &r
[2], 0, CHAN_X
);
1234 FETCH(&r
[2], 1, CHAN_X
);
1235 FETCH(&r
[5], 0, CHAN_X
);
1237 /* r3 = (r3 * r2) - (r1 * r5)
1239 r
[1].q
= si_fm(r
[1].q
, r
[5].q
);
1240 r
[3].q
= si_fms(r
[3].q
, r
[2].q
, r
[1].q
);
1242 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1243 STORE( &r
[3], 0, CHAN_Y
);
1246 /* r5 = (r5 * r4) - (r0 * r2)
1248 r
[0].q
= si_fm(r
[0].q
, r
[2].q
);
1249 r
[5].q
= si_fms(r
[5].q
, r
[4].q
, r
[0].q
);
1251 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1252 STORE( &r
[5], 0, CHAN_Z
);
1255 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1256 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1260 case TGSI_OPCODE_ABS
:
1261 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1262 FETCH(&r
[0], 0, chan_index
);
1264 r
[0].q
= micro_abs(r
[0].q
);
1266 STORE(&r
[0], 0, chan_index
);
1270 case TGSI_OPCODE_RCC
:
1274 case TGSI_OPCODE_DPH
:
1275 FETCH(&r
[0], 0, CHAN_X
);
1276 FETCH(&r
[1], 1, CHAN_X
);
1278 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1280 FETCH(&r
[1], 0, CHAN_Y
);
1281 FETCH(&r
[2], 1, CHAN_Y
);
1283 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1285 FETCH(&r
[1], 0, CHAN_Z
);
1286 FETCH(&r
[2], 1, CHAN_Z
);
1288 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1290 FETCH(&r
[1], 1, CHAN_W
);
1292 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
1294 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1295 STORE( &r
[0], 0, chan_index
);
1299 case TGSI_OPCODE_COS
:
1300 FETCH(&r
[0], 0, CHAN_X
);
1302 r
[0].q
= micro_cos(r
[0].q
);
1304 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1305 STORE( &r
[0], 0, chan_index
);
1309 case TGSI_OPCODE_DDX
:
1310 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1311 FETCH( &r
[0], 0, chan_index
);
1312 r
[0].q
= micro_ddx(r
[0].q
);
1313 STORE( &r
[0], 0, chan_index
);
1317 case TGSI_OPCODE_DDY
:
1318 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1319 FETCH( &r
[0], 0, chan_index
);
1320 r
[0].q
= micro_ddy(r
[0].q
);
1321 STORE( &r
[0], 0, chan_index
);
1325 case TGSI_OPCODE_KILP
:
1326 exec_kilp (mach
, inst
);
1329 case TGSI_OPCODE_KIL
:
1330 exec_kil (mach
, inst
);
1333 case TGSI_OPCODE_PK2H
:
1337 case TGSI_OPCODE_PK2US
:
1341 case TGSI_OPCODE_PK4B
:
1345 case TGSI_OPCODE_PK4UB
:
1349 case TGSI_OPCODE_RFL
:
1353 case TGSI_OPCODE_SEQ
:
1354 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1355 FETCH( &r
[0], 0, chan_index
);
1356 FETCH( &r
[1], 1, chan_index
);
1358 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1360 STORE( &r
[0], 0, chan_index
);
1364 case TGSI_OPCODE_SFL
:
1368 case TGSI_OPCODE_SGT
:
1369 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1370 FETCH( &r
[0], 0, chan_index
);
1371 FETCH( &r
[1], 1, chan_index
);
1372 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1373 STORE( &r
[0], 0, chan_index
);
1377 case TGSI_OPCODE_SIN
:
1378 FETCH( &r
[0], 0, CHAN_X
);
1379 r
[0].q
= micro_sin(r
[0].q
);
1380 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1381 STORE( &r
[0], 0, chan_index
);
1385 case TGSI_OPCODE_SLE
:
1386 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1387 FETCH( &r
[0], 0, chan_index
);
1388 FETCH( &r
[1], 1, chan_index
);
1390 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1391 r
[0].q
= si_xori(r
[0].q
, 0xff);
1393 STORE( &r
[0], 0, chan_index
);
1397 case TGSI_OPCODE_SNE
:
1398 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1399 FETCH( &r
[0], 0, chan_index
);
1400 FETCH( &r
[1], 1, chan_index
);
1402 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1403 r
[0].q
= si_xori(r
[0].q
, 0xff);
1405 STORE( &r
[0], 0, chan_index
);
1409 case TGSI_OPCODE_STR
:
1413 case TGSI_OPCODE_TEX
:
1414 /* simple texture lookup */
1415 /* src[0] = texcoord */
1416 /* src[1] = sampler unit */
1417 exec_tex(mach
, inst
, FALSE
, FALSE
);
1420 case TGSI_OPCODE_TXB
:
1421 /* Texture lookup with lod bias */
1422 /* src[0] = texcoord (src[0].w = load bias) */
1423 /* src[1] = sampler unit */
1424 exec_tex(mach
, inst
, TRUE
, FALSE
);
1427 case TGSI_OPCODE_TXD
:
1428 /* Texture lookup with explict partial derivatives */
1429 /* src[0] = texcoord */
1430 /* src[1] = d[strq]/dx */
1431 /* src[2] = d[strq]/dy */
1432 /* src[3] = sampler unit */
1436 case TGSI_OPCODE_TXL
:
1437 /* Texture lookup with explit LOD */
1438 /* src[0] = texcoord (src[0].w = load bias) */
1439 /* src[1] = sampler unit */
1440 exec_tex(mach
, inst
, TRUE
, FALSE
);
1443 case TGSI_OPCODE_TXP
:
1444 /* Texture lookup with projection */
1445 /* src[0] = texcoord (src[0].w = projection) */
1446 /* src[1] = sampler unit */
1447 exec_tex(mach
, inst
, TRUE
, TRUE
);
1450 case TGSI_OPCODE_UP2H
:
1454 case TGSI_OPCODE_UP2US
:
1458 case TGSI_OPCODE_UP4B
:
1462 case TGSI_OPCODE_UP4UB
:
1466 case TGSI_OPCODE_X2D
:
1470 case TGSI_OPCODE_ARA
:
1474 case TGSI_OPCODE_ARR
:
1478 case TGSI_OPCODE_BRA
:
1482 case TGSI_OPCODE_CAL
:
1483 /* skip the call if no execution channels are enabled */
1484 if (mach
->ExecMask
) {
1487 /* push the Cond, Loop, Cont stacks */
1488 ASSERT(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1489 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1490 ASSERT(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1491 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1492 ASSERT(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1493 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1495 ASSERT(mach
->FuncStackTop
< TGSI_EXEC_MAX_CALL_NESTING
);
1496 mach
->FuncStack
[mach
->FuncStackTop
++] = mach
->FuncMask
;
1498 /* note that PC was already incremented above */
1499 mach
->CallStack
[mach
->CallStackTop
++] = *pc
;
1500 *pc
= inst
->InstructionExtLabel
.Label
;
1504 case TGSI_OPCODE_RET
:
1505 mach
->FuncMask
&= ~mach
->ExecMask
;
1506 UPDATE_EXEC_MASK(mach
);
1508 if (mach
->ExecMask
== 0x0) {
1509 /* really return now (otherwise, keep executing */
1511 if (mach
->CallStackTop
== 0) {
1512 /* returning from main() */
1516 *pc
= mach
->CallStack
[--mach
->CallStackTop
];
1518 /* pop the Cond, Loop, Cont stacks */
1519 ASSERT(mach
->CondStackTop
> 0);
1520 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1521 ASSERT(mach
->LoopStackTop
> 0);
1522 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1523 ASSERT(mach
->ContStackTop
> 0);
1524 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1525 ASSERT(mach
->FuncStackTop
> 0);
1526 mach
->FuncMask
= mach
->FuncStack
[--mach
->FuncStackTop
];
1528 UPDATE_EXEC_MASK(mach
);
1532 case TGSI_OPCODE_SSG
:
1536 case TGSI_OPCODE_CMP
:
1537 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1538 FETCH(&r
[0], 0, chan_index
);
1539 FETCH(&r
[1], 1, chan_index
);
1540 FETCH(&r
[2], 2, chan_index
);
1542 /* r0 = (r0 < 0.0) ? r1 : r2
1544 r
[3].q
= si_xor(r
[3].q
, r
[3].q
);
1545 r
[0].q
= micro_lt(r
[0].q
, r
[3].q
);
1546 r
[0].q
= si_selb(r
[1].q
, r
[2].q
, r
[0].q
);
1548 STORE(&r
[0], 0, chan_index
);
1552 case TGSI_OPCODE_SCS
:
1553 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1554 FETCH( &r
[0], 0, CHAN_X
);
1556 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1557 r
[1].q
= micro_cos(r
[0].q
);
1558 STORE( &r
[1], 0, CHAN_X
);
1560 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1561 r
[1].q
= micro_sin(r
[0].q
);
1562 STORE( &r
[1], 0, CHAN_Y
);
1564 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1565 STORE( &mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
], 0, CHAN_Z
);
1567 if( IS_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1568 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1572 case TGSI_OPCODE_NRM
:
1576 case TGSI_OPCODE_DIV
:
1580 case TGSI_OPCODE_DP2
:
1581 FETCH( &r
[0], 0, CHAN_X
);
1582 FETCH( &r
[1], 1, CHAN_X
);
1583 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1585 FETCH( &r
[1], 0, CHAN_Y
);
1586 FETCH( &r
[2], 1, CHAN_Y
);
1587 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1589 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1590 STORE( &r
[0], 0, chan_index
);
1594 case TGSI_OPCODE_IF
:
1596 ASSERT(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1597 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1598 FETCH( &r
[0], 0, CHAN_X
);
1599 /* update CondMask */
1601 mach
->CondMask
&= ~0x1;
1604 mach
->CondMask
&= ~0x2;
1607 mach
->CondMask
&= ~0x4;
1610 mach
->CondMask
&= ~0x8;
1612 UPDATE_EXEC_MASK(mach
);
1613 /* Todo: If CondMask==0, jump to ELSE */
1616 case TGSI_OPCODE_ELSE
:
1617 /* invert CondMask wrt previous mask */
1620 ASSERT(mach
->CondStackTop
> 0);
1621 prevMask
= mach
->CondStack
[mach
->CondStackTop
- 1];
1622 mach
->CondMask
= ~mach
->CondMask
& prevMask
;
1623 UPDATE_EXEC_MASK(mach
);
1624 /* Todo: If CondMask==0, jump to ENDIF */
1628 case TGSI_OPCODE_ENDIF
:
1630 ASSERT(mach
->CondStackTop
> 0);
1631 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1632 UPDATE_EXEC_MASK(mach
);
1635 case TGSI_OPCODE_END
:
1636 /* halt execution */
1640 case TGSI_OPCODE_REP
:
1644 case TGSI_OPCODE_ENDREP
:
1648 case TGSI_OPCODE_PUSHA
:
1652 case TGSI_OPCODE_POPA
:
1656 case TGSI_OPCODE_CEIL
:
1657 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1658 FETCH( &r
[0], 0, chan_index
);
1659 r
[0].q
= micro_ceil(r
[0].q
);
1660 STORE( &r
[0], 0, chan_index
);
1664 case TGSI_OPCODE_I2F
:
1665 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1666 FETCH( &r
[0], 0, chan_index
);
1667 r
[0].q
= si_csflt(r
[0].q
, 0);
1668 STORE( &r
[0], 0, chan_index
);
1672 case TGSI_OPCODE_NOT
:
1673 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1674 FETCH( &r
[0], 0, chan_index
);
1675 r
[0].q
= si_xorbi(r
[0].q
, 0xff);
1676 STORE( &r
[0], 0, chan_index
);
1680 case TGSI_OPCODE_TRUNC
:
1681 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1682 FETCH( &r
[0], 0, chan_index
);
1683 r
[0].q
= micro_trunc(r
[0].q
);
1684 STORE( &r
[0], 0, chan_index
);
1688 case TGSI_OPCODE_SHL
:
1689 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1690 FETCH( &r
[0], 0, chan_index
);
1691 FETCH( &r
[1], 1, chan_index
);
1693 r
[0].q
= si_shl(r
[0].q
, r
[1].q
);
1695 STORE( &r
[0], 0, chan_index
);
1699 case TGSI_OPCODE_SHR
:
1700 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1701 FETCH( &r
[0], 0, chan_index
);
1702 FETCH( &r
[1], 1, chan_index
);
1703 r
[0].q
= micro_ishr(r
[0].q
, r
[1].q
);
1704 STORE( &r
[0], 0, chan_index
);
1708 case TGSI_OPCODE_AND
:
1709 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1710 FETCH( &r
[0], 0, chan_index
);
1711 FETCH( &r
[1], 1, chan_index
);
1712 r
[0].q
= si_and(r
[0].q
, r
[1].q
);
1713 STORE( &r
[0], 0, chan_index
);
1717 case TGSI_OPCODE_OR
:
1718 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1719 FETCH( &r
[0], 0, chan_index
);
1720 FETCH( &r
[1], 1, chan_index
);
1721 r
[0].q
= si_or(r
[0].q
, r
[1].q
);
1722 STORE( &r
[0], 0, chan_index
);
1726 case TGSI_OPCODE_MOD
:
1730 case TGSI_OPCODE_XOR
:
1731 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1732 FETCH( &r
[0], 0, chan_index
);
1733 FETCH( &r
[1], 1, chan_index
);
1734 r
[0].q
= si_xor(r
[0].q
, r
[1].q
);
1735 STORE( &r
[0], 0, chan_index
);
1739 case TGSI_OPCODE_SAD
:
1743 case TGSI_OPCODE_TXF
:
1747 case TGSI_OPCODE_TXQ
:
1751 case TGSI_OPCODE_EMIT
:
1752 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] += 16;
1753 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]]++;
1756 case TGSI_OPCODE_ENDPRIM
:
1757 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]++;
1758 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]] = 0;
1761 case TGSI_OPCODE_BGNFOR
:
1762 /* fall-through (for now) */
1763 case TGSI_OPCODE_BGNLOOP
:
1764 /* push LoopMask and ContMasks */
1765 ASSERT(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1766 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1767 ASSERT(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1768 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1771 case TGSI_OPCODE_ENDFOR
:
1772 /* fall-through (for now at least) */
1773 case TGSI_OPCODE_ENDLOOP
:
1774 /* Restore ContMask, but don't pop */
1775 ASSERT(mach
->ContStackTop
> 0);
1776 mach
->ContMask
= mach
->ContStack
[mach
->ContStackTop
- 1];
1777 if (mach
->LoopMask
) {
1778 /* repeat loop: jump to instruction just past BGNLOOP */
1779 *pc
= inst
->InstructionExtLabel
.Label
+ 1;
1782 /* exit loop: pop LoopMask */
1783 ASSERT(mach
->LoopStackTop
> 0);
1784 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1786 ASSERT(mach
->ContStackTop
> 0);
1787 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1789 UPDATE_EXEC_MASK(mach
);
1792 case TGSI_OPCODE_BRK
:
1793 /* turn off loop channels for each enabled exec channel */
1794 mach
->LoopMask
&= ~mach
->ExecMask
;
1795 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1796 UPDATE_EXEC_MASK(mach
);
1799 case TGSI_OPCODE_CONT
:
1800 /* turn off cont channels for each enabled exec channel */
1801 mach
->ContMask
&= ~mach
->ExecMask
;
1802 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1803 UPDATE_EXEC_MASK(mach
);
1806 case TGSI_OPCODE_BGNSUB
:
1810 case TGSI_OPCODE_ENDSUB
:
1814 case TGSI_OPCODE_NOISE1
:
1818 case TGSI_OPCODE_NOISE2
:
1822 case TGSI_OPCODE_NOISE3
:
1826 case TGSI_OPCODE_NOISE4
:
1830 case TGSI_OPCODE_NOP
:
1840 * Run TGSI interpreter.
1841 * \return bitmask of "alive" quad components
1844 spu_exec_machine_run( struct spu_exec_machine
*mach
)
1849 mach
->CondMask
= 0xf;
1850 mach
->LoopMask
= 0xf;
1851 mach
->ContMask
= 0xf;
1852 mach
->FuncMask
= 0xf;
1853 mach
->ExecMask
= 0xf;
1855 mach
->CondStackTop
= 0; /* temporarily subvert this ASSERTion */
1856 ASSERT(mach
->CondStackTop
== 0);
1857 ASSERT(mach
->LoopStackTop
== 0);
1858 ASSERT(mach
->ContStackTop
== 0);
1859 ASSERT(mach
->CallStackTop
== 0);
1861 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] = 0;
1862 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] = 0;
1864 if( mach
->Processor
== TGSI_PROCESSOR_GEOMETRY
) {
1865 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0] = 0;
1866 mach
->Primitives
[0] = 0;
1870 /* execute declarations (interpolants) */
1871 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1872 for (i
= 0; i
< mach
->NumDeclarations
; i
++) {
1874 struct tgsi_full_declaration decl
;
1875 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_declaration
)) / 16];
1877 unsigned ea
= (unsigned) (mach
->Declarations
+ pc
);
1879 spu_dcache_fetch_unaligned(d
.buffer
, ea
, sizeof(d
.decl
));
1881 exec_declaration( mach
, &d
.decl
);
1885 /* execute instructions, until pc is set to -1 */
1888 struct tgsi_full_instruction inst
;
1889 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_instruction
)) / 16];
1891 unsigned ea
= (unsigned) (mach
->Instructions
+ pc
);
1893 spu_dcache_fetch_unaligned(i
.buffer
, ea
, sizeof(i
.inst
));
1894 exec_instruction( mach
, & i
.inst
, &pc
);
1898 /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
1899 if (mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1901 * Scale back depth component.
1903 for (i
= 0; i
< 4; i
++)
1904 mach
->Outputs
[0].xyzw
[2].f
[i
] *= ctx
->DrawBuffer
->_DepthMaxF
;
1908 return ~mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0];