1 /**************************************************************************
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 * TGSI interpretor/executor.
31 * Flow control information:
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers. This is the ExecMask.
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
53 #include <transpose_matrix4x4.h>
54 #include <simdmath/ceilf4.h>
55 #include <simdmath/cosf4.h>
56 #include <simdmath/divf4.h>
57 #include <simdmath/floorf4.h>
58 #include <simdmath/log2f4.h>
59 #include <simdmath/powf4.h>
60 #include <simdmath/sinf4.h>
61 #include <simdmath/sqrtf4.h>
62 #include <simdmath/truncf4.h>
64 #include "pipe/p_compiler.h"
65 #include "pipe/p_state.h"
66 #include "pipe/p_shader_tokens.h"
67 #include "tgsi/tgsi_parse.h"
68 #include "tgsi/tgsi_util.h"
71 #include "spu_vertex_shader.h"
72 #include "spu_dcache.h"
73 #include "cell/common.h"
75 #define TILE_TOP_LEFT 0
76 #define TILE_TOP_RIGHT 1
77 #define TILE_BOTTOM_LEFT 2
78 #define TILE_BOTTOM_RIGHT 3
81 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
83 #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I
84 #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C
85 #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I
86 #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C
87 #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I
88 #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C
89 #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I
90 #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C
91 #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I
92 #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C
93 #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I
94 #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C
95 #define TEMP_128_I TGSI_EXEC_TEMP_128_I
96 #define TEMP_128_C TGSI_EXEC_TEMP_128_C
97 #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I
98 #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C
99 #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
100 #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
101 #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
102 #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
103 #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
104 #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
105 #define TEMP_R0 TGSI_EXEC_TEMP_R0
107 #define FOR_EACH_CHANNEL(CHAN)\
108 for (CHAN = 0; CHAN < 4; CHAN++)
110 #define IS_CHANNEL_ENABLED(INST, CHAN)\
111 ((INST).Dst[0].DstRegister.WriteMask & (1 << (CHAN)))
113 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
114 ((INST).Dst[1].DstRegister.WriteMask & (1 << (CHAN)))
116 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
117 FOR_EACH_CHANNEL( CHAN )\
118 if (IS_CHANNEL_ENABLED( INST, CHAN ))
120 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
121 FOR_EACH_CHANNEL( CHAN )\
122 if (IS_CHANNEL_ENABLED2( INST, CHAN ))
125 /** The execution mask depends on the conditional mask and the loop mask */
126 #define UPDATE_EXEC_MASK(MACH) \
127 MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
138 * Initialize machine state by expanding tokens to full instructions,
139 * allocating temporary storage, setting up constants, etc.
140 * After this, we can call spu_exec_machine_run() many times.
143 spu_exec_machine_init(struct spu_exec_machine
*mach
,
145 struct spu_sampler
*samplers
,
148 const qword zero
= si_il(0);
149 const qword not_zero
= si_il(~0);
152 mach
->Samplers
= samplers
;
153 mach
->Processor
= processor
;
154 mach
->Addrs
= &mach
->Temps
[TGSI_EXEC_NUM_TEMPS
];
156 /* Setup constants. */
157 mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
= zero
;
158 mach
->Temps
[TEMP_FF_I
].xyzw
[TEMP_FF_C
].q
= not_zero
;
159 mach
->Temps
[TEMP_7F_I
].xyzw
[TEMP_7F_C
].q
= si_shli(not_zero
, -1);
160 mach
->Temps
[TEMP_80_I
].xyzw
[TEMP_80_C
].q
= si_shli(not_zero
, 31);
162 mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
= (qword
) spu_splats(1.0f
);
163 mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
= (qword
) spu_splats(2.0f
);
164 mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
= (qword
) spu_splats(128.0f
);
165 mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
= (qword
) spu_splats(-128.0f
);
172 return si_rotmi(si_shli(src
, 1), -1);
176 micro_ceil(qword src
)
178 return (qword
) _ceilf4((vec_float4
) src
);
184 return (qword
) _cosf4((vec_float4
) src
);
187 static const qword br_shuf
= {
188 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
189 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
190 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
191 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
192 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
193 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
194 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
195 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
198 static const qword bl_shuf
= {
199 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
200 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
201 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
202 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
203 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
204 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
205 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
206 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
209 static const qword tl_shuf
= {
210 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
211 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
212 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
213 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
214 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
215 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
216 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
217 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
223 qword bottom_right
= si_shufb(src
, src
, br_shuf
);
224 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
226 return si_fs(bottom_right
, bottom_left
);
232 qword top_left
= si_shufb(src
, src
, tl_shuf
);
233 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
235 return si_fs(top_left
, bottom_left
);
239 micro_div(qword src0
, qword src1
)
241 return (qword
) _divf4((vec_float4
) src0
, (vec_float4
) src1
);
247 return (qword
) _floorf4((vec_float4
) src
);
253 return si_fs(src
, (qword
) _floorf4((vec_float4
) src
));
257 micro_ge(qword src0
, qword src1
)
259 return si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
265 return (qword
) _log2f4((vec_float4
) src
);
269 micro_lt(qword src0
, qword src1
)
271 const qword tmp
= si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
273 return si_xori(tmp
, 0xff);
277 micro_max(qword src0
, qword src1
)
279 return si_selb(src1
, src0
, si_fcgt(src0
, src1
));
283 micro_min(qword src0
, qword src1
)
285 return si_selb(src0
, src1
, si_fcgt(src0
, src1
));
291 return si_xor(src
, (qword
) spu_splats(0x80000000));
295 micro_set_sign(qword src
)
297 return si_or(src
, (qword
) spu_splats(0x80000000));
301 micro_pow(qword src0
, qword src1
)
303 return (qword
) _powf4((vec_float4
) src0
, (vec_float4
) src1
);
309 const qword half
= (qword
) spu_splats(0.5f
);
311 /* May be able to use _roundf4. There may be some difference, though.
313 return (qword
) _floorf4((vec_float4
) si_fa(src
, half
));
317 micro_ishr(qword src0
, qword src1
)
319 return si_rotma(src0
, si_sfi(src1
, 0));
323 micro_trunc(qword src
)
325 return (qword
) _truncf4((vec_float4
) src
);
331 return (qword
) _sinf4((vec_float4
) src
);
335 micro_sqrt(qword src
)
337 return (qword
) _sqrtf4((vec_float4
) src
);
341 fetch_src_file_channel(
342 const struct spu_exec_machine
*mach
,
345 const union spu_exec_channel
*index
,
346 union spu_exec_channel
*chan
)
354 case TGSI_FILE_CONSTANT
: {
357 for (i
= 0; i
< 4; i
++) {
358 const float *ptr
= mach
->Consts
[index
->i
[i
]];
361 spu_dcache_fetch_unaligned((qword
*) tmp
,
362 (uintptr_t)(ptr
+ swizzle
),
370 case TGSI_FILE_INPUT
:
371 chan
->u
[0] = mach
->Inputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
372 chan
->u
[1] = mach
->Inputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
373 chan
->u
[2] = mach
->Inputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
374 chan
->u
[3] = mach
->Inputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
377 case TGSI_FILE_TEMPORARY
:
378 chan
->u
[0] = mach
->Temps
[index
->i
[0]].xyzw
[swizzle
].u
[0];
379 chan
->u
[1] = mach
->Temps
[index
->i
[1]].xyzw
[swizzle
].u
[1];
380 chan
->u
[2] = mach
->Temps
[index
->i
[2]].xyzw
[swizzle
].u
[2];
381 chan
->u
[3] = mach
->Temps
[index
->i
[3]].xyzw
[swizzle
].u
[3];
384 case TGSI_FILE_IMMEDIATE
:
385 ASSERT( index
->i
[0] < (int) mach
->ImmLimit
);
386 ASSERT( index
->i
[1] < (int) mach
->ImmLimit
);
387 ASSERT( index
->i
[2] < (int) mach
->ImmLimit
);
388 ASSERT( index
->i
[3] < (int) mach
->ImmLimit
);
390 chan
->f
[0] = mach
->Imms
[index
->i
[0]][swizzle
];
391 chan
->f
[1] = mach
->Imms
[index
->i
[1]][swizzle
];
392 chan
->f
[2] = mach
->Imms
[index
->i
[2]][swizzle
];
393 chan
->f
[3] = mach
->Imms
[index
->i
[3]][swizzle
];
396 case TGSI_FILE_ADDRESS
:
397 chan
->u
[0] = mach
->Addrs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
398 chan
->u
[1] = mach
->Addrs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
399 chan
->u
[2] = mach
->Addrs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
400 chan
->u
[3] = mach
->Addrs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
403 case TGSI_FILE_OUTPUT
:
404 /* vertex/fragment output vars can be read too */
405 chan
->u
[0] = mach
->Outputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
406 chan
->u
[1] = mach
->Outputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
407 chan
->u
[2] = mach
->Outputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
408 chan
->u
[3] = mach
->Outputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
423 const struct spu_exec_machine
*mach
,
424 union spu_exec_channel
*chan
,
425 const struct tgsi_full_src_register
*reg
,
426 const uint chan_index
)
428 union spu_exec_channel index
;
434 index
.i
[3] = reg
->SrcRegister
.Index
;
436 if (reg
->SrcRegister
.Indirect
) {
437 union spu_exec_channel index2
;
438 union spu_exec_channel indir_index
;
443 index2
.i
[3] = reg
->SrcRegisterInd
.Index
;
445 swizzle
= tgsi_util_get_src_register_swizzle(®
->SrcRegisterInd
,
447 fetch_src_file_channel(
449 reg
->SrcRegisterInd
.File
,
454 index
.q
= si_a(index
.q
, indir_index
.q
);
457 if( reg
->SrcRegister
.Dimension
) {
458 switch( reg
->SrcRegister
.File
) {
459 case TGSI_FILE_INPUT
:
460 index
.q
= si_mpyi(index
.q
, 17);
462 case TGSI_FILE_CONSTANT
:
463 index
.q
= si_shli(index
.q
, 12);
469 index
.i
[0] += reg
->SrcRegisterDim
.Index
;
470 index
.i
[1] += reg
->SrcRegisterDim
.Index
;
471 index
.i
[2] += reg
->SrcRegisterDim
.Index
;
472 index
.i
[3] += reg
->SrcRegisterDim
.Index
;
474 if (reg
->SrcRegisterDim
.Indirect
) {
475 union spu_exec_channel index2
;
476 union spu_exec_channel indir_index
;
481 index2
.i
[3] = reg
->SrcRegisterDimInd
.Index
;
483 swizzle
= tgsi_util_get_src_register_swizzle( ®
->SrcRegisterDimInd
, CHAN_X
);
484 fetch_src_file_channel(
486 reg
->SrcRegisterDimInd
.File
,
491 index
.q
= si_a(index
.q
, indir_index
.q
);
495 swizzle
= tgsi_util_get_full_src_register_swizzle( reg
, chan_index
);
496 fetch_src_file_channel(
498 reg
->SrcRegister
.File
,
503 switch (tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
)) {
504 case TGSI_UTIL_SIGN_CLEAR
:
505 chan
->q
= micro_abs(chan
->q
);
508 case TGSI_UTIL_SIGN_SET
:
509 chan
->q
= micro_set_sign(chan
->q
);
512 case TGSI_UTIL_SIGN_TOGGLE
:
513 chan
->q
= micro_neg(chan
->q
);
516 case TGSI_UTIL_SIGN_KEEP
:
520 if (reg
->SrcRegisterExtMod
.Complement
) {
521 chan
->q
= si_fs(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, chan
->q
);
527 struct spu_exec_machine
*mach
,
528 const union spu_exec_channel
*chan
,
529 const struct tgsi_full_dst_register
*reg
,
530 const struct tgsi_full_instruction
*inst
,
533 union spu_exec_channel
*dst
;
535 switch( reg
->DstRegister
.File
) {
539 case TGSI_FILE_OUTPUT
:
540 dst
= &mach
->Outputs
[mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0]
541 + reg
->DstRegister
.Index
].xyzw
[chan_index
];
544 case TGSI_FILE_TEMPORARY
:
545 dst
= &mach
->Temps
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
548 case TGSI_FILE_ADDRESS
:
549 dst
= &mach
->Addrs
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
557 switch (inst
->Instruction
.Saturate
)
560 if (mach
->ExecMask
& 0x1)
561 dst
->i
[0] = chan
->i
[0];
562 if (mach
->ExecMask
& 0x2)
563 dst
->i
[1] = chan
->i
[1];
564 if (mach
->ExecMask
& 0x4)
565 dst
->i
[2] = chan
->i
[2];
566 if (mach
->ExecMask
& 0x8)
567 dst
->i
[3] = chan
->i
[3];
570 case TGSI_SAT_ZERO_ONE
:
571 /* XXX need to obey ExecMask here */
572 dst
->q
= micro_max(chan
->q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
573 dst
->q
= micro_min(dst
->q
, mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
);
576 case TGSI_SAT_MINUS_PLUS_ONE
:
585 #define FETCH(VAL,INDEX,CHAN)\
586 fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
588 #define STORE(VAL,INDEX,CHAN)\
589 store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
593 * Execute ARB-style KIL which is predicated by a src register.
594 * Kill fragment if any of the four values is less than zero.
597 exec_kil(struct spu_exec_machine
*mach
,
598 const struct tgsi_full_instruction
*inst
)
602 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
603 union spu_exec_channel r
[1];
605 /* This mask stores component bits that were already tested. */
608 for (chan_index
= 0; chan_index
< 4; chan_index
++)
613 /* unswizzle channel */
614 swizzle
= tgsi_util_get_full_src_register_swizzle (
618 /* check if the component has not been already tested */
619 if (uniquemask
& (1 << swizzle
))
621 uniquemask
|= 1 << swizzle
;
623 FETCH(&r
[0], 0, chan_index
);
624 for (i
= 0; i
< 4; i
++)
625 if (r
[0].f
[i
] < 0.0f
)
629 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
633 * Execute NVIDIA-style KIL which is predicated by a condition code.
634 * Kill fragment if the condition code is TRUE.
637 exec_kilp(struct tgsi_exec_machine
*mach
,
638 const struct tgsi_full_instruction
*inst
)
640 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
642 /* TODO: build kilmask from CC mask */
644 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
648 * Fetch a texel using STR texture coordinates.
651 fetch_texel( struct spu_sampler
*sampler
,
652 const union spu_exec_channel
*s
,
653 const union spu_exec_channel
*t
,
654 const union spu_exec_channel
*p
,
655 float lodbias
, /* XXX should be float[4] */
656 union spu_exec_channel
*r
,
657 union spu_exec_channel
*g
,
658 union spu_exec_channel
*b
,
659 union spu_exec_channel
*a
)
664 sampler
->get_samples(sampler
, s
->f
, t
->f
, p
->f
, lodbias
,
665 (float (*)[4]) rgba
);
667 _transpose_matrix4x4((vec_float4
*) out
, (vec_float4
*) rgba
);
676 exec_tex(struct spu_exec_machine
*mach
,
677 const struct tgsi_full_instruction
*inst
,
678 boolean biasLod
, boolean projected
)
680 const uint unit
= inst
->Src
[1].SrcRegister
.Index
;
681 union spu_exec_channel r
[8];
685 /* printf("Sampler %u unit %u\n", sampler, unit); */
687 switch (inst
->InstructionExtTexture
.Texture
) {
688 case TGSI_TEXTURE_1D
:
690 FETCH(&r
[0], 0, CHAN_X
);
693 FETCH(&r
[1], 0, CHAN_W
);
694 r
[0].q
= micro_div(r
[0].q
, r
[1].q
);
698 FETCH(&r
[1], 0, CHAN_W
);
704 fetch_texel(&mach
->Samplers
[unit
],
705 &r
[0], NULL
, NULL
, lodBias
, /* S, T, P, BIAS */
706 &r
[0], &r
[1], &r
[2], &r
[3]); /* R, G, B, A */
709 case TGSI_TEXTURE_2D
:
710 case TGSI_TEXTURE_RECT
:
712 FETCH(&r
[0], 0, CHAN_X
);
713 FETCH(&r
[1], 0, CHAN_Y
);
714 FETCH(&r
[2], 0, CHAN_Z
);
717 FETCH(&r
[3], 0, CHAN_W
);
718 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
719 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
720 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
724 FETCH(&r
[3], 0, CHAN_W
);
730 fetch_texel(&mach
->Samplers
[unit
],
731 &r
[0], &r
[1], &r
[2], lodBias
, /* inputs */
732 &r
[0], &r
[1], &r
[2], &r
[3]); /* outputs */
735 case TGSI_TEXTURE_3D
:
736 case TGSI_TEXTURE_CUBE
:
738 FETCH(&r
[0], 0, CHAN_X
);
739 FETCH(&r
[1], 0, CHAN_Y
);
740 FETCH(&r
[2], 0, CHAN_Z
);
743 FETCH(&r
[3], 0, CHAN_W
);
744 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
745 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
746 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
750 FETCH(&r
[3], 0, CHAN_W
);
756 fetch_texel(&mach
->Samplers
[unit
],
757 &r
[0], &r
[1], &r
[2], lodBias
,
758 &r
[0], &r
[1], &r
[2], &r
[3]);
765 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
766 STORE( &r
[chan_index
], 0, chan_index
);
773 constant_interpolation(
774 struct spu_exec_machine
*mach
,
780 for( i
= 0; i
< QUAD_SIZE
; i
++ ) {
781 mach
->Inputs
[attrib
].xyzw
[chan
].f
[i
] = mach
->InterpCoefs
[attrib
].a0
[chan
];
786 linear_interpolation(
787 struct spu_exec_machine
*mach
,
791 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
792 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
793 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
794 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
795 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
796 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
;
797 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = a0
+ dadx
;
798 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = a0
+ dady
;
799 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = a0
+ dadx
+ dady
;
803 perspective_interpolation(
804 struct spu_exec_machine
*mach
,
808 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
809 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
810 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
811 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
812 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
813 const float *w
= mach
->QuadPos
.xyzw
[3].f
;
814 /* divide by W here */
815 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
/ w
[0];
816 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = (a0
+ dadx
) / w
[1];
817 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = (a0
+ dady
) / w
[2];
818 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = (a0
+ dadx
+ dady
) / w
[3];
822 typedef void (* interpolation_func
)(
823 struct spu_exec_machine
*mach
,
828 exec_declaration(struct spu_exec_machine
*mach
,
829 const struct tgsi_full_declaration
*decl
)
831 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
832 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
833 unsigned first
, last
, mask
;
834 interpolation_func interp
;
836 first
= decl
->Range
.First
;
837 last
= decl
->Range
.Last
;
838 mask
= decl
->Declaration
.UsageMask
;
840 switch( decl
->Declaration
.Interpolate
) {
841 case TGSI_INTERPOLATE_CONSTANT
:
842 interp
= constant_interpolation
;
845 case TGSI_INTERPOLATE_LINEAR
:
846 interp
= linear_interpolation
;
849 case TGSI_INTERPOLATE_PERSPECTIVE
:
850 interp
= perspective_interpolation
;
857 if( mask
== TGSI_WRITEMASK_XYZW
) {
860 for( i
= first
; i
<= last
; i
++ ) {
861 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
862 interp( mach
, i
, j
);
869 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
870 if( mask
& (1 << j
) ) {
871 for( i
= first
; i
<= last
; i
++ ) {
872 interp( mach
, i
, j
);
883 struct spu_exec_machine
*mach
,
884 const struct tgsi_full_instruction
*inst
,
888 union spu_exec_channel r
[8];
892 switch (inst
->Instruction
.Opcode
) {
893 case TGSI_OPCODE_ARL
:
894 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
895 FETCH( &r
[0], 0, chan_index
);
896 r
[0].q
= si_cflts(r
[0].q
, 0);
897 STORE( &r
[0], 0, chan_index
);
901 case TGSI_OPCODE_MOV
:
902 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
903 FETCH( &r
[0], 0, chan_index
);
904 STORE( &r
[0], 0, chan_index
);
908 case TGSI_OPCODE_LIT
:
909 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
910 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
913 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
914 FETCH( &r
[0], 0, CHAN_X
);
915 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
916 r
[0].q
= micro_max(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
917 STORE( &r
[0], 0, CHAN_Y
);
920 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
921 FETCH( &r
[1], 0, CHAN_Y
);
922 r
[1].q
= micro_max(r
[1].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
924 FETCH( &r
[2], 0, CHAN_W
);
925 r
[2].q
= micro_min(r
[2].q
, mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
);
926 r
[2].q
= micro_max(r
[2].q
, mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
);
927 r
[1].q
= micro_pow(r
[1].q
, r
[2].q
);
929 /* r0 = (r0 > 0.0) ? r1 : 0.0
931 r
[0].q
= si_fcgt(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
932 r
[0].q
= si_selb(mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
, r
[1].q
,
934 STORE( &r
[0], 0, CHAN_Z
);
938 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
939 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
943 case TGSI_OPCODE_RCP
:
944 FETCH( &r
[0], 0, CHAN_X
);
945 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
946 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
947 STORE( &r
[0], 0, chan_index
);
951 case TGSI_OPCODE_RSQ
:
952 FETCH( &r
[0], 0, CHAN_X
);
953 r
[0].q
= micro_sqrt(r
[0].q
);
954 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
955 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
956 STORE( &r
[0], 0, chan_index
);
960 case TGSI_OPCODE_EXP
:
964 case TGSI_OPCODE_LOG
:
968 case TGSI_OPCODE_MUL
:
969 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
)
971 FETCH(&r
[0], 0, chan_index
);
972 FETCH(&r
[1], 1, chan_index
);
974 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
976 STORE(&r
[0], 0, chan_index
);
980 case TGSI_OPCODE_ADD
:
981 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
982 FETCH( &r
[0], 0, chan_index
);
983 FETCH( &r
[1], 1, chan_index
);
984 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
985 STORE( &r
[0], 0, chan_index
);
989 case TGSI_OPCODE_DP3
:
990 /* TGSI_OPCODE_DOT3 */
991 FETCH( &r
[0], 0, CHAN_X
);
992 FETCH( &r
[1], 1, CHAN_X
);
993 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
995 FETCH( &r
[1], 0, CHAN_Y
);
996 FETCH( &r
[2], 1, CHAN_Y
);
997 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1000 FETCH( &r
[1], 0, CHAN_Z
);
1001 FETCH( &r
[2], 1, CHAN_Z
);
1002 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1004 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1005 STORE( &r
[0], 0, chan_index
);
1009 case TGSI_OPCODE_DP4
:
1010 /* TGSI_OPCODE_DOT4 */
1011 FETCH(&r
[0], 0, CHAN_X
);
1012 FETCH(&r
[1], 1, CHAN_X
);
1014 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1016 FETCH(&r
[1], 0, CHAN_Y
);
1017 FETCH(&r
[2], 1, CHAN_Y
);
1019 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1021 FETCH(&r
[1], 0, CHAN_Z
);
1022 FETCH(&r
[2], 1, CHAN_Z
);
1024 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1026 FETCH(&r
[1], 0, CHAN_W
);
1027 FETCH(&r
[2], 1, CHAN_W
);
1029 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1031 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1032 STORE( &r
[0], 0, chan_index
);
1036 case TGSI_OPCODE_DST
:
1037 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1038 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
1041 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1042 FETCH( &r
[0], 0, CHAN_Y
);
1043 FETCH( &r
[1], 1, CHAN_Y
);
1044 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1045 STORE( &r
[0], 0, CHAN_Y
);
1048 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1049 FETCH( &r
[0], 0, CHAN_Z
);
1050 STORE( &r
[0], 0, CHAN_Z
);
1053 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1054 FETCH( &r
[0], 1, CHAN_W
);
1055 STORE( &r
[0], 0, CHAN_W
);
1059 case TGSI_OPCODE_MIN
:
1060 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1061 FETCH(&r
[0], 0, chan_index
);
1062 FETCH(&r
[1], 1, chan_index
);
1064 r
[0].q
= micro_min(r
[0].q
, r
[1].q
);
1066 STORE(&r
[0], 0, chan_index
);
1070 case TGSI_OPCODE_MAX
:
1071 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1072 FETCH(&r
[0], 0, chan_index
);
1073 FETCH(&r
[1], 1, chan_index
);
1075 r
[0].q
= micro_max(r
[0].q
, r
[1].q
);
1077 STORE(&r
[0], 0, chan_index
);
1081 case TGSI_OPCODE_SLT
:
1082 /* TGSI_OPCODE_SETLT */
1083 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1084 FETCH( &r
[0], 0, chan_index
);
1085 FETCH( &r
[1], 1, chan_index
);
1087 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1088 r
[0].q
= si_xori(r
[0].q
, 0xff);
1090 STORE( &r
[0], 0, chan_index
);
1094 case TGSI_OPCODE_SGE
:
1095 /* TGSI_OPCODE_SETGE */
1096 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1097 FETCH( &r
[0], 0, chan_index
);
1098 FETCH( &r
[1], 1, chan_index
);
1099 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1100 STORE( &r
[0], 0, chan_index
);
1104 case TGSI_OPCODE_MAD
:
1105 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1106 FETCH( &r
[0], 0, chan_index
);
1107 FETCH( &r
[1], 1, chan_index
);
1108 FETCH( &r
[2], 2, chan_index
);
1109 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1110 STORE( &r
[0], 0, chan_index
);
1114 case TGSI_OPCODE_SUB
:
1115 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1116 FETCH(&r
[0], 0, chan_index
);
1117 FETCH(&r
[1], 1, chan_index
);
1119 r
[0].q
= si_fs(r
[0].q
, r
[1].q
);
1121 STORE(&r
[0], 0, chan_index
);
1125 case TGSI_OPCODE_LRP
:
1126 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1127 FETCH(&r
[0], 0, chan_index
);
1128 FETCH(&r
[1], 1, chan_index
);
1129 FETCH(&r
[2], 2, chan_index
);
1131 r
[1].q
= si_fs(r
[1].q
, r
[2].q
);
1132 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1134 STORE(&r
[0], 0, chan_index
);
1138 case TGSI_OPCODE_CND
:
1142 case TGSI_OPCODE_DP2A
:
1146 case TGSI_OPCODE_FRC
:
1147 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1148 FETCH( &r
[0], 0, chan_index
);
1149 r
[0].q
= micro_frc(r
[0].q
);
1150 STORE( &r
[0], 0, chan_index
);
1154 case TGSI_OPCODE_CLAMP
:
1158 case TGSI_OPCODE_FLR
:
1159 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1160 FETCH( &r
[0], 0, chan_index
);
1161 r
[0].q
= micro_flr(r
[0].q
);
1162 STORE( &r
[0], 0, chan_index
);
1166 case TGSI_OPCODE_ROUND
:
1167 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1168 FETCH( &r
[0], 0, chan_index
);
1169 r
[0].q
= micro_rnd(r
[0].q
);
1170 STORE( &r
[0], 0, chan_index
);
1174 case TGSI_OPCODE_EX2
:
1175 FETCH(&r
[0], 0, CHAN_X
);
1177 r
[0].q
= micro_pow(mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
, r
[0].q
);
1179 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1180 STORE( &r
[0], 0, chan_index
);
1184 case TGSI_OPCODE_LG2
:
1185 FETCH( &r
[0], 0, CHAN_X
);
1186 r
[0].q
= micro_lg2(r
[0].q
);
1187 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1188 STORE( &r
[0], 0, chan_index
);
1192 case TGSI_OPCODE_POW
:
1193 FETCH(&r
[0], 0, CHAN_X
);
1194 FETCH(&r
[1], 1, CHAN_X
);
1196 r
[0].q
= micro_pow(r
[0].q
, r
[1].q
);
1198 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1199 STORE( &r
[0], 0, chan_index
);
1203 case TGSI_OPCODE_XPD
:
1204 /* TGSI_OPCODE_XPD */
1205 FETCH(&r
[0], 0, CHAN_Y
);
1206 FETCH(&r
[1], 1, CHAN_Z
);
1207 FETCH(&r
[3], 0, CHAN_Z
);
1208 FETCH(&r
[4], 1, CHAN_Y
);
1210 /* r2 = (r0 * r1) - (r3 * r5)
1212 r
[2].q
= si_fm(r
[3].q
, r
[5].q
);
1213 r
[2].q
= si_fms(r
[0].q
, r
[1].q
, r
[2].q
);
1215 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1216 STORE( &r
[2], 0, CHAN_X
);
1219 FETCH(&r
[2], 1, CHAN_X
);
1220 FETCH(&r
[5], 0, CHAN_X
);
1222 /* r3 = (r3 * r2) - (r1 * r5)
1224 r
[1].q
= si_fm(r
[1].q
, r
[5].q
);
1225 r
[3].q
= si_fms(r
[3].q
, r
[2].q
, r
[1].q
);
1227 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1228 STORE( &r
[3], 0, CHAN_Y
);
1231 /* r5 = (r5 * r4) - (r0 * r2)
1233 r
[0].q
= si_fm(r
[0].q
, r
[2].q
);
1234 r
[5].q
= si_fms(r
[5].q
, r
[4].q
, r
[0].q
);
1236 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1237 STORE( &r
[5], 0, CHAN_Z
);
1240 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1241 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1245 case TGSI_OPCODE_ABS
:
1246 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1247 FETCH(&r
[0], 0, chan_index
);
1249 r
[0].q
= micro_abs(r
[0].q
);
1251 STORE(&r
[0], 0, chan_index
);
1255 case TGSI_OPCODE_RCC
:
1259 case TGSI_OPCODE_DPH
:
1260 FETCH(&r
[0], 0, CHAN_X
);
1261 FETCH(&r
[1], 1, CHAN_X
);
1263 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1265 FETCH(&r
[1], 0, CHAN_Y
);
1266 FETCH(&r
[2], 1, CHAN_Y
);
1268 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1270 FETCH(&r
[1], 0, CHAN_Z
);
1271 FETCH(&r
[2], 1, CHAN_Z
);
1273 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1275 FETCH(&r
[1], 1, CHAN_W
);
1277 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
1279 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1280 STORE( &r
[0], 0, chan_index
);
1284 case TGSI_OPCODE_COS
:
1285 FETCH(&r
[0], 0, CHAN_X
);
1287 r
[0].q
= micro_cos(r
[0].q
);
1289 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1290 STORE( &r
[0], 0, chan_index
);
1294 case TGSI_OPCODE_DDX
:
1295 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1296 FETCH( &r
[0], 0, chan_index
);
1297 r
[0].q
= micro_ddx(r
[0].q
);
1298 STORE( &r
[0], 0, chan_index
);
1302 case TGSI_OPCODE_DDY
:
1303 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1304 FETCH( &r
[0], 0, chan_index
);
1305 r
[0].q
= micro_ddy(r
[0].q
);
1306 STORE( &r
[0], 0, chan_index
);
1310 case TGSI_OPCODE_KILP
:
1311 exec_kilp (mach
, inst
);
1314 case TGSI_OPCODE_KIL
:
1315 exec_kil (mach
, inst
);
1318 case TGSI_OPCODE_PK2H
:
1322 case TGSI_OPCODE_PK2US
:
1326 case TGSI_OPCODE_PK4B
:
1330 case TGSI_OPCODE_PK4UB
:
1334 case TGSI_OPCODE_RFL
:
1338 case TGSI_OPCODE_SEQ
:
1339 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1340 FETCH( &r
[0], 0, chan_index
);
1341 FETCH( &r
[1], 1, chan_index
);
1343 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1345 STORE( &r
[0], 0, chan_index
);
1349 case TGSI_OPCODE_SFL
:
1353 case TGSI_OPCODE_SGT
:
1354 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1355 FETCH( &r
[0], 0, chan_index
);
1356 FETCH( &r
[1], 1, chan_index
);
1357 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1358 STORE( &r
[0], 0, chan_index
);
1362 case TGSI_OPCODE_SIN
:
1363 FETCH( &r
[0], 0, CHAN_X
);
1364 r
[0].q
= micro_sin(r
[0].q
);
1365 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1366 STORE( &r
[0], 0, chan_index
);
1370 case TGSI_OPCODE_SLE
:
1371 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1372 FETCH( &r
[0], 0, chan_index
);
1373 FETCH( &r
[1], 1, chan_index
);
1375 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1376 r
[0].q
= si_xori(r
[0].q
, 0xff);
1378 STORE( &r
[0], 0, chan_index
);
1382 case TGSI_OPCODE_SNE
:
1383 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1384 FETCH( &r
[0], 0, chan_index
);
1385 FETCH( &r
[1], 1, chan_index
);
1387 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1388 r
[0].q
= si_xori(r
[0].q
, 0xff);
1390 STORE( &r
[0], 0, chan_index
);
1394 case TGSI_OPCODE_STR
:
1398 case TGSI_OPCODE_TEX
:
1399 /* simple texture lookup */
1400 /* src[0] = texcoord */
1401 /* src[1] = sampler unit */
1402 exec_tex(mach
, inst
, FALSE
, FALSE
);
1405 case TGSI_OPCODE_TXB
:
1406 /* Texture lookup with lod bias */
1407 /* src[0] = texcoord (src[0].w = load bias) */
1408 /* src[1] = sampler unit */
1409 exec_tex(mach
, inst
, TRUE
, FALSE
);
1412 case TGSI_OPCODE_TXD
:
1413 /* Texture lookup with explict partial derivatives */
1414 /* src[0] = texcoord */
1415 /* src[1] = d[strq]/dx */
1416 /* src[2] = d[strq]/dy */
1417 /* src[3] = sampler unit */
1421 case TGSI_OPCODE_TXL
:
1422 /* Texture lookup with explit LOD */
1423 /* src[0] = texcoord (src[0].w = load bias) */
1424 /* src[1] = sampler unit */
1425 exec_tex(mach
, inst
, TRUE
, FALSE
);
1428 case TGSI_OPCODE_TXP
:
1429 /* Texture lookup with projection */
1430 /* src[0] = texcoord (src[0].w = projection) */
1431 /* src[1] = sampler unit */
1432 exec_tex(mach
, inst
, TRUE
, TRUE
);
1435 case TGSI_OPCODE_UP2H
:
1439 case TGSI_OPCODE_UP2US
:
1443 case TGSI_OPCODE_UP4B
:
1447 case TGSI_OPCODE_UP4UB
:
1451 case TGSI_OPCODE_X2D
:
1455 case TGSI_OPCODE_ARA
:
1459 case TGSI_OPCODE_ARR
:
1463 case TGSI_OPCODE_BRA
:
1467 case TGSI_OPCODE_CAL
:
1468 /* skip the call if no execution channels are enabled */
1469 if (mach
->ExecMask
) {
1472 /* push the Cond, Loop, Cont stacks */
1473 ASSERT(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1474 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1475 ASSERT(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1476 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1477 ASSERT(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1478 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1480 ASSERT(mach
->FuncStackTop
< TGSI_EXEC_MAX_CALL_NESTING
);
1481 mach
->FuncStack
[mach
->FuncStackTop
++] = mach
->FuncMask
;
1483 /* note that PC was already incremented above */
1484 mach
->CallStack
[mach
->CallStackTop
++] = *pc
;
1485 *pc
= inst
->InstructionExtLabel
.Label
;
1489 case TGSI_OPCODE_RET
:
1490 mach
->FuncMask
&= ~mach
->ExecMask
;
1491 UPDATE_EXEC_MASK(mach
);
1493 if (mach
->ExecMask
== 0x0) {
1494 /* really return now (otherwise, keep executing */
1496 if (mach
->CallStackTop
== 0) {
1497 /* returning from main() */
1501 *pc
= mach
->CallStack
[--mach
->CallStackTop
];
1503 /* pop the Cond, Loop, Cont stacks */
1504 ASSERT(mach
->CondStackTop
> 0);
1505 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1506 ASSERT(mach
->LoopStackTop
> 0);
1507 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1508 ASSERT(mach
->ContStackTop
> 0);
1509 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1510 ASSERT(mach
->FuncStackTop
> 0);
1511 mach
->FuncMask
= mach
->FuncStack
[--mach
->FuncStackTop
];
1513 UPDATE_EXEC_MASK(mach
);
1517 case TGSI_OPCODE_SSG
:
1521 case TGSI_OPCODE_CMP
:
1522 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1523 FETCH(&r
[0], 0, chan_index
);
1524 FETCH(&r
[1], 1, chan_index
);
1525 FETCH(&r
[2], 2, chan_index
);
1527 /* r0 = (r0 < 0.0) ? r1 : r2
1529 r
[3].q
= si_xor(r
[3].q
, r
[3].q
);
1530 r
[0].q
= micro_lt(r
[0].q
, r
[3].q
);
1531 r
[0].q
= si_selb(r
[1].q
, r
[2].q
, r
[0].q
);
1533 STORE(&r
[0], 0, chan_index
);
1537 case TGSI_OPCODE_SCS
:
1538 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1539 FETCH( &r
[0], 0, CHAN_X
);
1541 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1542 r
[1].q
= micro_cos(r
[0].q
);
1543 STORE( &r
[1], 0, CHAN_X
);
1545 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1546 r
[1].q
= micro_sin(r
[0].q
);
1547 STORE( &r
[1], 0, CHAN_Y
);
1549 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1550 STORE( &mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
], 0, CHAN_Z
);
1552 if( IS_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1553 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1557 case TGSI_OPCODE_NRM
:
1561 case TGSI_OPCODE_DIV
:
1565 case TGSI_OPCODE_DP2
:
1566 FETCH( &r
[0], 0, CHAN_X
);
1567 FETCH( &r
[1], 1, CHAN_X
);
1568 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1570 FETCH( &r
[1], 0, CHAN_Y
);
1571 FETCH( &r
[2], 1, CHAN_Y
);
1572 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1574 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1575 STORE( &r
[0], 0, chan_index
);
1579 case TGSI_OPCODE_IF
:
1581 ASSERT(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1582 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1583 FETCH( &r
[0], 0, CHAN_X
);
1584 /* update CondMask */
1586 mach
->CondMask
&= ~0x1;
1589 mach
->CondMask
&= ~0x2;
1592 mach
->CondMask
&= ~0x4;
1595 mach
->CondMask
&= ~0x8;
1597 UPDATE_EXEC_MASK(mach
);
1598 /* Todo: If CondMask==0, jump to ELSE */
1601 case TGSI_OPCODE_ELSE
:
1602 /* invert CondMask wrt previous mask */
1605 ASSERT(mach
->CondStackTop
> 0);
1606 prevMask
= mach
->CondStack
[mach
->CondStackTop
- 1];
1607 mach
->CondMask
= ~mach
->CondMask
& prevMask
;
1608 UPDATE_EXEC_MASK(mach
);
1609 /* Todo: If CondMask==0, jump to ENDIF */
1613 case TGSI_OPCODE_ENDIF
:
1615 ASSERT(mach
->CondStackTop
> 0);
1616 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1617 UPDATE_EXEC_MASK(mach
);
1620 case TGSI_OPCODE_END
:
1621 /* halt execution */
1625 case TGSI_OPCODE_REP
:
1629 case TGSI_OPCODE_ENDREP
:
1633 case TGSI_OPCODE_PUSHA
:
1637 case TGSI_OPCODE_POPA
:
1641 case TGSI_OPCODE_CEIL
:
1642 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1643 FETCH( &r
[0], 0, chan_index
);
1644 r
[0].q
= micro_ceil(r
[0].q
);
1645 STORE( &r
[0], 0, chan_index
);
1649 case TGSI_OPCODE_I2F
:
1650 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1651 FETCH( &r
[0], 0, chan_index
);
1652 r
[0].q
= si_csflt(r
[0].q
, 0);
1653 STORE( &r
[0], 0, chan_index
);
1657 case TGSI_OPCODE_NOT
:
1658 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1659 FETCH( &r
[0], 0, chan_index
);
1660 r
[0].q
= si_xorbi(r
[0].q
, 0xff);
1661 STORE( &r
[0], 0, chan_index
);
1665 case TGSI_OPCODE_TRUNC
:
1666 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1667 FETCH( &r
[0], 0, chan_index
);
1668 r
[0].q
= micro_trunc(r
[0].q
);
1669 STORE( &r
[0], 0, chan_index
);
1673 case TGSI_OPCODE_SHL
:
1674 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1675 FETCH( &r
[0], 0, chan_index
);
1676 FETCH( &r
[1], 1, chan_index
);
1678 r
[0].q
= si_shl(r
[0].q
, r
[1].q
);
1680 STORE( &r
[0], 0, chan_index
);
1684 case TGSI_OPCODE_SHR
:
1685 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1686 FETCH( &r
[0], 0, chan_index
);
1687 FETCH( &r
[1], 1, chan_index
);
1688 r
[0].q
= micro_ishr(r
[0].q
, r
[1].q
);
1689 STORE( &r
[0], 0, chan_index
);
1693 case TGSI_OPCODE_AND
:
1694 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1695 FETCH( &r
[0], 0, chan_index
);
1696 FETCH( &r
[1], 1, chan_index
);
1697 r
[0].q
= si_and(r
[0].q
, r
[1].q
);
1698 STORE( &r
[0], 0, chan_index
);
1702 case TGSI_OPCODE_OR
:
1703 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1704 FETCH( &r
[0], 0, chan_index
);
1705 FETCH( &r
[1], 1, chan_index
);
1706 r
[0].q
= si_or(r
[0].q
, r
[1].q
);
1707 STORE( &r
[0], 0, chan_index
);
1711 case TGSI_OPCODE_MOD
:
1715 case TGSI_OPCODE_XOR
:
1716 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1717 FETCH( &r
[0], 0, chan_index
);
1718 FETCH( &r
[1], 1, chan_index
);
1719 r
[0].q
= si_xor(r
[0].q
, r
[1].q
);
1720 STORE( &r
[0], 0, chan_index
);
1724 case TGSI_OPCODE_SAD
:
1728 case TGSI_OPCODE_TXF
:
1732 case TGSI_OPCODE_TXQ
:
1736 case TGSI_OPCODE_EMIT
:
1737 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] += 16;
1738 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]]++;
1741 case TGSI_OPCODE_ENDPRIM
:
1742 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]++;
1743 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]] = 0;
1746 case TGSI_OPCODE_BGNFOR
:
1747 /* fall-through (for now) */
1748 case TGSI_OPCODE_BGNLOOP
:
1749 /* push LoopMask and ContMasks */
1750 ASSERT(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1751 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1752 ASSERT(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1753 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1756 case TGSI_OPCODE_ENDFOR
:
1757 /* fall-through (for now at least) */
1758 case TGSI_OPCODE_ENDLOOP
:
1759 /* Restore ContMask, but don't pop */
1760 ASSERT(mach
->ContStackTop
> 0);
1761 mach
->ContMask
= mach
->ContStack
[mach
->ContStackTop
- 1];
1762 if (mach
->LoopMask
) {
1763 /* repeat loop: jump to instruction just past BGNLOOP */
1764 *pc
= inst
->InstructionExtLabel
.Label
+ 1;
1767 /* exit loop: pop LoopMask */
1768 ASSERT(mach
->LoopStackTop
> 0);
1769 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1771 ASSERT(mach
->ContStackTop
> 0);
1772 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1774 UPDATE_EXEC_MASK(mach
);
1777 case TGSI_OPCODE_BRK
:
1778 /* turn off loop channels for each enabled exec channel */
1779 mach
->LoopMask
&= ~mach
->ExecMask
;
1780 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1781 UPDATE_EXEC_MASK(mach
);
1784 case TGSI_OPCODE_CONT
:
1785 /* turn off cont channels for each enabled exec channel */
1786 mach
->ContMask
&= ~mach
->ExecMask
;
1787 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1788 UPDATE_EXEC_MASK(mach
);
1791 case TGSI_OPCODE_BGNSUB
:
1795 case TGSI_OPCODE_ENDSUB
:
1799 case TGSI_OPCODE_NOP
:
1809 * Run TGSI interpreter.
1810 * \return bitmask of "alive" quad components
1813 spu_exec_machine_run( struct spu_exec_machine
*mach
)
1818 mach
->CondMask
= 0xf;
1819 mach
->LoopMask
= 0xf;
1820 mach
->ContMask
= 0xf;
1821 mach
->FuncMask
= 0xf;
1822 mach
->ExecMask
= 0xf;
1824 mach
->CondStackTop
= 0; /* temporarily subvert this ASSERTion */
1825 ASSERT(mach
->CondStackTop
== 0);
1826 ASSERT(mach
->LoopStackTop
== 0);
1827 ASSERT(mach
->ContStackTop
== 0);
1828 ASSERT(mach
->CallStackTop
== 0);
1830 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] = 0;
1831 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] = 0;
1833 if( mach
->Processor
== TGSI_PROCESSOR_GEOMETRY
) {
1834 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0] = 0;
1835 mach
->Primitives
[0] = 0;
1839 /* execute declarations (interpolants) */
1840 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1841 for (i
= 0; i
< mach
->NumDeclarations
; i
++) {
1843 struct tgsi_full_declaration decl
;
1844 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_declaration
)) / 16];
1846 unsigned ea
= (unsigned) (mach
->Declarations
+ pc
);
1848 spu_dcache_fetch_unaligned(d
.buffer
, ea
, sizeof(d
.decl
));
1850 exec_declaration( mach
, &d
.decl
);
1854 /* execute instructions, until pc is set to -1 */
1857 struct tgsi_full_instruction inst
;
1858 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_instruction
)) / 16];
1860 unsigned ea
= (unsigned) (mach
->Instructions
+ pc
);
1862 spu_dcache_fetch_unaligned(i
.buffer
, ea
, sizeof(i
.inst
));
1863 exec_instruction( mach
, & i
.inst
, &pc
);
1867 /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
1868 if (mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1870 * Scale back depth component.
1872 for (i
= 0; i
< 4; i
++)
1873 mach
->Outputs
[0].xyzw
[2].f
[i
] *= ctx
->DrawBuffer
->_DepthMaxF
;
1877 return ~mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0];