1 /**************************************************************************
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 * TGSI interpretor/executor.
31 * Flow control information:
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers. This is the ExecMask.
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
53 #include <transpose_matrix4x4.h>
54 #include <simdmath/ceilf4.h>
55 #include <simdmath/cosf4.h>
56 #include <simdmath/divf4.h>
57 #include <simdmath/floorf4.h>
58 #include <simdmath/log2f4.h>
59 #include <simdmath/powf4.h>
60 #include <simdmath/sinf4.h>
61 #include <simdmath/sqrtf4.h>
62 #include <simdmath/truncf4.h>
64 #include "pipe/p_compiler.h"
65 #include "pipe/p_state.h"
66 #include "pipe/p_util.h"
67 #include "pipe/p_shader_tokens.h"
68 #include "tgsi/util/tgsi_parse.h"
69 #include "tgsi/util/tgsi_util.h"
72 #include "spu_vertex_shader.h"
73 #include "spu_dcache.h"
74 #include "cell/common.h"
76 #define TILE_TOP_LEFT 0
77 #define TILE_TOP_RIGHT 1
78 #define TILE_BOTTOM_LEFT 2
79 #define TILE_BOTTOM_RIGHT 3
82 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
84 #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I
85 #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C
86 #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I
87 #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C
88 #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I
89 #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C
90 #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I
91 #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C
92 #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I
93 #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C
94 #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I
95 #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C
96 #define TEMP_128_I TGSI_EXEC_TEMP_128_I
97 #define TEMP_128_C TGSI_EXEC_TEMP_128_C
98 #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I
99 #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C
100 #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
101 #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
102 #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
103 #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
104 #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
105 #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
106 #define TEMP_R0 TGSI_EXEC_TEMP_R0
108 #define FOR_EACH_CHANNEL(CHAN)\
109 for (CHAN = 0; CHAN < 4; CHAN++)
111 #define IS_CHANNEL_ENABLED(INST, CHAN)\
112 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
114 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
115 ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
117 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
118 FOR_EACH_CHANNEL( CHAN )\
119 if (IS_CHANNEL_ENABLED( INST, CHAN ))
121 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
122 FOR_EACH_CHANNEL( CHAN )\
123 if (IS_CHANNEL_ENABLED2( INST, CHAN ))
126 /** The execution mask depends on the conditional mask and the loop mask */
127 #define UPDATE_EXEC_MASK(MACH) \
128 MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
139 * Initialize machine state by expanding tokens to full instructions,
140 * allocating temporary storage, setting up constants, etc.
141 * After this, we can call spu_exec_machine_run() many times.
144 spu_exec_machine_init(struct spu_exec_machine
*mach
,
146 struct spu_sampler
*samplers
,
149 const qword zero
= si_il(0);
150 const qword not_zero
= si_il(~0);
153 mach
->Samplers
= samplers
;
154 mach
->Processor
= processor
;
155 mach
->Addrs
= &mach
->Temps
[TGSI_EXEC_NUM_TEMPS
];
157 /* Setup constants. */
158 mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
= zero
;
159 mach
->Temps
[TEMP_FF_I
].xyzw
[TEMP_FF_C
].q
= not_zero
;
160 mach
->Temps
[TEMP_7F_I
].xyzw
[TEMP_7F_C
].q
= si_shli(not_zero
, -1);
161 mach
->Temps
[TEMP_80_I
].xyzw
[TEMP_80_C
].q
= si_shli(not_zero
, 31);
163 mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
= (qword
) spu_splats(1.0f
);
164 mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
= (qword
) spu_splats(2.0f
);
165 mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
= (qword
) spu_splats(128.0f
);
166 mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
= (qword
) spu_splats(-128.0f
);
173 return si_rotmi(si_shli(src
, 1), -1);
177 micro_ceil(qword src
)
179 return (qword
) _ceilf4((vec_float4
) src
);
185 return (qword
) _cosf4((vec_float4
) src
);
188 static const qword br_shuf
= {
189 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
190 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
191 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
192 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
193 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
194 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
195 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
196 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
199 static const qword bl_shuf
= {
200 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
201 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
202 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
203 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
204 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
205 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
206 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
207 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
210 static const qword tl_shuf
= {
211 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
212 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
213 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
214 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
215 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
216 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
217 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
218 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
224 qword bottom_right
= si_shufb(src
, src
, br_shuf
);
225 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
227 return si_fs(bottom_right
, bottom_left
);
233 qword top_left
= si_shufb(src
, src
, tl_shuf
);
234 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
236 return si_fs(top_left
, bottom_left
);
240 micro_div(qword src0
, qword src1
)
242 return (qword
) _divf4((vec_float4
) src0
, (vec_float4
) src1
);
248 return (qword
) _floorf4((vec_float4
) src
);
254 return si_fs(src
, (qword
) _floorf4((vec_float4
) src
));
258 micro_ge(qword src0
, qword src1
)
260 return si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
266 return (qword
) _log2f4((vec_float4
) src
);
270 micro_lt(qword src0
, qword src1
)
272 const qword tmp
= si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
274 return si_xori(tmp
, 0xff);
278 micro_max(qword src0
, qword src1
)
280 return si_selb(src1
, src0
, si_fcgt(src0
, src1
));
284 micro_min(qword src0
, qword src1
)
286 return si_selb(src0
, src1
, si_fcgt(src0
, src1
));
292 return si_xor(src
, (qword
) spu_splats(0x80000000));
296 micro_set_sign(qword src
)
298 return si_or(src
, (qword
) spu_splats(0x80000000));
302 micro_pow(qword src0
, qword src1
)
304 return (qword
) _powf4((vec_float4
) src0
, (vec_float4
) src1
);
310 const qword half
= (qword
) spu_splats(0.5f
);
312 /* May be able to use _roundf4. There may be some difference, though.
314 return (qword
) _floorf4((vec_float4
) si_fa(src
, half
));
318 micro_ishr(qword src0
, qword src1
)
320 return si_rotma(src0
, si_sfi(src1
, 0));
324 micro_trunc(qword src
)
326 return (qword
) _truncf4((vec_float4
) src
);
332 return (qword
) _sinf4((vec_float4
) src
);
336 micro_sqrt(qword src
)
338 return (qword
) _sqrtf4((vec_float4
) src
);
342 fetch_src_file_channel(
343 const struct spu_exec_machine
*mach
,
346 const union spu_exec_channel
*index
,
347 union spu_exec_channel
*chan
)
350 case TGSI_EXTSWIZZLE_X
:
351 case TGSI_EXTSWIZZLE_Y
:
352 case TGSI_EXTSWIZZLE_Z
:
353 case TGSI_EXTSWIZZLE_W
:
355 case TGSI_FILE_CONSTANT
: {
358 for (i
= 0; i
< 4; i
++) {
359 const float *ptr
= mach
->Consts
[index
->i
[i
]];
362 spu_dcache_fetch_unaligned((qword
*) tmp
,
363 (uintptr_t)(ptr
+ swizzle
),
371 case TGSI_FILE_INPUT
:
372 chan
->u
[0] = mach
->Inputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
373 chan
->u
[1] = mach
->Inputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
374 chan
->u
[2] = mach
->Inputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
375 chan
->u
[3] = mach
->Inputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
378 case TGSI_FILE_TEMPORARY
:
379 chan
->u
[0] = mach
->Temps
[index
->i
[0]].xyzw
[swizzle
].u
[0];
380 chan
->u
[1] = mach
->Temps
[index
->i
[1]].xyzw
[swizzle
].u
[1];
381 chan
->u
[2] = mach
->Temps
[index
->i
[2]].xyzw
[swizzle
].u
[2];
382 chan
->u
[3] = mach
->Temps
[index
->i
[3]].xyzw
[swizzle
].u
[3];
385 case TGSI_FILE_IMMEDIATE
:
386 assert( index
->i
[0] < (int) mach
->ImmLimit
);
387 assert( index
->i
[1] < (int) mach
->ImmLimit
);
388 assert( index
->i
[2] < (int) mach
->ImmLimit
);
389 assert( index
->i
[3] < (int) mach
->ImmLimit
);
391 chan
->f
[0] = mach
->Imms
[index
->i
[0]][swizzle
];
392 chan
->f
[1] = mach
->Imms
[index
->i
[1]][swizzle
];
393 chan
->f
[2] = mach
->Imms
[index
->i
[2]][swizzle
];
394 chan
->f
[3] = mach
->Imms
[index
->i
[3]][swizzle
];
397 case TGSI_FILE_ADDRESS
:
398 chan
->u
[0] = mach
->Addrs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
399 chan
->u
[1] = mach
->Addrs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
400 chan
->u
[2] = mach
->Addrs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
401 chan
->u
[3] = mach
->Addrs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
404 case TGSI_FILE_OUTPUT
:
405 /* vertex/fragment output vars can be read too */
406 chan
->u
[0] = mach
->Outputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
407 chan
->u
[1] = mach
->Outputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
408 chan
->u
[2] = mach
->Outputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
409 chan
->u
[3] = mach
->Outputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
417 case TGSI_EXTSWIZZLE_ZERO
:
418 *chan
= mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
];
421 case TGSI_EXTSWIZZLE_ONE
:
422 *chan
= mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
];
432 const struct spu_exec_machine
*mach
,
433 union spu_exec_channel
*chan
,
434 const struct tgsi_full_src_register
*reg
,
435 const uint chan_index
)
437 union spu_exec_channel index
;
443 index
.i
[3] = reg
->SrcRegister
.Index
;
445 if (reg
->SrcRegister
.Indirect
) {
446 union spu_exec_channel index2
;
447 union spu_exec_channel indir_index
;
452 index2
.i
[3] = reg
->SrcRegisterInd
.Index
;
454 swizzle
= tgsi_util_get_src_register_swizzle(®
->SrcRegisterInd
,
456 fetch_src_file_channel(
458 reg
->SrcRegisterInd
.File
,
463 index
.q
= si_a(index
.q
, indir_index
.q
);
466 if( reg
->SrcRegister
.Dimension
) {
467 switch( reg
->SrcRegister
.File
) {
468 case TGSI_FILE_INPUT
:
469 index
.q
= si_mpyi(index
.q
, 17);
471 case TGSI_FILE_CONSTANT
:
472 index
.q
= si_shli(index
.q
, 12);
478 index
.i
[0] += reg
->SrcRegisterDim
.Index
;
479 index
.i
[1] += reg
->SrcRegisterDim
.Index
;
480 index
.i
[2] += reg
->SrcRegisterDim
.Index
;
481 index
.i
[3] += reg
->SrcRegisterDim
.Index
;
483 if (reg
->SrcRegisterDim
.Indirect
) {
484 union spu_exec_channel index2
;
485 union spu_exec_channel indir_index
;
490 index2
.i
[3] = reg
->SrcRegisterDimInd
.Index
;
492 swizzle
= tgsi_util_get_src_register_swizzle( ®
->SrcRegisterDimInd
, CHAN_X
);
493 fetch_src_file_channel(
495 reg
->SrcRegisterDimInd
.File
,
500 index
.q
= si_a(index
.q
, indir_index
.q
);
504 swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
505 fetch_src_file_channel(
507 reg
->SrcRegister
.File
,
512 switch (tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
)) {
513 case TGSI_UTIL_SIGN_CLEAR
:
514 chan
->q
= micro_abs(chan
->q
);
517 case TGSI_UTIL_SIGN_SET
:
518 chan
->q
= micro_set_sign(chan
->q
);
521 case TGSI_UTIL_SIGN_TOGGLE
:
522 chan
->q
= micro_neg(chan
->q
);
525 case TGSI_UTIL_SIGN_KEEP
:
529 if (reg
->SrcRegisterExtMod
.Complement
) {
530 chan
->q
= si_fs(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, chan
->q
);
536 struct spu_exec_machine
*mach
,
537 const union spu_exec_channel
*chan
,
538 const struct tgsi_full_dst_register
*reg
,
539 const struct tgsi_full_instruction
*inst
,
542 union spu_exec_channel
*dst
;
544 switch( reg
->DstRegister
.File
) {
548 case TGSI_FILE_OUTPUT
:
549 dst
= &mach
->Outputs
[mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0]
550 + reg
->DstRegister
.Index
].xyzw
[chan_index
];
553 case TGSI_FILE_TEMPORARY
:
554 dst
= &mach
->Temps
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
557 case TGSI_FILE_ADDRESS
:
558 dst
= &mach
->Addrs
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
566 switch (inst
->Instruction
.Saturate
)
569 if (mach
->ExecMask
& 0x1)
570 dst
->i
[0] = chan
->i
[0];
571 if (mach
->ExecMask
& 0x2)
572 dst
->i
[1] = chan
->i
[1];
573 if (mach
->ExecMask
& 0x4)
574 dst
->i
[2] = chan
->i
[2];
575 if (mach
->ExecMask
& 0x8)
576 dst
->i
[3] = chan
->i
[3];
579 case TGSI_SAT_ZERO_ONE
:
580 /* XXX need to obey ExecMask here */
581 dst
->q
= micro_max(chan
->q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
582 dst
->q
= micro_min(dst
->q
, mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
);
585 case TGSI_SAT_MINUS_PLUS_ONE
:
594 #define FETCH(VAL,INDEX,CHAN)\
595 fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
597 #define STORE(VAL,INDEX,CHAN)\
598 store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
602 * Execute ARB-style KIL which is predicated by a src register.
603 * Kill fragment if any of the four values is less than zero.
606 exec_kilp(struct spu_exec_machine
*mach
,
607 const struct tgsi_full_instruction
*inst
)
611 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
612 union spu_exec_channel r
[1];
614 /* This mask stores component bits that were already tested. Note that
615 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
617 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
619 for (chan_index
= 0; chan_index
< 4; chan_index
++)
624 /* unswizzle channel */
625 swizzle
= tgsi_util_get_full_src_register_extswizzle (
626 &inst
->FullSrcRegisters
[0],
629 /* check if the component has not been already tested */
630 if (uniquemask
& (1 << swizzle
))
632 uniquemask
|= 1 << swizzle
;
634 FETCH(&r
[0], 0, chan_index
);
635 for (i
= 0; i
< 4; i
++)
636 if (r
[0].f
[i
] < 0.0f
)
640 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
645 * Fetch a texel using STR texture coordinates.
648 fetch_texel( struct spu_sampler
*sampler
,
649 const union spu_exec_channel
*s
,
650 const union spu_exec_channel
*t
,
651 const union spu_exec_channel
*p
,
652 float lodbias
, /* XXX should be float[4] */
653 union spu_exec_channel
*r
,
654 union spu_exec_channel
*g
,
655 union spu_exec_channel
*b
,
656 union spu_exec_channel
*a
)
661 sampler
->get_samples(sampler
, s
->f
, t
->f
, p
->f
, lodbias
,
662 (float (*)[4]) rgba
);
664 _transpose_matrix4x4((vec_float4
*) out
, (vec_float4
*) rgba
);
673 exec_tex(struct spu_exec_machine
*mach
,
674 const struct tgsi_full_instruction
*inst
,
675 boolean biasLod
, boolean projected
)
677 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
678 union spu_exec_channel r
[8];
682 /* printf("Sampler %u unit %u\n", sampler, unit); */
684 switch (inst
->InstructionExtTexture
.Texture
) {
685 case TGSI_TEXTURE_1D
:
687 FETCH(&r
[0], 0, CHAN_X
);
690 FETCH(&r
[1], 0, CHAN_W
);
691 r
[0].q
= micro_div(r
[0].q
, r
[1].q
);
695 FETCH(&r
[1], 0, CHAN_W
);
701 fetch_texel(&mach
->Samplers
[unit
],
702 &r
[0], NULL
, NULL
, lodBias
, /* S, T, P, BIAS */
703 &r
[0], &r
[1], &r
[2], &r
[3]); /* R, G, B, A */
706 case TGSI_TEXTURE_2D
:
707 case TGSI_TEXTURE_RECT
:
709 FETCH(&r
[0], 0, CHAN_X
);
710 FETCH(&r
[1], 0, CHAN_Y
);
711 FETCH(&r
[2], 0, CHAN_Z
);
714 FETCH(&r
[3], 0, CHAN_W
);
715 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
716 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
717 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
721 FETCH(&r
[3], 0, CHAN_W
);
727 fetch_texel(&mach
->Samplers
[unit
],
728 &r
[0], &r
[1], &r
[2], lodBias
, /* inputs */
729 &r
[0], &r
[1], &r
[2], &r
[3]); /* outputs */
732 case TGSI_TEXTURE_3D
:
733 case TGSI_TEXTURE_CUBE
:
735 FETCH(&r
[0], 0, CHAN_X
);
736 FETCH(&r
[1], 0, CHAN_Y
);
737 FETCH(&r
[2], 0, CHAN_Z
);
740 FETCH(&r
[3], 0, CHAN_W
);
741 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
742 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
743 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
747 FETCH(&r
[3], 0, CHAN_W
);
753 fetch_texel(&mach
->Samplers
[unit
],
754 &r
[0], &r
[1], &r
[2], lodBias
,
755 &r
[0], &r
[1], &r
[2], &r
[3]);
762 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
763 STORE( &r
[chan_index
], 0, chan_index
);
770 constant_interpolation(
771 struct spu_exec_machine
*mach
,
777 for( i
= 0; i
< QUAD_SIZE
; i
++ ) {
778 mach
->Inputs
[attrib
].xyzw
[chan
].f
[i
] = mach
->InterpCoefs
[attrib
].a0
[chan
];
783 linear_interpolation(
784 struct spu_exec_machine
*mach
,
788 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
789 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
790 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
791 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
792 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
793 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
;
794 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = a0
+ dadx
;
795 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = a0
+ dady
;
796 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = a0
+ dadx
+ dady
;
800 perspective_interpolation(
801 struct spu_exec_machine
*mach
,
805 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
806 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
807 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
808 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
809 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
810 const float *w
= mach
->QuadPos
.xyzw
[3].f
;
811 /* divide by W here */
812 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
/ w
[0];
813 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = (a0
+ dadx
) / w
[1];
814 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = (a0
+ dady
) / w
[2];
815 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = (a0
+ dadx
+ dady
) / w
[3];
819 typedef void (* interpolation_func
)(
820 struct spu_exec_machine
*mach
,
825 exec_declaration(struct spu_exec_machine
*mach
,
826 const struct tgsi_full_declaration
*decl
)
828 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
829 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
830 unsigned first
, last
, mask
;
831 interpolation_func interp
;
833 assert( decl
->Declaration
.Declare
== TGSI_DECLARE_RANGE
);
835 first
= decl
->u
.DeclarationRange
.First
;
836 last
= decl
->u
.DeclarationRange
.Last
;
837 mask
= decl
->Declaration
.UsageMask
;
839 switch( decl
->Interpolation
.Interpolate
) {
840 case TGSI_INTERPOLATE_CONSTANT
:
841 interp
= constant_interpolation
;
844 case TGSI_INTERPOLATE_LINEAR
:
845 interp
= linear_interpolation
;
848 case TGSI_INTERPOLATE_PERSPECTIVE
:
849 interp
= perspective_interpolation
;
856 if( mask
== TGSI_WRITEMASK_XYZW
) {
859 for( i
= first
; i
<= last
; i
++ ) {
860 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
861 interp( mach
, i
, j
);
868 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
869 if( mask
& (1 << j
) ) {
870 for( i
= first
; i
<= last
; i
++ ) {
871 interp( mach
, i
, j
);
882 struct spu_exec_machine
*mach
,
883 const struct tgsi_full_instruction
*inst
,
887 union spu_exec_channel r
[8];
891 switch (inst
->Instruction
.Opcode
) {
892 case TGSI_OPCODE_ARL
:
893 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
894 FETCH( &r
[0], 0, chan_index
);
895 r
[0].q
= si_cflts(r
[0].q
, 0);
896 STORE( &r
[0], 0, chan_index
);
900 case TGSI_OPCODE_MOV
:
901 /* TGSI_OPCODE_SWZ */
902 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
903 FETCH( &r
[0], 0, chan_index
);
904 STORE( &r
[0], 0, chan_index
);
908 case TGSI_OPCODE_LIT
:
909 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
910 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
913 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
914 FETCH( &r
[0], 0, CHAN_X
);
915 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
916 r
[0].q
= micro_max(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
917 STORE( &r
[0], 0, CHAN_Y
);
920 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
921 FETCH( &r
[1], 0, CHAN_Y
);
922 r
[1].q
= micro_max(r
[1].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
924 FETCH( &r
[2], 0, CHAN_W
);
925 r
[2].q
= micro_min(r
[2].q
, mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
);
926 r
[2].q
= micro_max(r
[2].q
, mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
);
927 r
[1].q
= micro_pow(r
[1].q
, r
[2].q
);
929 /* r0 = (r0 > 0.0) ? r1 : 0.0
931 r
[0].q
= si_fcgt(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
932 r
[0].q
= si_selb(mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
, r
[1].q
,
934 STORE( &r
[0], 0, CHAN_Z
);
938 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
939 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
943 case TGSI_OPCODE_RCP
:
944 /* TGSI_OPCODE_RECIP */
945 FETCH( &r
[0], 0, CHAN_X
);
946 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
947 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
948 STORE( &r
[0], 0, chan_index
);
952 case TGSI_OPCODE_RSQ
:
953 /* TGSI_OPCODE_RECIPSQRT */
954 FETCH( &r
[0], 0, CHAN_X
);
955 r
[0].q
= micro_sqrt(r
[0].q
);
956 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
957 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
958 STORE( &r
[0], 0, chan_index
);
962 case TGSI_OPCODE_EXP
:
966 case TGSI_OPCODE_LOG
:
970 case TGSI_OPCODE_MUL
:
971 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
)
973 FETCH(&r
[0], 0, chan_index
);
974 FETCH(&r
[1], 1, chan_index
);
976 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
978 STORE(&r
[0], 0, chan_index
);
982 case TGSI_OPCODE_ADD
:
983 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
984 FETCH( &r
[0], 0, chan_index
);
985 FETCH( &r
[1], 1, chan_index
);
986 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
987 STORE( &r
[0], 0, chan_index
);
991 case TGSI_OPCODE_DP3
:
992 /* TGSI_OPCODE_DOT3 */
993 FETCH( &r
[0], 0, CHAN_X
);
994 FETCH( &r
[1], 1, CHAN_X
);
995 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
997 FETCH( &r
[1], 0, CHAN_Y
);
998 FETCH( &r
[2], 1, CHAN_Y
);
999 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1002 FETCH( &r
[1], 0, CHAN_Z
);
1003 FETCH( &r
[2], 1, CHAN_Z
);
1004 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1006 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1007 STORE( &r
[0], 0, chan_index
);
1011 case TGSI_OPCODE_DP4
:
1012 /* TGSI_OPCODE_DOT4 */
1013 FETCH(&r
[0], 0, CHAN_X
);
1014 FETCH(&r
[1], 1, CHAN_X
);
1016 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1018 FETCH(&r
[1], 0, CHAN_Y
);
1019 FETCH(&r
[2], 1, CHAN_Y
);
1021 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1023 FETCH(&r
[1], 0, CHAN_Z
);
1024 FETCH(&r
[2], 1, CHAN_Z
);
1026 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1028 FETCH(&r
[1], 0, CHAN_W
);
1029 FETCH(&r
[2], 1, CHAN_W
);
1031 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1033 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1034 STORE( &r
[0], 0, chan_index
);
1038 case TGSI_OPCODE_DST
:
1039 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1040 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
1043 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1044 FETCH( &r
[0], 0, CHAN_Y
);
1045 FETCH( &r
[1], 1, CHAN_Y
);
1046 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1047 STORE( &r
[0], 0, CHAN_Y
);
1050 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1051 FETCH( &r
[0], 0, CHAN_Z
);
1052 STORE( &r
[0], 0, CHAN_Z
);
1055 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1056 FETCH( &r
[0], 1, CHAN_W
);
1057 STORE( &r
[0], 0, CHAN_W
);
1061 case TGSI_OPCODE_MIN
:
1062 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1063 FETCH(&r
[0], 0, chan_index
);
1064 FETCH(&r
[1], 1, chan_index
);
1066 r
[0].q
= micro_min(r
[0].q
, r
[1].q
);
1068 STORE(&r
[0], 0, chan_index
);
1072 case TGSI_OPCODE_MAX
:
1073 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1074 FETCH(&r
[0], 0, chan_index
);
1075 FETCH(&r
[1], 1, chan_index
);
1077 r
[0].q
= micro_max(r
[0].q
, r
[1].q
);
1079 STORE(&r
[0], 0, chan_index
);
1083 case TGSI_OPCODE_SLT
:
1084 /* TGSI_OPCODE_SETLT */
1085 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1086 FETCH( &r
[0], 0, chan_index
);
1087 FETCH( &r
[1], 1, chan_index
);
1089 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1090 r
[0].q
= si_xori(r
[0].q
, 0xff);
1092 STORE( &r
[0], 0, chan_index
);
1096 case TGSI_OPCODE_SGE
:
1097 /* TGSI_OPCODE_SETGE */
1098 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1099 FETCH( &r
[0], 0, chan_index
);
1100 FETCH( &r
[1], 1, chan_index
);
1101 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1102 STORE( &r
[0], 0, chan_index
);
1106 case TGSI_OPCODE_MAD
:
1107 /* TGSI_OPCODE_MADD */
1108 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1109 FETCH( &r
[0], 0, chan_index
);
1110 FETCH( &r
[1], 1, chan_index
);
1111 FETCH( &r
[2], 2, chan_index
);
1112 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1113 STORE( &r
[0], 0, chan_index
);
1117 case TGSI_OPCODE_SUB
:
1118 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1119 FETCH(&r
[0], 0, chan_index
);
1120 FETCH(&r
[1], 1, chan_index
);
1122 r
[0].q
= si_fs(r
[0].q
, r
[1].q
);
1124 STORE(&r
[0], 0, chan_index
);
1128 case TGSI_OPCODE_LERP
:
1129 /* TGSI_OPCODE_LRP */
1130 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1131 FETCH(&r
[0], 0, chan_index
);
1132 FETCH(&r
[1], 1, chan_index
);
1133 FETCH(&r
[2], 2, chan_index
);
1135 r
[1].q
= si_fs(r
[1].q
, r
[2].q
);
1136 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1138 STORE(&r
[0], 0, chan_index
);
1142 case TGSI_OPCODE_CND
:
1146 case TGSI_OPCODE_CND0
:
1150 case TGSI_OPCODE_DOT2ADD
:
1151 /* TGSI_OPCODE_DP2A */
1155 case TGSI_OPCODE_INDEX
:
1159 case TGSI_OPCODE_NEGATE
:
1163 case TGSI_OPCODE_FRAC
:
1164 /* TGSI_OPCODE_FRC */
1165 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1166 FETCH( &r
[0], 0, chan_index
);
1167 r
[0].q
= micro_frc(r
[0].q
);
1168 STORE( &r
[0], 0, chan_index
);
1172 case TGSI_OPCODE_CLAMP
:
1176 case TGSI_OPCODE_FLOOR
:
1177 /* TGSI_OPCODE_FLR */
1178 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1179 FETCH( &r
[0], 0, chan_index
);
1180 r
[0].q
= micro_flr(r
[0].q
);
1181 STORE( &r
[0], 0, chan_index
);
1185 case TGSI_OPCODE_ROUND
:
1186 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1187 FETCH( &r
[0], 0, chan_index
);
1188 r
[0].q
= micro_rnd(r
[0].q
);
1189 STORE( &r
[0], 0, chan_index
);
1193 case TGSI_OPCODE_EXPBASE2
:
1194 /* TGSI_OPCODE_EX2 */
1195 FETCH(&r
[0], 0, CHAN_X
);
1197 r
[0].q
= micro_pow(mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
, r
[0].q
);
1199 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1200 STORE( &r
[0], 0, chan_index
);
1204 case TGSI_OPCODE_LOGBASE2
:
1205 /* TGSI_OPCODE_LG2 */
1206 FETCH( &r
[0], 0, CHAN_X
);
1207 r
[0].q
= micro_lg2(r
[0].q
);
1208 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1209 STORE( &r
[0], 0, chan_index
);
1213 case TGSI_OPCODE_POWER
:
1214 /* TGSI_OPCODE_POW */
1215 FETCH(&r
[0], 0, CHAN_X
);
1216 FETCH(&r
[1], 1, CHAN_X
);
1218 r
[0].q
= micro_pow(r
[0].q
, r
[1].q
);
1220 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1221 STORE( &r
[0], 0, chan_index
);
1225 case TGSI_OPCODE_CROSSPRODUCT
:
1226 /* TGSI_OPCODE_XPD */
1227 FETCH(&r
[0], 0, CHAN_Y
);
1228 FETCH(&r
[1], 1, CHAN_Z
);
1229 FETCH(&r
[3], 0, CHAN_Z
);
1230 FETCH(&r
[4], 1, CHAN_Y
);
1232 /* r2 = (r0 * r1) - (r3 * r5)
1234 r
[2].q
= si_fm(r
[3].q
, r
[5].q
);
1235 r
[2].q
= si_fms(r
[0].q
, r
[1].q
, r
[2].q
);
1237 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1238 STORE( &r
[2], 0, CHAN_X
);
1241 FETCH(&r
[2], 1, CHAN_X
);
1242 FETCH(&r
[5], 0, CHAN_X
);
1244 /* r3 = (r3 * r2) - (r1 * r5)
1246 r
[1].q
= si_fm(r
[1].q
, r
[5].q
);
1247 r
[3].q
= si_fms(r
[3].q
, r
[2].q
, r
[1].q
);
1249 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1250 STORE( &r
[3], 0, CHAN_Y
);
1253 /* r5 = (r5 * r4) - (r0 * r2)
1255 r
[0].q
= si_fm(r
[0].q
, r
[2].q
);
1256 r
[5].q
= si_fms(r
[5].q
, r
[4].q
, r
[0].q
);
1258 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1259 STORE( &r
[5], 0, CHAN_Z
);
1262 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1263 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1267 case TGSI_OPCODE_MULTIPLYMATRIX
:
1271 case TGSI_OPCODE_ABS
:
1272 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1273 FETCH(&r
[0], 0, chan_index
);
1275 r
[0].q
= micro_abs(r
[0].q
);
1277 STORE(&r
[0], 0, chan_index
);
1281 case TGSI_OPCODE_RCC
:
1285 case TGSI_OPCODE_DPH
:
1286 FETCH(&r
[0], 0, CHAN_X
);
1287 FETCH(&r
[1], 1, CHAN_X
);
1289 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1291 FETCH(&r
[1], 0, CHAN_Y
);
1292 FETCH(&r
[2], 1, CHAN_Y
);
1294 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1296 FETCH(&r
[1], 0, CHAN_Z
);
1297 FETCH(&r
[2], 1, CHAN_Z
);
1299 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1301 FETCH(&r
[1], 1, CHAN_W
);
1303 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
1305 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1306 STORE( &r
[0], 0, chan_index
);
1310 case TGSI_OPCODE_COS
:
1311 FETCH(&r
[0], 0, CHAN_X
);
1313 r
[0].q
= micro_cos(r
[0].q
);
1315 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1316 STORE( &r
[0], 0, chan_index
);
1320 case TGSI_OPCODE_DDX
:
1321 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1322 FETCH( &r
[0], 0, chan_index
);
1323 r
[0].q
= micro_ddx(r
[0].q
);
1324 STORE( &r
[0], 0, chan_index
);
1328 case TGSI_OPCODE_DDY
:
1329 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1330 FETCH( &r
[0], 0, chan_index
);
1331 r
[0].q
= micro_ddy(r
[0].q
);
1332 STORE( &r
[0], 0, chan_index
);
1336 case TGSI_OPCODE_KILP
:
1337 exec_kilp (mach
, inst
);
1340 case TGSI_OPCODE_KIL
:
1341 /* for enabled ExecMask bits, set the killed bit */
1342 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= mach
->ExecMask
;
1345 case TGSI_OPCODE_PK2H
:
1349 case TGSI_OPCODE_PK2US
:
1353 case TGSI_OPCODE_PK4B
:
1357 case TGSI_OPCODE_PK4UB
:
1361 case TGSI_OPCODE_RFL
:
1365 case TGSI_OPCODE_SEQ
:
1366 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1367 FETCH( &r
[0], 0, chan_index
);
1368 FETCH( &r
[1], 1, chan_index
);
1370 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1372 STORE( &r
[0], 0, chan_index
);
1376 case TGSI_OPCODE_SFL
:
1380 case TGSI_OPCODE_SGT
:
1381 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1382 FETCH( &r
[0], 0, chan_index
);
1383 FETCH( &r
[1], 1, chan_index
);
1384 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1385 STORE( &r
[0], 0, chan_index
);
1389 case TGSI_OPCODE_SIN
:
1390 FETCH( &r
[0], 0, CHAN_X
);
1391 r
[0].q
= micro_sin(r
[0].q
);
1392 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1393 STORE( &r
[0], 0, chan_index
);
1397 case TGSI_OPCODE_SLE
:
1398 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1399 FETCH( &r
[0], 0, chan_index
);
1400 FETCH( &r
[1], 1, chan_index
);
1402 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1403 r
[0].q
= si_xori(r
[0].q
, 0xff);
1405 STORE( &r
[0], 0, chan_index
);
1409 case TGSI_OPCODE_SNE
:
1410 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1411 FETCH( &r
[0], 0, chan_index
);
1412 FETCH( &r
[1], 1, chan_index
);
1414 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1415 r
[0].q
= si_xori(r
[0].q
, 0xff);
1417 STORE( &r
[0], 0, chan_index
);
1421 case TGSI_OPCODE_STR
:
1425 case TGSI_OPCODE_TEX
:
1426 /* simple texture lookup */
1427 /* src[0] = texcoord */
1428 /* src[1] = sampler unit */
1429 exec_tex(mach
, inst
, FALSE
, FALSE
);
1432 case TGSI_OPCODE_TXB
:
1433 /* Texture lookup with lod bias */
1434 /* src[0] = texcoord (src[0].w = load bias) */
1435 /* src[1] = sampler unit */
1436 exec_tex(mach
, inst
, TRUE
, FALSE
);
1439 case TGSI_OPCODE_TXD
:
1440 /* Texture lookup with explict partial derivatives */
1441 /* src[0] = texcoord */
1442 /* src[1] = d[strq]/dx */
1443 /* src[2] = d[strq]/dy */
1444 /* src[3] = sampler unit */
1448 case TGSI_OPCODE_TXL
:
1449 /* Texture lookup with explit LOD */
1450 /* src[0] = texcoord (src[0].w = load bias) */
1451 /* src[1] = sampler unit */
1452 exec_tex(mach
, inst
, TRUE
, FALSE
);
1455 case TGSI_OPCODE_TXP
:
1456 /* Texture lookup with projection */
1457 /* src[0] = texcoord (src[0].w = projection) */
1458 /* src[1] = sampler unit */
1459 exec_tex(mach
, inst
, TRUE
, TRUE
);
1462 case TGSI_OPCODE_UP2H
:
1466 case TGSI_OPCODE_UP2US
:
1470 case TGSI_OPCODE_UP4B
:
1474 case TGSI_OPCODE_UP4UB
:
1478 case TGSI_OPCODE_X2D
:
1482 case TGSI_OPCODE_ARA
:
1486 case TGSI_OPCODE_ARR
:
1490 case TGSI_OPCODE_BRA
:
1494 case TGSI_OPCODE_CAL
:
1495 /* skip the call if no execution channels are enabled */
1496 if (mach
->ExecMask
) {
1499 /* push the Cond, Loop, Cont stacks */
1500 assert(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1501 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1502 assert(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1503 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1504 assert(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1505 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1507 assert(mach
->FuncStackTop
< TGSI_EXEC_MAX_CALL_NESTING
);
1508 mach
->FuncStack
[mach
->FuncStackTop
++] = mach
->FuncMask
;
1510 /* note that PC was already incremented above */
1511 mach
->CallStack
[mach
->CallStackTop
++] = *pc
;
1512 *pc
= inst
->InstructionExtLabel
.Label
;
1516 case TGSI_OPCODE_RET
:
1517 mach
->FuncMask
&= ~mach
->ExecMask
;
1518 UPDATE_EXEC_MASK(mach
);
1520 if (mach
->ExecMask
== 0x0) {
1521 /* really return now (otherwise, keep executing */
1523 if (mach
->CallStackTop
== 0) {
1524 /* returning from main() */
1528 *pc
= mach
->CallStack
[--mach
->CallStackTop
];
1530 /* pop the Cond, Loop, Cont stacks */
1531 assert(mach
->CondStackTop
> 0);
1532 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1533 assert(mach
->LoopStackTop
> 0);
1534 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1535 assert(mach
->ContStackTop
> 0);
1536 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1537 assert(mach
->FuncStackTop
> 0);
1538 mach
->FuncMask
= mach
->FuncStack
[--mach
->FuncStackTop
];
1540 UPDATE_EXEC_MASK(mach
);
1544 case TGSI_OPCODE_SSG
:
1548 case TGSI_OPCODE_CMP
:
1549 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1550 FETCH(&r
[0], 0, chan_index
);
1551 FETCH(&r
[1], 1, chan_index
);
1552 FETCH(&r
[2], 2, chan_index
);
1554 /* r0 = (r0 < 0.0) ? r1 : r2
1556 r
[3].q
= si_xor(r
[3].q
, r
[3].q
);
1557 r
[0].q
= micro_lt(r
[0].q
, r
[3].q
);
1558 r
[0].q
= si_selb(r
[1].q
, r
[2].q
, r
[0].q
);
1560 STORE(&r
[0], 0, chan_index
);
1564 case TGSI_OPCODE_SCS
:
1565 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1566 FETCH( &r
[0], 0, CHAN_X
);
1568 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1569 r
[1].q
= micro_cos(r
[0].q
);
1570 STORE( &r
[1], 0, CHAN_X
);
1572 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1573 r
[1].q
= micro_sin(r
[0].q
);
1574 STORE( &r
[1], 0, CHAN_Y
);
1576 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1577 STORE( &mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
], 0, CHAN_Z
);
1579 if( IS_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1580 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1584 case TGSI_OPCODE_NRM
:
1588 case TGSI_OPCODE_DIV
:
1592 case TGSI_OPCODE_DP2
:
1593 FETCH( &r
[0], 0, CHAN_X
);
1594 FETCH( &r
[1], 1, CHAN_X
);
1595 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1597 FETCH( &r
[1], 0, CHAN_Y
);
1598 FETCH( &r
[2], 1, CHAN_Y
);
1599 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1601 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1602 STORE( &r
[0], 0, chan_index
);
1606 case TGSI_OPCODE_IF
:
1608 assert(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1609 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1610 FETCH( &r
[0], 0, CHAN_X
);
1611 /* update CondMask */
1613 mach
->CondMask
&= ~0x1;
1616 mach
->CondMask
&= ~0x2;
1619 mach
->CondMask
&= ~0x4;
1622 mach
->CondMask
&= ~0x8;
1624 UPDATE_EXEC_MASK(mach
);
1625 /* Todo: If CondMask==0, jump to ELSE */
1628 case TGSI_OPCODE_ELSE
:
1629 /* invert CondMask wrt previous mask */
1632 assert(mach
->CondStackTop
> 0);
1633 prevMask
= mach
->CondStack
[mach
->CondStackTop
- 1];
1634 mach
->CondMask
= ~mach
->CondMask
& prevMask
;
1635 UPDATE_EXEC_MASK(mach
);
1636 /* Todo: If CondMask==0, jump to ENDIF */
1640 case TGSI_OPCODE_ENDIF
:
1642 assert(mach
->CondStackTop
> 0);
1643 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1644 UPDATE_EXEC_MASK(mach
);
1647 case TGSI_OPCODE_END
:
1648 /* halt execution */
1652 case TGSI_OPCODE_REP
:
1656 case TGSI_OPCODE_ENDREP
:
1660 case TGSI_OPCODE_PUSHA
:
1664 case TGSI_OPCODE_POPA
:
1668 case TGSI_OPCODE_CEIL
:
1669 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1670 FETCH( &r
[0], 0, chan_index
);
1671 r
[0].q
= micro_ceil(r
[0].q
);
1672 STORE( &r
[0], 0, chan_index
);
1676 case TGSI_OPCODE_I2F
:
1677 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1678 FETCH( &r
[0], 0, chan_index
);
1679 r
[0].q
= si_csflt(r
[0].q
, 0);
1680 STORE( &r
[0], 0, chan_index
);
1684 case TGSI_OPCODE_NOT
:
1685 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1686 FETCH( &r
[0], 0, chan_index
);
1687 r
[0].q
= si_xorbi(r
[0].q
, 0xff);
1688 STORE( &r
[0], 0, chan_index
);
1692 case TGSI_OPCODE_TRUNC
:
1693 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1694 FETCH( &r
[0], 0, chan_index
);
1695 r
[0].q
= micro_trunc(r
[0].q
);
1696 STORE( &r
[0], 0, chan_index
);
1700 case TGSI_OPCODE_SHL
:
1701 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1702 FETCH( &r
[0], 0, chan_index
);
1703 FETCH( &r
[1], 1, chan_index
);
1705 r
[0].q
= si_shl(r
[0].q
, r
[1].q
);
1707 STORE( &r
[0], 0, chan_index
);
1711 case TGSI_OPCODE_SHR
:
1712 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1713 FETCH( &r
[0], 0, chan_index
);
1714 FETCH( &r
[1], 1, chan_index
);
1715 r
[0].q
= micro_ishr(r
[0].q
, r
[1].q
);
1716 STORE( &r
[0], 0, chan_index
);
1720 case TGSI_OPCODE_AND
:
1721 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1722 FETCH( &r
[0], 0, chan_index
);
1723 FETCH( &r
[1], 1, chan_index
);
1724 r
[0].q
= si_and(r
[0].q
, r
[1].q
);
1725 STORE( &r
[0], 0, chan_index
);
1729 case TGSI_OPCODE_OR
:
1730 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1731 FETCH( &r
[0], 0, chan_index
);
1732 FETCH( &r
[1], 1, chan_index
);
1733 r
[0].q
= si_or(r
[0].q
, r
[1].q
);
1734 STORE( &r
[0], 0, chan_index
);
1738 case TGSI_OPCODE_MOD
:
1742 case TGSI_OPCODE_XOR
:
1743 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1744 FETCH( &r
[0], 0, chan_index
);
1745 FETCH( &r
[1], 1, chan_index
);
1746 r
[0].q
= si_xor(r
[0].q
, r
[1].q
);
1747 STORE( &r
[0], 0, chan_index
);
1751 case TGSI_OPCODE_SAD
:
1755 case TGSI_OPCODE_TXF
:
1759 case TGSI_OPCODE_TXQ
:
1763 case TGSI_OPCODE_EMIT
:
1764 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] += 16;
1765 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]]++;
1768 case TGSI_OPCODE_ENDPRIM
:
1769 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]++;
1770 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]] = 0;
1773 case TGSI_OPCODE_LOOP
:
1774 /* fall-through (for now) */
1775 case TGSI_OPCODE_BGNLOOP2
:
1776 /* push LoopMask and ContMasks */
1777 assert(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1778 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1779 assert(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1780 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1783 case TGSI_OPCODE_ENDLOOP
:
1784 /* fall-through (for now at least) */
1785 case TGSI_OPCODE_ENDLOOP2
:
1786 /* Restore ContMask, but don't pop */
1787 assert(mach
->ContStackTop
> 0);
1788 mach
->ContMask
= mach
->ContStack
[mach
->ContStackTop
- 1];
1789 if (mach
->LoopMask
) {
1790 /* repeat loop: jump to instruction just past BGNLOOP */
1791 *pc
= inst
->InstructionExtLabel
.Label
+ 1;
1794 /* exit loop: pop LoopMask */
1795 assert(mach
->LoopStackTop
> 0);
1796 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1798 assert(mach
->ContStackTop
> 0);
1799 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1801 UPDATE_EXEC_MASK(mach
);
1804 case TGSI_OPCODE_BRK
:
1805 /* turn off loop channels for each enabled exec channel */
1806 mach
->LoopMask
&= ~mach
->ExecMask
;
1807 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1808 UPDATE_EXEC_MASK(mach
);
1811 case TGSI_OPCODE_CONT
:
1812 /* turn off cont channels for each enabled exec channel */
1813 mach
->ContMask
&= ~mach
->ExecMask
;
1814 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1815 UPDATE_EXEC_MASK(mach
);
1818 case TGSI_OPCODE_BGNSUB
:
1822 case TGSI_OPCODE_ENDSUB
:
1826 case TGSI_OPCODE_NOISE1
:
1830 case TGSI_OPCODE_NOISE2
:
1834 case TGSI_OPCODE_NOISE3
:
1838 case TGSI_OPCODE_NOISE4
:
1842 case TGSI_OPCODE_NOP
:
1852 * Run TGSI interpreter.
1853 * \return bitmask of "alive" quad components
1856 spu_exec_machine_run( struct spu_exec_machine
*mach
)
1861 mach
->CondMask
= 0xf;
1862 mach
->LoopMask
= 0xf;
1863 mach
->ContMask
= 0xf;
1864 mach
->FuncMask
= 0xf;
1865 mach
->ExecMask
= 0xf;
1867 mach
->CondStackTop
= 0; /* temporarily subvert this assertion */
1868 assert(mach
->CondStackTop
== 0);
1869 assert(mach
->LoopStackTop
== 0);
1870 assert(mach
->ContStackTop
== 0);
1871 assert(mach
->CallStackTop
== 0);
1873 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] = 0;
1874 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] = 0;
1876 if( mach
->Processor
== TGSI_PROCESSOR_GEOMETRY
) {
1877 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0] = 0;
1878 mach
->Primitives
[0] = 0;
1882 /* execute declarations (interpolants) */
1883 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1884 for (i
= 0; i
< mach
->NumDeclarations
; i
++) {
1886 struct tgsi_full_declaration decl
;
1887 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_declaration
)) / 16];
1889 unsigned ea
= (unsigned) (mach
->Declarations
+ pc
);
1891 spu_dcache_fetch_unaligned(d
.buffer
, ea
, sizeof(d
.decl
));
1893 exec_declaration( mach
, &d
.decl
);
1897 /* execute instructions, until pc is set to -1 */
1900 struct tgsi_full_instruction inst
;
1901 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_instruction
)) / 16];
1903 unsigned ea
= (unsigned) (mach
->Instructions
+ pc
);
1905 spu_dcache_fetch_unaligned(i
.buffer
, ea
, sizeof(i
.inst
));
1906 exec_instruction( mach
, & i
.inst
, &pc
);
1910 /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
1911 if (mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1913 * Scale back depth component.
1915 for (i
= 0; i
< 4; i
++)
1916 mach
->Outputs
[0].xyzw
[2].f
[i
] *= ctx
->DrawBuffer
->_DepthMaxF
;
1920 return ~mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0];