1 /**************************************************************************
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 * TGSI interpretor/executor.
31 * Flow control information:
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers. This is the ExecMask.
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
53 #include <transpose_matrix4x4.h>
54 #include <simdmath/ceilf4.h>
55 #include <simdmath/cosf4.h>
56 #include <simdmath/divf4.h>
57 #include <simdmath/floorf4.h>
58 #include <simdmath/log2f4.h>
59 #include <simdmath/powf4.h>
60 #include <simdmath/sinf4.h>
61 #include <simdmath/sqrtf4.h>
62 #include <simdmath/truncf4.h>
64 #include "pipe/p_compiler.h"
65 #include "pipe/p_state.h"
66 #include "pipe/p_shader_tokens.h"
67 #include "tgsi/tgsi_parse.h"
68 #include "tgsi/tgsi_util.h"
71 #include "spu_vertex_shader.h"
72 #include "spu_dcache.h"
73 #include "cell/common.h"
75 #define TILE_TOP_LEFT 0
76 #define TILE_TOP_RIGHT 1
77 #define TILE_BOTTOM_LEFT 2
78 #define TILE_BOTTOM_RIGHT 3
81 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
83 #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I
84 #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C
85 #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I
86 #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C
87 #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I
88 #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C
89 #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I
90 #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C
91 #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I
92 #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C
93 #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I
94 #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C
95 #define TEMP_128_I TGSI_EXEC_TEMP_128_I
96 #define TEMP_128_C TGSI_EXEC_TEMP_128_C
97 #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I
98 #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C
99 #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
100 #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
101 #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
102 #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
103 #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
104 #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
105 #define TEMP_R0 TGSI_EXEC_TEMP_R0
107 #define FOR_EACH_CHANNEL(CHAN)\
108 for (CHAN = 0; CHAN < 4; CHAN++)
110 #define IS_CHANNEL_ENABLED(INST, CHAN)\
111 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
113 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
114 ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
116 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
117 FOR_EACH_CHANNEL( CHAN )\
118 if (IS_CHANNEL_ENABLED( INST, CHAN ))
120 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
121 FOR_EACH_CHANNEL( CHAN )\
122 if (IS_CHANNEL_ENABLED2( INST, CHAN ))
125 /** The execution mask depends on the conditional mask and the loop mask */
126 #define UPDATE_EXEC_MASK(MACH) \
127 MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
138 * Initialize machine state by expanding tokens to full instructions,
139 * allocating temporary storage, setting up constants, etc.
140 * After this, we can call spu_exec_machine_run() many times.
143 spu_exec_machine_init(struct spu_exec_machine
*mach
,
145 struct spu_sampler
*samplers
,
148 const qword zero
= si_il(0);
149 const qword not_zero
= si_il(~0);
152 mach
->Samplers
= samplers
;
153 mach
->Processor
= processor
;
154 mach
->Addrs
= &mach
->Temps
[TGSI_EXEC_NUM_TEMPS
];
156 /* Setup constants. */
157 mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
= zero
;
158 mach
->Temps
[TEMP_FF_I
].xyzw
[TEMP_FF_C
].q
= not_zero
;
159 mach
->Temps
[TEMP_7F_I
].xyzw
[TEMP_7F_C
].q
= si_shli(not_zero
, -1);
160 mach
->Temps
[TEMP_80_I
].xyzw
[TEMP_80_C
].q
= si_shli(not_zero
, 31);
162 mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
= (qword
) spu_splats(1.0f
);
163 mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
= (qword
) spu_splats(2.0f
);
164 mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
= (qword
) spu_splats(128.0f
);
165 mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
= (qword
) spu_splats(-128.0f
);
172 return si_rotmi(si_shli(src
, 1), -1);
176 micro_ceil(qword src
)
178 return (qword
) _ceilf4((vec_float4
) src
);
184 return (qword
) _cosf4((vec_float4
) src
);
187 static const qword br_shuf
= {
188 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
189 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
190 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
191 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
192 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
193 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
194 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
195 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
198 static const qword bl_shuf
= {
199 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
200 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
201 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
202 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
203 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
204 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
205 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
206 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
209 static const qword tl_shuf
= {
210 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
211 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
212 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
213 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
214 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
215 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
216 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
217 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
223 qword bottom_right
= si_shufb(src
, src
, br_shuf
);
224 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
226 return si_fs(bottom_right
, bottom_left
);
232 qword top_left
= si_shufb(src
, src
, tl_shuf
);
233 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
235 return si_fs(top_left
, bottom_left
);
239 micro_div(qword src0
, qword src1
)
241 return (qword
) _divf4((vec_float4
) src0
, (vec_float4
) src1
);
247 return (qword
) _floorf4((vec_float4
) src
);
253 return si_fs(src
, (qword
) _floorf4((vec_float4
) src
));
257 micro_ge(qword src0
, qword src1
)
259 return si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
265 return (qword
) _log2f4((vec_float4
) src
);
269 micro_lt(qword src0
, qword src1
)
271 const qword tmp
= si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
273 return si_xori(tmp
, 0xff);
277 micro_max(qword src0
, qword src1
)
279 return si_selb(src1
, src0
, si_fcgt(src0
, src1
));
283 micro_min(qword src0
, qword src1
)
285 return si_selb(src0
, src1
, si_fcgt(src0
, src1
));
291 return si_xor(src
, (qword
) spu_splats(0x80000000));
295 micro_set_sign(qword src
)
297 return si_or(src
, (qword
) spu_splats(0x80000000));
301 micro_pow(qword src0
, qword src1
)
303 return (qword
) _powf4((vec_float4
) src0
, (vec_float4
) src1
);
309 const qword half
= (qword
) spu_splats(0.5f
);
311 /* May be able to use _roundf4. There may be some difference, though.
313 return (qword
) _floorf4((vec_float4
) si_fa(src
, half
));
317 micro_ishr(qword src0
, qword src1
)
319 return si_rotma(src0
, si_sfi(src1
, 0));
323 micro_trunc(qword src
)
325 return (qword
) _truncf4((vec_float4
) src
);
331 return (qword
) _sinf4((vec_float4
) src
);
335 micro_sqrt(qword src
)
337 return (qword
) _sqrtf4((vec_float4
) src
);
341 fetch_src_file_channel(
342 const struct spu_exec_machine
*mach
,
345 const union spu_exec_channel
*index
,
346 union spu_exec_channel
*chan
)
349 case TGSI_EXTSWIZZLE_X
:
350 case TGSI_EXTSWIZZLE_Y
:
351 case TGSI_EXTSWIZZLE_Z
:
352 case TGSI_EXTSWIZZLE_W
:
354 case TGSI_FILE_CONSTANT
: {
357 for (i
= 0; i
< 4; i
++) {
358 const float *ptr
= mach
->Consts
[index
->i
[i
]];
361 spu_dcache_fetch_unaligned((qword
*) tmp
,
362 (uintptr_t)(ptr
+ swizzle
),
370 case TGSI_FILE_INPUT
:
371 chan
->u
[0] = mach
->Inputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
372 chan
->u
[1] = mach
->Inputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
373 chan
->u
[2] = mach
->Inputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
374 chan
->u
[3] = mach
->Inputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
377 case TGSI_FILE_TEMPORARY
:
378 chan
->u
[0] = mach
->Temps
[index
->i
[0]].xyzw
[swizzle
].u
[0];
379 chan
->u
[1] = mach
->Temps
[index
->i
[1]].xyzw
[swizzle
].u
[1];
380 chan
->u
[2] = mach
->Temps
[index
->i
[2]].xyzw
[swizzle
].u
[2];
381 chan
->u
[3] = mach
->Temps
[index
->i
[3]].xyzw
[swizzle
].u
[3];
384 case TGSI_FILE_IMMEDIATE
:
385 ASSERT( index
->i
[0] < (int) mach
->ImmLimit
);
386 ASSERT( index
->i
[1] < (int) mach
->ImmLimit
);
387 ASSERT( index
->i
[2] < (int) mach
->ImmLimit
);
388 ASSERT( index
->i
[3] < (int) mach
->ImmLimit
);
390 chan
->f
[0] = mach
->Imms
[index
->i
[0]][swizzle
];
391 chan
->f
[1] = mach
->Imms
[index
->i
[1]][swizzle
];
392 chan
->f
[2] = mach
->Imms
[index
->i
[2]][swizzle
];
393 chan
->f
[3] = mach
->Imms
[index
->i
[3]][swizzle
];
396 case TGSI_FILE_ADDRESS
:
397 chan
->u
[0] = mach
->Addrs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
398 chan
->u
[1] = mach
->Addrs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
399 chan
->u
[2] = mach
->Addrs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
400 chan
->u
[3] = mach
->Addrs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
403 case TGSI_FILE_OUTPUT
:
404 /* vertex/fragment output vars can be read too */
405 chan
->u
[0] = mach
->Outputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
406 chan
->u
[1] = mach
->Outputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
407 chan
->u
[2] = mach
->Outputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
408 chan
->u
[3] = mach
->Outputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
416 case TGSI_EXTSWIZZLE_ZERO
:
417 *chan
= mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
];
420 case TGSI_EXTSWIZZLE_ONE
:
421 *chan
= mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
];
431 const struct spu_exec_machine
*mach
,
432 union spu_exec_channel
*chan
,
433 const struct tgsi_full_src_register
*reg
,
434 const uint chan_index
)
436 union spu_exec_channel index
;
442 index
.i
[3] = reg
->SrcRegister
.Index
;
444 if (reg
->SrcRegister
.Indirect
) {
445 union spu_exec_channel index2
;
446 union spu_exec_channel indir_index
;
451 index2
.i
[3] = reg
->SrcRegisterInd
.Index
;
453 swizzle
= tgsi_util_get_src_register_swizzle(®
->SrcRegisterInd
,
455 fetch_src_file_channel(
457 reg
->SrcRegisterInd
.File
,
462 index
.q
= si_a(index
.q
, indir_index
.q
);
465 if( reg
->SrcRegister
.Dimension
) {
466 switch( reg
->SrcRegister
.File
) {
467 case TGSI_FILE_INPUT
:
468 index
.q
= si_mpyi(index
.q
, 17);
470 case TGSI_FILE_CONSTANT
:
471 index
.q
= si_shli(index
.q
, 12);
477 index
.i
[0] += reg
->SrcRegisterDim
.Index
;
478 index
.i
[1] += reg
->SrcRegisterDim
.Index
;
479 index
.i
[2] += reg
->SrcRegisterDim
.Index
;
480 index
.i
[3] += reg
->SrcRegisterDim
.Index
;
482 if (reg
->SrcRegisterDim
.Indirect
) {
483 union spu_exec_channel index2
;
484 union spu_exec_channel indir_index
;
489 index2
.i
[3] = reg
->SrcRegisterDimInd
.Index
;
491 swizzle
= tgsi_util_get_src_register_swizzle( ®
->SrcRegisterDimInd
, CHAN_X
);
492 fetch_src_file_channel(
494 reg
->SrcRegisterDimInd
.File
,
499 index
.q
= si_a(index
.q
, indir_index
.q
);
503 swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
504 fetch_src_file_channel(
506 reg
->SrcRegister
.File
,
511 switch (tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
)) {
512 case TGSI_UTIL_SIGN_CLEAR
:
513 chan
->q
= micro_abs(chan
->q
);
516 case TGSI_UTIL_SIGN_SET
:
517 chan
->q
= micro_set_sign(chan
->q
);
520 case TGSI_UTIL_SIGN_TOGGLE
:
521 chan
->q
= micro_neg(chan
->q
);
524 case TGSI_UTIL_SIGN_KEEP
:
528 if (reg
->SrcRegisterExtMod
.Complement
) {
529 chan
->q
= si_fs(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, chan
->q
);
535 struct spu_exec_machine
*mach
,
536 const union spu_exec_channel
*chan
,
537 const struct tgsi_full_dst_register
*reg
,
538 const struct tgsi_full_instruction
*inst
,
541 union spu_exec_channel
*dst
;
543 switch( reg
->DstRegister
.File
) {
547 case TGSI_FILE_OUTPUT
:
548 dst
= &mach
->Outputs
[mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0]
549 + reg
->DstRegister
.Index
].xyzw
[chan_index
];
552 case TGSI_FILE_TEMPORARY
:
553 dst
= &mach
->Temps
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
556 case TGSI_FILE_ADDRESS
:
557 dst
= &mach
->Addrs
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
565 switch (inst
->Instruction
.Saturate
)
568 if (mach
->ExecMask
& 0x1)
569 dst
->i
[0] = chan
->i
[0];
570 if (mach
->ExecMask
& 0x2)
571 dst
->i
[1] = chan
->i
[1];
572 if (mach
->ExecMask
& 0x4)
573 dst
->i
[2] = chan
->i
[2];
574 if (mach
->ExecMask
& 0x8)
575 dst
->i
[3] = chan
->i
[3];
578 case TGSI_SAT_ZERO_ONE
:
579 /* XXX need to obey ExecMask here */
580 dst
->q
= micro_max(chan
->q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
581 dst
->q
= micro_min(dst
->q
, mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
);
584 case TGSI_SAT_MINUS_PLUS_ONE
:
593 #define FETCH(VAL,INDEX,CHAN)\
594 fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
596 #define STORE(VAL,INDEX,CHAN)\
597 store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
601 * Execute ARB-style KIL which is predicated by a src register.
602 * Kill fragment if any of the four values is less than zero.
605 exec_kil(struct spu_exec_machine
*mach
,
606 const struct tgsi_full_instruction
*inst
)
610 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
611 union spu_exec_channel r
[1];
613 /* This mask stores component bits that were already tested. Note that
614 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
616 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
618 for (chan_index
= 0; chan_index
< 4; chan_index
++)
623 /* unswizzle channel */
624 swizzle
= tgsi_util_get_full_src_register_extswizzle (
625 &inst
->FullSrcRegisters
[0],
628 /* check if the component has not been already tested */
629 if (uniquemask
& (1 << swizzle
))
631 uniquemask
|= 1 << swizzle
;
633 FETCH(&r
[0], 0, chan_index
);
634 for (i
= 0; i
< 4; i
++)
635 if (r
[0].f
[i
] < 0.0f
)
639 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
643 * Execute NVIDIA-style KIL which is predicated by a condition code.
644 * Kill fragment if the condition code is TRUE.
647 exec_kilp(struct tgsi_exec_machine
*mach
,
648 const struct tgsi_full_instruction
*inst
)
650 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
652 /* TODO: build kilmask from CC mask */
654 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
658 * Fetch a texel using STR texture coordinates.
661 fetch_texel( struct spu_sampler
*sampler
,
662 const union spu_exec_channel
*s
,
663 const union spu_exec_channel
*t
,
664 const union spu_exec_channel
*p
,
665 float lodbias
, /* XXX should be float[4] */
666 union spu_exec_channel
*r
,
667 union spu_exec_channel
*g
,
668 union spu_exec_channel
*b
,
669 union spu_exec_channel
*a
)
674 sampler
->get_samples(sampler
, s
->f
, t
->f
, p
->f
, lodbias
,
675 (float (*)[4]) rgba
);
677 _transpose_matrix4x4((vec_float4
*) out
, (vec_float4
*) rgba
);
686 exec_tex(struct spu_exec_machine
*mach
,
687 const struct tgsi_full_instruction
*inst
,
688 boolean biasLod
, boolean projected
)
690 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
691 union spu_exec_channel r
[8];
695 /* printf("Sampler %u unit %u\n", sampler, unit); */
697 switch (inst
->InstructionExtTexture
.Texture
) {
698 case TGSI_TEXTURE_1D
:
700 FETCH(&r
[0], 0, CHAN_X
);
703 FETCH(&r
[1], 0, CHAN_W
);
704 r
[0].q
= micro_div(r
[0].q
, r
[1].q
);
708 FETCH(&r
[1], 0, CHAN_W
);
714 fetch_texel(&mach
->Samplers
[unit
],
715 &r
[0], NULL
, NULL
, lodBias
, /* S, T, P, BIAS */
716 &r
[0], &r
[1], &r
[2], &r
[3]); /* R, G, B, A */
719 case TGSI_TEXTURE_2D
:
720 case TGSI_TEXTURE_RECT
:
722 FETCH(&r
[0], 0, CHAN_X
);
723 FETCH(&r
[1], 0, CHAN_Y
);
724 FETCH(&r
[2], 0, CHAN_Z
);
727 FETCH(&r
[3], 0, CHAN_W
);
728 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
729 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
730 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
734 FETCH(&r
[3], 0, CHAN_W
);
740 fetch_texel(&mach
->Samplers
[unit
],
741 &r
[0], &r
[1], &r
[2], lodBias
, /* inputs */
742 &r
[0], &r
[1], &r
[2], &r
[3]); /* outputs */
745 case TGSI_TEXTURE_3D
:
746 case TGSI_TEXTURE_CUBE
:
748 FETCH(&r
[0], 0, CHAN_X
);
749 FETCH(&r
[1], 0, CHAN_Y
);
750 FETCH(&r
[2], 0, CHAN_Z
);
753 FETCH(&r
[3], 0, CHAN_W
);
754 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
755 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
756 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
760 FETCH(&r
[3], 0, CHAN_W
);
766 fetch_texel(&mach
->Samplers
[unit
],
767 &r
[0], &r
[1], &r
[2], lodBias
,
768 &r
[0], &r
[1], &r
[2], &r
[3]);
775 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
776 STORE( &r
[chan_index
], 0, chan_index
);
783 constant_interpolation(
784 struct spu_exec_machine
*mach
,
790 for( i
= 0; i
< QUAD_SIZE
; i
++ ) {
791 mach
->Inputs
[attrib
].xyzw
[chan
].f
[i
] = mach
->InterpCoefs
[attrib
].a0
[chan
];
796 linear_interpolation(
797 struct spu_exec_machine
*mach
,
801 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
802 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
803 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
804 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
805 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
806 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
;
807 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = a0
+ dadx
;
808 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = a0
+ dady
;
809 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = a0
+ dadx
+ dady
;
813 perspective_interpolation(
814 struct spu_exec_machine
*mach
,
818 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
819 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
820 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
821 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
822 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
823 const float *w
= mach
->QuadPos
.xyzw
[3].f
;
824 /* divide by W here */
825 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
/ w
[0];
826 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = (a0
+ dadx
) / w
[1];
827 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = (a0
+ dady
) / w
[2];
828 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = (a0
+ dadx
+ dady
) / w
[3];
832 typedef void (* interpolation_func
)(
833 struct spu_exec_machine
*mach
,
838 exec_declaration(struct spu_exec_machine
*mach
,
839 const struct tgsi_full_declaration
*decl
)
841 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
842 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
843 unsigned first
, last
, mask
;
844 interpolation_func interp
;
846 first
= decl
->DeclarationRange
.First
;
847 last
= decl
->DeclarationRange
.Last
;
848 mask
= decl
->Declaration
.UsageMask
;
850 switch( decl
->Declaration
.Interpolate
) {
851 case TGSI_INTERPOLATE_CONSTANT
:
852 interp
= constant_interpolation
;
855 case TGSI_INTERPOLATE_LINEAR
:
856 interp
= linear_interpolation
;
859 case TGSI_INTERPOLATE_PERSPECTIVE
:
860 interp
= perspective_interpolation
;
867 if( mask
== TGSI_WRITEMASK_XYZW
) {
870 for( i
= first
; i
<= last
; i
++ ) {
871 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
872 interp( mach
, i
, j
);
879 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
880 if( mask
& (1 << j
) ) {
881 for( i
= first
; i
<= last
; i
++ ) {
882 interp( mach
, i
, j
);
893 struct spu_exec_machine
*mach
,
894 const struct tgsi_full_instruction
*inst
,
898 union spu_exec_channel r
[8];
902 switch (inst
->Instruction
.Opcode
) {
903 case TGSI_OPCODE_ARL
:
904 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
905 FETCH( &r
[0], 0, chan_index
);
906 r
[0].q
= si_cflts(r
[0].q
, 0);
907 STORE( &r
[0], 0, chan_index
);
911 case TGSI_OPCODE_MOV
:
912 case TGSI_OPCODE_SWZ
:
913 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
914 FETCH( &r
[0], 0, chan_index
);
915 STORE( &r
[0], 0, chan_index
);
919 case TGSI_OPCODE_LIT
:
920 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
921 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
924 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
925 FETCH( &r
[0], 0, CHAN_X
);
926 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
927 r
[0].q
= micro_max(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
928 STORE( &r
[0], 0, CHAN_Y
);
931 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
932 FETCH( &r
[1], 0, CHAN_Y
);
933 r
[1].q
= micro_max(r
[1].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
935 FETCH( &r
[2], 0, CHAN_W
);
936 r
[2].q
= micro_min(r
[2].q
, mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
);
937 r
[2].q
= micro_max(r
[2].q
, mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
);
938 r
[1].q
= micro_pow(r
[1].q
, r
[2].q
);
940 /* r0 = (r0 > 0.0) ? r1 : 0.0
942 r
[0].q
= si_fcgt(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
943 r
[0].q
= si_selb(mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
, r
[1].q
,
945 STORE( &r
[0], 0, CHAN_Z
);
949 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
950 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
954 case TGSI_OPCODE_RCP
:
955 /* TGSI_OPCODE_RECIP */
956 FETCH( &r
[0], 0, CHAN_X
);
957 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
958 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
959 STORE( &r
[0], 0, chan_index
);
963 case TGSI_OPCODE_RSQ
:
964 /* TGSI_OPCODE_RECIPSQRT */
965 FETCH( &r
[0], 0, CHAN_X
);
966 r
[0].q
= micro_sqrt(r
[0].q
);
967 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
968 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
969 STORE( &r
[0], 0, chan_index
);
973 case TGSI_OPCODE_EXP
:
977 case TGSI_OPCODE_LOG
:
981 case TGSI_OPCODE_MUL
:
982 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
)
984 FETCH(&r
[0], 0, chan_index
);
985 FETCH(&r
[1], 1, chan_index
);
987 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
989 STORE(&r
[0], 0, chan_index
);
993 case TGSI_OPCODE_ADD
:
994 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
995 FETCH( &r
[0], 0, chan_index
);
996 FETCH( &r
[1], 1, chan_index
);
997 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
998 STORE( &r
[0], 0, chan_index
);
1002 case TGSI_OPCODE_DP3
:
1003 /* TGSI_OPCODE_DOT3 */
1004 FETCH( &r
[0], 0, CHAN_X
);
1005 FETCH( &r
[1], 1, CHAN_X
);
1006 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1008 FETCH( &r
[1], 0, CHAN_Y
);
1009 FETCH( &r
[2], 1, CHAN_Y
);
1010 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1013 FETCH( &r
[1], 0, CHAN_Z
);
1014 FETCH( &r
[2], 1, CHAN_Z
);
1015 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1017 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1018 STORE( &r
[0], 0, chan_index
);
1022 case TGSI_OPCODE_DP4
:
1023 /* TGSI_OPCODE_DOT4 */
1024 FETCH(&r
[0], 0, CHAN_X
);
1025 FETCH(&r
[1], 1, CHAN_X
);
1027 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1029 FETCH(&r
[1], 0, CHAN_Y
);
1030 FETCH(&r
[2], 1, CHAN_Y
);
1032 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1034 FETCH(&r
[1], 0, CHAN_Z
);
1035 FETCH(&r
[2], 1, CHAN_Z
);
1037 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1039 FETCH(&r
[1], 0, CHAN_W
);
1040 FETCH(&r
[2], 1, CHAN_W
);
1042 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1044 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1045 STORE( &r
[0], 0, chan_index
);
1049 case TGSI_OPCODE_DST
:
1050 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1051 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
1054 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1055 FETCH( &r
[0], 0, CHAN_Y
);
1056 FETCH( &r
[1], 1, CHAN_Y
);
1057 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1058 STORE( &r
[0], 0, CHAN_Y
);
1061 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1062 FETCH( &r
[0], 0, CHAN_Z
);
1063 STORE( &r
[0], 0, CHAN_Z
);
1066 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1067 FETCH( &r
[0], 1, CHAN_W
);
1068 STORE( &r
[0], 0, CHAN_W
);
1072 case TGSI_OPCODE_MIN
:
1073 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1074 FETCH(&r
[0], 0, chan_index
);
1075 FETCH(&r
[1], 1, chan_index
);
1077 r
[0].q
= micro_min(r
[0].q
, r
[1].q
);
1079 STORE(&r
[0], 0, chan_index
);
1083 case TGSI_OPCODE_MAX
:
1084 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1085 FETCH(&r
[0], 0, chan_index
);
1086 FETCH(&r
[1], 1, chan_index
);
1088 r
[0].q
= micro_max(r
[0].q
, r
[1].q
);
1090 STORE(&r
[0], 0, chan_index
);
1094 case TGSI_OPCODE_SLT
:
1095 /* TGSI_OPCODE_SETLT */
1096 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1097 FETCH( &r
[0], 0, chan_index
);
1098 FETCH( &r
[1], 1, chan_index
);
1100 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1101 r
[0].q
= si_xori(r
[0].q
, 0xff);
1103 STORE( &r
[0], 0, chan_index
);
1107 case TGSI_OPCODE_SGE
:
1108 /* TGSI_OPCODE_SETGE */
1109 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1110 FETCH( &r
[0], 0, chan_index
);
1111 FETCH( &r
[1], 1, chan_index
);
1112 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1113 STORE( &r
[0], 0, chan_index
);
1117 case TGSI_OPCODE_MAD
:
1118 /* TGSI_OPCODE_MADD */
1119 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1120 FETCH( &r
[0], 0, chan_index
);
1121 FETCH( &r
[1], 1, chan_index
);
1122 FETCH( &r
[2], 2, chan_index
);
1123 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1124 STORE( &r
[0], 0, chan_index
);
1128 case TGSI_OPCODE_SUB
:
1129 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1130 FETCH(&r
[0], 0, chan_index
);
1131 FETCH(&r
[1], 1, chan_index
);
1133 r
[0].q
= si_fs(r
[0].q
, r
[1].q
);
1135 STORE(&r
[0], 0, chan_index
);
1139 case TGSI_OPCODE_LERP
:
1140 /* TGSI_OPCODE_LRP */
1141 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1142 FETCH(&r
[0], 0, chan_index
);
1143 FETCH(&r
[1], 1, chan_index
);
1144 FETCH(&r
[2], 2, chan_index
);
1146 r
[1].q
= si_fs(r
[1].q
, r
[2].q
);
1147 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1149 STORE(&r
[0], 0, chan_index
);
1153 case TGSI_OPCODE_CND
:
1157 case TGSI_OPCODE_CND0
:
1161 case TGSI_OPCODE_DOT2ADD
:
1162 /* TGSI_OPCODE_DP2A */
1166 case TGSI_OPCODE_INDEX
:
1170 case TGSI_OPCODE_NEGATE
:
1174 case TGSI_OPCODE_FRAC
:
1175 /* TGSI_OPCODE_FRC */
1176 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1177 FETCH( &r
[0], 0, chan_index
);
1178 r
[0].q
= micro_frc(r
[0].q
);
1179 STORE( &r
[0], 0, chan_index
);
1183 case TGSI_OPCODE_CLAMP
:
1187 case TGSI_OPCODE_FLOOR
:
1188 /* TGSI_OPCODE_FLR */
1189 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1190 FETCH( &r
[0], 0, chan_index
);
1191 r
[0].q
= micro_flr(r
[0].q
);
1192 STORE( &r
[0], 0, chan_index
);
1196 case TGSI_OPCODE_ROUND
:
1197 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1198 FETCH( &r
[0], 0, chan_index
);
1199 r
[0].q
= micro_rnd(r
[0].q
);
1200 STORE( &r
[0], 0, chan_index
);
1204 case TGSI_OPCODE_EXPBASE2
:
1205 /* TGSI_OPCODE_EX2 */
1206 FETCH(&r
[0], 0, CHAN_X
);
1208 r
[0].q
= micro_pow(mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
, r
[0].q
);
1210 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1211 STORE( &r
[0], 0, chan_index
);
1215 case TGSI_OPCODE_LOGBASE2
:
1216 /* TGSI_OPCODE_LG2 */
1217 FETCH( &r
[0], 0, CHAN_X
);
1218 r
[0].q
= micro_lg2(r
[0].q
);
1219 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1220 STORE( &r
[0], 0, chan_index
);
1224 case TGSI_OPCODE_POWER
:
1225 /* TGSI_OPCODE_POW */
1226 FETCH(&r
[0], 0, CHAN_X
);
1227 FETCH(&r
[1], 1, CHAN_X
);
1229 r
[0].q
= micro_pow(r
[0].q
, r
[1].q
);
1231 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1232 STORE( &r
[0], 0, chan_index
);
1236 case TGSI_OPCODE_CROSSPRODUCT
:
1237 /* TGSI_OPCODE_XPD */
1238 FETCH(&r
[0], 0, CHAN_Y
);
1239 FETCH(&r
[1], 1, CHAN_Z
);
1240 FETCH(&r
[3], 0, CHAN_Z
);
1241 FETCH(&r
[4], 1, CHAN_Y
);
1243 /* r2 = (r0 * r1) - (r3 * r5)
1245 r
[2].q
= si_fm(r
[3].q
, r
[5].q
);
1246 r
[2].q
= si_fms(r
[0].q
, r
[1].q
, r
[2].q
);
1248 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1249 STORE( &r
[2], 0, CHAN_X
);
1252 FETCH(&r
[2], 1, CHAN_X
);
1253 FETCH(&r
[5], 0, CHAN_X
);
1255 /* r3 = (r3 * r2) - (r1 * r5)
1257 r
[1].q
= si_fm(r
[1].q
, r
[5].q
);
1258 r
[3].q
= si_fms(r
[3].q
, r
[2].q
, r
[1].q
);
1260 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1261 STORE( &r
[3], 0, CHAN_Y
);
1264 /* r5 = (r5 * r4) - (r0 * r2)
1266 r
[0].q
= si_fm(r
[0].q
, r
[2].q
);
1267 r
[5].q
= si_fms(r
[5].q
, r
[4].q
, r
[0].q
);
1269 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1270 STORE( &r
[5], 0, CHAN_Z
);
1273 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1274 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1278 case TGSI_OPCODE_MULTIPLYMATRIX
:
1282 case TGSI_OPCODE_ABS
:
1283 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1284 FETCH(&r
[0], 0, chan_index
);
1286 r
[0].q
= micro_abs(r
[0].q
);
1288 STORE(&r
[0], 0, chan_index
);
1292 case TGSI_OPCODE_RCC
:
1296 case TGSI_OPCODE_DPH
:
1297 FETCH(&r
[0], 0, CHAN_X
);
1298 FETCH(&r
[1], 1, CHAN_X
);
1300 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1302 FETCH(&r
[1], 0, CHAN_Y
);
1303 FETCH(&r
[2], 1, CHAN_Y
);
1305 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1307 FETCH(&r
[1], 0, CHAN_Z
);
1308 FETCH(&r
[2], 1, CHAN_Z
);
1310 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1312 FETCH(&r
[1], 1, CHAN_W
);
1314 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
1316 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1317 STORE( &r
[0], 0, chan_index
);
1321 case TGSI_OPCODE_COS
:
1322 FETCH(&r
[0], 0, CHAN_X
);
1324 r
[0].q
= micro_cos(r
[0].q
);
1326 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1327 STORE( &r
[0], 0, chan_index
);
1331 case TGSI_OPCODE_DDX
:
1332 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1333 FETCH( &r
[0], 0, chan_index
);
1334 r
[0].q
= micro_ddx(r
[0].q
);
1335 STORE( &r
[0], 0, chan_index
);
1339 case TGSI_OPCODE_DDY
:
1340 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1341 FETCH( &r
[0], 0, chan_index
);
1342 r
[0].q
= micro_ddy(r
[0].q
);
1343 STORE( &r
[0], 0, chan_index
);
1347 case TGSI_OPCODE_KILP
:
1348 exec_kilp (mach
, inst
);
1351 case TGSI_OPCODE_KIL
:
1352 exec_kil (mach
, inst
);
1355 case TGSI_OPCODE_PK2H
:
1359 case TGSI_OPCODE_PK2US
:
1363 case TGSI_OPCODE_PK4B
:
1367 case TGSI_OPCODE_PK4UB
:
1371 case TGSI_OPCODE_RFL
:
1375 case TGSI_OPCODE_SEQ
:
1376 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1377 FETCH( &r
[0], 0, chan_index
);
1378 FETCH( &r
[1], 1, chan_index
);
1380 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1382 STORE( &r
[0], 0, chan_index
);
1386 case TGSI_OPCODE_SFL
:
1390 case TGSI_OPCODE_SGT
:
1391 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1392 FETCH( &r
[0], 0, chan_index
);
1393 FETCH( &r
[1], 1, chan_index
);
1394 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1395 STORE( &r
[0], 0, chan_index
);
1399 case TGSI_OPCODE_SIN
:
1400 FETCH( &r
[0], 0, CHAN_X
);
1401 r
[0].q
= micro_sin(r
[0].q
);
1402 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1403 STORE( &r
[0], 0, chan_index
);
1407 case TGSI_OPCODE_SLE
:
1408 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1409 FETCH( &r
[0], 0, chan_index
);
1410 FETCH( &r
[1], 1, chan_index
);
1412 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1413 r
[0].q
= si_xori(r
[0].q
, 0xff);
1415 STORE( &r
[0], 0, chan_index
);
1419 case TGSI_OPCODE_SNE
:
1420 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1421 FETCH( &r
[0], 0, chan_index
);
1422 FETCH( &r
[1], 1, chan_index
);
1424 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1425 r
[0].q
= si_xori(r
[0].q
, 0xff);
1427 STORE( &r
[0], 0, chan_index
);
1431 case TGSI_OPCODE_STR
:
1435 case TGSI_OPCODE_TEX
:
1436 /* simple texture lookup */
1437 /* src[0] = texcoord */
1438 /* src[1] = sampler unit */
1439 exec_tex(mach
, inst
, FALSE
, FALSE
);
1442 case TGSI_OPCODE_TXB
:
1443 /* Texture lookup with lod bias */
1444 /* src[0] = texcoord (src[0].w = load bias) */
1445 /* src[1] = sampler unit */
1446 exec_tex(mach
, inst
, TRUE
, FALSE
);
1449 case TGSI_OPCODE_TXD
:
1450 /* Texture lookup with explict partial derivatives */
1451 /* src[0] = texcoord */
1452 /* src[1] = d[strq]/dx */
1453 /* src[2] = d[strq]/dy */
1454 /* src[3] = sampler unit */
1458 case TGSI_OPCODE_TXL
:
1459 /* Texture lookup with explit LOD */
1460 /* src[0] = texcoord (src[0].w = load bias) */
1461 /* src[1] = sampler unit */
1462 exec_tex(mach
, inst
, TRUE
, FALSE
);
1465 case TGSI_OPCODE_TXP
:
1466 /* Texture lookup with projection */
1467 /* src[0] = texcoord (src[0].w = projection) */
1468 /* src[1] = sampler unit */
1469 exec_tex(mach
, inst
, TRUE
, TRUE
);
1472 case TGSI_OPCODE_UP2H
:
1476 case TGSI_OPCODE_UP2US
:
1480 case TGSI_OPCODE_UP4B
:
1484 case TGSI_OPCODE_UP4UB
:
1488 case TGSI_OPCODE_X2D
:
1492 case TGSI_OPCODE_ARA
:
1496 case TGSI_OPCODE_ARR
:
1500 case TGSI_OPCODE_BRA
:
1504 case TGSI_OPCODE_CAL
:
1505 /* skip the call if no execution channels are enabled */
1506 if (mach
->ExecMask
) {
1509 /* push the Cond, Loop, Cont stacks */
1510 ASSERT(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1511 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1512 ASSERT(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1513 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1514 ASSERT(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1515 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1517 ASSERT(mach
->FuncStackTop
< TGSI_EXEC_MAX_CALL_NESTING
);
1518 mach
->FuncStack
[mach
->FuncStackTop
++] = mach
->FuncMask
;
1520 /* note that PC was already incremented above */
1521 mach
->CallStack
[mach
->CallStackTop
++] = *pc
;
1522 *pc
= inst
->InstructionExtLabel
.Label
;
1526 case TGSI_OPCODE_RET
:
1527 mach
->FuncMask
&= ~mach
->ExecMask
;
1528 UPDATE_EXEC_MASK(mach
);
1530 if (mach
->ExecMask
== 0x0) {
1531 /* really return now (otherwise, keep executing */
1533 if (mach
->CallStackTop
== 0) {
1534 /* returning from main() */
1538 *pc
= mach
->CallStack
[--mach
->CallStackTop
];
1540 /* pop the Cond, Loop, Cont stacks */
1541 ASSERT(mach
->CondStackTop
> 0);
1542 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1543 ASSERT(mach
->LoopStackTop
> 0);
1544 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1545 ASSERT(mach
->ContStackTop
> 0);
1546 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1547 ASSERT(mach
->FuncStackTop
> 0);
1548 mach
->FuncMask
= mach
->FuncStack
[--mach
->FuncStackTop
];
1550 UPDATE_EXEC_MASK(mach
);
1554 case TGSI_OPCODE_SSG
:
1558 case TGSI_OPCODE_CMP
:
1559 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1560 FETCH(&r
[0], 0, chan_index
);
1561 FETCH(&r
[1], 1, chan_index
);
1562 FETCH(&r
[2], 2, chan_index
);
1564 /* r0 = (r0 < 0.0) ? r1 : r2
1566 r
[3].q
= si_xor(r
[3].q
, r
[3].q
);
1567 r
[0].q
= micro_lt(r
[0].q
, r
[3].q
);
1568 r
[0].q
= si_selb(r
[1].q
, r
[2].q
, r
[0].q
);
1570 STORE(&r
[0], 0, chan_index
);
1574 case TGSI_OPCODE_SCS
:
1575 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1576 FETCH( &r
[0], 0, CHAN_X
);
1578 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1579 r
[1].q
= micro_cos(r
[0].q
);
1580 STORE( &r
[1], 0, CHAN_X
);
1582 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1583 r
[1].q
= micro_sin(r
[0].q
);
1584 STORE( &r
[1], 0, CHAN_Y
);
1586 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1587 STORE( &mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
], 0, CHAN_Z
);
1589 if( IS_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1590 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1594 case TGSI_OPCODE_NRM
:
1598 case TGSI_OPCODE_DIV
:
1602 case TGSI_OPCODE_DP2
:
1603 FETCH( &r
[0], 0, CHAN_X
);
1604 FETCH( &r
[1], 1, CHAN_X
);
1605 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1607 FETCH( &r
[1], 0, CHAN_Y
);
1608 FETCH( &r
[2], 1, CHAN_Y
);
1609 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1611 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1612 STORE( &r
[0], 0, chan_index
);
1616 case TGSI_OPCODE_IF
:
1618 ASSERT(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1619 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1620 FETCH( &r
[0], 0, CHAN_X
);
1621 /* update CondMask */
1623 mach
->CondMask
&= ~0x1;
1626 mach
->CondMask
&= ~0x2;
1629 mach
->CondMask
&= ~0x4;
1632 mach
->CondMask
&= ~0x8;
1634 UPDATE_EXEC_MASK(mach
);
1635 /* Todo: If CondMask==0, jump to ELSE */
1638 case TGSI_OPCODE_ELSE
:
1639 /* invert CondMask wrt previous mask */
1642 ASSERT(mach
->CondStackTop
> 0);
1643 prevMask
= mach
->CondStack
[mach
->CondStackTop
- 1];
1644 mach
->CondMask
= ~mach
->CondMask
& prevMask
;
1645 UPDATE_EXEC_MASK(mach
);
1646 /* Todo: If CondMask==0, jump to ENDIF */
1650 case TGSI_OPCODE_ENDIF
:
1652 ASSERT(mach
->CondStackTop
> 0);
1653 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1654 UPDATE_EXEC_MASK(mach
);
1657 case TGSI_OPCODE_END
:
1658 /* halt execution */
1662 case TGSI_OPCODE_REP
:
1666 case TGSI_OPCODE_ENDREP
:
1670 case TGSI_OPCODE_PUSHA
:
1674 case TGSI_OPCODE_POPA
:
1678 case TGSI_OPCODE_CEIL
:
1679 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1680 FETCH( &r
[0], 0, chan_index
);
1681 r
[0].q
= micro_ceil(r
[0].q
);
1682 STORE( &r
[0], 0, chan_index
);
1686 case TGSI_OPCODE_I2F
:
1687 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1688 FETCH( &r
[0], 0, chan_index
);
1689 r
[0].q
= si_csflt(r
[0].q
, 0);
1690 STORE( &r
[0], 0, chan_index
);
1694 case TGSI_OPCODE_NOT
:
1695 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1696 FETCH( &r
[0], 0, chan_index
);
1697 r
[0].q
= si_xorbi(r
[0].q
, 0xff);
1698 STORE( &r
[0], 0, chan_index
);
1702 case TGSI_OPCODE_TRUNC
:
1703 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1704 FETCH( &r
[0], 0, chan_index
);
1705 r
[0].q
= micro_trunc(r
[0].q
);
1706 STORE( &r
[0], 0, chan_index
);
1710 case TGSI_OPCODE_SHL
:
1711 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1712 FETCH( &r
[0], 0, chan_index
);
1713 FETCH( &r
[1], 1, chan_index
);
1715 r
[0].q
= si_shl(r
[0].q
, r
[1].q
);
1717 STORE( &r
[0], 0, chan_index
);
1721 case TGSI_OPCODE_SHR
:
1722 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1723 FETCH( &r
[0], 0, chan_index
);
1724 FETCH( &r
[1], 1, chan_index
);
1725 r
[0].q
= micro_ishr(r
[0].q
, r
[1].q
);
1726 STORE( &r
[0], 0, chan_index
);
1730 case TGSI_OPCODE_AND
:
1731 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1732 FETCH( &r
[0], 0, chan_index
);
1733 FETCH( &r
[1], 1, chan_index
);
1734 r
[0].q
= si_and(r
[0].q
, r
[1].q
);
1735 STORE( &r
[0], 0, chan_index
);
1739 case TGSI_OPCODE_OR
:
1740 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1741 FETCH( &r
[0], 0, chan_index
);
1742 FETCH( &r
[1], 1, chan_index
);
1743 r
[0].q
= si_or(r
[0].q
, r
[1].q
);
1744 STORE( &r
[0], 0, chan_index
);
1748 case TGSI_OPCODE_MOD
:
1752 case TGSI_OPCODE_XOR
:
1753 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1754 FETCH( &r
[0], 0, chan_index
);
1755 FETCH( &r
[1], 1, chan_index
);
1756 r
[0].q
= si_xor(r
[0].q
, r
[1].q
);
1757 STORE( &r
[0], 0, chan_index
);
1761 case TGSI_OPCODE_SAD
:
1765 case TGSI_OPCODE_TXF
:
1769 case TGSI_OPCODE_TXQ
:
1773 case TGSI_OPCODE_EMIT
:
1774 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] += 16;
1775 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]]++;
1778 case TGSI_OPCODE_ENDPRIM
:
1779 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]++;
1780 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]] = 0;
1783 case TGSI_OPCODE_LOOP
:
1784 /* fall-through (for now) */
1785 case TGSI_OPCODE_BGNLOOP2
:
1786 /* push LoopMask and ContMasks */
1787 ASSERT(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1788 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1789 ASSERT(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1790 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1793 case TGSI_OPCODE_ENDLOOP
:
1794 /* fall-through (for now at least) */
1795 case TGSI_OPCODE_ENDLOOP2
:
1796 /* Restore ContMask, but don't pop */
1797 ASSERT(mach
->ContStackTop
> 0);
1798 mach
->ContMask
= mach
->ContStack
[mach
->ContStackTop
- 1];
1799 if (mach
->LoopMask
) {
1800 /* repeat loop: jump to instruction just past BGNLOOP */
1801 *pc
= inst
->InstructionExtLabel
.Label
+ 1;
1804 /* exit loop: pop LoopMask */
1805 ASSERT(mach
->LoopStackTop
> 0);
1806 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1808 ASSERT(mach
->ContStackTop
> 0);
1809 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1811 UPDATE_EXEC_MASK(mach
);
1814 case TGSI_OPCODE_BRK
:
1815 /* turn off loop channels for each enabled exec channel */
1816 mach
->LoopMask
&= ~mach
->ExecMask
;
1817 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1818 UPDATE_EXEC_MASK(mach
);
1821 case TGSI_OPCODE_CONT
:
1822 /* turn off cont channels for each enabled exec channel */
1823 mach
->ContMask
&= ~mach
->ExecMask
;
1824 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1825 UPDATE_EXEC_MASK(mach
);
1828 case TGSI_OPCODE_BGNSUB
:
1832 case TGSI_OPCODE_ENDSUB
:
1836 case TGSI_OPCODE_NOISE1
:
1840 case TGSI_OPCODE_NOISE2
:
1844 case TGSI_OPCODE_NOISE3
:
1848 case TGSI_OPCODE_NOISE4
:
1852 case TGSI_OPCODE_NOP
:
1862 * Run TGSI interpreter.
1863 * \return bitmask of "alive" quad components
1866 spu_exec_machine_run( struct spu_exec_machine
*mach
)
1871 mach
->CondMask
= 0xf;
1872 mach
->LoopMask
= 0xf;
1873 mach
->ContMask
= 0xf;
1874 mach
->FuncMask
= 0xf;
1875 mach
->ExecMask
= 0xf;
1877 mach
->CondStackTop
= 0; /* temporarily subvert this ASSERTion */
1878 ASSERT(mach
->CondStackTop
== 0);
1879 ASSERT(mach
->LoopStackTop
== 0);
1880 ASSERT(mach
->ContStackTop
== 0);
1881 ASSERT(mach
->CallStackTop
== 0);
1883 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] = 0;
1884 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] = 0;
1886 if( mach
->Processor
== TGSI_PROCESSOR_GEOMETRY
) {
1887 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0] = 0;
1888 mach
->Primitives
[0] = 0;
1892 /* execute declarations (interpolants) */
1893 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1894 for (i
= 0; i
< mach
->NumDeclarations
; i
++) {
1896 struct tgsi_full_declaration decl
;
1897 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_declaration
)) / 16];
1899 unsigned ea
= (unsigned) (mach
->Declarations
+ pc
);
1901 spu_dcache_fetch_unaligned(d
.buffer
, ea
, sizeof(d
.decl
));
1903 exec_declaration( mach
, &d
.decl
);
1907 /* execute instructions, until pc is set to -1 */
1910 struct tgsi_full_instruction inst
;
1911 qword buffer
[ROUNDUP16(sizeof(struct tgsi_full_instruction
)) / 16];
1913 unsigned ea
= (unsigned) (mach
->Instructions
+ pc
);
1915 spu_dcache_fetch_unaligned(i
.buffer
, ea
, sizeof(i
.inst
));
1916 exec_instruction( mach
, & i
.inst
, &pc
);
1920 /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
1921 if (mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1923 * Scale back depth component.
1925 for (i
= 0; i
< 4; i
++)
1926 mach
->Outputs
[0].xyzw
[2].f
[i
] *= ctx
->DrawBuffer
->_DepthMaxF
;
1930 return ~mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0];