1 /**************************************************************************
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 * TGSI interpretor/executor.
31 * Flow control information:
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers. This is the ExecMask.
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
54 #include <spu_mfcio.h>
55 #include <transpose_matrix4x4.h>
56 #include <simdmath/ceilf4.h>
57 #include <simdmath/cosf4.h>
58 #include <simdmath/divf4.h>
59 #include <simdmath/floorf4.h>
60 #include <simdmath/log2f4.h>
61 #include <simdmath/powf4.h>
62 #include <simdmath/sinf4.h>
63 #include <simdmath/sqrtf4.h>
64 #include <simdmath/truncf4.h>
66 #include "pipe/p_compiler.h"
67 #include "pipe/p_state.h"
68 #include "pipe/p_util.h"
69 #include "pipe/p_shader_tokens.h"
70 #include "tgsi/util/tgsi_parse.h"
71 #include "tgsi/util/tgsi_util.h"
74 #include "spu_vertex_shader.h"
76 #define TILE_TOP_LEFT 0
77 #define TILE_TOP_RIGHT 1
78 #define TILE_BOTTOM_LEFT 2
79 #define TILE_BOTTOM_RIGHT 3
82 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
84 #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I
85 #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C
86 #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I
87 #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C
88 #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I
89 #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C
90 #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I
91 #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C
92 #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I
93 #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C
94 #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I
95 #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C
96 #define TEMP_128_I TGSI_EXEC_TEMP_128_I
97 #define TEMP_128_C TGSI_EXEC_TEMP_128_C
98 #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I
99 #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C
100 #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
101 #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
102 #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
103 #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
104 #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
105 #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
106 #define TEMP_R0 TGSI_EXEC_TEMP_R0
108 #define FOR_EACH_CHANNEL(CHAN)\
109 for (CHAN = 0; CHAN < 4; CHAN++)
111 #define IS_CHANNEL_ENABLED(INST, CHAN)\
112 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
114 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
115 ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
117 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
118 FOR_EACH_CHANNEL( CHAN )\
119 if (IS_CHANNEL_ENABLED( INST, CHAN ))
121 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
122 FOR_EACH_CHANNEL( CHAN )\
123 if (IS_CHANNEL_ENABLED2( INST, CHAN ))
126 /** The execution mask depends on the conditional mask and the loop mask */
127 #define UPDATE_EXEC_MASK(MACH) \
128 MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
139 * Initialize machine state by expanding tokens to full instructions,
140 * allocating temporary storage, setting up constants, etc.
141 * After this, we can call spu_exec_machine_run() many times.
144 spu_exec_machine_init(struct spu_exec_machine
*mach
,
146 struct spu_sampler
*samplers
,
149 const qword zero
= si_il(0);
150 const qword not_zero
= si_il(~0);
152 mach
->Samplers
= samplers
;
153 mach
->Processor
= processor
;
154 mach
->Addrs
= &mach
->Temps
[TGSI_EXEC_NUM_TEMPS
];
156 /* Setup constants. */
157 mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
= zero
;
158 mach
->Temps
[TEMP_FF_I
].xyzw
[TEMP_FF_C
].q
= not_zero
;
159 mach
->Temps
[TEMP_7F_I
].xyzw
[TEMP_7F_C
].q
= si_shli(not_zero
, -1);
160 mach
->Temps
[TEMP_80_I
].xyzw
[TEMP_80_C
].q
= si_shli(not_zero
, 31);
162 mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
= (qword
) spu_splats(1.0f
);
163 mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
= (qword
) spu_splats(2.0f
);
164 mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
= (qword
) spu_splats(128.0f
);
165 mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
= (qword
) spu_splats(-128.0f
);
172 return si_rotmi(si_shli(src
, 1), -1);
176 micro_ceil(qword src
)
178 return (qword
) _ceilf4((vec_float4
) src
);
184 return (qword
) _cosf4((vec_float4
) src
);
187 static const qword br_shuf
= {
188 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
189 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
190 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
191 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
192 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
193 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
194 TILE_BOTTOM_RIGHT
+ 0, TILE_BOTTOM_RIGHT
+ 1,
195 TILE_BOTTOM_RIGHT
+ 2, TILE_BOTTOM_RIGHT
+ 3,
198 static const qword bl_shuf
= {
199 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
200 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
201 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
202 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
203 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
204 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
205 TILE_BOTTOM_LEFT
+ 0, TILE_BOTTOM_LEFT
+ 1,
206 TILE_BOTTOM_LEFT
+ 2, TILE_BOTTOM_LEFT
+ 3,
209 static const qword tl_shuf
= {
210 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
211 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
212 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
213 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
214 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
215 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
216 TILE_TOP_LEFT
+ 0, TILE_TOP_LEFT
+ 1,
217 TILE_TOP_LEFT
+ 2, TILE_TOP_LEFT
+ 3,
223 qword bottom_right
= si_shufb(src
, src
, br_shuf
);
224 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
226 return si_fs(bottom_right
, bottom_left
);
232 qword top_left
= si_shufb(src
, src
, tl_shuf
);
233 qword bottom_left
= si_shufb(src
, src
, bl_shuf
);
235 return si_fs(top_left
, bottom_left
);
239 micro_div(qword src0
, qword src1
)
241 return (qword
) _divf4((vec_float4
) src0
, (vec_float4
) src1
);
247 return (qword
) _floorf4((vec_float4
) src
);
253 return si_fs(src
, (qword
) _floorf4((vec_float4
) src
));
257 micro_ge(qword src0
, qword src1
)
259 return si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
265 return (qword
) _log2f4((vec_float4
) src
);
269 micro_lt(qword src0
, qword src1
)
271 const qword tmp
= si_or(si_fceq(src0
, src1
), si_fcgt(src0
, src1
));
273 return si_xori(tmp
, 0xff);
277 micro_max(qword src0
, qword src1
)
279 return si_selb(src1
, src0
, si_fcgt(src0
, src1
));
283 micro_min(qword src0
, qword src1
)
285 return si_selb(src0
, src1
, si_fcgt(src0
, src1
));
291 return si_xor(src
, (qword
) spu_splats(0x80000000));
295 micro_set_sign(qword src
)
297 return si_or(src
, (qword
) spu_splats(0x80000000));
301 micro_pow(qword src0
, qword src1
)
303 return (qword
) _powf4((vec_float4
) src0
, (vec_float4
) src1
);
309 const qword half
= (qword
) spu_splats(0.5f
);
311 /* May be able to use _roundf4. There may be some difference, though.
313 return (qword
) _floorf4((vec_float4
) si_fa(src
, half
));
317 micro_ishr(qword src0
, qword src1
)
319 return si_rotma(src0
, si_sfi(src1
, 0));
323 micro_trunc(qword src
)
325 return (qword
) _truncf4((vec_float4
) src
);
331 return (qword
) _sinf4((vec_float4
) src
);
335 micro_sqrt(qword src
)
337 return (qword
) _sqrtf4((vec_float4
) src
);
341 fetch_src_file_channel(
342 const struct spu_exec_machine
*mach
,
345 const union spu_exec_channel
*index
,
346 union spu_exec_channel
*chan
)
349 case TGSI_EXTSWIZZLE_X
:
350 case TGSI_EXTSWIZZLE_Y
:
351 case TGSI_EXTSWIZZLE_Z
:
352 case TGSI_EXTSWIZZLE_W
:
354 case TGSI_FILE_CONSTANT
: {
355 unsigned char buffer
[32] ALIGN16_ATTRIB
;
358 for (i
= 0; i
< 4; i
++) {
359 const float *ptr
= mach
->Consts
[index
->i
[i
]];
360 const uint64_t addr
= (uint64_t)(uintptr_t) ptr
;
361 const unsigned size
= ((addr
& 0x0f) == 0) ? 16 : 32;
363 mfc_get(buffer
, addr
& ~0x0f, size
, TAG_VERTEX_BUFFER
, 0, 0);
364 wait_on_mask(1 << TAG_VERTEX_BUFFER
);
366 (void) memcpy(& chan
->f
[i
], &buffer
[(addr
& 0x0f)
367 + (sizeof(float) * swizzle
)], sizeof(float));
372 case TGSI_FILE_INPUT
:
373 chan
->u
[0] = mach
->Inputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
374 chan
->u
[1] = mach
->Inputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
375 chan
->u
[2] = mach
->Inputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
376 chan
->u
[3] = mach
->Inputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
379 case TGSI_FILE_TEMPORARY
:
380 chan
->u
[0] = mach
->Temps
[index
->i
[0]].xyzw
[swizzle
].u
[0];
381 chan
->u
[1] = mach
->Temps
[index
->i
[1]].xyzw
[swizzle
].u
[1];
382 chan
->u
[2] = mach
->Temps
[index
->i
[2]].xyzw
[swizzle
].u
[2];
383 chan
->u
[3] = mach
->Temps
[index
->i
[3]].xyzw
[swizzle
].u
[3];
386 case TGSI_FILE_IMMEDIATE
:
387 assert( index
->i
[0] < (int) mach
->ImmLimit
);
388 assert( index
->i
[1] < (int) mach
->ImmLimit
);
389 assert( index
->i
[2] < (int) mach
->ImmLimit
);
390 assert( index
->i
[3] < (int) mach
->ImmLimit
);
392 chan
->f
[0] = mach
->Imms
[index
->i
[0]][swizzle
];
393 chan
->f
[1] = mach
->Imms
[index
->i
[1]][swizzle
];
394 chan
->f
[2] = mach
->Imms
[index
->i
[2]][swizzle
];
395 chan
->f
[3] = mach
->Imms
[index
->i
[3]][swizzle
];
398 case TGSI_FILE_ADDRESS
:
399 chan
->u
[0] = mach
->Addrs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
400 chan
->u
[1] = mach
->Addrs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
401 chan
->u
[2] = mach
->Addrs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
402 chan
->u
[3] = mach
->Addrs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
405 case TGSI_FILE_OUTPUT
:
406 /* vertex/fragment output vars can be read too */
407 chan
->u
[0] = mach
->Outputs
[index
->i
[0]].xyzw
[swizzle
].u
[0];
408 chan
->u
[1] = mach
->Outputs
[index
->i
[1]].xyzw
[swizzle
].u
[1];
409 chan
->u
[2] = mach
->Outputs
[index
->i
[2]].xyzw
[swizzle
].u
[2];
410 chan
->u
[3] = mach
->Outputs
[index
->i
[3]].xyzw
[swizzle
].u
[3];
418 case TGSI_EXTSWIZZLE_ZERO
:
419 *chan
= mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
];
422 case TGSI_EXTSWIZZLE_ONE
:
423 *chan
= mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
];
433 const struct spu_exec_machine
*mach
,
434 union spu_exec_channel
*chan
,
435 const struct tgsi_full_src_register
*reg
,
436 const uint chan_index
)
438 union spu_exec_channel index
;
444 index
.i
[3] = reg
->SrcRegister
.Index
;
446 if (reg
->SrcRegister
.Indirect
) {
447 union spu_exec_channel index2
;
448 union spu_exec_channel indir_index
;
453 index2
.i
[3] = reg
->SrcRegisterInd
.Index
;
455 swizzle
= tgsi_util_get_src_register_swizzle(®
->SrcRegisterInd
,
457 fetch_src_file_channel(
459 reg
->SrcRegisterInd
.File
,
464 index
.q
= si_a(index
.q
, indir_index
.q
);
467 if( reg
->SrcRegister
.Dimension
) {
468 switch( reg
->SrcRegister
.File
) {
469 case TGSI_FILE_INPUT
:
470 index
.q
= si_mpyi(index
.q
, 17);
472 case TGSI_FILE_CONSTANT
:
473 index
.q
= si_shli(index
.q
, 12);
479 index
.i
[0] += reg
->SrcRegisterDim
.Index
;
480 index
.i
[1] += reg
->SrcRegisterDim
.Index
;
481 index
.i
[2] += reg
->SrcRegisterDim
.Index
;
482 index
.i
[3] += reg
->SrcRegisterDim
.Index
;
484 if (reg
->SrcRegisterDim
.Indirect
) {
485 union spu_exec_channel index2
;
486 union spu_exec_channel indir_index
;
491 index2
.i
[3] = reg
->SrcRegisterDimInd
.Index
;
493 swizzle
= tgsi_util_get_src_register_swizzle( ®
->SrcRegisterDimInd
, CHAN_X
);
494 fetch_src_file_channel(
496 reg
->SrcRegisterDimInd
.File
,
501 index
.q
= si_a(index
.q
, indir_index
.q
);
505 swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
506 fetch_src_file_channel(
508 reg
->SrcRegister
.File
,
513 switch (tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
)) {
514 case TGSI_UTIL_SIGN_CLEAR
:
515 chan
->q
= micro_abs(chan
->q
);
518 case TGSI_UTIL_SIGN_SET
:
519 chan
->q
= micro_set_sign(chan
->q
);
522 case TGSI_UTIL_SIGN_TOGGLE
:
523 chan
->q
= micro_neg(chan
->q
);
526 case TGSI_UTIL_SIGN_KEEP
:
530 if (reg
->SrcRegisterExtMod
.Complement
) {
531 chan
->q
= si_fs(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, chan
->q
);
537 struct spu_exec_machine
*mach
,
538 const union spu_exec_channel
*chan
,
539 const struct tgsi_full_dst_register
*reg
,
540 const struct tgsi_full_instruction
*inst
,
543 union spu_exec_channel
*dst
;
545 switch( reg
->DstRegister
.File
) {
549 case TGSI_FILE_OUTPUT
:
550 dst
= &mach
->Outputs
[mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0]
551 + reg
->DstRegister
.Index
].xyzw
[chan_index
];
554 case TGSI_FILE_TEMPORARY
:
555 dst
= &mach
->Temps
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
558 case TGSI_FILE_ADDRESS
:
559 dst
= &mach
->Addrs
[reg
->DstRegister
.Index
].xyzw
[chan_index
];
567 switch (inst
->Instruction
.Saturate
)
570 if (mach
->ExecMask
& 0x1)
571 dst
->i
[0] = chan
->i
[0];
572 if (mach
->ExecMask
& 0x2)
573 dst
->i
[1] = chan
->i
[1];
574 if (mach
->ExecMask
& 0x4)
575 dst
->i
[2] = chan
->i
[2];
576 if (mach
->ExecMask
& 0x8)
577 dst
->i
[3] = chan
->i
[3];
580 case TGSI_SAT_ZERO_ONE
:
581 /* XXX need to obey ExecMask here */
582 dst
->q
= micro_max(chan
->q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
583 dst
->q
= micro_min(dst
->q
, mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
);
586 case TGSI_SAT_MINUS_PLUS_ONE
:
595 #define FETCH(VAL,INDEX,CHAN)\
596 fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
598 #define STORE(VAL,INDEX,CHAN)\
599 store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
603 * Execute ARB-style KIL which is predicated by a src register.
604 * Kill fragment if any of the four values is less than zero.
607 exec_kilp(struct spu_exec_machine
*mach
,
608 const struct tgsi_full_instruction
*inst
)
612 uint kilmask
= 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
613 union spu_exec_channel r
[1];
615 /* This mask stores component bits that were already tested. Note that
616 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
618 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
620 for (chan_index
= 0; chan_index
< 4; chan_index
++)
625 /* unswizzle channel */
626 swizzle
= tgsi_util_get_full_src_register_extswizzle (
627 &inst
->FullSrcRegisters
[0],
630 /* check if the component has not been already tested */
631 if (uniquemask
& (1 << swizzle
))
633 uniquemask
|= 1 << swizzle
;
635 FETCH(&r
[0], 0, chan_index
);
636 for (i
= 0; i
< 4; i
++)
637 if (r
[0].f
[i
] < 0.0f
)
641 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= kilmask
;
646 * Fetch a texel using STR texture coordinates.
649 fetch_texel( struct spu_sampler
*sampler
,
650 const union spu_exec_channel
*s
,
651 const union spu_exec_channel
*t
,
652 const union spu_exec_channel
*p
,
653 float lodbias
, /* XXX should be float[4] */
654 union spu_exec_channel
*r
,
655 union spu_exec_channel
*g
,
656 union spu_exec_channel
*b
,
657 union spu_exec_channel
*a
)
662 sampler
->get_samples(sampler
, s
->f
, t
->f
, p
->f
, lodbias
, (float *) rgba
);
664 _transpose_matrix4x4(out
, rgba
);
673 exec_tex(struct spu_exec_machine
*mach
,
674 const struct tgsi_full_instruction
*inst
,
677 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
678 union spu_exec_channel r
[8];
682 /* printf("Sampler %u unit %u\n", sampler, unit); */
684 switch (inst
->InstructionExtTexture
.Texture
) {
685 case TGSI_TEXTURE_1D
:
687 FETCH(&r
[0], 0, CHAN_X
);
689 switch (inst
->FullSrcRegisters
[0].SrcRegisterExtSwz
.ExtDivide
) {
690 case TGSI_EXTSWIZZLE_W
:
691 FETCH(&r
[1], 0, CHAN_W
);
692 r
[0].q
= micro_div(r
[0].q
, r
[1].q
);
695 case TGSI_EXTSWIZZLE_ONE
:
703 FETCH(&r
[1], 0, CHAN_W
);
709 fetch_texel(&mach
->Samplers
[unit
],
710 &r
[0], NULL
, NULL
, lodBias
, /* S, T, P, BIAS */
711 &r
[0], &r
[1], &r
[2], &r
[3]); /* R, G, B, A */
714 case TGSI_TEXTURE_2D
:
715 case TGSI_TEXTURE_RECT
:
717 FETCH(&r
[0], 0, CHAN_X
);
718 FETCH(&r
[1], 0, CHAN_Y
);
719 FETCH(&r
[2], 0, CHAN_Z
);
721 switch (inst
->FullSrcRegisters
[0].SrcRegisterExtSwz
.ExtDivide
) {
722 case TGSI_EXTSWIZZLE_W
:
723 FETCH(&r
[3], 0, CHAN_W
);
724 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
725 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
726 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
729 case TGSI_EXTSWIZZLE_ONE
:
737 FETCH(&r
[3], 0, CHAN_W
);
743 fetch_texel(&mach
->Samplers
[unit
],
744 &r
[0], &r
[1], &r
[2], lodBias
, /* inputs */
745 &r
[0], &r
[1], &r
[2], &r
[3]); /* outputs */
748 case TGSI_TEXTURE_3D
:
749 case TGSI_TEXTURE_CUBE
:
751 FETCH(&r
[0], 0, CHAN_X
);
752 FETCH(&r
[1], 0, CHAN_Y
);
753 FETCH(&r
[2], 0, CHAN_Z
);
755 switch (inst
->FullSrcRegisters
[0].SrcRegisterExtSwz
.ExtDivide
) {
756 case TGSI_EXTSWIZZLE_W
:
757 FETCH(&r
[3], 0, CHAN_W
);
758 r
[0].q
= micro_div(r
[0].q
, r
[3].q
);
759 r
[1].q
= micro_div(r
[1].q
, r
[3].q
);
760 r
[2].q
= micro_div(r
[2].q
, r
[3].q
);
763 case TGSI_EXTSWIZZLE_ONE
:
771 FETCH(&r
[3], 0, CHAN_W
);
777 fetch_texel(&mach
->Samplers
[unit
],
778 &r
[0], &r
[1], &r
[2], lodBias
,
779 &r
[0], &r
[1], &r
[2], &r
[3]);
786 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
787 STORE( &r
[chan_index
], 0, chan_index
);
794 constant_interpolation(
795 struct spu_exec_machine
*mach
,
801 for( i
= 0; i
< QUAD_SIZE
; i
++ ) {
802 mach
->Inputs
[attrib
].xyzw
[chan
].f
[i
] = mach
->InterpCoefs
[attrib
].a0
[chan
];
807 linear_interpolation(
808 struct spu_exec_machine
*mach
,
812 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
813 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
814 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
815 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
816 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
817 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
;
818 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = a0
+ dadx
;
819 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = a0
+ dady
;
820 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = a0
+ dadx
+ dady
;
824 perspective_interpolation(
825 struct spu_exec_machine
*mach
,
829 const float x
= mach
->QuadPos
.xyzw
[0].f
[0];
830 const float y
= mach
->QuadPos
.xyzw
[1].f
[0];
831 const float dadx
= mach
->InterpCoefs
[attrib
].dadx
[chan
];
832 const float dady
= mach
->InterpCoefs
[attrib
].dady
[chan
];
833 const float a0
= mach
->InterpCoefs
[attrib
].a0
[chan
] + dadx
* x
+ dady
* y
;
834 const float *w
= mach
->QuadPos
.xyzw
[3].f
;
835 /* divide by W here */
836 mach
->Inputs
[attrib
].xyzw
[chan
].f
[0] = a0
/ w
[0];
837 mach
->Inputs
[attrib
].xyzw
[chan
].f
[1] = (a0
+ dadx
) / w
[1];
838 mach
->Inputs
[attrib
].xyzw
[chan
].f
[2] = (a0
+ dady
) / w
[2];
839 mach
->Inputs
[attrib
].xyzw
[chan
].f
[3] = (a0
+ dadx
+ dady
) / w
[3];
843 typedef void (* interpolation_func
)(
844 struct spu_exec_machine
*mach
,
849 exec_declaration(struct spu_exec_machine
*mach
,
850 const struct tgsi_full_declaration
*decl
)
852 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
853 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
854 unsigned first
, last
, mask
;
855 interpolation_func interp
;
857 assert( decl
->Declaration
.Declare
== TGSI_DECLARE_RANGE
);
859 first
= decl
->u
.DeclarationRange
.First
;
860 last
= decl
->u
.DeclarationRange
.Last
;
861 mask
= decl
->Declaration
.UsageMask
;
863 switch( decl
->Interpolation
.Interpolate
) {
864 case TGSI_INTERPOLATE_CONSTANT
:
865 interp
= constant_interpolation
;
868 case TGSI_INTERPOLATE_LINEAR
:
869 interp
= linear_interpolation
;
872 case TGSI_INTERPOLATE_PERSPECTIVE
:
873 interp
= perspective_interpolation
;
880 if( mask
== TGSI_WRITEMASK_XYZW
) {
883 for( i
= first
; i
<= last
; i
++ ) {
884 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
885 interp( mach
, i
, j
);
892 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
893 if( mask
& (1 << j
) ) {
894 for( i
= first
; i
<= last
; i
++ ) {
895 interp( mach
, i
, j
);
906 struct spu_exec_machine
*mach
,
907 const struct tgsi_full_instruction
*inst
,
911 union spu_exec_channel r
[8];
915 switch (inst
->Instruction
.Opcode
) {
916 case TGSI_OPCODE_ARL
:
917 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
918 FETCH( &r
[0], 0, chan_index
);
919 r
[0].q
= si_cflts(r
[0].q
, 0);
920 STORE( &r
[0], 0, chan_index
);
924 case TGSI_OPCODE_MOV
:
925 /* TGSI_OPCODE_SWZ */
926 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
927 FETCH( &r
[0], 0, chan_index
);
928 STORE( &r
[0], 0, chan_index
);
932 case TGSI_OPCODE_LIT
:
933 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
934 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
937 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
938 FETCH( &r
[0], 0, CHAN_X
);
939 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
940 r
[0].q
= micro_max(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
941 STORE( &r
[0], 0, CHAN_Y
);
944 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
945 FETCH( &r
[1], 0, CHAN_Y
);
946 r
[1].q
= micro_max(r
[1].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
948 FETCH( &r
[2], 0, CHAN_W
);
949 r
[2].q
= micro_min(r
[2].q
, mach
->Temps
[TEMP_128_I
].xyzw
[TEMP_128_C
].q
);
950 r
[2].q
= micro_max(r
[2].q
, mach
->Temps
[TEMP_M128_I
].xyzw
[TEMP_M128_C
].q
);
951 r
[1].q
= micro_pow(r
[1].q
, r
[2].q
);
953 /* r0 = (r0 > 0.0) ? r1 : 0.0
955 r
[0].q
= si_fcgt(r
[0].q
, mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
);
956 r
[0].q
= si_selb(mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
].q
, r
[1].q
,
958 STORE( &r
[0], 0, CHAN_Z
);
962 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
963 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
967 case TGSI_OPCODE_RCP
:
968 /* TGSI_OPCODE_RECIP */
969 FETCH( &r
[0], 0, CHAN_X
);
970 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
971 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
972 STORE( &r
[0], 0, chan_index
);
976 case TGSI_OPCODE_RSQ
:
977 /* TGSI_OPCODE_RECIPSQRT */
978 FETCH( &r
[0], 0, CHAN_X
);
979 r
[0].q
= micro_sqrt(r
[0].q
);
980 r
[0].q
= micro_div(mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
].q
, r
[0].q
);
981 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
982 STORE( &r
[0], 0, chan_index
);
986 case TGSI_OPCODE_EXP
:
990 case TGSI_OPCODE_LOG
:
994 case TGSI_OPCODE_MUL
:
995 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
)
997 FETCH(&r
[0], 0, chan_index
);
998 FETCH(&r
[1], 1, chan_index
);
1000 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1002 STORE(&r
[0], 0, chan_index
);
1006 case TGSI_OPCODE_ADD
:
1007 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1008 FETCH( &r
[0], 0, chan_index
);
1009 FETCH( &r
[1], 1, chan_index
);
1010 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
1011 STORE( &r
[0], 0, chan_index
);
1015 case TGSI_OPCODE_DP3
:
1016 /* TGSI_OPCODE_DOT3 */
1017 FETCH( &r
[0], 0, CHAN_X
);
1018 FETCH( &r
[1], 1, CHAN_X
);
1019 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1021 FETCH( &r
[1], 0, CHAN_Y
);
1022 FETCH( &r
[2], 1, CHAN_Y
);
1023 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1026 FETCH( &r
[1], 0, CHAN_Z
);
1027 FETCH( &r
[2], 1, CHAN_Z
);
1028 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1030 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1031 STORE( &r
[0], 0, chan_index
);
1035 case TGSI_OPCODE_DP4
:
1036 /* TGSI_OPCODE_DOT4 */
1037 FETCH(&r
[0], 0, CHAN_X
);
1038 FETCH(&r
[1], 1, CHAN_X
);
1040 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1042 FETCH(&r
[1], 0, CHAN_Y
);
1043 FETCH(&r
[2], 1, CHAN_Y
);
1045 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1047 FETCH(&r
[1], 0, CHAN_Z
);
1048 FETCH(&r
[2], 1, CHAN_Z
);
1050 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1052 FETCH(&r
[1], 0, CHAN_W
);
1053 FETCH(&r
[2], 1, CHAN_W
);
1055 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1057 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1058 STORE( &r
[0], 0, chan_index
);
1062 case TGSI_OPCODE_DST
:
1063 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1064 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_X
);
1067 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1068 FETCH( &r
[0], 0, CHAN_Y
);
1069 FETCH( &r
[1], 1, CHAN_Y
);
1070 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1071 STORE( &r
[0], 0, CHAN_Y
);
1074 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1075 FETCH( &r
[0], 0, CHAN_Z
);
1076 STORE( &r
[0], 0, CHAN_Z
);
1079 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1080 FETCH( &r
[0], 1, CHAN_W
);
1081 STORE( &r
[0], 0, CHAN_W
);
1085 case TGSI_OPCODE_MIN
:
1086 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1087 FETCH(&r
[0], 0, chan_index
);
1088 FETCH(&r
[1], 1, chan_index
);
1090 r
[0].q
= micro_min(r
[0].q
, r
[1].q
);
1092 STORE(&r
[0], 0, chan_index
);
1096 case TGSI_OPCODE_MAX
:
1097 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1098 FETCH(&r
[0], 0, chan_index
);
1099 FETCH(&r
[1], 1, chan_index
);
1101 r
[0].q
= micro_max(r
[0].q
, r
[1].q
);
1103 STORE(&r
[0], 0, chan_index
);
1107 case TGSI_OPCODE_SLT
:
1108 /* TGSI_OPCODE_SETLT */
1109 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1110 FETCH( &r
[0], 0, chan_index
);
1111 FETCH( &r
[1], 1, chan_index
);
1113 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1114 r
[0].q
= si_xori(r
[0].q
, 0xff);
1116 STORE( &r
[0], 0, chan_index
);
1120 case TGSI_OPCODE_SGE
:
1121 /* TGSI_OPCODE_SETGE */
1122 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1123 FETCH( &r
[0], 0, chan_index
);
1124 FETCH( &r
[1], 1, chan_index
);
1125 r
[0].q
= micro_ge(r
[0].q
, r
[1].q
);
1126 STORE( &r
[0], 0, chan_index
);
1130 case TGSI_OPCODE_MAD
:
1131 /* TGSI_OPCODE_MADD */
1132 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1133 FETCH( &r
[0], 0, chan_index
);
1134 FETCH( &r
[1], 1, chan_index
);
1135 FETCH( &r
[2], 2, chan_index
);
1136 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1137 STORE( &r
[0], 0, chan_index
);
1141 case TGSI_OPCODE_SUB
:
1142 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1143 FETCH(&r
[0], 0, chan_index
);
1144 FETCH(&r
[1], 1, chan_index
);
1146 r
[0].q
= si_fs(r
[0].q
, r
[1].q
);
1148 STORE(&r
[0], 0, chan_index
);
1152 case TGSI_OPCODE_LERP
:
1153 /* TGSI_OPCODE_LRP */
1154 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1155 FETCH(&r
[0], 0, chan_index
);
1156 FETCH(&r
[1], 1, chan_index
);
1157 FETCH(&r
[2], 2, chan_index
);
1159 r
[1].q
= si_fs(r
[1].q
, r
[2].q
);
1160 r
[0].q
= si_fma(r
[0].q
, r
[1].q
, r
[2].q
);
1162 STORE(&r
[0], 0, chan_index
);
1166 case TGSI_OPCODE_CND
:
1170 case TGSI_OPCODE_CND0
:
1174 case TGSI_OPCODE_DOT2ADD
:
1175 /* TGSI_OPCODE_DP2A */
1179 case TGSI_OPCODE_INDEX
:
1183 case TGSI_OPCODE_NEGATE
:
1187 case TGSI_OPCODE_FRAC
:
1188 /* TGSI_OPCODE_FRC */
1189 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1190 FETCH( &r
[0], 0, chan_index
);
1191 r
[0].q
= micro_frc(r
[0].q
);
1192 STORE( &r
[0], 0, chan_index
);
1196 case TGSI_OPCODE_CLAMP
:
1200 case TGSI_OPCODE_FLOOR
:
1201 /* TGSI_OPCODE_FLR */
1202 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1203 FETCH( &r
[0], 0, chan_index
);
1204 r
[0].q
= micro_flr(r
[0].q
);
1205 STORE( &r
[0], 0, chan_index
);
1209 case TGSI_OPCODE_ROUND
:
1210 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1211 FETCH( &r
[0], 0, chan_index
);
1212 r
[0].q
= micro_rnd(r
[0].q
);
1213 STORE( &r
[0], 0, chan_index
);
1217 case TGSI_OPCODE_EXPBASE2
:
1218 /* TGSI_OPCODE_EX2 */
1219 FETCH(&r
[0], 0, CHAN_X
);
1221 r
[0].q
= micro_pow(mach
->Temps
[TEMP_2_I
].xyzw
[TEMP_2_C
].q
, r
[0].q
);
1223 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1224 STORE( &r
[0], 0, chan_index
);
1228 case TGSI_OPCODE_LOGBASE2
:
1229 /* TGSI_OPCODE_LG2 */
1230 FETCH( &r
[0], 0, CHAN_X
);
1231 r
[0].q
= micro_lg2(r
[0].q
);
1232 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1233 STORE( &r
[0], 0, chan_index
);
1237 case TGSI_OPCODE_POWER
:
1238 /* TGSI_OPCODE_POW */
1239 FETCH(&r
[0], 0, CHAN_X
);
1240 FETCH(&r
[1], 1, CHAN_X
);
1242 r
[0].q
= micro_pow(r
[0].q
, r
[1].q
);
1244 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1245 STORE( &r
[0], 0, chan_index
);
1249 case TGSI_OPCODE_CROSSPRODUCT
:
1250 /* TGSI_OPCODE_XPD */
1251 FETCH(&r
[0], 0, CHAN_Y
);
1252 FETCH(&r
[1], 1, CHAN_Z
);
1253 FETCH(&r
[3], 0, CHAN_Z
);
1254 FETCH(&r
[4], 1, CHAN_Y
);
1256 /* r2 = (r0 * r1) - (r3 * r5)
1258 r
[2].q
= si_fm(r
[3].q
, r
[5].q
);
1259 r
[2].q
= si_fms(r
[0].q
, r
[1].q
, r
[2].q
);
1261 if (IS_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1262 STORE( &r
[2], 0, CHAN_X
);
1265 FETCH(&r
[2], 1, CHAN_X
);
1266 FETCH(&r
[5], 0, CHAN_X
);
1268 /* r3 = (r3 * r2) - (r1 * r5)
1270 r
[1].q
= si_fm(r
[1].q
, r
[5].q
);
1271 r
[3].q
= si_fms(r
[3].q
, r
[2].q
, r
[1].q
);
1273 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1274 STORE( &r
[3], 0, CHAN_Y
);
1277 /* r5 = (r5 * r4) - (r0 * r2)
1279 r
[0].q
= si_fm(r
[0].q
, r
[2].q
);
1280 r
[5].q
= si_fms(r
[5].q
, r
[4].q
, r
[0].q
);
1282 if (IS_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1283 STORE( &r
[5], 0, CHAN_Z
);
1286 if (IS_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1287 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1291 case TGSI_OPCODE_MULTIPLYMATRIX
:
1295 case TGSI_OPCODE_ABS
:
1296 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1297 FETCH(&r
[0], 0, chan_index
);
1299 r
[0].q
= micro_abs(r
[0].q
);
1301 STORE(&r
[0], 0, chan_index
);
1305 case TGSI_OPCODE_RCC
:
1309 case TGSI_OPCODE_DPH
:
1310 FETCH(&r
[0], 0, CHAN_X
);
1311 FETCH(&r
[1], 1, CHAN_X
);
1313 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1315 FETCH(&r
[1], 0, CHAN_Y
);
1316 FETCH(&r
[2], 1, CHAN_Y
);
1318 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1320 FETCH(&r
[1], 0, CHAN_Z
);
1321 FETCH(&r
[2], 1, CHAN_Z
);
1323 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1325 FETCH(&r
[1], 1, CHAN_W
);
1327 r
[0].q
= si_fa(r
[0].q
, r
[1].q
);
1329 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1330 STORE( &r
[0], 0, chan_index
);
1334 case TGSI_OPCODE_COS
:
1335 FETCH(&r
[0], 0, CHAN_X
);
1337 r
[0].q
= micro_cos(r
[0].q
);
1339 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1340 STORE( &r
[0], 0, chan_index
);
1344 case TGSI_OPCODE_DDX
:
1345 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1346 FETCH( &r
[0], 0, chan_index
);
1347 r
[0].q
= micro_ddx(r
[0].q
);
1348 STORE( &r
[0], 0, chan_index
);
1352 case TGSI_OPCODE_DDY
:
1353 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1354 FETCH( &r
[0], 0, chan_index
);
1355 r
[0].q
= micro_ddy(r
[0].q
);
1356 STORE( &r
[0], 0, chan_index
);
1360 case TGSI_OPCODE_KILP
:
1361 exec_kilp (mach
, inst
);
1364 case TGSI_OPCODE_KIL
:
1365 /* for enabled ExecMask bits, set the killed bit */
1366 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] |= mach
->ExecMask
;
1369 case TGSI_OPCODE_PK2H
:
1373 case TGSI_OPCODE_PK2US
:
1377 case TGSI_OPCODE_PK4B
:
1381 case TGSI_OPCODE_PK4UB
:
1385 case TGSI_OPCODE_RFL
:
1389 case TGSI_OPCODE_SEQ
:
1390 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1391 FETCH( &r
[0], 0, chan_index
);
1392 FETCH( &r
[1], 1, chan_index
);
1394 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1396 STORE( &r
[0], 0, chan_index
);
1400 case TGSI_OPCODE_SFL
:
1404 case TGSI_OPCODE_SGT
:
1405 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1406 FETCH( &r
[0], 0, chan_index
);
1407 FETCH( &r
[1], 1, chan_index
);
1408 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1409 STORE( &r
[0], 0, chan_index
);
1413 case TGSI_OPCODE_SIN
:
1414 FETCH( &r
[0], 0, CHAN_X
);
1415 r
[0].q
= micro_sin(r
[0].q
);
1416 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1417 STORE( &r
[0], 0, chan_index
);
1421 case TGSI_OPCODE_SLE
:
1422 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1423 FETCH( &r
[0], 0, chan_index
);
1424 FETCH( &r
[1], 1, chan_index
);
1426 r
[0].q
= si_fcgt(r
[0].q
, r
[1].q
);
1427 r
[0].q
= si_xori(r
[0].q
, 0xff);
1429 STORE( &r
[0], 0, chan_index
);
1433 case TGSI_OPCODE_SNE
:
1434 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1435 FETCH( &r
[0], 0, chan_index
);
1436 FETCH( &r
[1], 1, chan_index
);
1438 r
[0].q
= si_fceq(r
[0].q
, r
[1].q
);
1439 r
[0].q
= si_xori(r
[0].q
, 0xff);
1441 STORE( &r
[0], 0, chan_index
);
1445 case TGSI_OPCODE_STR
:
1449 case TGSI_OPCODE_TEX
:
1450 /* simple texture lookup */
1451 /* src[0] = texcoord */
1452 /* src[1] = sampler unit */
1453 exec_tex(mach
, inst
, FALSE
);
1456 case TGSI_OPCODE_TXB
:
1457 /* Texture lookup with lod bias */
1458 /* src[0] = texcoord (src[0].w = load bias) */
1459 /* src[1] = sampler unit */
1460 exec_tex(mach
, inst
, TRUE
);
1463 case TGSI_OPCODE_TXD
:
1464 /* Texture lookup with explict partial derivatives */
1465 /* src[0] = texcoord */
1466 /* src[1] = d[strq]/dx */
1467 /* src[2] = d[strq]/dy */
1468 /* src[3] = sampler unit */
1472 case TGSI_OPCODE_TXL
:
1473 /* Texture lookup with explit LOD */
1474 /* src[0] = texcoord (src[0].w = load bias) */
1475 /* src[1] = sampler unit */
1476 exec_tex(mach
, inst
, TRUE
);
1479 case TGSI_OPCODE_UP2H
:
1483 case TGSI_OPCODE_UP2US
:
1487 case TGSI_OPCODE_UP4B
:
1491 case TGSI_OPCODE_UP4UB
:
1495 case TGSI_OPCODE_X2D
:
1499 case TGSI_OPCODE_ARA
:
1503 case TGSI_OPCODE_ARR
:
1507 case TGSI_OPCODE_BRA
:
1511 case TGSI_OPCODE_CAL
:
1512 /* skip the call if no execution channels are enabled */
1513 if (mach
->ExecMask
) {
1516 /* push the Cond, Loop, Cont stacks */
1517 assert(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1518 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1519 assert(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1520 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1521 assert(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1522 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1524 assert(mach
->FuncStackTop
< TGSI_EXEC_MAX_CALL_NESTING
);
1525 mach
->FuncStack
[mach
->FuncStackTop
++] = mach
->FuncMask
;
1527 /* note that PC was already incremented above */
1528 mach
->CallStack
[mach
->CallStackTop
++] = *pc
;
1529 *pc
= inst
->InstructionExtLabel
.Label
;
1533 case TGSI_OPCODE_RET
:
1534 mach
->FuncMask
&= ~mach
->ExecMask
;
1535 UPDATE_EXEC_MASK(mach
);
1537 if (mach
->ExecMask
== 0x0) {
1538 /* really return now (otherwise, keep executing */
1540 if (mach
->CallStackTop
== 0) {
1541 /* returning from main() */
1545 *pc
= mach
->CallStack
[--mach
->CallStackTop
];
1547 /* pop the Cond, Loop, Cont stacks */
1548 assert(mach
->CondStackTop
> 0);
1549 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1550 assert(mach
->LoopStackTop
> 0);
1551 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1552 assert(mach
->ContStackTop
> 0);
1553 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1554 assert(mach
->FuncStackTop
> 0);
1555 mach
->FuncMask
= mach
->FuncStack
[--mach
->FuncStackTop
];
1557 UPDATE_EXEC_MASK(mach
);
1561 case TGSI_OPCODE_SSG
:
1565 case TGSI_OPCODE_CMP
:
1566 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1567 FETCH(&r
[0], 0, chan_index
);
1568 FETCH(&r
[1], 1, chan_index
);
1569 FETCH(&r
[2], 2, chan_index
);
1571 /* r0 = (r0 < 0.0) ? r1 : r2
1573 r
[3].q
= si_xor(r
[3].q
, r
[3].q
);
1574 r
[0].q
= micro_lt(r
[0].q
, r
[3].q
);
1575 r
[0].q
= si_selb(r
[1].q
, r
[2].q
, r
[0].q
);
1577 STORE(&r
[0], 0, chan_index
);
1581 case TGSI_OPCODE_SCS
:
1582 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) || IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1583 FETCH( &r
[0], 0, CHAN_X
);
1585 if( IS_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1586 r
[1].q
= micro_cos(r
[0].q
);
1587 STORE( &r
[1], 0, CHAN_X
);
1589 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1590 r
[1].q
= micro_sin(r
[0].q
);
1591 STORE( &r
[1], 0, CHAN_Y
);
1593 if( IS_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1594 STORE( &mach
->Temps
[TEMP_0_I
].xyzw
[TEMP_0_C
], 0, CHAN_Z
);
1596 if( IS_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1597 STORE( &mach
->Temps
[TEMP_1_I
].xyzw
[TEMP_1_C
], 0, CHAN_W
);
1601 case TGSI_OPCODE_NRM
:
1605 case TGSI_OPCODE_DIV
:
1609 case TGSI_OPCODE_DP2
:
1610 FETCH( &r
[0], 0, CHAN_X
);
1611 FETCH( &r
[1], 1, CHAN_X
);
1612 r
[0].q
= si_fm(r
[0].q
, r
[1].q
);
1614 FETCH( &r
[1], 0, CHAN_Y
);
1615 FETCH( &r
[2], 1, CHAN_Y
);
1616 r
[0].q
= si_fma(r
[1].q
, r
[2].q
, r
[0].q
);
1618 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1619 STORE( &r
[0], 0, chan_index
);
1623 case TGSI_OPCODE_IF
:
1625 assert(mach
->CondStackTop
< TGSI_EXEC_MAX_COND_NESTING
);
1626 mach
->CondStack
[mach
->CondStackTop
++] = mach
->CondMask
;
1627 FETCH( &r
[0], 0, CHAN_X
);
1628 /* update CondMask */
1630 mach
->CondMask
&= ~0x1;
1633 mach
->CondMask
&= ~0x2;
1636 mach
->CondMask
&= ~0x4;
1639 mach
->CondMask
&= ~0x8;
1641 UPDATE_EXEC_MASK(mach
);
1642 /* Todo: If CondMask==0, jump to ELSE */
1645 case TGSI_OPCODE_ELSE
:
1646 /* invert CondMask wrt previous mask */
1649 assert(mach
->CondStackTop
> 0);
1650 prevMask
= mach
->CondStack
[mach
->CondStackTop
- 1];
1651 mach
->CondMask
= ~mach
->CondMask
& prevMask
;
1652 UPDATE_EXEC_MASK(mach
);
1653 /* Todo: If CondMask==0, jump to ENDIF */
1657 case TGSI_OPCODE_ENDIF
:
1659 assert(mach
->CondStackTop
> 0);
1660 mach
->CondMask
= mach
->CondStack
[--mach
->CondStackTop
];
1661 UPDATE_EXEC_MASK(mach
);
1664 case TGSI_OPCODE_END
:
1665 /* halt execution */
1669 case TGSI_OPCODE_REP
:
1673 case TGSI_OPCODE_ENDREP
:
1677 case TGSI_OPCODE_PUSHA
:
1681 case TGSI_OPCODE_POPA
:
1685 case TGSI_OPCODE_CEIL
:
1686 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1687 FETCH( &r
[0], 0, chan_index
);
1688 r
[0].q
= micro_ceil(r
[0].q
);
1689 STORE( &r
[0], 0, chan_index
);
1693 case TGSI_OPCODE_I2F
:
1694 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1695 FETCH( &r
[0], 0, chan_index
);
1696 r
[0].q
= si_csflt(r
[0].q
, 0);
1697 STORE( &r
[0], 0, chan_index
);
1701 case TGSI_OPCODE_NOT
:
1702 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1703 FETCH( &r
[0], 0, chan_index
);
1704 r
[0].q
= si_xorbi(r
[0].q
, 0xff);
1705 STORE( &r
[0], 0, chan_index
);
1709 case TGSI_OPCODE_TRUNC
:
1710 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1711 FETCH( &r
[0], 0, chan_index
);
1712 r
[0].q
= micro_trunc(r
[0].q
);
1713 STORE( &r
[0], 0, chan_index
);
1717 case TGSI_OPCODE_SHL
:
1718 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1719 FETCH( &r
[0], 0, chan_index
);
1720 FETCH( &r
[1], 1, chan_index
);
1722 r
[0].q
= si_shl(r
[0].q
, r
[1].q
);
1724 STORE( &r
[0], 0, chan_index
);
1728 case TGSI_OPCODE_SHR
:
1729 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1730 FETCH( &r
[0], 0, chan_index
);
1731 FETCH( &r
[1], 1, chan_index
);
1732 r
[0].q
= micro_ishr(r
[0].q
, r
[1].q
);
1733 STORE( &r
[0], 0, chan_index
);
1737 case TGSI_OPCODE_AND
:
1738 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1739 FETCH( &r
[0], 0, chan_index
);
1740 FETCH( &r
[1], 1, chan_index
);
1741 r
[0].q
= si_and(r
[0].q
, r
[1].q
);
1742 STORE( &r
[0], 0, chan_index
);
1746 case TGSI_OPCODE_OR
:
1747 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1748 FETCH( &r
[0], 0, chan_index
);
1749 FETCH( &r
[1], 1, chan_index
);
1750 r
[0].q
= si_or(r
[0].q
, r
[1].q
);
1751 STORE( &r
[0], 0, chan_index
);
1755 case TGSI_OPCODE_MOD
:
1759 case TGSI_OPCODE_XOR
:
1760 FOR_EACH_ENABLED_CHANNEL( *inst
, chan_index
) {
1761 FETCH( &r
[0], 0, chan_index
);
1762 FETCH( &r
[1], 1, chan_index
);
1763 r
[0].q
= si_xor(r
[0].q
, r
[1].q
);
1764 STORE( &r
[0], 0, chan_index
);
1768 case TGSI_OPCODE_SAD
:
1772 case TGSI_OPCODE_TXF
:
1776 case TGSI_OPCODE_TXQ
:
1780 case TGSI_OPCODE_EMIT
:
1781 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] += 16;
1782 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]]++;
1785 case TGSI_OPCODE_ENDPRIM
:
1786 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]++;
1787 mach
->Primitives
[mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0]] = 0;
1790 case TGSI_OPCODE_LOOP
:
1791 /* fall-through (for now) */
1792 case TGSI_OPCODE_BGNLOOP2
:
1793 /* push LoopMask and ContMasks */
1794 assert(mach
->LoopStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1795 mach
->LoopStack
[mach
->LoopStackTop
++] = mach
->LoopMask
;
1796 assert(mach
->ContStackTop
< TGSI_EXEC_MAX_LOOP_NESTING
);
1797 mach
->ContStack
[mach
->ContStackTop
++] = mach
->ContMask
;
1800 case TGSI_OPCODE_ENDLOOP
:
1801 /* fall-through (for now at least) */
1802 case TGSI_OPCODE_ENDLOOP2
:
1803 /* Restore ContMask, but don't pop */
1804 assert(mach
->ContStackTop
> 0);
1805 mach
->ContMask
= mach
->ContStack
[mach
->ContStackTop
- 1];
1806 if (mach
->LoopMask
) {
1807 /* repeat loop: jump to instruction just past BGNLOOP */
1808 *pc
= inst
->InstructionExtLabel
.Label
+ 1;
1811 /* exit loop: pop LoopMask */
1812 assert(mach
->LoopStackTop
> 0);
1813 mach
->LoopMask
= mach
->LoopStack
[--mach
->LoopStackTop
];
1815 assert(mach
->ContStackTop
> 0);
1816 mach
->ContMask
= mach
->ContStack
[--mach
->ContStackTop
];
1818 UPDATE_EXEC_MASK(mach
);
1821 case TGSI_OPCODE_BRK
:
1822 /* turn off loop channels for each enabled exec channel */
1823 mach
->LoopMask
&= ~mach
->ExecMask
;
1824 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1825 UPDATE_EXEC_MASK(mach
);
1828 case TGSI_OPCODE_CONT
:
1829 /* turn off cont channels for each enabled exec channel */
1830 mach
->ContMask
&= ~mach
->ExecMask
;
1831 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1832 UPDATE_EXEC_MASK(mach
);
1835 case TGSI_OPCODE_BGNSUB
:
1839 case TGSI_OPCODE_ENDSUB
:
1843 case TGSI_OPCODE_NOISE1
:
1847 case TGSI_OPCODE_NOISE2
:
1851 case TGSI_OPCODE_NOISE3
:
1855 case TGSI_OPCODE_NOISE4
:
1859 case TGSI_OPCODE_NOP
:
1869 * Run TGSI interpreter.
1870 * \return bitmask of "alive" quad components
1873 spu_exec_machine_run( struct spu_exec_machine
*mach
)
1878 mach
->CondMask
= 0xf;
1879 mach
->LoopMask
= 0xf;
1880 mach
->ContMask
= 0xf;
1881 mach
->FuncMask
= 0xf;
1882 mach
->ExecMask
= 0xf;
1884 mach
->CondStackTop
= 0; /* temporarily subvert this assertion */
1885 assert(mach
->CondStackTop
== 0);
1886 assert(mach
->LoopStackTop
== 0);
1887 assert(mach
->ContStackTop
== 0);
1888 assert(mach
->CallStackTop
== 0);
1890 mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0] = 0;
1891 mach
->Temps
[TEMP_OUTPUT_I
].xyzw
[TEMP_OUTPUT_C
].u
[0] = 0;
1893 if( mach
->Processor
== TGSI_PROCESSOR_GEOMETRY
) {
1894 mach
->Temps
[TEMP_PRIMITIVE_I
].xyzw
[TEMP_PRIMITIVE_C
].u
[0] = 0;
1895 mach
->Primitives
[0] = 0;
1899 /* execute declarations (interpolants) */
1900 if( mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1901 for (i
= 0; i
< mach
->NumDeclarations
; i
++) {
1902 uint8_t buffer
[sizeof(struct tgsi_full_declaration
) + 32] ALIGN16_ATTRIB
;
1903 struct tgsi_full_declaration decl
;
1904 unsigned long decl_addr
= (unsigned long) (mach
->Declarations
+i
);
1905 unsigned size
= ((sizeof(decl
) + (decl_addr
& 0x0f) + 0x0f) & ~0x0f);
1907 mfc_get(buffer
, decl_addr
& ~0x0f, size
, TAG_INSTRUCTION_FETCH
, 0, 0);
1908 wait_on_mask(1 << TAG_INSTRUCTION_FETCH
);
1910 memcpy(& decl
, buffer
+ (decl_addr
& 0x0f), sizeof(decl
));
1911 exec_declaration( mach
, &decl
);
1915 /* execute instructions, until pc is set to -1 */
1917 uint8_t buffer
[sizeof(struct tgsi_full_instruction
) + 32] ALIGN16_ATTRIB
;
1918 struct tgsi_full_instruction inst
;
1919 unsigned long inst_addr
= (unsigned long) (mach
->Instructions
+ pc
);
1920 unsigned size
= ((sizeof(inst
) + (inst_addr
& 0x0f) + 0x0f) & ~0x0f);
1922 assert(pc
< mach
->NumInstructions
);
1923 mfc_get(buffer
, inst_addr
& ~0x0f, size
, TAG_INSTRUCTION_FETCH
, 0, 0);
1924 wait_on_mask(1 << TAG_INSTRUCTION_FETCH
);
1926 memcpy(& inst
, buffer
+ (inst_addr
& 0x0f), sizeof(inst
));
1927 exec_instruction( mach
, & inst
, &pc
);
1931 /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
1932 if (mach
->Processor
== TGSI_PROCESSOR_FRAGMENT
) {
1934 * Scale back depth component.
1936 for (i
= 0; i
< 4; i
++)
1937 mach
->Outputs
[0].xyzw
[2].f
[i
] *= ctx
->DrawBuffer
->_DepthMaxF
;
1941 return ~mach
->Temps
[TEMP_KILMASK_I
].xyzw
[TEMP_KILMASK_C
].u
[0];