Merge branch 'mesa_7_7_branch'
[mesa.git] / src / gallium / drivers / cell / spu / spu_exec.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * TGSI interpretor/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers. This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 * Michal Krol
50 * Brian Paul
51 */
52
53 #include <transpose_matrix4x4.h>
54 #include <simdmath/ceilf4.h>
55 #include <simdmath/cosf4.h>
56 #include <simdmath/divf4.h>
57 #include <simdmath/floorf4.h>
58 #include <simdmath/log2f4.h>
59 #include <simdmath/powf4.h>
60 #include <simdmath/sinf4.h>
61 #include <simdmath/sqrtf4.h>
62 #include <simdmath/truncf4.h>
63
64 #include "pipe/p_compiler.h"
65 #include "pipe/p_state.h"
66 #include "pipe/p_shader_tokens.h"
67 #include "tgsi/tgsi_parse.h"
68 #include "tgsi/tgsi_util.h"
69 #include "spu_exec.h"
70 #include "spu_main.h"
71 #include "spu_vertex_shader.h"
72 #include "spu_dcache.h"
73 #include "cell/common.h"
74
75 #define TILE_TOP_LEFT 0
76 #define TILE_TOP_RIGHT 1
77 #define TILE_BOTTOM_LEFT 2
78 #define TILE_BOTTOM_RIGHT 3
79
80 /*
81 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
82 */
83 #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I
84 #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C
85 #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I
86 #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C
87 #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I
88 #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C
89 #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I
90 #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C
91 #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I
92 #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C
93 #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I
94 #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C
95 #define TEMP_128_I TGSI_EXEC_TEMP_128_I
96 #define TEMP_128_C TGSI_EXEC_TEMP_128_C
97 #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I
98 #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C
99 #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
100 #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
101 #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
102 #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
103 #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
104 #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
105 #define TEMP_R0 TGSI_EXEC_TEMP_R0
106
107 #define FOR_EACH_CHANNEL(CHAN)\
108 for (CHAN = 0; CHAN < 4; CHAN++)
109
110 #define IS_CHANNEL_ENABLED(INST, CHAN)\
111 ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
112
113 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
114 ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
115
116 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
117 FOR_EACH_CHANNEL( CHAN )\
118 if (IS_CHANNEL_ENABLED( INST, CHAN ))
119
120 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
121 FOR_EACH_CHANNEL( CHAN )\
122 if (IS_CHANNEL_ENABLED2( INST, CHAN ))
123
124
125 /** The execution mask depends on the conditional mask and the loop mask */
126 #define UPDATE_EXEC_MASK(MACH) \
127 MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
128
129
130 #define CHAN_X 0
131 #define CHAN_Y 1
132 #define CHAN_Z 2
133 #define CHAN_W 3
134
135
136
137 /**
138 * Initialize machine state by expanding tokens to full instructions,
139 * allocating temporary storage, setting up constants, etc.
140 * After this, we can call spu_exec_machine_run() many times.
141 */
142 void
143 spu_exec_machine_init(struct spu_exec_machine *mach,
144 uint numSamplers,
145 struct spu_sampler *samplers,
146 unsigned processor)
147 {
148 const qword zero = si_il(0);
149 const qword not_zero = si_il(~0);
150
151 (void) numSamplers;
152 mach->Samplers = samplers;
153 mach->Processor = processor;
154 mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
155
156 /* Setup constants. */
157 mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
158 mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
159 mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
160 mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
161
162 mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
163 mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
164 mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
165 mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
166 }
167
168
169 static INLINE qword
170 micro_abs(qword src)
171 {
172 return si_rotmi(si_shli(src, 1), -1);
173 }
174
175 static INLINE qword
176 micro_ceil(qword src)
177 {
178 return (qword) _ceilf4((vec_float4) src);
179 }
180
181 static INLINE qword
182 micro_cos(qword src)
183 {
184 return (qword) _cosf4((vec_float4) src);
185 }
186
187 static const qword br_shuf = {
188 TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
189 TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
190 TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
191 TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
192 TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
193 TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
194 TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
195 TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
196 };
197
198 static const qword bl_shuf = {
199 TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
200 TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
201 TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
202 TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
203 TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
204 TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
205 TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
206 TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
207 };
208
209 static const qword tl_shuf = {
210 TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
211 TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
212 TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
213 TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
214 TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
215 TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
216 TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
217 TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
218 };
219
220 static qword
221 micro_ddx(qword src)
222 {
223 qword bottom_right = si_shufb(src, src, br_shuf);
224 qword bottom_left = si_shufb(src, src, bl_shuf);
225
226 return si_fs(bottom_right, bottom_left);
227 }
228
229 static qword
230 micro_ddy(qword src)
231 {
232 qword top_left = si_shufb(src, src, tl_shuf);
233 qword bottom_left = si_shufb(src, src, bl_shuf);
234
235 return si_fs(top_left, bottom_left);
236 }
237
238 static INLINE qword
239 micro_div(qword src0, qword src1)
240 {
241 return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
242 }
243
244 static qword
245 micro_flr(qword src)
246 {
247 return (qword) _floorf4((vec_float4) src);
248 }
249
250 static qword
251 micro_frc(qword src)
252 {
253 return si_fs(src, (qword) _floorf4((vec_float4) src));
254 }
255
256 static INLINE qword
257 micro_ge(qword src0, qword src1)
258 {
259 return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
260 }
261
262 static qword
263 micro_lg2(qword src)
264 {
265 return (qword) _log2f4((vec_float4) src);
266 }
267
268 static INLINE qword
269 micro_lt(qword src0, qword src1)
270 {
271 const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
272
273 return si_xori(tmp, 0xff);
274 }
275
276 static INLINE qword
277 micro_max(qword src0, qword src1)
278 {
279 return si_selb(src1, src0, si_fcgt(src0, src1));
280 }
281
282 static INLINE qword
283 micro_min(qword src0, qword src1)
284 {
285 return si_selb(src0, src1, si_fcgt(src0, src1));
286 }
287
288 static qword
289 micro_neg(qword src)
290 {
291 return si_xor(src, (qword) spu_splats(0x80000000));
292 }
293
294 static qword
295 micro_set_sign(qword src)
296 {
297 return si_or(src, (qword) spu_splats(0x80000000));
298 }
299
300 static qword
301 micro_pow(qword src0, qword src1)
302 {
303 return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
304 }
305
306 static qword
307 micro_rnd(qword src)
308 {
309 const qword half = (qword) spu_splats(0.5f);
310
311 /* May be able to use _roundf4. There may be some difference, though.
312 */
313 return (qword) _floorf4((vec_float4) si_fa(src, half));
314 }
315
316 static INLINE qword
317 micro_ishr(qword src0, qword src1)
318 {
319 return si_rotma(src0, si_sfi(src1, 0));
320 }
321
322 static qword
323 micro_trunc(qword src)
324 {
325 return (qword) _truncf4((vec_float4) src);
326 }
327
328 static qword
329 micro_sin(qword src)
330 {
331 return (qword) _sinf4((vec_float4) src);
332 }
333
334 static INLINE qword
335 micro_sqrt(qword src)
336 {
337 return (qword) _sqrtf4((vec_float4) src);
338 }
339
340 static void
341 fetch_src_file_channel(
342 const struct spu_exec_machine *mach,
343 const uint file,
344 const uint swizzle,
345 const union spu_exec_channel *index,
346 union spu_exec_channel *chan )
347 {
348 switch( swizzle ) {
349 case TGSI_SWIZZLE_X:
350 case TGSI_SWIZZLE_Y:
351 case TGSI_SWIZZLE_Z:
352 case TGSI_SWIZZLE_W:
353 switch( file ) {
354 case TGSI_FILE_CONSTANT: {
355 unsigned i;
356
357 for (i = 0; i < 4; i++) {
358 const float *ptr = mach->Consts[index->i[i]];
359 float tmp[4];
360
361 spu_dcache_fetch_unaligned((qword *) tmp,
362 (uintptr_t)(ptr + swizzle),
363 sizeof(float));
364
365 chan->f[i] = tmp[0];
366 }
367 break;
368 }
369
370 case TGSI_FILE_INPUT:
371 chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
372 chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
373 chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
374 chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
375 break;
376
377 case TGSI_FILE_TEMPORARY:
378 chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
379 chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
380 chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
381 chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
382 break;
383
384 case TGSI_FILE_IMMEDIATE:
385 ASSERT( index->i[0] < (int) mach->ImmLimit );
386 ASSERT( index->i[1] < (int) mach->ImmLimit );
387 ASSERT( index->i[2] < (int) mach->ImmLimit );
388 ASSERT( index->i[3] < (int) mach->ImmLimit );
389
390 chan->f[0] = mach->Imms[index->i[0]][swizzle];
391 chan->f[1] = mach->Imms[index->i[1]][swizzle];
392 chan->f[2] = mach->Imms[index->i[2]][swizzle];
393 chan->f[3] = mach->Imms[index->i[3]][swizzle];
394 break;
395
396 case TGSI_FILE_ADDRESS:
397 chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
398 chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
399 chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
400 chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
401 break;
402
403 case TGSI_FILE_OUTPUT:
404 /* vertex/fragment output vars can be read too */
405 chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
406 chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
407 chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
408 chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
409 break;
410
411 default:
412 ASSERT( 0 );
413 }
414 break;
415
416 default:
417 ASSERT( 0 );
418 }
419 }
420
421 static void
422 fetch_source(
423 const struct spu_exec_machine *mach,
424 union spu_exec_channel *chan,
425 const struct tgsi_full_src_register *reg,
426 const uint chan_index )
427 {
428 union spu_exec_channel index;
429 uint swizzle;
430
431 index.i[0] =
432 index.i[1] =
433 index.i[2] =
434 index.i[3] = reg->Register.Index;
435
436 if (reg->Register.Indirect) {
437 union spu_exec_channel index2;
438 union spu_exec_channel indir_index;
439
440 index2.i[0] =
441 index2.i[1] =
442 index2.i[2] =
443 index2.i[3] = reg->Indirect.Index;
444
445 swizzle = tgsi_util_get_src_register_swizzle(&reg->Indirect,
446 CHAN_X);
447 fetch_src_file_channel(
448 mach,
449 reg->Indirect.File,
450 swizzle,
451 &index2,
452 &indir_index );
453
454 index.q = si_a(index.q, indir_index.q);
455 }
456
457 if( reg->Register.Dimension ) {
458 switch( reg->Register.File ) {
459 case TGSI_FILE_INPUT:
460 index.q = si_mpyi(index.q, 17);
461 break;
462 case TGSI_FILE_CONSTANT:
463 index.q = si_shli(index.q, 12);
464 break;
465 default:
466 ASSERT( 0 );
467 }
468
469 index.i[0] += reg->Dimension.Index;
470 index.i[1] += reg->Dimension.Index;
471 index.i[2] += reg->Dimension.Index;
472 index.i[3] += reg->Dimension.Index;
473
474 if (reg->Dimension.Indirect) {
475 union spu_exec_channel index2;
476 union spu_exec_channel indir_index;
477
478 index2.i[0] =
479 index2.i[1] =
480 index2.i[2] =
481 index2.i[3] = reg->DimIndirect.Index;
482
483 swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
484 fetch_src_file_channel(
485 mach,
486 reg->DimIndirect.File,
487 swizzle,
488 &index2,
489 &indir_index );
490
491 index.q = si_a(index.q, indir_index.q);
492 }
493 }
494
495 swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
496 fetch_src_file_channel(
497 mach,
498 reg->Register.File,
499 swizzle,
500 &index,
501 chan );
502
503 switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
504 case TGSI_UTIL_SIGN_CLEAR:
505 chan->q = micro_abs(chan->q);
506 break;
507
508 case TGSI_UTIL_SIGN_SET:
509 chan->q = micro_set_sign(chan->q);
510 break;
511
512 case TGSI_UTIL_SIGN_TOGGLE:
513 chan->q = micro_neg(chan->q);
514 break;
515
516 case TGSI_UTIL_SIGN_KEEP:
517 break;
518 }
519
520 if (reg->RegisterExtMod.Complement) {
521 chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
522 }
523 }
524
525 static void
526 store_dest(
527 struct spu_exec_machine *mach,
528 const union spu_exec_channel *chan,
529 const struct tgsi_full_dst_register *reg,
530 const struct tgsi_full_instruction *inst,
531 uint chan_index )
532 {
533 union spu_exec_channel *dst;
534
535 switch( reg->Register.File ) {
536 case TGSI_FILE_NULL:
537 return;
538
539 case TGSI_FILE_OUTPUT:
540 dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
541 + reg->Register.Index].xyzw[chan_index];
542 break;
543
544 case TGSI_FILE_TEMPORARY:
545 dst = &mach->Temps[reg->Register.Index].xyzw[chan_index];
546 break;
547
548 case TGSI_FILE_ADDRESS:
549 dst = &mach->Addrs[reg->Register.Index].xyzw[chan_index];
550 break;
551
552 default:
553 ASSERT( 0 );
554 return;
555 }
556
557 switch (inst->Instruction.Saturate)
558 {
559 case TGSI_SAT_NONE:
560 if (mach->ExecMask & 0x1)
561 dst->i[0] = chan->i[0];
562 if (mach->ExecMask & 0x2)
563 dst->i[1] = chan->i[1];
564 if (mach->ExecMask & 0x4)
565 dst->i[2] = chan->i[2];
566 if (mach->ExecMask & 0x8)
567 dst->i[3] = chan->i[3];
568 break;
569
570 case TGSI_SAT_ZERO_ONE:
571 /* XXX need to obey ExecMask here */
572 dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
573 dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
574 break;
575
576 case TGSI_SAT_MINUS_PLUS_ONE:
577 ASSERT( 0 );
578 break;
579
580 default:
581 ASSERT( 0 );
582 }
583 }
584
585 #define FETCH(VAL,INDEX,CHAN)\
586 fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
587
588 #define STORE(VAL,INDEX,CHAN)\
589 store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
590
591
592 /**
593 * Execute ARB-style KIL which is predicated by a src register.
594 * Kill fragment if any of the four values is less than zero.
595 */
596 static void
597 exec_kil(struct spu_exec_machine *mach,
598 const struct tgsi_full_instruction *inst)
599 {
600 uint uniquemask;
601 uint chan_index;
602 uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
603 union spu_exec_channel r[1];
604
605 /* This mask stores component bits that were already tested. */
606 uniquemask = 0;
607
608 for (chan_index = 0; chan_index < 4; chan_index++)
609 {
610 uint swizzle;
611 uint i;
612
613 /* unswizzle channel */
614 swizzle = tgsi_util_get_full_src_register_swizzle (
615 &inst->Src[0],
616 chan_index);
617
618 /* check if the component has not been already tested */
619 if (uniquemask & (1 << swizzle))
620 continue;
621 uniquemask |= 1 << swizzle;
622
623 FETCH(&r[0], 0, chan_index);
624 for (i = 0; i < 4; i++)
625 if (r[0].f[i] < 0.0f)
626 kilmask |= 1 << i;
627 }
628
629 mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
630 }
631
632 /**
633 * Execute NVIDIA-style KIL which is predicated by a condition code.
634 * Kill fragment if the condition code is TRUE.
635 */
636 static void
637 exec_kilp(struct tgsi_exec_machine *mach,
638 const struct tgsi_full_instruction *inst)
639 {
640 uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
641
642 /* TODO: build kilmask from CC mask */
643
644 mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
645 }
646
647 /*
648 * Fetch a texel using STR texture coordinates.
649 */
650 static void
651 fetch_texel( struct spu_sampler *sampler,
652 const union spu_exec_channel *s,
653 const union spu_exec_channel *t,
654 const union spu_exec_channel *p,
655 float lodbias, /* XXX should be float[4] */
656 union spu_exec_channel *r,
657 union spu_exec_channel *g,
658 union spu_exec_channel *b,
659 union spu_exec_channel *a )
660 {
661 qword rgba[4];
662 qword out[4];
663
664 sampler->get_samples(sampler, s->f, t->f, p->f, lodbias,
665 (float (*)[4]) rgba);
666
667 _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
668 r->q = out[0];
669 g->q = out[1];
670 b->q = out[2];
671 a->q = out[3];
672 }
673
674
675 static void
676 exec_tex(struct spu_exec_machine *mach,
677 const struct tgsi_full_instruction *inst,
678 boolean biasLod, boolean projected)
679 {
680 const uint unit = inst->Src[1].Register.Index;
681 union spu_exec_channel r[8];
682 uint chan_index;
683 float lodBias;
684
685 /* printf("Sampler %u unit %u\n", sampler, unit); */
686
687 switch (inst->InstructionExtTexture.Texture) {
688 case TGSI_TEXTURE_1D:
689
690 FETCH(&r[0], 0, CHAN_X);
691
692 if (projected) {
693 FETCH(&r[1], 0, CHAN_W);
694 r[0].q = micro_div(r[0].q, r[1].q);
695 }
696
697 if (biasLod) {
698 FETCH(&r[1], 0, CHAN_W);
699 lodBias = r[2].f[0];
700 }
701 else
702 lodBias = 0.0;
703
704 fetch_texel(&mach->Samplers[unit],
705 &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */
706 &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
707 break;
708
709 case TGSI_TEXTURE_2D:
710 case TGSI_TEXTURE_RECT:
711
712 FETCH(&r[0], 0, CHAN_X);
713 FETCH(&r[1], 0, CHAN_Y);
714 FETCH(&r[2], 0, CHAN_Z);
715
716 if (projected) {
717 FETCH(&r[3], 0, CHAN_W);
718 r[0].q = micro_div(r[0].q, r[3].q);
719 r[1].q = micro_div(r[1].q, r[3].q);
720 r[2].q = micro_div(r[2].q, r[3].q);
721 }
722
723 if (biasLod) {
724 FETCH(&r[3], 0, CHAN_W);
725 lodBias = r[3].f[0];
726 }
727 else
728 lodBias = 0.0;
729
730 fetch_texel(&mach->Samplers[unit],
731 &r[0], &r[1], &r[2], lodBias, /* inputs */
732 &r[0], &r[1], &r[2], &r[3]); /* outputs */
733 break;
734
735 case TGSI_TEXTURE_3D:
736 case TGSI_TEXTURE_CUBE:
737
738 FETCH(&r[0], 0, CHAN_X);
739 FETCH(&r[1], 0, CHAN_Y);
740 FETCH(&r[2], 0, CHAN_Z);
741
742 if (projected) {
743 FETCH(&r[3], 0, CHAN_W);
744 r[0].q = micro_div(r[0].q, r[3].q);
745 r[1].q = micro_div(r[1].q, r[3].q);
746 r[2].q = micro_div(r[2].q, r[3].q);
747 }
748
749 if (biasLod) {
750 FETCH(&r[3], 0, CHAN_W);
751 lodBias = r[3].f[0];
752 }
753 else
754 lodBias = 0.0;
755
756 fetch_texel(&mach->Samplers[unit],
757 &r[0], &r[1], &r[2], lodBias,
758 &r[0], &r[1], &r[2], &r[3]);
759 break;
760
761 default:
762 ASSERT (0);
763 }
764
765 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
766 STORE( &r[chan_index], 0, chan_index );
767 }
768 }
769
770
771
772 static void
773 constant_interpolation(
774 struct spu_exec_machine *mach,
775 unsigned attrib,
776 unsigned chan )
777 {
778 unsigned i;
779
780 for( i = 0; i < QUAD_SIZE; i++ ) {
781 mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
782 }
783 }
784
785 static void
786 linear_interpolation(
787 struct spu_exec_machine *mach,
788 unsigned attrib,
789 unsigned chan )
790 {
791 const float x = mach->QuadPos.xyzw[0].f[0];
792 const float y = mach->QuadPos.xyzw[1].f[0];
793 const float dadx = mach->InterpCoefs[attrib].dadx[chan];
794 const float dady = mach->InterpCoefs[attrib].dady[chan];
795 const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
796 mach->Inputs[attrib].xyzw[chan].f[0] = a0;
797 mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
798 mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
799 mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
800 }
801
802 static void
803 perspective_interpolation(
804 struct spu_exec_machine *mach,
805 unsigned attrib,
806 unsigned chan )
807 {
808 const float x = mach->QuadPos.xyzw[0].f[0];
809 const float y = mach->QuadPos.xyzw[1].f[0];
810 const float dadx = mach->InterpCoefs[attrib].dadx[chan];
811 const float dady = mach->InterpCoefs[attrib].dady[chan];
812 const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
813 const float *w = mach->QuadPos.xyzw[3].f;
814 /* divide by W here */
815 mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
816 mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
817 mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
818 mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
819 }
820
821
822 typedef void (* interpolation_func)(
823 struct spu_exec_machine *mach,
824 unsigned attrib,
825 unsigned chan );
826
827 static void
828 exec_declaration(struct spu_exec_machine *mach,
829 const struct tgsi_full_declaration *decl)
830 {
831 if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
832 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
833 unsigned first, last, mask;
834 interpolation_func interp;
835
836 first = decl->Range.First;
837 last = decl->Range.Last;
838 mask = decl->Declaration.UsageMask;
839
840 switch( decl->Declaration.Interpolate ) {
841 case TGSI_INTERPOLATE_CONSTANT:
842 interp = constant_interpolation;
843 break;
844
845 case TGSI_INTERPOLATE_LINEAR:
846 interp = linear_interpolation;
847 break;
848
849 case TGSI_INTERPOLATE_PERSPECTIVE:
850 interp = perspective_interpolation;
851 break;
852
853 default:
854 ASSERT( 0 );
855 }
856
857 if( mask == TGSI_WRITEMASK_XYZW ) {
858 unsigned i, j;
859
860 for( i = first; i <= last; i++ ) {
861 for( j = 0; j < NUM_CHANNELS; j++ ) {
862 interp( mach, i, j );
863 }
864 }
865 }
866 else {
867 unsigned i, j;
868
869 for( j = 0; j < NUM_CHANNELS; j++ ) {
870 if( mask & (1 << j) ) {
871 for( i = first; i <= last; i++ ) {
872 interp( mach, i, j );
873 }
874 }
875 }
876 }
877 }
878 }
879 }
880
881 static void
882 exec_instruction(
883 struct spu_exec_machine *mach,
884 const struct tgsi_full_instruction *inst,
885 int *pc )
886 {
887 uint chan_index;
888 union spu_exec_channel r[8];
889
890 (*pc)++;
891
892 switch (inst->Instruction.Opcode) {
893 case TGSI_OPCODE_ARL:
894 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
895 FETCH( &r[0], 0, chan_index );
896 r[0].q = si_cflts(r[0].q, 0);
897 STORE( &r[0], 0, chan_index );
898 }
899 break;
900
901 case TGSI_OPCODE_MOV:
902 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
903 FETCH( &r[0], 0, chan_index );
904 STORE( &r[0], 0, chan_index );
905 }
906 break;
907
908 case TGSI_OPCODE_LIT:
909 if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
910 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
911 }
912
913 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
914 FETCH( &r[0], 0, CHAN_X );
915 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
916 r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
917 STORE( &r[0], 0, CHAN_Y );
918 }
919
920 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
921 FETCH( &r[1], 0, CHAN_Y );
922 r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
923
924 FETCH( &r[2], 0, CHAN_W );
925 r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
926 r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
927 r[1].q = micro_pow(r[1].q, r[2].q);
928
929 /* r0 = (r0 > 0.0) ? r1 : 0.0
930 */
931 r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
932 r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
933 r[0].q);
934 STORE( &r[0], 0, CHAN_Z );
935 }
936 }
937
938 if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
939 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
940 }
941 break;
942
943 case TGSI_OPCODE_RCP:
944 FETCH( &r[0], 0, CHAN_X );
945 r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
946 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
947 STORE( &r[0], 0, chan_index );
948 }
949 break;
950
951 case TGSI_OPCODE_RSQ:
952 FETCH( &r[0], 0, CHAN_X );
953 r[0].q = micro_sqrt(r[0].q);
954 r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
955 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
956 STORE( &r[0], 0, chan_index );
957 }
958 break;
959
960 case TGSI_OPCODE_EXP:
961 ASSERT (0);
962 break;
963
964 case TGSI_OPCODE_LOG:
965 ASSERT (0);
966 break;
967
968 case TGSI_OPCODE_MUL:
969 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
970 {
971 FETCH(&r[0], 0, chan_index);
972 FETCH(&r[1], 1, chan_index);
973
974 r[0].q = si_fm(r[0].q, r[1].q);
975
976 STORE(&r[0], 0, chan_index);
977 }
978 break;
979
980 case TGSI_OPCODE_ADD:
981 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
982 FETCH( &r[0], 0, chan_index );
983 FETCH( &r[1], 1, chan_index );
984 r[0].q = si_fa(r[0].q, r[1].q);
985 STORE( &r[0], 0, chan_index );
986 }
987 break;
988
989 case TGSI_OPCODE_DP3:
990 /* TGSI_OPCODE_DOT3 */
991 FETCH( &r[0], 0, CHAN_X );
992 FETCH( &r[1], 1, CHAN_X );
993 r[0].q = si_fm(r[0].q, r[1].q);
994
995 FETCH( &r[1], 0, CHAN_Y );
996 FETCH( &r[2], 1, CHAN_Y );
997 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
998
999
1000 FETCH( &r[1], 0, CHAN_Z );
1001 FETCH( &r[2], 1, CHAN_Z );
1002 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
1003
1004 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1005 STORE( &r[0], 0, chan_index );
1006 }
1007 break;
1008
1009 case TGSI_OPCODE_DP4:
1010 /* TGSI_OPCODE_DOT4 */
1011 FETCH(&r[0], 0, CHAN_X);
1012 FETCH(&r[1], 1, CHAN_X);
1013
1014 r[0].q = si_fm(r[0].q, r[1].q);
1015
1016 FETCH(&r[1], 0, CHAN_Y);
1017 FETCH(&r[2], 1, CHAN_Y);
1018
1019 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
1020
1021 FETCH(&r[1], 0, CHAN_Z);
1022 FETCH(&r[2], 1, CHAN_Z);
1023
1024 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
1025
1026 FETCH(&r[1], 0, CHAN_W);
1027 FETCH(&r[2], 1, CHAN_W);
1028
1029 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
1030
1031 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1032 STORE( &r[0], 0, chan_index );
1033 }
1034 break;
1035
1036 case TGSI_OPCODE_DST:
1037 if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1038 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1039 }
1040
1041 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1042 FETCH( &r[0], 0, CHAN_Y );
1043 FETCH( &r[1], 1, CHAN_Y);
1044 r[0].q = si_fm(r[0].q, r[1].q);
1045 STORE( &r[0], 0, CHAN_Y );
1046 }
1047
1048 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1049 FETCH( &r[0], 0, CHAN_Z );
1050 STORE( &r[0], 0, CHAN_Z );
1051 }
1052
1053 if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1054 FETCH( &r[0], 1, CHAN_W );
1055 STORE( &r[0], 0, CHAN_W );
1056 }
1057 break;
1058
1059 case TGSI_OPCODE_MIN:
1060 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1061 FETCH(&r[0], 0, chan_index);
1062 FETCH(&r[1], 1, chan_index);
1063
1064 r[0].q = micro_min(r[0].q, r[1].q);
1065
1066 STORE(&r[0], 0, chan_index);
1067 }
1068 break;
1069
1070 case TGSI_OPCODE_MAX:
1071 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1072 FETCH(&r[0], 0, chan_index);
1073 FETCH(&r[1], 1, chan_index);
1074
1075 r[0].q = micro_max(r[0].q, r[1].q);
1076
1077 STORE(&r[0], 0, chan_index );
1078 }
1079 break;
1080
1081 case TGSI_OPCODE_SLT:
1082 /* TGSI_OPCODE_SETLT */
1083 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1084 FETCH( &r[0], 0, chan_index );
1085 FETCH( &r[1], 1, chan_index );
1086
1087 r[0].q = micro_ge(r[0].q, r[1].q);
1088 r[0].q = si_xori(r[0].q, 0xff);
1089
1090 STORE( &r[0], 0, chan_index );
1091 }
1092 break;
1093
1094 case TGSI_OPCODE_SGE:
1095 /* TGSI_OPCODE_SETGE */
1096 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1097 FETCH( &r[0], 0, chan_index );
1098 FETCH( &r[1], 1, chan_index );
1099 r[0].q = micro_ge(r[0].q, r[1].q);
1100 STORE( &r[0], 0, chan_index );
1101 }
1102 break;
1103
1104 case TGSI_OPCODE_MAD:
1105 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1106 FETCH( &r[0], 0, chan_index );
1107 FETCH( &r[1], 1, chan_index );
1108 FETCH( &r[2], 2, chan_index );
1109 r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
1110 STORE( &r[0], 0, chan_index );
1111 }
1112 break;
1113
1114 case TGSI_OPCODE_SUB:
1115 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1116 FETCH(&r[0], 0, chan_index);
1117 FETCH(&r[1], 1, chan_index);
1118
1119 r[0].q = si_fs(r[0].q, r[1].q);
1120
1121 STORE(&r[0], 0, chan_index);
1122 }
1123 break;
1124
1125 case TGSI_OPCODE_LRP:
1126 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1127 FETCH(&r[0], 0, chan_index);
1128 FETCH(&r[1], 1, chan_index);
1129 FETCH(&r[2], 2, chan_index);
1130
1131 r[1].q = si_fs(r[1].q, r[2].q);
1132 r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
1133
1134 STORE(&r[0], 0, chan_index);
1135 }
1136 break;
1137
1138 case TGSI_OPCODE_CND:
1139 ASSERT (0);
1140 break;
1141
1142 case TGSI_OPCODE_DP2A:
1143 ASSERT (0);
1144 break;
1145
1146 case TGSI_OPCODE_FRC:
1147 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1148 FETCH( &r[0], 0, chan_index );
1149 r[0].q = micro_frc(r[0].q);
1150 STORE( &r[0], 0, chan_index );
1151 }
1152 break;
1153
1154 case TGSI_OPCODE_CLAMP:
1155 ASSERT (0);
1156 break;
1157
1158 case TGSI_OPCODE_FLR:
1159 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1160 FETCH( &r[0], 0, chan_index );
1161 r[0].q = micro_flr(r[0].q);
1162 STORE( &r[0], 0, chan_index );
1163 }
1164 break;
1165
1166 case TGSI_OPCODE_ROUND:
1167 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1168 FETCH( &r[0], 0, chan_index );
1169 r[0].q = micro_rnd(r[0].q);
1170 STORE( &r[0], 0, chan_index );
1171 }
1172 break;
1173
1174 case TGSI_OPCODE_EX2:
1175 FETCH(&r[0], 0, CHAN_X);
1176
1177 r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
1178
1179 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1180 STORE( &r[0], 0, chan_index );
1181 }
1182 break;
1183
1184 case TGSI_OPCODE_LG2:
1185 FETCH( &r[0], 0, CHAN_X );
1186 r[0].q = micro_lg2(r[0].q);
1187 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1188 STORE( &r[0], 0, chan_index );
1189 }
1190 break;
1191
1192 case TGSI_OPCODE_POW:
1193 FETCH(&r[0], 0, CHAN_X);
1194 FETCH(&r[1], 1, CHAN_X);
1195
1196 r[0].q = micro_pow(r[0].q, r[1].q);
1197
1198 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1199 STORE( &r[0], 0, chan_index );
1200 }
1201 break;
1202
1203 case TGSI_OPCODE_XPD:
1204 /* TGSI_OPCODE_XPD */
1205 FETCH(&r[0], 0, CHAN_Y);
1206 FETCH(&r[1], 1, CHAN_Z);
1207 FETCH(&r[3], 0, CHAN_Z);
1208 FETCH(&r[4], 1, CHAN_Y);
1209
1210 /* r2 = (r0 * r1) - (r3 * r5)
1211 */
1212 r[2].q = si_fm(r[3].q, r[5].q);
1213 r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
1214
1215 if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1216 STORE( &r[2], 0, CHAN_X );
1217 }
1218
1219 FETCH(&r[2], 1, CHAN_X);
1220 FETCH(&r[5], 0, CHAN_X);
1221
1222 /* r3 = (r3 * r2) - (r1 * r5)
1223 */
1224 r[1].q = si_fm(r[1].q, r[5].q);
1225 r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
1226
1227 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1228 STORE( &r[3], 0, CHAN_Y );
1229 }
1230
1231 /* r5 = (r5 * r4) - (r0 * r2)
1232 */
1233 r[0].q = si_fm(r[0].q, r[2].q);
1234 r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
1235
1236 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1237 STORE( &r[5], 0, CHAN_Z );
1238 }
1239
1240 if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1241 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1242 }
1243 break;
1244
1245 case TGSI_OPCODE_ABS:
1246 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1247 FETCH(&r[0], 0, chan_index);
1248
1249 r[0].q = micro_abs(r[0].q);
1250
1251 STORE(&r[0], 0, chan_index);
1252 }
1253 break;
1254
1255 case TGSI_OPCODE_RCC:
1256 ASSERT (0);
1257 break;
1258
1259 case TGSI_OPCODE_DPH:
1260 FETCH(&r[0], 0, CHAN_X);
1261 FETCH(&r[1], 1, CHAN_X);
1262
1263 r[0].q = si_fm(r[0].q, r[1].q);
1264
1265 FETCH(&r[1], 0, CHAN_Y);
1266 FETCH(&r[2], 1, CHAN_Y);
1267
1268 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
1269
1270 FETCH(&r[1], 0, CHAN_Z);
1271 FETCH(&r[2], 1, CHAN_Z);
1272
1273 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
1274
1275 FETCH(&r[1], 1, CHAN_W);
1276
1277 r[0].q = si_fa(r[0].q, r[1].q);
1278
1279 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1280 STORE( &r[0], 0, chan_index );
1281 }
1282 break;
1283
1284 case TGSI_OPCODE_COS:
1285 FETCH(&r[0], 0, CHAN_X);
1286
1287 r[0].q = micro_cos(r[0].q);
1288
1289 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1290 STORE( &r[0], 0, chan_index );
1291 }
1292 break;
1293
1294 case TGSI_OPCODE_DDX:
1295 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1296 FETCH( &r[0], 0, chan_index );
1297 r[0].q = micro_ddx(r[0].q);
1298 STORE( &r[0], 0, chan_index );
1299 }
1300 break;
1301
1302 case TGSI_OPCODE_DDY:
1303 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1304 FETCH( &r[0], 0, chan_index );
1305 r[0].q = micro_ddy(r[0].q);
1306 STORE( &r[0], 0, chan_index );
1307 }
1308 break;
1309
1310 case TGSI_OPCODE_KILP:
1311 exec_kilp (mach, inst);
1312 break;
1313
1314 case TGSI_OPCODE_KIL:
1315 exec_kil (mach, inst);
1316 break;
1317
1318 case TGSI_OPCODE_PK2H:
1319 ASSERT (0);
1320 break;
1321
1322 case TGSI_OPCODE_PK2US:
1323 ASSERT (0);
1324 break;
1325
1326 case TGSI_OPCODE_PK4B:
1327 ASSERT (0);
1328 break;
1329
1330 case TGSI_OPCODE_PK4UB:
1331 ASSERT (0);
1332 break;
1333
1334 case TGSI_OPCODE_RFL:
1335 ASSERT (0);
1336 break;
1337
1338 case TGSI_OPCODE_SEQ:
1339 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1340 FETCH( &r[0], 0, chan_index );
1341 FETCH( &r[1], 1, chan_index );
1342
1343 r[0].q = si_fceq(r[0].q, r[1].q);
1344
1345 STORE( &r[0], 0, chan_index );
1346 }
1347 break;
1348
1349 case TGSI_OPCODE_SFL:
1350 ASSERT (0);
1351 break;
1352
1353 case TGSI_OPCODE_SGT:
1354 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1355 FETCH( &r[0], 0, chan_index );
1356 FETCH( &r[1], 1, chan_index );
1357 r[0].q = si_fcgt(r[0].q, r[1].q);
1358 STORE( &r[0], 0, chan_index );
1359 }
1360 break;
1361
1362 case TGSI_OPCODE_SIN:
1363 FETCH( &r[0], 0, CHAN_X );
1364 r[0].q = micro_sin(r[0].q);
1365 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1366 STORE( &r[0], 0, chan_index );
1367 }
1368 break;
1369
1370 case TGSI_OPCODE_SLE:
1371 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1372 FETCH( &r[0], 0, chan_index );
1373 FETCH( &r[1], 1, chan_index );
1374
1375 r[0].q = si_fcgt(r[0].q, r[1].q);
1376 r[0].q = si_xori(r[0].q, 0xff);
1377
1378 STORE( &r[0], 0, chan_index );
1379 }
1380 break;
1381
1382 case TGSI_OPCODE_SNE:
1383 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1384 FETCH( &r[0], 0, chan_index );
1385 FETCH( &r[1], 1, chan_index );
1386
1387 r[0].q = si_fceq(r[0].q, r[1].q);
1388 r[0].q = si_xori(r[0].q, 0xff);
1389
1390 STORE( &r[0], 0, chan_index );
1391 }
1392 break;
1393
1394 case TGSI_OPCODE_STR:
1395 ASSERT (0);
1396 break;
1397
1398 case TGSI_OPCODE_TEX:
1399 /* simple texture lookup */
1400 /* src[0] = texcoord */
1401 /* src[1] = sampler unit */
1402 exec_tex(mach, inst, FALSE, FALSE);
1403 break;
1404
1405 case TGSI_OPCODE_TXB:
1406 /* Texture lookup with lod bias */
1407 /* src[0] = texcoord (src[0].w = load bias) */
1408 /* src[1] = sampler unit */
1409 exec_tex(mach, inst, TRUE, FALSE);
1410 break;
1411
1412 case TGSI_OPCODE_TXD:
1413 /* Texture lookup with explict partial derivatives */
1414 /* src[0] = texcoord */
1415 /* src[1] = d[strq]/dx */
1416 /* src[2] = d[strq]/dy */
1417 /* src[3] = sampler unit */
1418 ASSERT (0);
1419 break;
1420
1421 case TGSI_OPCODE_TXL:
1422 /* Texture lookup with explit LOD */
1423 /* src[0] = texcoord (src[0].w = load bias) */
1424 /* src[1] = sampler unit */
1425 exec_tex(mach, inst, TRUE, FALSE);
1426 break;
1427
1428 case TGSI_OPCODE_TXP:
1429 /* Texture lookup with projection */
1430 /* src[0] = texcoord (src[0].w = projection) */
1431 /* src[1] = sampler unit */
1432 exec_tex(mach, inst, TRUE, TRUE);
1433 break;
1434
1435 case TGSI_OPCODE_UP2H:
1436 ASSERT (0);
1437 break;
1438
1439 case TGSI_OPCODE_UP2US:
1440 ASSERT (0);
1441 break;
1442
1443 case TGSI_OPCODE_UP4B:
1444 ASSERT (0);
1445 break;
1446
1447 case TGSI_OPCODE_UP4UB:
1448 ASSERT (0);
1449 break;
1450
1451 case TGSI_OPCODE_X2D:
1452 ASSERT (0);
1453 break;
1454
1455 case TGSI_OPCODE_ARA:
1456 ASSERT (0);
1457 break;
1458
1459 case TGSI_OPCODE_ARR:
1460 ASSERT (0);
1461 break;
1462
1463 case TGSI_OPCODE_BRA:
1464 ASSERT (0);
1465 break;
1466
1467 case TGSI_OPCODE_CAL:
1468 /* skip the call if no execution channels are enabled */
1469 if (mach->ExecMask) {
1470 /* do the call */
1471
1472 /* push the Cond, Loop, Cont stacks */
1473 ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
1474 mach->CondStack[mach->CondStackTop++] = mach->CondMask;
1475 ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
1476 mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
1477 ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
1478 mach->ContStack[mach->ContStackTop++] = mach->ContMask;
1479
1480 ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
1481 mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
1482
1483 /* note that PC was already incremented above */
1484 mach->CallStack[mach->CallStackTop++] = *pc;
1485 *pc = inst->InstructionExtLabel.Label;
1486 }
1487 break;
1488
1489 case TGSI_OPCODE_RET:
1490 mach->FuncMask &= ~mach->ExecMask;
1491 UPDATE_EXEC_MASK(mach);
1492
1493 if (mach->ExecMask == 0x0) {
1494 /* really return now (otherwise, keep executing */
1495
1496 if (mach->CallStackTop == 0) {
1497 /* returning from main() */
1498 *pc = -1;
1499 return;
1500 }
1501 *pc = mach->CallStack[--mach->CallStackTop];
1502
1503 /* pop the Cond, Loop, Cont stacks */
1504 ASSERT(mach->CondStackTop > 0);
1505 mach->CondMask = mach->CondStack[--mach->CondStackTop];
1506 ASSERT(mach->LoopStackTop > 0);
1507 mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
1508 ASSERT(mach->ContStackTop > 0);
1509 mach->ContMask = mach->ContStack[--mach->ContStackTop];
1510 ASSERT(mach->FuncStackTop > 0);
1511 mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
1512
1513 UPDATE_EXEC_MASK(mach);
1514 }
1515 break;
1516
1517 case TGSI_OPCODE_SSG:
1518 ASSERT (0);
1519 break;
1520
1521 case TGSI_OPCODE_CMP:
1522 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1523 FETCH(&r[0], 0, chan_index);
1524 FETCH(&r[1], 1, chan_index);
1525 FETCH(&r[2], 2, chan_index);
1526
1527 /* r0 = (r0 < 0.0) ? r1 : r2
1528 */
1529 r[3].q = si_xor(r[3].q, r[3].q);
1530 r[0].q = micro_lt(r[0].q, r[3].q);
1531 r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
1532
1533 STORE(&r[0], 0, chan_index);
1534 }
1535 break;
1536
1537 case TGSI_OPCODE_SCS:
1538 if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1539 FETCH( &r[0], 0, CHAN_X );
1540 }
1541 if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1542 r[1].q = micro_cos(r[0].q);
1543 STORE( &r[1], 0, CHAN_X );
1544 }
1545 if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1546 r[1].q = micro_sin(r[0].q);
1547 STORE( &r[1], 0, CHAN_Y );
1548 }
1549 if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1550 STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
1551 }
1552 if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1553 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1554 }
1555 break;
1556
1557 case TGSI_OPCODE_NRM:
1558 ASSERT (0);
1559 break;
1560
1561 case TGSI_OPCODE_DIV:
1562 ASSERT( 0 );
1563 break;
1564
1565 case TGSI_OPCODE_DP2:
1566 FETCH( &r[0], 0, CHAN_X );
1567 FETCH( &r[1], 1, CHAN_X );
1568 r[0].q = si_fm(r[0].q, r[1].q);
1569
1570 FETCH( &r[1], 0, CHAN_Y );
1571 FETCH( &r[2], 1, CHAN_Y );
1572 r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
1573
1574 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1575 STORE( &r[0], 0, chan_index );
1576 }
1577 break;
1578
1579 case TGSI_OPCODE_IF:
1580 /* push CondMask */
1581 ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
1582 mach->CondStack[mach->CondStackTop++] = mach->CondMask;
1583 FETCH( &r[0], 0, CHAN_X );
1584 /* update CondMask */
1585 if( ! r[0].u[0] ) {
1586 mach->CondMask &= ~0x1;
1587 }
1588 if( ! r[0].u[1] ) {
1589 mach->CondMask &= ~0x2;
1590 }
1591 if( ! r[0].u[2] ) {
1592 mach->CondMask &= ~0x4;
1593 }
1594 if( ! r[0].u[3] ) {
1595 mach->CondMask &= ~0x8;
1596 }
1597 UPDATE_EXEC_MASK(mach);
1598 /* Todo: If CondMask==0, jump to ELSE */
1599 break;
1600
1601 case TGSI_OPCODE_ELSE:
1602 /* invert CondMask wrt previous mask */
1603 {
1604 uint prevMask;
1605 ASSERT(mach->CondStackTop > 0);
1606 prevMask = mach->CondStack[mach->CondStackTop - 1];
1607 mach->CondMask = ~mach->CondMask & prevMask;
1608 UPDATE_EXEC_MASK(mach);
1609 /* Todo: If CondMask==0, jump to ENDIF */
1610 }
1611 break;
1612
1613 case TGSI_OPCODE_ENDIF:
1614 /* pop CondMask */
1615 ASSERT(mach->CondStackTop > 0);
1616 mach->CondMask = mach->CondStack[--mach->CondStackTop];
1617 UPDATE_EXEC_MASK(mach);
1618 break;
1619
1620 case TGSI_OPCODE_END:
1621 /* halt execution */
1622 *pc = -1;
1623 break;
1624
1625 case TGSI_OPCODE_REP:
1626 ASSERT (0);
1627 break;
1628
1629 case TGSI_OPCODE_ENDREP:
1630 ASSERT (0);
1631 break;
1632
1633 case TGSI_OPCODE_PUSHA:
1634 ASSERT (0);
1635 break;
1636
1637 case TGSI_OPCODE_POPA:
1638 ASSERT (0);
1639 break;
1640
1641 case TGSI_OPCODE_CEIL:
1642 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1643 FETCH( &r[0], 0, chan_index );
1644 r[0].q = micro_ceil(r[0].q);
1645 STORE( &r[0], 0, chan_index );
1646 }
1647 break;
1648
1649 case TGSI_OPCODE_I2F:
1650 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1651 FETCH( &r[0], 0, chan_index );
1652 r[0].q = si_csflt(r[0].q, 0);
1653 STORE( &r[0], 0, chan_index );
1654 }
1655 break;
1656
1657 case TGSI_OPCODE_NOT:
1658 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1659 FETCH( &r[0], 0, chan_index );
1660 r[0].q = si_xorbi(r[0].q, 0xff);
1661 STORE( &r[0], 0, chan_index );
1662 }
1663 break;
1664
1665 case TGSI_OPCODE_TRUNC:
1666 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1667 FETCH( &r[0], 0, chan_index );
1668 r[0].q = micro_trunc(r[0].q);
1669 STORE( &r[0], 0, chan_index );
1670 }
1671 break;
1672
1673 case TGSI_OPCODE_SHL:
1674 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1675 FETCH( &r[0], 0, chan_index );
1676 FETCH( &r[1], 1, chan_index );
1677
1678 r[0].q = si_shl(r[0].q, r[1].q);
1679
1680 STORE( &r[0], 0, chan_index );
1681 }
1682 break;
1683
1684 case TGSI_OPCODE_ISHR:
1685 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1686 FETCH( &r[0], 0, chan_index );
1687 FETCH( &r[1], 1, chan_index );
1688 r[0].q = micro_ishr(r[0].q, r[1].q);
1689 STORE( &r[0], 0, chan_index );
1690 }
1691 break;
1692
1693 case TGSI_OPCODE_AND:
1694 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1695 FETCH( &r[0], 0, chan_index );
1696 FETCH( &r[1], 1, chan_index );
1697 r[0].q = si_and(r[0].q, r[1].q);
1698 STORE( &r[0], 0, chan_index );
1699 }
1700 break;
1701
1702 case TGSI_OPCODE_OR:
1703 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1704 FETCH( &r[0], 0, chan_index );
1705 FETCH( &r[1], 1, chan_index );
1706 r[0].q = si_or(r[0].q, r[1].q);
1707 STORE( &r[0], 0, chan_index );
1708 }
1709 break;
1710
1711 case TGSI_OPCODE_MOD:
1712 ASSERT (0);
1713 break;
1714
1715 case TGSI_OPCODE_XOR:
1716 FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1717 FETCH( &r[0], 0, chan_index );
1718 FETCH( &r[1], 1, chan_index );
1719 r[0].q = si_xor(r[0].q, r[1].q);
1720 STORE( &r[0], 0, chan_index );
1721 }
1722 break;
1723
1724 case TGSI_OPCODE_SAD:
1725 ASSERT (0);
1726 break;
1727
1728 case TGSI_OPCODE_TXF:
1729 ASSERT (0);
1730 break;
1731
1732 case TGSI_OPCODE_TXQ:
1733 ASSERT (0);
1734 break;
1735
1736 case TGSI_OPCODE_EMIT:
1737 mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
1738 mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1739 break;
1740
1741 case TGSI_OPCODE_ENDPRIM:
1742 mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
1743 mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
1744 break;
1745
1746 case TGSI_OPCODE_BGNFOR:
1747 /* fall-through (for now) */
1748 case TGSI_OPCODE_BGNLOOP:
1749 /* push LoopMask and ContMasks */
1750 ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
1751 mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
1752 ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
1753 mach->ContStack[mach->ContStackTop++] = mach->ContMask;
1754 break;
1755
1756 case TGSI_OPCODE_ENDFOR:
1757 /* fall-through (for now at least) */
1758 case TGSI_OPCODE_ENDLOOP:
1759 /* Restore ContMask, but don't pop */
1760 ASSERT(mach->ContStackTop > 0);
1761 mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
1762 if (mach->LoopMask) {
1763 /* repeat loop: jump to instruction just past BGNLOOP */
1764 *pc = inst->InstructionExtLabel.Label + 1;
1765 }
1766 else {
1767 /* exit loop: pop LoopMask */
1768 ASSERT(mach->LoopStackTop > 0);
1769 mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
1770 /* pop ContMask */
1771 ASSERT(mach->ContStackTop > 0);
1772 mach->ContMask = mach->ContStack[--mach->ContStackTop];
1773 }
1774 UPDATE_EXEC_MASK(mach);
1775 break;
1776
1777 case TGSI_OPCODE_BRK:
1778 /* turn off loop channels for each enabled exec channel */
1779 mach->LoopMask &= ~mach->ExecMask;
1780 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1781 UPDATE_EXEC_MASK(mach);
1782 break;
1783
1784 case TGSI_OPCODE_CONT:
1785 /* turn off cont channels for each enabled exec channel */
1786 mach->ContMask &= ~mach->ExecMask;
1787 /* Todo: if mach->LoopMask == 0, jump to end of loop */
1788 UPDATE_EXEC_MASK(mach);
1789 break;
1790
1791 case TGSI_OPCODE_BGNSUB:
1792 /* no-op */
1793 break;
1794
1795 case TGSI_OPCODE_ENDSUB:
1796 /* no-op */
1797 break;
1798
1799 case TGSI_OPCODE_NOP:
1800 break;
1801
1802 default:
1803 ASSERT( 0 );
1804 }
1805 }
1806
1807
1808 /**
1809 * Run TGSI interpreter.
1810 * \return bitmask of "alive" quad components
1811 */
1812 uint
1813 spu_exec_machine_run( struct spu_exec_machine *mach )
1814 {
1815 uint i;
1816 int pc = 0;
1817
1818 mach->CondMask = 0xf;
1819 mach->LoopMask = 0xf;
1820 mach->ContMask = 0xf;
1821 mach->FuncMask = 0xf;
1822 mach->ExecMask = 0xf;
1823
1824 mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
1825 ASSERT(mach->CondStackTop == 0);
1826 ASSERT(mach->LoopStackTop == 0);
1827 ASSERT(mach->ContStackTop == 0);
1828 ASSERT(mach->CallStackTop == 0);
1829
1830 mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
1831 mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
1832
1833 if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
1834 mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
1835 mach->Primitives[0] = 0;
1836 }
1837
1838
1839 /* execute declarations (interpolants) */
1840 if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1841 for (i = 0; i < mach->NumDeclarations; i++) {
1842 PIPE_ALIGN_VAR(16)
1843 union {
1844 struct tgsi_full_declaration decl;
1845 qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
1846 } d;
1847 unsigned ea = (unsigned) (mach->Declarations + pc);
1848
1849 spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
1850
1851 exec_declaration( mach, &d.decl );
1852 }
1853 }
1854
1855 /* execute instructions, until pc is set to -1 */
1856 while (pc != -1) {
1857 PIPE_ALIGN_VAR(16)
1858 union {
1859 struct tgsi_full_instruction inst;
1860 qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
1861 } i;
1862 unsigned ea = (unsigned) (mach->Instructions + pc);
1863
1864 spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
1865 exec_instruction( mach, & i.inst, &pc );
1866 }
1867
1868 #if 0
1869 /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
1870 if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1871 /*
1872 * Scale back depth component.
1873 */
1874 for (i = 0; i < 4; i++)
1875 mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
1876 }
1877 #endif
1878
1879 return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
1880 }
1881
1882