src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * TGSI interpreter/executor.
  30  *
  31  * Flow control information:
  32  *
  33  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  34  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  35  * care since a condition may be true for some quad components but false
  36  * for other components.
  37  *
  38  * We basically execute all statements (even if they're in the part of
  39  * an IF/ELSE clause that's "not taken") and use a special mask to
  40  * control writing to destination registers.  This is the ExecMask.
  41  * See store_dest().
  42  *
  43  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  44  * ContMask) which are controlled by the flow control instructions (namely:
  45  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  46  *
  47  *
  48  * Authors:
  49  *   Michal Krol
  50  *   Brian Paul
  51  */
  52
  53 #include "pipe/p_compiler.h"
  54 #include "pipe/p_state.h"
  55 #include "pipe/p_shader_tokens.h"
  56 #include "tgsi/tgsi_parse.h"
  57 #include "tgsi/tgsi_util.h"
  58 #include "tgsi_exec.h"
  59 #include "util/u_memory.h"
  60 #include "util/u_math.h"
  61
  62 #define FAST_MATH 1
  63
  64 #define TILE_TOP_LEFT     0
  65 #define TILE_TOP_RIGHT    1
  66 #define TILE_BOTTOM_LEFT  2
  67 #define TILE_BOTTOM_RIGHT 3
  68
  69 #define CHAN_X  0
  70 #define CHAN_Y  1
  71 #define CHAN_Z  2
  72 #define CHAN_W  3
  73
  74 /*
  75  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  76  */
  77 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
  78 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
  79 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
  80 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
  81 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
  82 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
  83 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
  84 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
  85 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
  86 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
  87 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
  88 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
  89 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
  90 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
  91 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
  92 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
  93 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
  94 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
  95 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
  96 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
  97 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
  98 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
  99 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 100 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 101 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 102 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 103 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 104 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 105 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 106
 107 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 108    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
 109
 110 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 111    ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
 112
 113 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 114    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 115       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 116
 117 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 118    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 119       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 120
 121
 122 /** The execution mask depends on the conditional mask and the loop mask */
 123 #define UPDATE_EXEC_MASK(MACH) \
 124       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
 125
 126 /**
 127  * Initialize machine state by expanding tokens to full instructions,
 128  * allocating temporary storage, setting up constants, etc.
 129  * After this, we can call tgsi_exec_machine_run() many times.
 130  */
 131 void
 132 tgsi_exec_machine_bind_shader(
 133    struct tgsi_exec_machine *mach,
 134    const struct tgsi_token *tokens,
 135    uint numSamplers,
 136    struct tgsi_sampler **samplers)
 137 {
 138    uint k;
 139    struct tgsi_parse_context parse;
 140    struct tgsi_exec_labels *labels = &mach->Labels;
 141    struct tgsi_full_instruction *instructions;
 142    struct tgsi_full_declaration *declarations;
 143    uint maxInstructions = 10, numInstructions = 0;
 144    uint maxDeclarations = 10, numDeclarations = 0;
 145    uint instno = 0;
 146
 147 #if 0
 148    tgsi_dump(tokens, 0);
 149 #endif
 150
 151    util_init_math();
 152
 153    mach->Tokens = tokens;
 154    mach->Samplers = samplers;
 155
 156    k = tgsi_parse_init (&parse, mach->Tokens);
 157    if (k != TGSI_PARSE_OK) {
 158       debug_printf( "Problem parsing!\n" );
 159       return;
 160    }
 161
 162    mach->Processor = parse.FullHeader.Processor.Processor;
 163    mach->ImmLimit = 0;
 164    labels->count = 0;
 165
 166    declarations = (struct tgsi_full_declaration *)
 167       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 168
 169    if (!declarations) {
 170       return;
 171    }
 172
 173    instructions = (struct tgsi_full_instruction *)
 174       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 175
 176    if (!instructions) {
 177       FREE( declarations );
 178       return;
 179    }
 180
 181    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 182       uint pointer = parse.Position;
 183       uint i;
 184
 185       tgsi_parse_token( &parse );
 186       switch( parse.FullToken.Token.Type ) {
 187       case TGSI_TOKEN_TYPE_DECLARATION:
 188          /* save expanded declaration */
 189          if (numDeclarations == maxDeclarations) {
 190             declarations = REALLOC(declarations,
 191                                    maxDeclarations
 192                                    * sizeof(struct tgsi_full_declaration),
 193                                    (maxDeclarations + 10)
 194                                    * sizeof(struct tgsi_full_declaration));
 195             maxDeclarations += 10;
 196          }
 197          memcpy(declarations + numDeclarations,
 198                 &parse.FullToken.FullDeclaration,
 199                 sizeof(declarations[0]));
 200          numDeclarations++;
 201          break;
 202
 203       case TGSI_TOKEN_TYPE_IMMEDIATE:
 204          {
 205             uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
 206             assert( size % 4 == 0 );
 207             assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
 208
 209             for( i = 0; i < size; i++ ) {
 210                mach->Imms[mach->ImmLimit + i / 4][i % 4] =
 211                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
 212             }
 213             mach->ImmLimit += size / 4;
 214          }
 215          break;
 216
 217       case TGSI_TOKEN_TYPE_INSTRUCTION:
 218          assert( labels->count < MAX_LABELS );
 219
 220          labels->labels[labels->count][0] = instno;
 221          labels->labels[labels->count][1] = pointer;
 222          labels->count++;
 223
 224          /* save expanded instruction */
 225          if (numInstructions == maxInstructions) {
 226             instructions = REALLOC(instructions,
 227                                    maxInstructions
 228                                    * sizeof(struct tgsi_full_instruction),
 229                                    (maxInstructions + 10)
 230                                    * sizeof(struct tgsi_full_instruction));
 231             maxInstructions += 10;
 232          }
 233          memcpy(instructions + numInstructions,
 234                 &parse.FullToken.FullInstruction,
 235                 sizeof(instructions[0]));
 236          numInstructions++;
 237          break;
 238
 239       default:
 240          assert( 0 );
 241       }
 242    }
 243    tgsi_parse_free (&parse);
 244
 245    if (mach->Declarations) {
 246       FREE( mach->Declarations );
 247    }
 248    mach->Declarations = declarations;
 249    mach->NumDeclarations = numDeclarations;
 250
 251    if (mach->Instructions) {
 252       FREE( mach->Instructions );
 253    }
 254    mach->Instructions = instructions;
 255    mach->NumInstructions = numInstructions;
 256 }
 257
 258
 259 void
 260 tgsi_exec_machine_init(
 261    struct tgsi_exec_machine *mach )
 262 {
 263    uint i;
 264
 265    mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
 266    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 267
 268    /* Setup constants. */
 269    for( i = 0; i < 4; i++ ) {
 270       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 271       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 272       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 273       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 274       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 275       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 276       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 277       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 278       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 279       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 280    }
 281 }
 282
 283
 284 void
 285 tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
 286 {
 287    if (mach->Instructions) {
 288       FREE(mach->Instructions);
 289       mach->Instructions = NULL;
 290       mach->NumInstructions = 0;
 291    }
 292    if (mach->Declarations) {
 293       FREE(mach->Declarations);
 294       mach->Declarations = NULL;
 295       mach->NumDeclarations = 0;
 296    }
 297 }
 298
 299
 300 static void
 301 micro_abs(
 302    union tgsi_exec_channel *dst,
 303    const union tgsi_exec_channel *src )
 304 {
 305    dst->f[0] = fabsf( src->f[0] );
 306    dst->f[1] = fabsf( src->f[1] );
 307    dst->f[2] = fabsf( src->f[2] );
 308    dst->f[3] = fabsf( src->f[3] );
 309 }
 310
 311 static void
 312 micro_add(
 313    union tgsi_exec_channel *dst,
 314    const union tgsi_exec_channel *src0,
 315    const union tgsi_exec_channel *src1 )
 316 {
 317    dst->f[0] = src0->f[0] + src1->f[0];
 318    dst->f[1] = src0->f[1] + src1->f[1];
 319    dst->f[2] = src0->f[2] + src1->f[2];
 320    dst->f[3] = src0->f[3] + src1->f[3];
 321 }
 322
 323 static void
 324 micro_iadd(
 325    union tgsi_exec_channel *dst,
 326    const union tgsi_exec_channel *src0,
 327    const union tgsi_exec_channel *src1 )
 328 {
 329    dst->i[0] = src0->i[0] + src1->i[0];
 330    dst->i[1] = src0->i[1] + src1->i[1];
 331    dst->i[2] = src0->i[2] + src1->i[2];
 332    dst->i[3] = src0->i[3] + src1->i[3];
 333 }
 334
 335 static void
 336 micro_and(
 337    union tgsi_exec_channel *dst,
 338    const union tgsi_exec_channel *src0,
 339    const union tgsi_exec_channel *src1 )
 340 {
 341    dst->u[0] = src0->u[0] & src1->u[0];
 342    dst->u[1] = src0->u[1] & src1->u[1];
 343    dst->u[2] = src0->u[2] & src1->u[2];
 344    dst->u[3] = src0->u[3] & src1->u[3];
 345 }
 346
 347 static void
 348 micro_ceil(
 349    union tgsi_exec_channel *dst,
 350    const union tgsi_exec_channel *src )
 351 {
 352    dst->f[0] = ceilf( src->f[0] );
 353    dst->f[1] = ceilf( src->f[1] );
 354    dst->f[2] = ceilf( src->f[2] );
 355    dst->f[3] = ceilf( src->f[3] );
 356 }
 357
 358 static void
 359 micro_cos(
 360    union tgsi_exec_channel *dst,
 361    const union tgsi_exec_channel *src )
 362 {
 363    dst->f[0] = cosf( src->f[0] );
 364    dst->f[1] = cosf( src->f[1] );
 365    dst->f[2] = cosf( src->f[2] );
 366    dst->f[3] = cosf( src->f[3] );
 367 }
 368
 369 static void
 370 micro_ddx(
 371    union tgsi_exec_channel *dst,
 372    const union tgsi_exec_channel *src )
 373 {
 374    dst->f[0] =
 375    dst->f[1] =
 376    dst->f[2] =
 377    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 378 }
 379
 380 static void
 381 micro_ddy(
 382    union tgsi_exec_channel *dst,
 383    const union tgsi_exec_channel *src )
 384 {
 385    dst->f[0] =
 386    dst->f[1] =
 387    dst->f[2] =
 388    dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
 389 }
 390
 391 static void
 392 micro_div(
 393    union tgsi_exec_channel *dst,
 394    const union tgsi_exec_channel *src0,
 395    const union tgsi_exec_channel *src1 )
 396 {
 397    if (src1->f[0] != 0) {
 398       dst->f[0] = src0->f[0] / src1->f[0];
 399    }
 400    if (src1->f[1] != 0) {
 401       dst->f[1] = src0->f[1] / src1->f[1];
 402    }
 403    if (src1->f[2] != 0) {
 404       dst->f[2] = src0->f[2] / src1->f[2];
 405    }
 406    if (src1->f[3] != 0) {
 407       dst->f[3] = src0->f[3] / src1->f[3];
 408    }
 409 }
 410
 411 static void
 412 micro_udiv(
 413    union tgsi_exec_channel *dst,
 414    const union tgsi_exec_channel *src0,
 415    const union tgsi_exec_channel *src1 )
 416 {
 417    dst->u[0] = src0->u[0] / src1->u[0];
 418    dst->u[1] = src0->u[1] / src1->u[1];
 419    dst->u[2] = src0->u[2] / src1->u[2];
 420    dst->u[3] = src0->u[3] / src1->u[3];
 421 }
 422
 423 static void
 424 micro_eq(
 425    union tgsi_exec_channel *dst,
 426    const union tgsi_exec_channel *src0,
 427    const union tgsi_exec_channel *src1,
 428    const union tgsi_exec_channel *src2,
 429    const union tgsi_exec_channel *src3 )
 430 {
 431    dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
 432    dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
 433    dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
 434    dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
 435 }
 436
 437 static void
 438 micro_ieq(
 439    union tgsi_exec_channel *dst,
 440    const union tgsi_exec_channel *src0,
 441    const union tgsi_exec_channel *src1,
 442    const union tgsi_exec_channel *src2,
 443    const union tgsi_exec_channel *src3 )
 444 {
 445    dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
 446    dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
 447    dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
 448    dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
 449 }
 450
 451 static void
 452 micro_exp2(
 453    union tgsi_exec_channel *dst,
 454    const union tgsi_exec_channel *src)
 455 {
 456 #if FAST_MATH
 457    dst->f[0] = util_fast_exp2( src->f[0] );
 458    dst->f[1] = util_fast_exp2( src->f[1] );
 459    dst->f[2] = util_fast_exp2( src->f[2] );
 460    dst->f[3] = util_fast_exp2( src->f[3] );
 461 #else
 462    dst->f[0] = powf( 2.0f, src->f[0] );
 463    dst->f[1] = powf( 2.0f, src->f[1] );
 464    dst->f[2] = powf( 2.0f, src->f[2] );
 465    dst->f[3] = powf( 2.0f, src->f[3] );
 466 #endif
 467 }
 468
 469 static void
 470 micro_f2ut(
 471    union tgsi_exec_channel *dst,
 472    const union tgsi_exec_channel *src )
 473 {
 474    dst->u[0] = (uint) src->f[0];
 475    dst->u[1] = (uint) src->f[1];
 476    dst->u[2] = (uint) src->f[2];
 477    dst->u[3] = (uint) src->f[3];
 478 }
 479
 480 static void
 481 micro_flr(
 482    union tgsi_exec_channel *dst,
 483    const union tgsi_exec_channel *src )
 484 {
 485    dst->f[0] = floorf( src->f[0] );
 486    dst->f[1] = floorf( src->f[1] );
 487    dst->f[2] = floorf( src->f[2] );
 488    dst->f[3] = floorf( src->f[3] );
 489 }
 490
 491 static void
 492 micro_frc(
 493    union tgsi_exec_channel *dst,
 494    const union tgsi_exec_channel *src )
 495 {
 496    dst->f[0] = src->f[0] - floorf( src->f[0] );
 497    dst->f[1] = src->f[1] - floorf( src->f[1] );
 498    dst->f[2] = src->f[2] - floorf( src->f[2] );
 499    dst->f[3] = src->f[3] - floorf( src->f[3] );
 500 }
 501
 502 static void
 503 micro_ge(
 504    union tgsi_exec_channel *dst,
 505    const union tgsi_exec_channel *src0,
 506    const union tgsi_exec_channel *src1,
 507    const union tgsi_exec_channel *src2,
 508    const union tgsi_exec_channel *src3 )
 509 {
 510    dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
 511    dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
 512    dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
 513    dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
 514 }
 515
 516 static void
 517 micro_i2f(
 518    union tgsi_exec_channel *dst,
 519    const union tgsi_exec_channel *src )
 520 {
 521    dst->f[0] = (float) src->i[0];
 522    dst->f[1] = (float) src->i[1];
 523    dst->f[2] = (float) src->i[2];
 524    dst->f[3] = (float) src->i[3];
 525 }
 526
 527 static void
 528 micro_lg2(
 529    union tgsi_exec_channel *dst,
 530    const union tgsi_exec_channel *src )
 531 {
 532 #if FAST_MATH
 533    dst->f[0] = util_fast_log2( src->f[0] );
 534    dst->f[1] = util_fast_log2( src->f[1] );
 535    dst->f[2] = util_fast_log2( src->f[2] );
 536    dst->f[3] = util_fast_log2( src->f[3] );
 537 #else
 538    dst->f[0] = logf( src->f[0] ) * 1.442695f;
 539    dst->f[1] = logf( src->f[1] ) * 1.442695f;
 540    dst->f[2] = logf( src->f[2] ) * 1.442695f;
 541    dst->f[3] = logf( src->f[3] ) * 1.442695f;
 542 #endif
 543 }
 544
 545 static void
 546 micro_le(
 547    union tgsi_exec_channel *dst,
 548    const union tgsi_exec_channel *src0,
 549    const union tgsi_exec_channel *src1,
 550    const union tgsi_exec_channel *src2,
 551    const union tgsi_exec_channel *src3 )
 552 {
 553    dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
 554    dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
 555    dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
 556    dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
 557 }
 558
 559 static void
 560 micro_lt(
 561    union tgsi_exec_channel *dst,
 562    const union tgsi_exec_channel *src0,
 563    const union tgsi_exec_channel *src1,
 564    const union tgsi_exec_channel *src2,
 565    const union tgsi_exec_channel *src3 )
 566 {
 567    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 568    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 569    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 570    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 571 }
 572
 573 static void
 574 micro_ilt(
 575    union tgsi_exec_channel *dst,
 576    const union tgsi_exec_channel *src0,
 577    const union tgsi_exec_channel *src1,
 578    const union tgsi_exec_channel *src2,
 579    const union tgsi_exec_channel *src3 )
 580 {
 581    dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
 582    dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
 583    dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
 584    dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
 585 }
 586
 587 static void
 588 micro_ult(
 589    union tgsi_exec_channel *dst,
 590    const union tgsi_exec_channel *src0,
 591    const union tgsi_exec_channel *src1,
 592    const union tgsi_exec_channel *src2,
 593    const union tgsi_exec_channel *src3 )
 594 {
 595    dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
 596    dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
 597    dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
 598    dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
 599 }
 600
 601 static void
 602 micro_max(
 603    union tgsi_exec_channel *dst,
 604    const union tgsi_exec_channel *src0,
 605    const union tgsi_exec_channel *src1 )
 606 {
 607    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 608    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 609    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 610    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 611 }
 612
 613 static void
 614 micro_imax(
 615    union tgsi_exec_channel *dst,
 616    const union tgsi_exec_channel *src0,
 617    const union tgsi_exec_channel *src1 )
 618 {
 619    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
 620    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
 621    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
 622    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
 623 }
 624
 625 static void
 626 micro_umax(
 627    union tgsi_exec_channel *dst,
 628    const union tgsi_exec_channel *src0,
 629    const union tgsi_exec_channel *src1 )
 630 {
 631    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
 632    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
 633    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
 634    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
 635 }
 636
 637 static void
 638 micro_min(
 639    union tgsi_exec_channel *dst,
 640    const union tgsi_exec_channel *src0,
 641    const union tgsi_exec_channel *src1 )
 642 {
 643    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 644    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 645    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 646    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 647 }
 648
 649 static void
 650 micro_imin(
 651    union tgsi_exec_channel *dst,
 652    const union tgsi_exec_channel *src0,
 653    const union tgsi_exec_channel *src1 )
 654 {
 655    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
 656    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
 657    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
 658    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
 659 }
 660
 661 static void
 662 micro_umin(
 663    union tgsi_exec_channel *dst,
 664    const union tgsi_exec_channel *src0,
 665    const union tgsi_exec_channel *src1 )
 666 {
 667    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
 668    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
 669    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
 670    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
 671 }
 672
 673 static void
 674 micro_umod(
 675    union tgsi_exec_channel *dst,
 676    const union tgsi_exec_channel *src0,
 677    const union tgsi_exec_channel *src1 )
 678 {
 679    dst->u[0] = src0->u[0] % src1->u[0];
 680    dst->u[1] = src0->u[1] % src1->u[1];
 681    dst->u[2] = src0->u[2] % src1->u[2];
 682    dst->u[3] = src0->u[3] % src1->u[3];
 683 }
 684
 685 static void
 686 micro_mul(
 687    union tgsi_exec_channel *dst,
 688    const union tgsi_exec_channel *src0,
 689    const union tgsi_exec_channel *src1 )
 690 {
 691    dst->f[0] = src0->f[0] * src1->f[0];
 692    dst->f[1] = src0->f[1] * src1->f[1];
 693    dst->f[2] = src0->f[2] * src1->f[2];
 694    dst->f[3] = src0->f[3] * src1->f[3];
 695 }
 696
 697 static void
 698 micro_imul(
 699    union tgsi_exec_channel *dst,
 700    const union tgsi_exec_channel *src0,
 701    const union tgsi_exec_channel *src1 )
 702 {
 703    dst->i[0] = src0->i[0] * src1->i[0];
 704    dst->i[1] = src0->i[1] * src1->i[1];
 705    dst->i[2] = src0->i[2] * src1->i[2];
 706    dst->i[3] = src0->i[3] * src1->i[3];
 707 }
 708
 709 static void
 710 micro_imul64(
 711    union tgsi_exec_channel *dst0,
 712    union tgsi_exec_channel *dst1,
 713    const union tgsi_exec_channel *src0,
 714    const union tgsi_exec_channel *src1 )
 715 {
 716    dst1->i[0] = src0->i[0] * src1->i[0];
 717    dst1->i[1] = src0->i[1] * src1->i[1];
 718    dst1->i[2] = src0->i[2] * src1->i[2];
 719    dst1->i[3] = src0->i[3] * src1->i[3];
 720    dst0->i[0] = 0;
 721    dst0->i[1] = 0;
 722    dst0->i[2] = 0;
 723    dst0->i[3] = 0;
 724 }
 725
 726 static void
 727 micro_umul64(
 728    union tgsi_exec_channel *dst0,
 729    union tgsi_exec_channel *dst1,
 730    const union tgsi_exec_channel *src0,
 731    const union tgsi_exec_channel *src1 )
 732 {
 733    dst1->u[0] = src0->u[0] * src1->u[0];
 734    dst1->u[1] = src0->u[1] * src1->u[1];
 735    dst1->u[2] = src0->u[2] * src1->u[2];
 736    dst1->u[3] = src0->u[3] * src1->u[3];
 737    dst0->u[0] = 0;
 738    dst0->u[1] = 0;
 739    dst0->u[2] = 0;
 740    dst0->u[3] = 0;
 741 }
 742
 743 static void
 744 micro_movc(
 745    union tgsi_exec_channel *dst,
 746    const union tgsi_exec_channel *src0,
 747    const union tgsi_exec_channel *src1,
 748    const union tgsi_exec_channel *src2 )
 749 {
 750    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 751    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 752    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 753    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 754 }
 755
 756 static void
 757 micro_neg(
 758    union tgsi_exec_channel *dst,
 759    const union tgsi_exec_channel *src )
 760 {
 761    dst->f[0] = -src->f[0];
 762    dst->f[1] = -src->f[1];
 763    dst->f[2] = -src->f[2];
 764    dst->f[3] = -src->f[3];
 765 }
 766
 767 static void
 768 micro_ineg(
 769    union tgsi_exec_channel *dst,
 770    const union tgsi_exec_channel *src )
 771 {
 772    dst->i[0] = -src->i[0];
 773    dst->i[1] = -src->i[1];
 774    dst->i[2] = -src->i[2];
 775    dst->i[3] = -src->i[3];
 776 }
 777
 778 static void
 779 micro_not(
 780    union tgsi_exec_channel *dst,
 781    const union tgsi_exec_channel *src )
 782 {
 783    dst->u[0] = ~src->u[0];
 784    dst->u[1] = ~src->u[1];
 785    dst->u[2] = ~src->u[2];
 786    dst->u[3] = ~src->u[3];
 787 }
 788
 789 static void
 790 micro_or(
 791    union tgsi_exec_channel *dst,
 792    const union tgsi_exec_channel *src0,
 793    const union tgsi_exec_channel *src1 )
 794 {
 795    dst->u[0] = src0->u[0] | src1->u[0];
 796    dst->u[1] = src0->u[1] | src1->u[1];
 797    dst->u[2] = src0->u[2] | src1->u[2];
 798    dst->u[3] = src0->u[3] | src1->u[3];
 799 }
 800
 801 static void
 802 micro_pow(
 803    union tgsi_exec_channel *dst,
 804    const union tgsi_exec_channel *src0,
 805    const union tgsi_exec_channel *src1 )
 806 {
 807 #if FAST_MATH
 808    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 809    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 810    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 811    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 812 #else
 813    dst->f[0] = powf( src0->f[0], src1->f[0] );
 814    dst->f[1] = powf( src0->f[1], src1->f[1] );
 815    dst->f[2] = powf( src0->f[2], src1->f[2] );
 816    dst->f[3] = powf( src0->f[3], src1->f[3] );
 817 #endif
 818 }
 819
 820 static void
 821 micro_rnd(
 822    union tgsi_exec_channel *dst,
 823    const union tgsi_exec_channel *src )
 824 {
 825    dst->f[0] = floorf( src->f[0] + 0.5f );
 826    dst->f[1] = floorf( src->f[1] + 0.5f );
 827    dst->f[2] = floorf( src->f[2] + 0.5f );
 828    dst->f[3] = floorf( src->f[3] + 0.5f );
 829 }
 830
 831 static void
 832 micro_shl(
 833    union tgsi_exec_channel *dst,
 834    const union tgsi_exec_channel *src0,
 835    const union tgsi_exec_channel *src1 )
 836 {
 837    dst->i[0] = src0->i[0] << src1->i[0];
 838    dst->i[1] = src0->i[1] << src1->i[1];
 839    dst->i[2] = src0->i[2] << src1->i[2];
 840    dst->i[3] = src0->i[3] << src1->i[3];
 841 }
 842
 843 static void
 844 micro_ishr(
 845    union tgsi_exec_channel *dst,
 846    const union tgsi_exec_channel *src0,
 847    const union tgsi_exec_channel *src1 )
 848 {
 849    dst->i[0] = src0->i[0] >> src1->i[0];
 850    dst->i[1] = src0->i[1] >> src1->i[1];
 851    dst->i[2] = src0->i[2] >> src1->i[2];
 852    dst->i[3] = src0->i[3] >> src1->i[3];
 853 }
 854
 855 static void
 856 micro_trunc(
 857    union tgsi_exec_channel *dst,
 858    const union tgsi_exec_channel *src0 )
 859 {
 860    dst->f[0] = (float) (int) src0->f[0];
 861    dst->f[1] = (float) (int) src0->f[1];
 862    dst->f[2] = (float) (int) src0->f[2];
 863    dst->f[3] = (float) (int) src0->f[3];
 864 }
 865
 866 static void
 867 micro_ushr(
 868    union tgsi_exec_channel *dst,
 869    const union tgsi_exec_channel *src0,
 870    const union tgsi_exec_channel *src1 )
 871 {
 872    dst->u[0] = src0->u[0] >> src1->u[0];
 873    dst->u[1] = src0->u[1] >> src1->u[1];
 874    dst->u[2] = src0->u[2] >> src1->u[2];
 875    dst->u[3] = src0->u[3] >> src1->u[3];
 876 }
 877
 878 static void
 879 micro_sin(
 880    union tgsi_exec_channel *dst,
 881    const union tgsi_exec_channel *src )
 882 {
 883    dst->f[0] = sinf( src->f[0] );
 884    dst->f[1] = sinf( src->f[1] );
 885    dst->f[2] = sinf( src->f[2] );
 886    dst->f[3] = sinf( src->f[3] );
 887 }
 888
 889 static void
 890 micro_sqrt( union tgsi_exec_channel *dst,
 891             const union tgsi_exec_channel *src )
 892 {
 893    dst->f[0] = sqrtf( src->f[0] );
 894    dst->f[1] = sqrtf( src->f[1] );
 895    dst->f[2] = sqrtf( src->f[2] );
 896    dst->f[3] = sqrtf( src->f[3] );
 897 }
 898
 899 static void
 900 micro_sub(
 901    union tgsi_exec_channel *dst,
 902    const union tgsi_exec_channel *src0,
 903    const union tgsi_exec_channel *src1 )
 904 {
 905    dst->f[0] = src0->f[0] - src1->f[0];
 906    dst->f[1] = src0->f[1] - src1->f[1];
 907    dst->f[2] = src0->f[2] - src1->f[2];
 908    dst->f[3] = src0->f[3] - src1->f[3];
 909 }
 910
 911 static void
 912 micro_u2f(
 913    union tgsi_exec_channel *dst,
 914    const union tgsi_exec_channel *src )
 915 {
 916    dst->f[0] = (float) src->u[0];
 917    dst->f[1] = (float) src->u[1];
 918    dst->f[2] = (float) src->u[2];
 919    dst->f[3] = (float) src->u[3];
 920 }
 921
 922 static void
 923 micro_xor(
 924    union tgsi_exec_channel *dst,
 925    const union tgsi_exec_channel *src0,
 926    const union tgsi_exec_channel *src1 )
 927 {
 928    dst->u[0] = src0->u[0] ^ src1->u[0];
 929    dst->u[1] = src0->u[1] ^ src1->u[1];
 930    dst->u[2] = src0->u[2] ^ src1->u[2];
 931    dst->u[3] = src0->u[3] ^ src1->u[3];
 932 }
 933
 934 static void
 935 fetch_src_file_channel(
 936    const struct tgsi_exec_machine *mach,
 937    const uint file,
 938    const uint swizzle,
 939    const union tgsi_exec_channel *index,
 940    union tgsi_exec_channel *chan )
 941 {
 942    switch( swizzle ) {
 943    case TGSI_EXTSWIZZLE_X:
 944    case TGSI_EXTSWIZZLE_Y:
 945    case TGSI_EXTSWIZZLE_Z:
 946    case TGSI_EXTSWIZZLE_W:
 947       switch( file ) {
 948       case TGSI_FILE_CONSTANT:
 949          assert(mach->Consts);
 950          if (index->i[0] < 0)
 951             chan->f[0] = 0.0f;
 952          else
 953             chan->f[0] = mach->Consts[index->i[0]][swizzle];
 954          if (index->i[1] < 0)
 955             chan->f[1] = 0.0f;
 956          else
 957             chan->f[1] = mach->Consts[index->i[1]][swizzle];
 958          if (index->i[2] < 0)
 959             chan->f[2] = 0.0f;
 960          else
 961             chan->f[2] = mach->Consts[index->i[2]][swizzle];
 962          if (index->i[3] < 0)
 963             chan->f[3] = 0.0f;
 964          else
 965             chan->f[3] = mach->Consts[index->i[3]][swizzle];
 966          break;
 967
 968       case TGSI_FILE_INPUT:
 969          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
 970          chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
 971          chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
 972          chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
 973          break;
 974
 975       case TGSI_FILE_TEMPORARY:
 976          assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
 977          chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
 978          chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
 979          chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
 980          chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
 981          break;
 982
 983       case TGSI_FILE_IMMEDIATE:
 984          assert( index->i[0] < (int) mach->ImmLimit );
 985          chan->f[0] = mach->Imms[index->i[0]][swizzle];
 986          assert( index->i[1] < (int) mach->ImmLimit );
 987          chan->f[1] = mach->Imms[index->i[1]][swizzle];
 988          assert( index->i[2] < (int) mach->ImmLimit );
 989          chan->f[2] = mach->Imms[index->i[2]][swizzle];
 990          assert( index->i[3] < (int) mach->ImmLimit );
 991          chan->f[3] = mach->Imms[index->i[3]][swizzle];
 992          break;
 993
 994       case TGSI_FILE_ADDRESS:
 995          chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
 996          chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
 997          chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
 998          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
 999          break;
1000
1001       case TGSI_FILE_OUTPUT:
1002          /* vertex/fragment output vars can be read too */
1003          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1004          chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1005          chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1006          chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1007          break;
1008
1009       default:
1010          assert( 0 );
1011       }
1012       break;
1013
1014    case TGSI_EXTSWIZZLE_ZERO:
1015       *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
1016       break;
1017
1018    case TGSI_EXTSWIZZLE_ONE:
1019       *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
1020       break;
1021
1022    default:
1023       assert( 0 );
1024    }
1025 }
1026
1027 static void
1028 fetch_source(
1029    const struct tgsi_exec_machine *mach,
1030    union tgsi_exec_channel *chan,
1031    const struct tgsi_full_src_register *reg,
1032    const uint chan_index )
1033 {
1034    union tgsi_exec_channel index;
1035    uint swizzle;
1036
1037    /* We start with a direct index into a register file.
1038     *
1039     *    file[1],
1040     *    where:
1041     *       file = SrcRegister.File
1042     *       [1] = SrcRegister.Index
1043     */
1044    index.i[0] =
1045    index.i[1] =
1046    index.i[2] =
1047    index.i[3] = reg->SrcRegister.Index;
1048
1049    /* There is an extra source register that indirectly subscripts
1050     * a register file. The direct index now becomes an offset
1051     * that is being added to the indirect register.
1052     *
1053     *    file[ind[2].x+1],
1054     *    where:
1055     *       ind = SrcRegisterInd.File
1056     *       [2] = SrcRegisterInd.Index
1057     *       .x = SrcRegisterInd.SwizzleX
1058     */
1059    if (reg->SrcRegister.Indirect) {
1060       union tgsi_exec_channel index2;
1061       union tgsi_exec_channel indir_index;
1062       const uint execmask = mach->ExecMask;
1063       uint i;
1064
1065       /* which address register (always zero now) */
1066       index2.i[0] =
1067       index2.i[1] =
1068       index2.i[2] =
1069       index2.i[3] = reg->SrcRegisterInd.Index;
1070
1071       /* get current value of address register[swizzle] */
1072       swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1073       fetch_src_file_channel(
1074          mach,
1075          reg->SrcRegisterInd.File,
1076          swizzle,
1077          &index2,
1078          &indir_index );
1079
1080       /* add value of address register to the offset */
1081       index.i[0] += (int) indir_index.f[0];
1082       index.i[1] += (int) indir_index.f[1];
1083       index.i[2] += (int) indir_index.f[2];
1084       index.i[3] += (int) indir_index.f[3];
1085
1086       /* for disabled execution channels, zero-out the index to
1087        * avoid using a potential garbage value.
1088        */
1089       for (i = 0; i < QUAD_SIZE; i++) {
1090          if ((execmask & (1 << i)) == 0)
1091             index.i[i] = 0;
1092       }
1093    }
1094
1095    /* There is an extra source register that is a second
1096     * subscript to a register file. Effectively it means that
1097     * the register file is actually a 2D array of registers.
1098     *
1099     *    file[1][3] == file[1*sizeof(file[1])+3],
1100     *    where:
1101     *       [3] = SrcRegisterDim.Index
1102     */
1103    if (reg->SrcRegister.Dimension) {
1104       /* The size of the first-order array depends on the register file type.
1105        * We need to multiply the index to the first array to get an effective,
1106        * "flat" index that points to the beginning of the second-order array.
1107        */
1108       switch (reg->SrcRegister.File) {
1109       case TGSI_FILE_INPUT:
1110          index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1111          index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1112          index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1113          index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1114          break;
1115       case TGSI_FILE_CONSTANT:
1116          index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1117          index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1118          index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1119          index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1120          break;
1121       default:
1122          assert( 0 );
1123       }
1124
1125       index.i[0] += reg->SrcRegisterDim.Index;
1126       index.i[1] += reg->SrcRegisterDim.Index;
1127       index.i[2] += reg->SrcRegisterDim.Index;
1128       index.i[3] += reg->SrcRegisterDim.Index;
1129
1130       /* Again, the second subscript index can be addressed indirectly
1131        * identically to the first one.
1132        * Nothing stops us from indirectly addressing the indirect register,
1133        * but there is no need for that, so we won't exercise it.
1134        *
1135        *    file[1][ind[4].y+3],
1136        *    where:
1137        *       ind = SrcRegisterDimInd.File
1138        *       [4] = SrcRegisterDimInd.Index
1139        *       .y = SrcRegisterDimInd.SwizzleX
1140        */
1141       if (reg->SrcRegisterDim.Indirect) {
1142          union tgsi_exec_channel index2;
1143          union tgsi_exec_channel indir_index;
1144          const uint execmask = mach->ExecMask;
1145          uint i;
1146
1147          index2.i[0] =
1148          index2.i[1] =
1149          index2.i[2] =
1150          index2.i[3] = reg->SrcRegisterDimInd.Index;
1151
1152          swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1153          fetch_src_file_channel(
1154             mach,
1155             reg->SrcRegisterDimInd.File,
1156             swizzle,
1157             &index2,
1158             &indir_index );
1159
1160          index.i[0] += (int) indir_index.f[0];
1161          index.i[1] += (int) indir_index.f[1];
1162          index.i[2] += (int) indir_index.f[2];
1163          index.i[3] += (int) indir_index.f[3];
1164
1165          /* for disabled execution channels, zero-out the index to
1166           * avoid using a potential garbage value.
1167           */
1168          for (i = 0; i < QUAD_SIZE; i++) {
1169             if ((execmask & (1 << i)) == 0)
1170                index.i[i] = 0;
1171          }
1172       }
1173
1174       /* If by any chance there was a need for a 3D array of register
1175        * files, we would have to check whether SrcRegisterDim is followed
1176        * by a dimension register and continue the saga.
1177        */
1178    }
1179
1180    swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1181    fetch_src_file_channel(
1182       mach,
1183       reg->SrcRegister.File,
1184       swizzle,
1185       &index,
1186       chan );
1187
1188    switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1189    case TGSI_UTIL_SIGN_CLEAR:
1190       micro_abs( chan, chan );
1191       break;
1192
1193    case TGSI_UTIL_SIGN_SET:
1194       micro_abs( chan, chan );
1195       micro_neg( chan, chan );
1196       break;
1197
1198    case TGSI_UTIL_SIGN_TOGGLE:
1199       micro_neg( chan, chan );
1200       break;
1201
1202    case TGSI_UTIL_SIGN_KEEP:
1203       break;
1204    }
1205
1206    if (reg->SrcRegisterExtMod.Complement) {
1207       micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1208    }
1209 }
1210
1211 static void
1212 store_dest(
1213    struct tgsi_exec_machine *mach,
1214    const union tgsi_exec_channel *chan,
1215    const struct tgsi_full_dst_register *reg,
1216    const struct tgsi_full_instruction *inst,
1217    uint chan_index )
1218 {
1219    uint i;
1220    union tgsi_exec_channel null;
1221    union tgsi_exec_channel *dst;
1222    uint execmask = mach->ExecMask;
1223
1224    switch (reg->DstRegister.File) {
1225    case TGSI_FILE_NULL:
1226       dst = &null;
1227       break;
1228
1229    case TGSI_FILE_OUTPUT:
1230       dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1231                            + reg->DstRegister.Index].xyzw[chan_index];
1232       break;
1233
1234    case TGSI_FILE_TEMPORARY:
1235       assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS );
1236       dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
1237       break;
1238
1239    case TGSI_FILE_ADDRESS:
1240       dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
1241       break;
1242
1243    default:
1244       assert( 0 );
1245       return;
1246    }
1247
1248    if (inst->InstructionExtNv.CondFlowEnable) {
1249       union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1250       uint swizzle;
1251       uint shift;
1252       uint mask;
1253       uint test;
1254
1255       /* Only CC0 supported.
1256        */
1257       assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1258
1259       switch (chan_index) {
1260       case CHAN_X:
1261          swizzle = inst->InstructionExtNv.CondSwizzleX;
1262          break;
1263       case CHAN_Y:
1264          swizzle = inst->InstructionExtNv.CondSwizzleY;
1265          break;
1266       case CHAN_Z:
1267          swizzle = inst->InstructionExtNv.CondSwizzleZ;
1268          break;
1269       case CHAN_W:
1270          swizzle = inst->InstructionExtNv.CondSwizzleW;
1271          break;
1272       default:
1273          assert( 0 );
1274          return;
1275       }
1276
1277       switch (swizzle) {
1278       case TGSI_SWIZZLE_X:
1279          shift = TGSI_EXEC_CC_X_SHIFT;
1280          mask = TGSI_EXEC_CC_X_MASK;
1281          break;
1282       case TGSI_SWIZZLE_Y:
1283          shift = TGSI_EXEC_CC_Y_SHIFT;
1284          mask = TGSI_EXEC_CC_Y_MASK;
1285          break;
1286       case TGSI_SWIZZLE_Z:
1287          shift = TGSI_EXEC_CC_Z_SHIFT;
1288          mask = TGSI_EXEC_CC_Z_MASK;
1289          break;
1290       case TGSI_SWIZZLE_W:
1291          shift = TGSI_EXEC_CC_W_SHIFT;
1292          mask = TGSI_EXEC_CC_W_MASK;
1293          break;
1294       default:
1295          assert( 0 );
1296          return;
1297       }
1298
1299       switch (inst->InstructionExtNv.CondMask) {
1300       case TGSI_CC_GT:
1301          test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1302          for (i = 0; i < QUAD_SIZE; i++)
1303             if (cc->u[i] & test)
1304                execmask &= ~(1 << i);
1305          break;
1306
1307       case TGSI_CC_EQ:
1308          test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1309          for (i = 0; i < QUAD_SIZE; i++)
1310             if (cc->u[i] & test)
1311                execmask &= ~(1 << i);
1312          break;
1313
1314       case TGSI_CC_LT:
1315          test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1316          for (i = 0; i < QUAD_SIZE; i++)
1317             if (cc->u[i] & test)
1318                execmask &= ~(1 << i);
1319          break;
1320
1321       case TGSI_CC_GE:
1322          test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1323          for (i = 0; i < QUAD_SIZE; i++)
1324             if (cc->u[i] & test)
1325                execmask &= ~(1 << i);
1326          break;
1327
1328       case TGSI_CC_LE:
1329          test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1330          for (i = 0; i < QUAD_SIZE; i++)
1331             if (cc->u[i] & test)
1332                execmask &= ~(1 << i);
1333          break;
1334
1335       case TGSI_CC_NE:
1336          test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1337          for (i = 0; i < QUAD_SIZE; i++)
1338             if (cc->u[i] & test)
1339                execmask &= ~(1 << i);
1340          break;
1341
1342       case TGSI_CC_TR:
1343          break;
1344
1345       case TGSI_CC_FL:
1346          for (i = 0; i < QUAD_SIZE; i++)
1347             execmask &= ~(1 << i);
1348          break;
1349
1350       default:
1351          assert( 0 );
1352          return;
1353       }
1354    }
1355
1356    switch (inst->Instruction.Saturate) {
1357    case TGSI_SAT_NONE:
1358       for (i = 0; i < QUAD_SIZE; i++)
1359          if (execmask & (1 << i))
1360             dst->i[i] = chan->i[i];
1361       break;
1362
1363    case TGSI_SAT_ZERO_ONE:
1364       for (i = 0; i < QUAD_SIZE; i++)
1365          if (execmask & (1 << i)) {
1366             if (chan->f[i] < 0.0f)
1367                dst->f[i] = 0.0f;
1368             else if (chan->f[i] > 1.0f)
1369                dst->f[i] = 1.0f;
1370             else
1371                dst->i[i] = chan->i[i];
1372          }
1373       break;
1374
1375    case TGSI_SAT_MINUS_PLUS_ONE:
1376       for (i = 0; i < QUAD_SIZE; i++)
1377          if (execmask & (1 << i)) {
1378             if (chan->f[i] < -1.0f)
1379                dst->f[i] = -1.0f;
1380             else if (chan->f[i] > 1.0f)
1381                dst->f[i] = 1.0f;
1382             else
1383                dst->i[i] = chan->i[i];
1384          }
1385       break;
1386
1387    default:
1388       assert( 0 );
1389    }
1390
1391    if (inst->InstructionExtNv.CondDstUpdate) {
1392       union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1393       uint shift;
1394       uint mask;
1395
1396       /* Only CC0 supported.
1397        */
1398       assert( inst->InstructionExtNv.CondDstIndex < 1 );
1399
1400       switch (chan_index) {
1401       case CHAN_X:
1402          shift = TGSI_EXEC_CC_X_SHIFT;
1403          mask = ~TGSI_EXEC_CC_X_MASK;
1404          break;
1405       case CHAN_Y:
1406          shift = TGSI_EXEC_CC_Y_SHIFT;
1407          mask = ~TGSI_EXEC_CC_Y_MASK;
1408          break;
1409       case CHAN_Z:
1410          shift = TGSI_EXEC_CC_Z_SHIFT;
1411          mask = ~TGSI_EXEC_CC_Z_MASK;
1412          break;
1413       case CHAN_W:
1414          shift = TGSI_EXEC_CC_W_SHIFT;
1415          mask = ~TGSI_EXEC_CC_W_MASK;
1416          break;
1417       default:
1418          assert( 0 );
1419          return;
1420       }
1421
1422       for (i = 0; i < QUAD_SIZE; i++)
1423          if (execmask & (1 << i)) {
1424             cc->u[i] &= mask;
1425             if (dst->f[i] < 0.0f)
1426                cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1427             else if (dst->f[i] > 0.0f)
1428                cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1429             else if (dst->f[i] == 0.0f)
1430                cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1431             else
1432                cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1433          }
1434    }
1435 }
1436
1437 #define FETCH(VAL,INDEX,CHAN)\
1438     fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1439
1440 #define STORE(VAL,INDEX,CHAN)\
1441     store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1442
1443
1444 /**
1445  * Execute ARB-style KIL which is predicated by a src register.
1446  * Kill fragment if any of the four values is less than zero.
1447  */
1448 static void
1449 exec_kil(struct tgsi_exec_machine *mach,
1450          const struct tgsi_full_instruction *inst)
1451 {
1452    uint uniquemask;
1453    uint chan_index;
1454    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1455    union tgsi_exec_channel r[1];
1456
1457    /* This mask stores component bits that were already tested. Note that
1458     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1459     * tested. */
1460    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1461
1462    for (chan_index = 0; chan_index < 4; chan_index++)
1463    {
1464       uint swizzle;
1465       uint i;
1466
1467       /* unswizzle channel */
1468       swizzle = tgsi_util_get_full_src_register_extswizzle (
1469                         &inst->FullSrcRegisters[0],
1470                         chan_index);
1471
1472       /* check if the component has not been already tested */
1473       if (uniquemask & (1 << swizzle))
1474          continue;
1475       uniquemask |= 1 << swizzle;
1476
1477       FETCH(&r[0], 0, chan_index);
1478       for (i = 0; i < 4; i++)
1479          if (r[0].f[i] < 0.0f)
1480             kilmask |= 1 << i;
1481    }
1482
1483    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1484 }
1485
1486 /**
1487  * Execute NVIDIA-style KIL which is predicated by a condition code.
1488  * Kill fragment if the condition code is TRUE.
1489  */
1490 static void
1491 exec_kilp(struct tgsi_exec_machine *mach,
1492           const struct tgsi_full_instruction *inst)
1493 {
1494    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1495
1496    if (inst->InstructionExtNv.CondFlowEnable) {
1497       uint swizzle[4];
1498       uint chan_index;
1499
1500       kilmask = 0x0;
1501
1502       swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1503       swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1504       swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1505       swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1506
1507       for (chan_index = 0; chan_index < 4; chan_index++)
1508       {
1509          uint i;
1510
1511          for (i = 0; i < 4; i++) {
1512             /* TODO: evaluate the condition code */
1513             if (0)
1514                kilmask |= 1 << i;
1515          }
1516       }
1517    }
1518    else {
1519       /* "unconditional" kil */
1520       kilmask = mach->ExecMask;
1521    }
1522    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1523 }
1524
1525
1526 /*
1527  * Fetch a texel using STR texture coordinates.
1528  */
1529 static void
1530 fetch_texel( struct tgsi_sampler *sampler,
1531              const union tgsi_exec_channel *s,
1532              const union tgsi_exec_channel *t,
1533              const union tgsi_exec_channel *p,
1534              float lodbias,  /* XXX should be float[4] */
1535              union tgsi_exec_channel *r,
1536              union tgsi_exec_channel *g,
1537              union tgsi_exec_channel *b,
1538              union tgsi_exec_channel *a )
1539 {
1540    uint j;
1541    float rgba[NUM_CHANNELS][QUAD_SIZE];
1542
1543    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1544
1545    for (j = 0; j < 4; j++) {
1546       r->f[j] = rgba[0][j];
1547       g->f[j] = rgba[1][j];
1548       b->f[j] = rgba[2][j];
1549       a->f[j] = rgba[3][j];
1550    }
1551 }
1552
1553
1554 static void
1555 exec_tex(struct tgsi_exec_machine *mach,
1556          const struct tgsi_full_instruction *inst,
1557          boolean biasLod,
1558          boolean projected)
1559 {
1560    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1561    union tgsi_exec_channel r[8];
1562    uint chan_index;
1563    float lodBias;
1564
1565    /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1566
1567    switch (inst->InstructionExtTexture.Texture) {
1568    case TGSI_TEXTURE_1D:
1569
1570       FETCH(&r[0], 0, CHAN_X);
1571
1572       if (projected) {
1573          FETCH(&r[1], 0, CHAN_W);
1574          micro_div( &r[0], &r[0], &r[1] );
1575       }
1576
1577       if (biasLod) {
1578          FETCH(&r[1], 0, CHAN_W);
1579          lodBias = r[2].f[0];
1580       }
1581       else
1582          lodBias = 0.0;
1583
1584       fetch_texel(mach->Samplers[unit],
1585                   &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
1586                   &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1587       break;
1588
1589    case TGSI_TEXTURE_2D:
1590    case TGSI_TEXTURE_RECT:
1591
1592       FETCH(&r[0], 0, CHAN_X);
1593       FETCH(&r[1], 0, CHAN_Y);
1594       FETCH(&r[2], 0, CHAN_Z);
1595
1596       if (projected) {
1597          FETCH(&r[3], 0, CHAN_W);
1598          micro_div( &r[0], &r[0], &r[3] );
1599          micro_div( &r[1], &r[1], &r[3] );
1600          micro_div( &r[2], &r[2], &r[3] );
1601       }
1602
1603       if (biasLod) {
1604          FETCH(&r[3], 0, CHAN_W);
1605          lodBias = r[3].f[0];
1606       }
1607       else
1608          lodBias = 0.0;
1609
1610       fetch_texel(mach->Samplers[unit],
1611                   &r[0], &r[1], &r[2], lodBias,  /* inputs */
1612                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1613       break;
1614
1615    case TGSI_TEXTURE_3D:
1616    case TGSI_TEXTURE_CUBE:
1617
1618       FETCH(&r[0], 0, CHAN_X);
1619       FETCH(&r[1], 0, CHAN_Y);
1620       FETCH(&r[2], 0, CHAN_Z);
1621
1622       if (projected) {
1623          FETCH(&r[3], 0, CHAN_W);
1624          micro_div( &r[0], &r[0], &r[3] );
1625          micro_div( &r[1], &r[1], &r[3] );
1626          micro_div( &r[2], &r[2], &r[3] );
1627       }
1628
1629       if (biasLod) {
1630          FETCH(&r[3], 0, CHAN_W);
1631          lodBias = r[3].f[0];
1632       }
1633       else
1634          lodBias = 0.0;
1635
1636       fetch_texel(mach->Samplers[unit],
1637                   &r[0], &r[1], &r[2], lodBias,
1638                   &r[0], &r[1], &r[2], &r[3]);
1639       break;
1640
1641    default:
1642       assert (0);
1643    }
1644
1645    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1646       STORE( &r[chan_index], 0, chan_index );
1647    }
1648 }
1649
1650
1651 /**
1652  * Evaluate a constant-valued coefficient at the position of the
1653  * current quad.
1654  */
1655 static void
1656 eval_constant_coef(
1657    struct tgsi_exec_machine *mach,
1658    unsigned attrib,
1659    unsigned chan )
1660 {
1661    unsigned i;
1662
1663    for( i = 0; i < QUAD_SIZE; i++ ) {
1664       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1665    }
1666 }
1667
1668 /**
1669  * Evaluate a linear-valued coefficient at the position of the
1670  * current quad.
1671  */
1672 static void
1673 eval_linear_coef(
1674    struct tgsi_exec_machine *mach,
1675    unsigned attrib,
1676    unsigned chan )
1677 {
1678    const float x = mach->QuadPos.xyzw[0].f[0];
1679    const float y = mach->QuadPos.xyzw[1].f[0];
1680    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1681    const float dady = mach->InterpCoefs[attrib].dady[chan];
1682    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1683    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1684    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1685    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1686    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1687 }
1688
1689 /**
1690  * Evaluate a perspective-valued coefficient at the position of the
1691  * current quad.
1692  */
1693 static void
1694 eval_perspective_coef(
1695    struct tgsi_exec_machine *mach,
1696    unsigned attrib,
1697    unsigned chan )
1698 {
1699    const float x = mach->QuadPos.xyzw[0].f[0];
1700    const float y = mach->QuadPos.xyzw[1].f[0];
1701    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1702    const float dady = mach->InterpCoefs[attrib].dady[chan];
1703    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1704    const float *w = mach->QuadPos.xyzw[3].f;
1705    /* divide by W here */
1706    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1707    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1708    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1709    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1710 }
1711
1712
1713 typedef void (* eval_coef_func)(
1714    struct tgsi_exec_machine *mach,
1715    unsigned attrib,
1716    unsigned chan );
1717
1718 static void
1719 exec_declaration(
1720    struct tgsi_exec_machine *mach,
1721    const struct tgsi_full_declaration *decl )
1722 {
1723    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1724       if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1725          unsigned first, last, mask;
1726          eval_coef_func eval;
1727
1728          first = decl->DeclarationRange.First;
1729          last = decl->DeclarationRange.Last;
1730          mask = decl->Declaration.UsageMask;
1731
1732          switch( decl->Declaration.Interpolate ) {
1733          case TGSI_INTERPOLATE_CONSTANT:
1734             eval = eval_constant_coef;
1735             break;
1736
1737          case TGSI_INTERPOLATE_LINEAR:
1738             eval = eval_linear_coef;
1739             break;
1740
1741          case TGSI_INTERPOLATE_PERSPECTIVE:
1742             eval = eval_perspective_coef;
1743             break;
1744
1745          default:
1746             eval = NULL;
1747             assert( 0 );
1748          }
1749
1750          if( mask == TGSI_WRITEMASK_XYZW ) {
1751             unsigned i, j;
1752
1753             for( i = first; i <= last; i++ ) {
1754                for( j = 0; j < NUM_CHANNELS; j++ ) {
1755                   eval( mach, i, j );
1756                }
1757             }
1758          }
1759          else {
1760             unsigned i, j;
1761
1762             for( j = 0; j < NUM_CHANNELS; j++ ) {
1763                if( mask & (1 << j) ) {
1764                   for( i = first; i <= last; i++ ) {
1765                      eval( mach, i, j );
1766                   }
1767                }
1768             }
1769          }
1770       }
1771    }
1772 }
1773
1774 static void
1775 exec_instruction(
1776    struct tgsi_exec_machine *mach,
1777    const struct tgsi_full_instruction *inst,
1778    int *pc )
1779 {
1780    uint chan_index;
1781    union tgsi_exec_channel r[8];
1782
1783    (*pc)++;
1784
1785    switch (inst->Instruction.Opcode) {
1786    case TGSI_OPCODE_ARL:
1787       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1788          FETCH( &r[0], 0, chan_index );
1789          micro_trunc( &r[0], &r[0] );
1790          STORE( &r[0], 0, chan_index );
1791       }
1792       break;
1793
1794    case TGSI_OPCODE_MOV:
1795    case TGSI_OPCODE_SWZ:
1796       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1797          FETCH( &r[0], 0, chan_index );
1798          STORE( &r[0], 0, chan_index );
1799       }
1800       break;
1801
1802    case TGSI_OPCODE_LIT:
1803       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1804          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1805       }
1806
1807       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1808          FETCH( &r[0], 0, CHAN_X );
1809          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1810             micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1811             STORE( &r[0], 0, CHAN_Y );
1812          }
1813
1814          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1815             FETCH( &r[1], 0, CHAN_Y );
1816             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1817
1818             FETCH( &r[2], 0, CHAN_W );
1819             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1820             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1821             micro_pow( &r[1], &r[1], &r[2] );
1822             micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1823             STORE( &r[0], 0, CHAN_Z );
1824          }
1825       }
1826
1827       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1828          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1829       }
1830       break;
1831
1832    case TGSI_OPCODE_RCP:
1833    /* TGSI_OPCODE_RECIP */
1834       FETCH( &r[0], 0, CHAN_X );
1835       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1836       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1837          STORE( &r[0], 0, chan_index );
1838       }
1839       break;
1840
1841    case TGSI_OPCODE_RSQ:
1842    /* TGSI_OPCODE_RECIPSQRT */
1843       FETCH( &r[0], 0, CHAN_X );
1844       micro_sqrt( &r[0], &r[0] );
1845       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1846       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1847          STORE( &r[0], 0, chan_index );
1848       }
1849       break;
1850
1851    case TGSI_OPCODE_EXP:
1852       FETCH( &r[0], 0, CHAN_X );
1853       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1854       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1855          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1856          STORE( &r[2], 0, CHAN_X );        /* store r2 */
1857       }
1858       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1859          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1860          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1861       }
1862       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1863          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1864          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1865       }
1866       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1867          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1868       }
1869       break;
1870
1871    case TGSI_OPCODE_LOG:
1872       FETCH( &r[0], 0, CHAN_X );
1873       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1874       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1875       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1876       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1877          STORE( &r[0], 0, CHAN_X );
1878       }
1879       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1880          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1881          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
1882          STORE( &r[0], 0, CHAN_Y );
1883       }
1884       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1885          STORE( &r[1], 0, CHAN_Z );
1886       }
1887       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1888          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1889       }
1890       break;
1891
1892    case TGSI_OPCODE_MUL:
1893       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1894       {
1895          FETCH(&r[0], 0, chan_index);
1896          FETCH(&r[1], 1, chan_index);
1897
1898          micro_mul( &r[0], &r[0], &r[1] );
1899
1900          STORE(&r[0], 0, chan_index);
1901       }
1902       break;
1903
1904    case TGSI_OPCODE_ADD:
1905       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1906          FETCH( &r[0], 0, chan_index );
1907          FETCH( &r[1], 1, chan_index );
1908          micro_add( &r[0], &r[0], &r[1] );
1909          STORE( &r[0], 0, chan_index );
1910       }
1911       break;
1912
1913    case TGSI_OPCODE_DP3:
1914    /* TGSI_OPCODE_DOT3 */
1915       FETCH( &r[0], 0, CHAN_X );
1916       FETCH( &r[1], 1, CHAN_X );
1917       micro_mul( &r[0], &r[0], &r[1] );
1918
1919       FETCH( &r[1], 0, CHAN_Y );
1920       FETCH( &r[2], 1, CHAN_Y );
1921       micro_mul( &r[1], &r[1], &r[2] );
1922       micro_add( &r[0], &r[0], &r[1] );
1923
1924       FETCH( &r[1], 0, CHAN_Z );
1925       FETCH( &r[2], 1, CHAN_Z );
1926       micro_mul( &r[1], &r[1], &r[2] );
1927       micro_add( &r[0], &r[0], &r[1] );
1928
1929       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1930          STORE( &r[0], 0, chan_index );
1931       }
1932       break;
1933
1934     case TGSI_OPCODE_DP4:
1935     /* TGSI_OPCODE_DOT4 */
1936        FETCH(&r[0], 0, CHAN_X);
1937        FETCH(&r[1], 1, CHAN_X);
1938
1939        micro_mul( &r[0], &r[0], &r[1] );
1940
1941        FETCH(&r[1], 0, CHAN_Y);
1942        FETCH(&r[2], 1, CHAN_Y);
1943
1944        micro_mul( &r[1], &r[1], &r[2] );
1945        micro_add( &r[0], &r[0], &r[1] );
1946
1947        FETCH(&r[1], 0, CHAN_Z);
1948        FETCH(&r[2], 1, CHAN_Z);
1949
1950        micro_mul( &r[1], &r[1], &r[2] );
1951        micro_add( &r[0], &r[0], &r[1] );
1952
1953        FETCH(&r[1], 0, CHAN_W);
1954        FETCH(&r[2], 1, CHAN_W);
1955
1956        micro_mul( &r[1], &r[1], &r[2] );
1957        micro_add( &r[0], &r[0], &r[1] );
1958
1959       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1960          STORE( &r[0], 0, chan_index );
1961       }
1962       break;
1963
1964    case TGSI_OPCODE_DST:
1965       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1966          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1967       }
1968
1969       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1970          FETCH( &r[0], 0, CHAN_Y );
1971          FETCH( &r[1], 1, CHAN_Y);
1972          micro_mul( &r[0], &r[0], &r[1] );
1973          STORE( &r[0], 0, CHAN_Y );
1974       }
1975
1976       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1977          FETCH( &r[0], 0, CHAN_Z );
1978          STORE( &r[0], 0, CHAN_Z );
1979       }
1980
1981       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1982          FETCH( &r[0], 1, CHAN_W );
1983          STORE( &r[0], 0, CHAN_W );
1984       }
1985       break;
1986
1987    case TGSI_OPCODE_MIN:
1988       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1989          FETCH(&r[0], 0, chan_index);
1990          FETCH(&r[1], 1, chan_index);
1991
1992          /* XXX use micro_min()?? */
1993          micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
1994
1995          STORE(&r[0], 0, chan_index);
1996       }
1997       break;
1998
1999    case TGSI_OPCODE_MAX:
2000       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2001          FETCH(&r[0], 0, chan_index);
2002          FETCH(&r[1], 1, chan_index);
2003
2004          /* XXX use micro_max()?? */
2005          micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2006
2007          STORE(&r[0], 0, chan_index );
2008       }
2009       break;
2010
2011    case TGSI_OPCODE_SLT:
2012    /* TGSI_OPCODE_SETLT */
2013       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2014          FETCH( &r[0], 0, chan_index );
2015          FETCH( &r[1], 1, chan_index );
2016          micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2017          STORE( &r[0], 0, chan_index );
2018       }
2019       break;
2020
2021    case TGSI_OPCODE_SGE:
2022    /* TGSI_OPCODE_SETGE */
2023       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2024          FETCH( &r[0], 0, chan_index );
2025          FETCH( &r[1], 1, chan_index );
2026          micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2027          STORE( &r[0], 0, chan_index );
2028       }
2029       break;
2030
2031    case TGSI_OPCODE_MAD:
2032    /* TGSI_OPCODE_MADD */
2033       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2034          FETCH( &r[0], 0, chan_index );
2035          FETCH( &r[1], 1, chan_index );
2036          micro_mul( &r[0], &r[0], &r[1] );
2037          FETCH( &r[1], 2, chan_index );
2038          micro_add( &r[0], &r[0], &r[1] );
2039          STORE( &r[0], 0, chan_index );
2040       }
2041       break;
2042
2043    case TGSI_OPCODE_SUB:
2044       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2045          FETCH(&r[0], 0, chan_index);
2046          FETCH(&r[1], 1, chan_index);
2047
2048          micro_sub( &r[0], &r[0], &r[1] );
2049
2050          STORE(&r[0], 0, chan_index);
2051       }
2052       break;
2053
2054    case TGSI_OPCODE_LERP:
2055    /* TGSI_OPCODE_LRP */
2056       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2057          FETCH(&r[0], 0, chan_index);
2058          FETCH(&r[1], 1, chan_index);
2059          FETCH(&r[2], 2, chan_index);
2060
2061          micro_sub( &r[1], &r[1], &r[2] );
2062          micro_mul( &r[0], &r[0], &r[1] );
2063          micro_add( &r[0], &r[0], &r[2] );
2064
2065          STORE(&r[0], 0, chan_index);
2066       }
2067       break;
2068
2069    case TGSI_OPCODE_CND:
2070       assert (0);
2071       break;
2072
2073    case TGSI_OPCODE_CND0:
2074       assert (0);
2075       break;
2076
2077    case TGSI_OPCODE_DOT2ADD:
2078       /* TGSI_OPCODE_DP2A */
2079       FETCH( &r[0], 0, CHAN_X );
2080       FETCH( &r[1], 1, CHAN_X );
2081       micro_mul( &r[0], &r[0], &r[1] );
2082
2083       FETCH( &r[1], 0, CHAN_Y );
2084       FETCH( &r[2], 1, CHAN_Y );
2085       micro_mul( &r[1], &r[1], &r[2] );
2086       micro_add( &r[0], &r[0], &r[1] );
2087
2088       FETCH( &r[2], 2, CHAN_X );
2089       micro_add( &r[0], &r[0], &r[2] );
2090
2091       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2092          STORE( &r[0], 0, chan_index );
2093       }
2094       break;
2095
2096    case TGSI_OPCODE_INDEX:
2097       assert (0);
2098       break;
2099
2100    case TGSI_OPCODE_NEGATE:
2101       assert (0);
2102       break;
2103
2104    case TGSI_OPCODE_FRAC:
2105    /* TGSI_OPCODE_FRC */
2106       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2107          FETCH( &r[0], 0, chan_index );
2108          micro_frc( &r[0], &r[0] );
2109          STORE( &r[0], 0, chan_index );
2110       }
2111       break;
2112
2113    case TGSI_OPCODE_CLAMP:
2114       assert (0);
2115       break;
2116
2117    case TGSI_OPCODE_FLOOR:
2118    /* TGSI_OPCODE_FLR */
2119       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2120          FETCH( &r[0], 0, chan_index );
2121          micro_flr( &r[0], &r[0] );
2122          STORE( &r[0], 0, chan_index );
2123       }
2124       break;
2125
2126    case TGSI_OPCODE_ROUND:
2127       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2128          FETCH( &r[0], 0, chan_index );
2129          micro_rnd( &r[0], &r[0] );
2130          STORE( &r[0], 0, chan_index );
2131       }
2132       break;
2133
2134    case TGSI_OPCODE_EXPBASE2:
2135     /* TGSI_OPCODE_EX2 */
2136       FETCH(&r[0], 0, CHAN_X);
2137
2138 #if FAST_MATH
2139       micro_exp2( &r[0], &r[0] );
2140 #else
2141       micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2142 #endif
2143
2144       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2145          STORE( &r[0], 0, chan_index );
2146       }
2147       break;
2148
2149    case TGSI_OPCODE_LOGBASE2:
2150    /* TGSI_OPCODE_LG2 */
2151       FETCH( &r[0], 0, CHAN_X );
2152       micro_lg2( &r[0], &r[0] );
2153       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2154          STORE( &r[0], 0, chan_index );
2155       }
2156       break;
2157
2158    case TGSI_OPCODE_POWER:
2159       /* TGSI_OPCODE_POW */
2160       FETCH(&r[0], 0, CHAN_X);
2161       FETCH(&r[1], 1, CHAN_X);
2162
2163       micro_pow( &r[0], &r[0], &r[1] );
2164
2165       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2166          STORE( &r[0], 0, chan_index );
2167       }
2168       break;
2169
2170    case TGSI_OPCODE_CROSSPRODUCT:
2171       /* TGSI_OPCODE_XPD */
2172       FETCH(&r[0], 0, CHAN_Y);
2173       FETCH(&r[1], 1, CHAN_Z);
2174
2175       micro_mul( &r[2], &r[0], &r[1] );
2176
2177       FETCH(&r[3], 0, CHAN_Z);
2178       FETCH(&r[4], 1, CHAN_Y);
2179
2180       micro_mul( &r[5], &r[3], &r[4] );
2181       micro_sub( &r[2], &r[2], &r[5] );
2182
2183       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2184          STORE( &r[2], 0, CHAN_X );
2185       }
2186
2187       FETCH(&r[2], 1, CHAN_X);
2188
2189       micro_mul( &r[3], &r[3], &r[2] );
2190
2191       FETCH(&r[5], 0, CHAN_X);
2192
2193       micro_mul( &r[1], &r[1], &r[5] );
2194       micro_sub( &r[3], &r[3], &r[1] );
2195
2196       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2197          STORE( &r[3], 0, CHAN_Y );
2198       }
2199
2200       micro_mul( &r[5], &r[5], &r[4] );
2201       micro_mul( &r[0], &r[0], &r[2] );
2202       micro_sub( &r[5], &r[5], &r[0] );
2203
2204       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2205          STORE( &r[5], 0, CHAN_Z );
2206       }
2207
2208       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2209          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2210       }
2211       break;
2212
2213     case TGSI_OPCODE_MULTIPLYMATRIX:
2214        assert (0);
2215        break;
2216
2217     case TGSI_OPCODE_ABS:
2218        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2219           FETCH(&r[0], 0, chan_index);
2220
2221           micro_abs( &r[0], &r[0] );
2222
2223           STORE(&r[0], 0, chan_index);
2224        }
2225        break;
2226
2227    case TGSI_OPCODE_RCC:
2228       assert (0);
2229       break;
2230
2231    case TGSI_OPCODE_DPH:
2232       FETCH(&r[0], 0, CHAN_X);
2233       FETCH(&r[1], 1, CHAN_X);
2234
2235       micro_mul( &r[0], &r[0], &r[1] );
2236
2237       FETCH(&r[1], 0, CHAN_Y);
2238       FETCH(&r[2], 1, CHAN_Y);
2239
2240       micro_mul( &r[1], &r[1], &r[2] );
2241       micro_add( &r[0], &r[0], &r[1] );
2242
2243       FETCH(&r[1], 0, CHAN_Z);
2244       FETCH(&r[2], 1, CHAN_Z);
2245
2246       micro_mul( &r[1], &r[1], &r[2] );
2247       micro_add( &r[0], &r[0], &r[1] );
2248
2249       FETCH(&r[1], 1, CHAN_W);
2250
2251       micro_add( &r[0], &r[0], &r[1] );
2252
2253       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2254          STORE( &r[0], 0, chan_index );
2255       }
2256       break;
2257
2258    case TGSI_OPCODE_COS:
2259       FETCH(&r[0], 0, CHAN_X);
2260
2261       micro_cos( &r[0], &r[0] );
2262
2263       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2264          STORE( &r[0], 0, chan_index );
2265       }
2266       break;
2267
2268    case TGSI_OPCODE_DDX:
2269       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2270          FETCH( &r[0], 0, chan_index );
2271          micro_ddx( &r[0], &r[0] );
2272          STORE( &r[0], 0, chan_index );
2273       }
2274       break;
2275
2276    case TGSI_OPCODE_DDY:
2277       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2278          FETCH( &r[0], 0, chan_index );
2279          micro_ddy( &r[0], &r[0] );
2280          STORE( &r[0], 0, chan_index );
2281       }
2282       break;
2283
2284    case TGSI_OPCODE_KILP:
2285       exec_kilp (mach, inst);
2286       break;
2287
2288    case TGSI_OPCODE_KIL:
2289       exec_kil (mach, inst);
2290       break;
2291
2292    case TGSI_OPCODE_PK2H:
2293       assert (0);
2294       break;
2295
2296    case TGSI_OPCODE_PK2US:
2297       assert (0);
2298       break;
2299
2300    case TGSI_OPCODE_PK4B:
2301       assert (0);
2302       break;
2303
2304    case TGSI_OPCODE_PK4UB:
2305       assert (0);
2306       break;
2307
2308    case TGSI_OPCODE_RFL:
2309       assert (0);
2310       break;
2311
2312    case TGSI_OPCODE_SEQ:
2313       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2314          FETCH( &r[0], 0, chan_index );
2315          FETCH( &r[1], 1, chan_index );
2316          micro_eq( &r[0], &r[0], &r[1],
2317                    &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2318                    &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2319          STORE( &r[0], 0, chan_index );
2320       }
2321       break;
2322
2323    case TGSI_OPCODE_SFL:
2324       assert (0);
2325       break;
2326
2327    case TGSI_OPCODE_SGT:
2328       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2329          FETCH( &r[0], 0, chan_index );
2330          FETCH( &r[1], 1, chan_index );
2331          micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2332          STORE( &r[0], 0, chan_index );
2333       }
2334       break;
2335
2336    case TGSI_OPCODE_SIN:
2337       FETCH( &r[0], 0, CHAN_X );
2338       micro_sin( &r[0], &r[0] );
2339       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2340          STORE( &r[0], 0, chan_index );
2341       }
2342       break;
2343
2344    case TGSI_OPCODE_SLE:
2345       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2346          FETCH( &r[0], 0, chan_index );
2347          FETCH( &r[1], 1, chan_index );
2348          micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2349          STORE( &r[0], 0, chan_index );
2350       }
2351       break;
2352
2353    case TGSI_OPCODE_SNE:
2354       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2355          FETCH( &r[0], 0, chan_index );
2356          FETCH( &r[1], 1, chan_index );
2357          micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2358          STORE( &r[0], 0, chan_index );
2359       }
2360       break;
2361
2362    case TGSI_OPCODE_STR:
2363       assert (0);
2364       break;
2365
2366    case TGSI_OPCODE_TEX:
2367       /* simple texture lookup */
2368       /* src[0] = texcoord */
2369       /* src[1] = sampler unit */
2370       exec_tex(mach, inst, FALSE, FALSE);
2371       break;
2372
2373    case TGSI_OPCODE_TXB:
2374       /* Texture lookup with lod bias */
2375       /* src[0] = texcoord (src[0].w = LOD bias) */
2376       /* src[1] = sampler unit */
2377       exec_tex(mach, inst, TRUE, FALSE);
2378       break;
2379
2380    case TGSI_OPCODE_TXD:
2381       /* Texture lookup with explict partial derivatives */
2382       /* src[0] = texcoord */
2383       /* src[1] = d[strq]/dx */
2384       /* src[2] = d[strq]/dy */
2385       /* src[3] = sampler unit */
2386       assert (0);
2387       break;
2388
2389    case TGSI_OPCODE_TXL:
2390       /* Texture lookup with explit LOD */
2391       /* src[0] = texcoord (src[0].w = LOD) */
2392       /* src[1] = sampler unit */
2393       exec_tex(mach, inst, TRUE, FALSE);
2394       break;
2395
2396    case TGSI_OPCODE_TXP:
2397       /* Texture lookup with projection */
2398       /* src[0] = texcoord (src[0].w = projection) */
2399       /* src[1] = sampler unit */
2400       exec_tex(mach, inst, FALSE, TRUE);
2401       break;
2402
2403    case TGSI_OPCODE_UP2H:
2404       assert (0);
2405       break;
2406
2407    case TGSI_OPCODE_UP2US:
2408       assert (0);
2409       break;
2410
2411    case TGSI_OPCODE_UP4B:
2412       assert (0);
2413       break;
2414
2415    case TGSI_OPCODE_UP4UB:
2416       assert (0);
2417       break;
2418
2419    case TGSI_OPCODE_X2D:
2420       assert (0);
2421       break;
2422
2423    case TGSI_OPCODE_ARA:
2424       assert (0);
2425       break;
2426
2427    case TGSI_OPCODE_ARR:
2428       assert (0);
2429       break;
2430
2431    case TGSI_OPCODE_BRA:
2432       assert (0);
2433       break;
2434
2435    case TGSI_OPCODE_CAL:
2436       /* skip the call if no execution channels are enabled */
2437       if (mach->ExecMask) {
2438          /* do the call */
2439
2440          /* push the Cond, Loop, Cont stacks */
2441          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2442          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2443          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2444          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2445          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2446          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2447
2448          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2449          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2450
2451          /* note that PC was already incremented above */
2452          mach->CallStack[mach->CallStackTop++] = *pc;
2453          *pc = inst->InstructionExtLabel.Label;
2454       }
2455       break;
2456
2457    case TGSI_OPCODE_RET:
2458       mach->FuncMask &= ~mach->ExecMask;
2459       UPDATE_EXEC_MASK(mach);
2460
2461       if (mach->FuncMask == 0x0) {
2462          /* really return now (otherwise, keep executing */
2463
2464          if (mach->CallStackTop == 0) {
2465             /* returning from main() */
2466             *pc = -1;
2467             return;
2468          }
2469          *pc = mach->CallStack[--mach->CallStackTop];
2470
2471          /* pop the Cond, Loop, Cont stacks */
2472          assert(mach->CondStackTop > 0);
2473          mach->CondMask = mach->CondStack[--mach->CondStackTop];
2474          assert(mach->LoopStackTop > 0);
2475          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2476          assert(mach->ContStackTop > 0);
2477          mach->ContMask = mach->ContStack[--mach->ContStackTop];
2478          assert(mach->FuncStackTop > 0);
2479          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2480
2481          UPDATE_EXEC_MASK(mach);
2482       }
2483       break;
2484
2485    case TGSI_OPCODE_SSG:
2486       assert (0);
2487       break;
2488
2489    case TGSI_OPCODE_CMP:
2490       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2491          FETCH(&r[0], 0, chan_index);
2492          FETCH(&r[1], 1, chan_index);
2493          FETCH(&r[2], 2, chan_index);
2494
2495          micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2496
2497          STORE(&r[0], 0, chan_index);
2498       }
2499       break;
2500
2501    case TGSI_OPCODE_SCS:
2502       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2503          FETCH( &r[0], 0, CHAN_X );
2504       }
2505       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
2506          micro_cos( &r[1], &r[0] );
2507          STORE( &r[1], 0, CHAN_X );
2508       }
2509       if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2510          micro_sin( &r[1], &r[0] );
2511          STORE( &r[1], 0, CHAN_Y );
2512       }
2513       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2514          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2515       }
2516       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2517          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2518       }
2519       break;
2520
2521    case TGSI_OPCODE_NRM:
2522       /* 3-component vector normalize */
2523       {
2524          union tgsi_exec_channel tmp, dot;
2525
2526          /* tmp = dp3(src0, src0): */
2527          FETCH( &r[0], 0, CHAN_X );
2528          micro_mul( &tmp, &r[0], &r[0] );
2529
2530          FETCH( &r[1], 0, CHAN_Y );
2531          micro_mul( &dot, &r[1], &r[1] );
2532          micro_add( &tmp, &tmp, &dot );
2533
2534          FETCH( &r[2], 0, CHAN_Z );
2535          micro_mul( &dot, &r[2], &r[2] );
2536          micro_add( &tmp, &tmp, &dot );
2537
2538          /* tmp = 1 / sqrt(tmp) */
2539          micro_sqrt( &tmp, &tmp );
2540          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2541
2542          /* note: w channel is undefined */
2543          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2544             /* chan = chan * tmp */
2545             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2546             STORE( &r[chan_index], 0, chan_index );
2547          }
2548       }
2549       break;
2550
2551    case TGSI_OPCODE_NRM4:
2552       /* 4-component vector normalize */
2553       {
2554          union tgsi_exec_channel tmp, dot;
2555
2556          /* tmp = dp4(src0, src0): */
2557          FETCH( &r[0], 0, CHAN_X );
2558          micro_mul( &tmp, &r[0], &r[0] );
2559
2560          FETCH( &r[1], 0, CHAN_Y );
2561          micro_mul( &dot, &r[1], &r[1] );
2562          micro_add( &tmp, &tmp, &dot );
2563
2564          FETCH( &r[2], 0, CHAN_Z );
2565          micro_mul( &dot, &r[2], &r[2] );
2566          micro_add( &tmp, &tmp, &dot );
2567
2568          FETCH( &r[3], 0, CHAN_W );
2569          micro_mul( &dot, &r[3], &r[3] );
2570          micro_add( &tmp, &tmp, &dot );
2571
2572          /* tmp = 1 / sqrt(tmp) */
2573          micro_sqrt( &tmp, &tmp );
2574          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2575
2576          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2577             /* chan = chan * tmp */
2578             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2579             STORE( &r[chan_index], 0, chan_index );
2580          }
2581       }
2582       break;
2583
2584    case TGSI_OPCODE_DIV:
2585       assert( 0 );
2586       break;
2587
2588    case TGSI_OPCODE_DP2:
2589       FETCH( &r[0], 0, CHAN_X );
2590       FETCH( &r[1], 1, CHAN_X );
2591       micro_mul( &r[0], &r[0], &r[1] );
2592
2593       FETCH( &r[1], 0, CHAN_Y );
2594       FETCH( &r[2], 1, CHAN_Y );
2595       micro_mul( &r[1], &r[1], &r[2] );
2596       micro_add( &r[0], &r[0], &r[1] );
2597
2598       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2599          STORE( &r[0], 0, chan_index );
2600       }
2601       break;
2602
2603    case TGSI_OPCODE_IF:
2604       /* push CondMask */
2605       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2606       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2607       FETCH( &r[0], 0, CHAN_X );
2608       /* update CondMask */
2609       if( ! r[0].u[0] ) {
2610          mach->CondMask &= ~0x1;
2611       }
2612       if( ! r[0].u[1] ) {
2613          mach->CondMask &= ~0x2;
2614       }
2615       if( ! r[0].u[2] ) {
2616          mach->CondMask &= ~0x4;
2617       }
2618       if( ! r[0].u[3] ) {
2619          mach->CondMask &= ~0x8;
2620       }
2621       UPDATE_EXEC_MASK(mach);
2622       /* Todo: If CondMask==0, jump to ELSE */
2623       break;
2624
2625    case TGSI_OPCODE_ELSE:
2626       /* invert CondMask wrt previous mask */
2627       {
2628          uint prevMask;
2629          assert(mach->CondStackTop > 0);
2630          prevMask = mach->CondStack[mach->CondStackTop - 1];
2631          mach->CondMask = ~mach->CondMask & prevMask;
2632          UPDATE_EXEC_MASK(mach);
2633          /* Todo: If CondMask==0, jump to ENDIF */
2634       }
2635       break;
2636
2637    case TGSI_OPCODE_ENDIF:
2638       /* pop CondMask */
2639       assert(mach->CondStackTop > 0);
2640       mach->CondMask = mach->CondStack[--mach->CondStackTop];
2641       UPDATE_EXEC_MASK(mach);
2642       break;
2643
2644    case TGSI_OPCODE_END:
2645       /* halt execution */
2646       *pc = -1;
2647       break;
2648
2649    case TGSI_OPCODE_REP:
2650       assert (0);
2651       break;
2652
2653    case TGSI_OPCODE_ENDREP:
2654        assert (0);
2655        break;
2656
2657    case TGSI_OPCODE_PUSHA:
2658       assert (0);
2659       break;
2660
2661    case TGSI_OPCODE_POPA:
2662       assert (0);
2663       break;
2664
2665    case TGSI_OPCODE_CEIL:
2666       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2667          FETCH( &r[0], 0, chan_index );
2668          micro_ceil( &r[0], &r[0] );
2669          STORE( &r[0], 0, chan_index );
2670       }
2671       break;
2672
2673    case TGSI_OPCODE_I2F:
2674       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2675          FETCH( &r[0], 0, chan_index );
2676          micro_i2f( &r[0], &r[0] );
2677          STORE( &r[0], 0, chan_index );
2678       }
2679       break;
2680
2681    case TGSI_OPCODE_NOT:
2682       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2683          FETCH( &r[0], 0, chan_index );
2684          micro_not( &r[0], &r[0] );
2685          STORE( &r[0], 0, chan_index );
2686       }
2687       break;
2688
2689    case TGSI_OPCODE_TRUNC:
2690       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2691          FETCH( &r[0], 0, chan_index );
2692          micro_trunc( &r[0], &r[0] );
2693          STORE( &r[0], 0, chan_index );
2694       }
2695       break;
2696
2697    case TGSI_OPCODE_SHL:
2698       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2699          FETCH( &r[0], 0, chan_index );
2700          FETCH( &r[1], 1, chan_index );
2701          micro_shl( &r[0], &r[0], &r[1] );
2702          STORE( &r[0], 0, chan_index );
2703       }
2704       break;
2705
2706    case TGSI_OPCODE_SHR:
2707       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2708          FETCH( &r[0], 0, chan_index );
2709          FETCH( &r[1], 1, chan_index );
2710          micro_ishr( &r[0], &r[0], &r[1] );
2711          STORE( &r[0], 0, chan_index );
2712       }
2713       break;
2714
2715    case TGSI_OPCODE_AND:
2716       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2717          FETCH( &r[0], 0, chan_index );
2718          FETCH( &r[1], 1, chan_index );
2719          micro_and( &r[0], &r[0], &r[1] );
2720          STORE( &r[0], 0, chan_index );
2721       }
2722       break;
2723
2724    case TGSI_OPCODE_OR:
2725       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2726          FETCH( &r[0], 0, chan_index );
2727          FETCH( &r[1], 1, chan_index );
2728          micro_or( &r[0], &r[0], &r[1] );
2729          STORE( &r[0], 0, chan_index );
2730       }
2731       break;
2732
2733    case TGSI_OPCODE_MOD:
2734       assert (0);
2735       break;
2736
2737    case TGSI_OPCODE_XOR:
2738       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2739          FETCH( &r[0], 0, chan_index );
2740          FETCH( &r[1], 1, chan_index );
2741          micro_xor( &r[0], &r[0], &r[1] );
2742          STORE( &r[0], 0, chan_index );
2743       }
2744       break;
2745
2746    case TGSI_OPCODE_SAD:
2747       assert (0);
2748       break;
2749
2750    case TGSI_OPCODE_TXF:
2751       assert (0);
2752       break;
2753
2754    case TGSI_OPCODE_TXQ:
2755       assert (0);
2756       break;
2757
2758    case TGSI_OPCODE_EMIT:
2759       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
2760       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2761       break;
2762
2763    case TGSI_OPCODE_ENDPRIM:
2764       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
2765       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
2766       break;
2767
2768    case TGSI_OPCODE_LOOP:
2769       /* fall-through (for now) */
2770    case TGSI_OPCODE_BGNLOOP2:
2771       /* push LoopMask and ContMasks */
2772       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2773       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2774       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2775       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2776       break;
2777
2778    case TGSI_OPCODE_ENDLOOP:
2779       /* fall-through (for now at least) */
2780    case TGSI_OPCODE_ENDLOOP2:
2781       /* Restore ContMask, but don't pop */
2782       assert(mach->ContStackTop > 0);
2783       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
2784       UPDATE_EXEC_MASK(mach);
2785       if (mach->ExecMask) {
2786          /* repeat loop: jump to instruction just past BGNLOOP */
2787          *pc = inst->InstructionExtLabel.Label + 1;
2788       }
2789       else {
2790          /* exit loop: pop LoopMask */
2791          assert(mach->LoopStackTop > 0);
2792          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2793          /* pop ContMask */
2794          assert(mach->ContStackTop > 0);
2795          mach->ContMask = mach->ContStack[--mach->ContStackTop];
2796       }
2797       UPDATE_EXEC_MASK(mach);
2798       break;
2799
2800    case TGSI_OPCODE_BRK:
2801       /* turn off loop channels for each enabled exec channel */
2802       mach->LoopMask &= ~mach->ExecMask;
2803       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2804       UPDATE_EXEC_MASK(mach);
2805       break;
2806
2807    case TGSI_OPCODE_CONT:
2808       /* turn off cont channels for each enabled exec channel */
2809       mach->ContMask &= ~mach->ExecMask;
2810       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2811       UPDATE_EXEC_MASK(mach);
2812       break;
2813
2814    case TGSI_OPCODE_BGNSUB:
2815       /* no-op */
2816       break;
2817
2818    case TGSI_OPCODE_ENDSUB:
2819       /* no-op */
2820       break;
2821
2822    case TGSI_OPCODE_NOISE1:
2823       assert( 0 );
2824       break;
2825
2826    case TGSI_OPCODE_NOISE2:
2827       assert( 0 );
2828       break;
2829
2830    case TGSI_OPCODE_NOISE3:
2831       assert( 0 );
2832       break;
2833
2834    case TGSI_OPCODE_NOISE4:
2835       assert( 0 );
2836       break;
2837
2838    case TGSI_OPCODE_NOP:
2839       break;
2840
2841    default:
2842       assert( 0 );
2843    }
2844 }
2845
2846
2847 /**
2848  * Run TGSI interpreter.
2849  * \return bitmask of "alive" quad components
2850  */
2851 uint
2852 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
2853 {
2854    uint i;
2855    int pc = 0;
2856
2857    mach->CondMask = 0xf;
2858    mach->LoopMask = 0xf;
2859    mach->ContMask = 0xf;
2860    mach->FuncMask = 0xf;
2861    mach->ExecMask = 0xf;
2862
2863    mach->CondStackTop = 0; /* temporarily subvert this assertion */
2864    assert(mach->CondStackTop == 0);
2865    assert(mach->LoopStackTop == 0);
2866    assert(mach->ContStackTop == 0);
2867    assert(mach->CallStackTop == 0);
2868
2869    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
2870    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
2871
2872    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
2873       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
2874       mach->Primitives[0] = 0;
2875    }
2876
2877    for (i = 0; i < QUAD_SIZE; i++) {
2878       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
2879          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
2880          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
2881          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
2882          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
2883    }
2884
2885    /* execute declarations (interpolants) */
2886    for (i = 0; i < mach->NumDeclarations; i++) {
2887       exec_declaration( mach, mach->Declarations+i );
2888    }
2889
2890    /* execute instructions, until pc is set to -1 */
2891    while (pc != -1) {
2892       assert(pc < (int) mach->NumInstructions);
2893       exec_instruction( mach, mach->Instructions + pc, &pc );
2894    }
2895
2896 #if 0
2897    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
2898    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2899       /*
2900        * Scale back depth component.
2901        */
2902       for (i = 0; i < 4; i++)
2903          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
2904    }
2905 #endif
2906
2907    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
2908 }
2909
2910