src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * TGSI interpreter/executor.
  30  *
  31  * Flow control information:
  32  *
  33  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  34  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  35  * care since a condition may be true for some quad components but false
  36  * for other components.
  37  *
  38  * We basically execute all statements (even if they're in the part of
  39  * an IF/ELSE clause that's "not taken") and use a special mask to
  40  * control writing to destination registers.  This is the ExecMask.
  41  * See store_dest().
  42  *
  43  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  44  * ContMask) which are controlled by the flow control instructions (namely:
  45  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  46  *
  47  *
  48  * Authors:
  49  *   Michal Krol
  50  *   Brian Paul
  51  */
  52
  53 #include "pipe/p_compiler.h"
  54 #include "pipe/p_state.h"
  55 #include "pipe/p_shader_tokens.h"
  56 #include "tgsi/tgsi_parse.h"
  57 #include "tgsi/tgsi_util.h"
  58 #include "tgsi_exec.h"
  59 #include "util/u_memory.h"
  60 #include "util/u_math.h"
  61
  62 #define FAST_MATH 1
  63
  64 #define TILE_TOP_LEFT     0
  65 #define TILE_TOP_RIGHT    1
  66 #define TILE_BOTTOM_LEFT  2
  67 #define TILE_BOTTOM_RIGHT 3
  68
  69 #define CHAN_X  0
  70 #define CHAN_Y  1
  71 #define CHAN_Z  2
  72 #define CHAN_W  3
  73
  74 /*
  75  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  76  */
  77 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
  78 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
  79 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
  80 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
  81 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
  82 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
  83 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
  84 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
  85 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
  86 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
  87 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
  88 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
  89 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
  90 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
  91 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
  92 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
  93 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
  94 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
  95 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
  96 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
  97 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
  98 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
  99 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 100 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 101 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 102 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 103 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 104 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 105 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 106
 107 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 108    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
 109
 110 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 111    ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
 112
 113 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 114    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 115       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 116
 117 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 118    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 119       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 120
 121
 122 /** The execution mask depends on the conditional mask and the loop mask */
 123 #define UPDATE_EXEC_MASK(MACH) \
 124       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
 125
 126 /**
 127  * Initialize machine state by expanding tokens to full instructions,
 128  * allocating temporary storage, setting up constants, etc.
 129  * After this, we can call tgsi_exec_machine_run() many times.
 130  */
 131 void
 132 tgsi_exec_machine_bind_shader(
 133    struct tgsi_exec_machine *mach,
 134    const struct tgsi_token *tokens,
 135    uint numSamplers,
 136    struct tgsi_sampler **samplers)
 137 {
 138    uint k;
 139    struct tgsi_parse_context parse;
 140    struct tgsi_exec_labels *labels = &mach->Labels;
 141    struct tgsi_full_instruction *instructions;
 142    struct tgsi_full_declaration *declarations;
 143    uint maxInstructions = 10, numInstructions = 0;
 144    uint maxDeclarations = 10, numDeclarations = 0;
 145    uint instno = 0;
 146
 147 #if 0
 148    tgsi_dump(tokens, 0);
 149 #endif
 150
 151    util_init_math();
 152
 153    mach->Tokens = tokens;
 154    mach->Samplers = samplers;
 155
 156    k = tgsi_parse_init (&parse, mach->Tokens);
 157    if (k != TGSI_PARSE_OK) {
 158       debug_printf( "Problem parsing!\n" );
 159       return;
 160    }
 161
 162    mach->Processor = parse.FullHeader.Processor.Processor;
 163    mach->ImmLimit = 0;
 164    labels->count = 0;
 165
 166    declarations = (struct tgsi_full_declaration *)
 167       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 168
 169    if (!declarations) {
 170       return;
 171    }
 172
 173    instructions = (struct tgsi_full_instruction *)
 174       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 175
 176    if (!instructions) {
 177       FREE( declarations );
 178       return;
 179    }
 180
 181    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 182       uint pointer = parse.Position;
 183       uint i;
 184
 185       tgsi_parse_token( &parse );
 186       switch( parse.FullToken.Token.Type ) {
 187       case TGSI_TOKEN_TYPE_DECLARATION:
 188          /* save expanded declaration */
 189          if (numDeclarations == maxDeclarations) {
 190             declarations = REALLOC(declarations,
 191                                    maxDeclarations
 192                                    * sizeof(struct tgsi_full_declaration),
 193                                    (maxDeclarations + 10)
 194                                    * sizeof(struct tgsi_full_declaration));
 195             maxDeclarations += 10;
 196          }
 197          memcpy(declarations + numDeclarations,
 198                 &parse.FullToken.FullDeclaration,
 199                 sizeof(declarations[0]));
 200          numDeclarations++;
 201          break;
 202
 203       case TGSI_TOKEN_TYPE_IMMEDIATE:
 204          {
 205             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 206             assert( size % 4 == 0 );
 207             assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
 208
 209             for( i = 0; i < size; i++ ) {
 210                mach->Imms[mach->ImmLimit + i / 4][i % 4] =
 211                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
 212             }
 213             mach->ImmLimit += size / 4;
 214          }
 215          break;
 216
 217       case TGSI_TOKEN_TYPE_INSTRUCTION:
 218          assert( labels->count < MAX_LABELS );
 219
 220          labels->labels[labels->count][0] = instno;
 221          labels->labels[labels->count][1] = pointer;
 222          labels->count++;
 223
 224          /* save expanded instruction */
 225          if (numInstructions == maxInstructions) {
 226             instructions = REALLOC(instructions,
 227                                    maxInstructions
 228                                    * sizeof(struct tgsi_full_instruction),
 229                                    (maxInstructions + 10)
 230                                    * sizeof(struct tgsi_full_instruction));
 231             maxInstructions += 10;
 232          }
 233          memcpy(instructions + numInstructions,
 234                 &parse.FullToken.FullInstruction,
 235                 sizeof(instructions[0]));
 236          numInstructions++;
 237          break;
 238
 239       default:
 240          assert( 0 );
 241       }
 242    }
 243    tgsi_parse_free (&parse);
 244
 245    if (mach->Declarations) {
 246       FREE( mach->Declarations );
 247    }
 248    mach->Declarations = declarations;
 249    mach->NumDeclarations = numDeclarations;
 250
 251    if (mach->Instructions) {
 252       FREE( mach->Instructions );
 253    }
 254    mach->Instructions = instructions;
 255    mach->NumInstructions = numInstructions;
 256 }
 257
 258
 259 void
 260 tgsi_exec_machine_init(
 261    struct tgsi_exec_machine *mach )
 262 {
 263    uint i;
 264
 265    mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
 266    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 267
 268    /* Setup constants. */
 269    for( i = 0; i < 4; i++ ) {
 270       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 271       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 272       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 273       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 274       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 275       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 276       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 277       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 278       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 279       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 280    }
 281 }
 282
 283
 284 void
 285 tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
 286 {
 287    if (mach->Instructions) {
 288       FREE(mach->Instructions);
 289       mach->Instructions = NULL;
 290       mach->NumInstructions = 0;
 291    }
 292    if (mach->Declarations) {
 293       FREE(mach->Declarations);
 294       mach->Declarations = NULL;
 295       mach->NumDeclarations = 0;
 296    }
 297 }
 298
 299
 300 static void
 301 micro_abs(
 302    union tgsi_exec_channel *dst,
 303    const union tgsi_exec_channel *src )
 304 {
 305    dst->f[0] = fabsf( src->f[0] );
 306    dst->f[1] = fabsf( src->f[1] );
 307    dst->f[2] = fabsf( src->f[2] );
 308    dst->f[3] = fabsf( src->f[3] );
 309 }
 310
 311 static void
 312 micro_add(
 313    union tgsi_exec_channel *dst,
 314    const union tgsi_exec_channel *src0,
 315    const union tgsi_exec_channel *src1 )
 316 {
 317    dst->f[0] = src0->f[0] + src1->f[0];
 318    dst->f[1] = src0->f[1] + src1->f[1];
 319    dst->f[2] = src0->f[2] + src1->f[2];
 320    dst->f[3] = src0->f[3] + src1->f[3];
 321 }
 322
 323 #if 0
 324 static void
 325 micro_iadd(
 326    union tgsi_exec_channel *dst,
 327    const union tgsi_exec_channel *src0,
 328    const union tgsi_exec_channel *src1 )
 329 {
 330    dst->i[0] = src0->i[0] + src1->i[0];
 331    dst->i[1] = src0->i[1] + src1->i[1];
 332    dst->i[2] = src0->i[2] + src1->i[2];
 333    dst->i[3] = src0->i[3] + src1->i[3];
 334 }
 335 #endif
 336
 337 static void
 338 micro_and(
 339    union tgsi_exec_channel *dst,
 340    const union tgsi_exec_channel *src0,
 341    const union tgsi_exec_channel *src1 )
 342 {
 343    dst->u[0] = src0->u[0] & src1->u[0];
 344    dst->u[1] = src0->u[1] & src1->u[1];
 345    dst->u[2] = src0->u[2] & src1->u[2];
 346    dst->u[3] = src0->u[3] & src1->u[3];
 347 }
 348
 349 static void
 350 micro_ceil(
 351    union tgsi_exec_channel *dst,
 352    const union tgsi_exec_channel *src )
 353 {
 354    dst->f[0] = ceilf( src->f[0] );
 355    dst->f[1] = ceilf( src->f[1] );
 356    dst->f[2] = ceilf( src->f[2] );
 357    dst->f[3] = ceilf( src->f[3] );
 358 }
 359
 360 static void
 361 micro_cos(
 362    union tgsi_exec_channel *dst,
 363    const union tgsi_exec_channel *src )
 364 {
 365    dst->f[0] = cosf( src->f[0] );
 366    dst->f[1] = cosf( src->f[1] );
 367    dst->f[2] = cosf( src->f[2] );
 368    dst->f[3] = cosf( src->f[3] );
 369 }
 370
 371 static void
 372 micro_ddx(
 373    union tgsi_exec_channel *dst,
 374    const union tgsi_exec_channel *src )
 375 {
 376    dst->f[0] =
 377    dst->f[1] =
 378    dst->f[2] =
 379    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 380 }
 381
 382 static void
 383 micro_ddy(
 384    union tgsi_exec_channel *dst,
 385    const union tgsi_exec_channel *src )
 386 {
 387    dst->f[0] =
 388    dst->f[1] =
 389    dst->f[2] =
 390    dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
 391 }
 392
 393 static void
 394 micro_div(
 395    union tgsi_exec_channel *dst,
 396    const union tgsi_exec_channel *src0,
 397    const union tgsi_exec_channel *src1 )
 398 {
 399    if (src1->f[0] != 0) {
 400       dst->f[0] = src0->f[0] / src1->f[0];
 401    }
 402    if (src1->f[1] != 0) {
 403       dst->f[1] = src0->f[1] / src1->f[1];
 404    }
 405    if (src1->f[2] != 0) {
 406       dst->f[2] = src0->f[2] / src1->f[2];
 407    }
 408    if (src1->f[3] != 0) {
 409       dst->f[3] = src0->f[3] / src1->f[3];
 410    }
 411 }
 412
 413 #if 0
 414 static void
 415 micro_udiv(
 416    union tgsi_exec_channel *dst,
 417    const union tgsi_exec_channel *src0,
 418    const union tgsi_exec_channel *src1 )
 419 {
 420    dst->u[0] = src0->u[0] / src1->u[0];
 421    dst->u[1] = src0->u[1] / src1->u[1];
 422    dst->u[2] = src0->u[2] / src1->u[2];
 423    dst->u[3] = src0->u[3] / src1->u[3];
 424 }
 425 #endif
 426
 427 static void
 428 micro_eq(
 429    union tgsi_exec_channel *dst,
 430    const union tgsi_exec_channel *src0,
 431    const union tgsi_exec_channel *src1,
 432    const union tgsi_exec_channel *src2,
 433    const union tgsi_exec_channel *src3 )
 434 {
 435    dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
 436    dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
 437    dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
 438    dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
 439 }
 440
 441 #if 0
 442 static void
 443 micro_ieq(
 444    union tgsi_exec_channel *dst,
 445    const union tgsi_exec_channel *src0,
 446    const union tgsi_exec_channel *src1,
 447    const union tgsi_exec_channel *src2,
 448    const union tgsi_exec_channel *src3 )
 449 {
 450    dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
 451    dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
 452    dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
 453    dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
 454 }
 455 #endif
 456
 457 static void
 458 micro_exp2(
 459    union tgsi_exec_channel *dst,
 460    const union tgsi_exec_channel *src)
 461 {
 462 #if FAST_MATH
 463    dst->f[0] = util_fast_exp2( src->f[0] );
 464    dst->f[1] = util_fast_exp2( src->f[1] );
 465    dst->f[2] = util_fast_exp2( src->f[2] );
 466    dst->f[3] = util_fast_exp2( src->f[3] );
 467 #else
 468    dst->f[0] = powf( 2.0f, src->f[0] );
 469    dst->f[1] = powf( 2.0f, src->f[1] );
 470    dst->f[2] = powf( 2.0f, src->f[2] );
 471    dst->f[3] = powf( 2.0f, src->f[3] );
 472 #endif
 473 }
 474
 475 #if 0
 476 static void
 477 micro_f2ut(
 478    union tgsi_exec_channel *dst,
 479    const union tgsi_exec_channel *src )
 480 {
 481    dst->u[0] = (uint) src->f[0];
 482    dst->u[1] = (uint) src->f[1];
 483    dst->u[2] = (uint) src->f[2];
 484    dst->u[3] = (uint) src->f[3];
 485 }
 486 #endif
 487
 488 static void
 489 micro_float_clamp(union tgsi_exec_channel *dst,
 490                   const union tgsi_exec_channel *src)
 491 {
 492    uint i;
 493
 494    for (i = 0; i < 4; i++) {
 495       if (src->f[i] > 0.0f) {
 496          if (src->f[i] > 1.884467e+019f)
 497             dst->f[i] = 1.884467e+019f;
 498          else if (src->f[i] < 5.42101e-020f)
 499             dst->f[i] = 5.42101e-020f;
 500          else
 501             dst->f[i] = src->f[i];
 502       }
 503       else {
 504          if (src->f[i] < -1.884467e+019f)
 505             dst->f[i] = -1.884467e+019f;
 506          else if (src->f[i] > -5.42101e-020f)
 507             dst->f[i] = -5.42101e-020f;
 508          else
 509             dst->f[i] = src->f[i];
 510       }
 511    }
 512 }
 513
 514 static void
 515 micro_flr(
 516    union tgsi_exec_channel *dst,
 517    const union tgsi_exec_channel *src )
 518 {
 519    dst->f[0] = floorf( src->f[0] );
 520    dst->f[1] = floorf( src->f[1] );
 521    dst->f[2] = floorf( src->f[2] );
 522    dst->f[3] = floorf( src->f[3] );
 523 }
 524
 525 static void
 526 micro_frc(
 527    union tgsi_exec_channel *dst,
 528    const union tgsi_exec_channel *src )
 529 {
 530    dst->f[0] = src->f[0] - floorf( src->f[0] );
 531    dst->f[1] = src->f[1] - floorf( src->f[1] );
 532    dst->f[2] = src->f[2] - floorf( src->f[2] );
 533    dst->f[3] = src->f[3] - floorf( src->f[3] );
 534 }
 535
 536 static void
 537 micro_i2f(
 538    union tgsi_exec_channel *dst,
 539    const union tgsi_exec_channel *src )
 540 {
 541    dst->f[0] = (float) src->i[0];
 542    dst->f[1] = (float) src->i[1];
 543    dst->f[2] = (float) src->i[2];
 544    dst->f[3] = (float) src->i[3];
 545 }
 546
 547 static void
 548 micro_lg2(
 549    union tgsi_exec_channel *dst,
 550    const union tgsi_exec_channel *src )
 551 {
 552 #if FAST_MATH
 553    dst->f[0] = util_fast_log2( src->f[0] );
 554    dst->f[1] = util_fast_log2( src->f[1] );
 555    dst->f[2] = util_fast_log2( src->f[2] );
 556    dst->f[3] = util_fast_log2( src->f[3] );
 557 #else
 558    dst->f[0] = logf( src->f[0] ) * 1.442695f;
 559    dst->f[1] = logf( src->f[1] ) * 1.442695f;
 560    dst->f[2] = logf( src->f[2] ) * 1.442695f;
 561    dst->f[3] = logf( src->f[3] ) * 1.442695f;
 562 #endif
 563 }
 564
 565 static void
 566 micro_le(
 567    union tgsi_exec_channel *dst,
 568    const union tgsi_exec_channel *src0,
 569    const union tgsi_exec_channel *src1,
 570    const union tgsi_exec_channel *src2,
 571    const union tgsi_exec_channel *src3 )
 572 {
 573    dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
 574    dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
 575    dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
 576    dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
 577 }
 578
 579 static void
 580 micro_lt(
 581    union tgsi_exec_channel *dst,
 582    const union tgsi_exec_channel *src0,
 583    const union tgsi_exec_channel *src1,
 584    const union tgsi_exec_channel *src2,
 585    const union tgsi_exec_channel *src3 )
 586 {
 587    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 588    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 589    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 590    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 591 }
 592
 593 #if 0
 594 static void
 595 micro_ilt(
 596    union tgsi_exec_channel *dst,
 597    const union tgsi_exec_channel *src0,
 598    const union tgsi_exec_channel *src1,
 599    const union tgsi_exec_channel *src2,
 600    const union tgsi_exec_channel *src3 )
 601 {
 602    dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
 603    dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
 604    dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
 605    dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
 606 }
 607 #endif
 608
 609 #if 0
 610 static void
 611 micro_ult(
 612    union tgsi_exec_channel *dst,
 613    const union tgsi_exec_channel *src0,
 614    const union tgsi_exec_channel *src1,
 615    const union tgsi_exec_channel *src2,
 616    const union tgsi_exec_channel *src3 )
 617 {
 618    dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
 619    dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
 620    dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
 621    dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
 622 }
 623 #endif
 624
 625 static void
 626 micro_max(
 627    union tgsi_exec_channel *dst,
 628    const union tgsi_exec_channel *src0,
 629    const union tgsi_exec_channel *src1 )
 630 {
 631    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 632    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 633    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 634    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 635 }
 636
 637 #if 0
 638 static void
 639 micro_imax(
 640    union tgsi_exec_channel *dst,
 641    const union tgsi_exec_channel *src0,
 642    const union tgsi_exec_channel *src1 )
 643 {
 644    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
 645    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
 646    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
 647    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
 648 }
 649 #endif
 650
 651 #if 0
 652 static void
 653 micro_umax(
 654    union tgsi_exec_channel *dst,
 655    const union tgsi_exec_channel *src0,
 656    const union tgsi_exec_channel *src1 )
 657 {
 658    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
 659    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
 660    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
 661    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
 662 }
 663 #endif
 664
 665 static void
 666 micro_min(
 667    union tgsi_exec_channel *dst,
 668    const union tgsi_exec_channel *src0,
 669    const union tgsi_exec_channel *src1 )
 670 {
 671    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 672    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 673    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 674    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 675 }
 676
 677 #if 0
 678 static void
 679 micro_imin(
 680    union tgsi_exec_channel *dst,
 681    const union tgsi_exec_channel *src0,
 682    const union tgsi_exec_channel *src1 )
 683 {
 684    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
 685    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
 686    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
 687    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
 688 }
 689 #endif
 690
 691 #if 0
 692 static void
 693 micro_umin(
 694    union tgsi_exec_channel *dst,
 695    const union tgsi_exec_channel *src0,
 696    const union tgsi_exec_channel *src1 )
 697 {
 698    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
 699    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
 700    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
 701    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
 702 }
 703 #endif
 704
 705 #if 0
 706 static void
 707 micro_umod(
 708    union tgsi_exec_channel *dst,
 709    const union tgsi_exec_channel *src0,
 710    const union tgsi_exec_channel *src1 )
 711 {
 712    dst->u[0] = src0->u[0] % src1->u[0];
 713    dst->u[1] = src0->u[1] % src1->u[1];
 714    dst->u[2] = src0->u[2] % src1->u[2];
 715    dst->u[3] = src0->u[3] % src1->u[3];
 716 }
 717 #endif
 718
 719 static void
 720 micro_mul(
 721    union tgsi_exec_channel *dst,
 722    const union tgsi_exec_channel *src0,
 723    const union tgsi_exec_channel *src1 )
 724 {
 725    dst->f[0] = src0->f[0] * src1->f[0];
 726    dst->f[1] = src0->f[1] * src1->f[1];
 727    dst->f[2] = src0->f[2] * src1->f[2];
 728    dst->f[3] = src0->f[3] * src1->f[3];
 729 }
 730
 731 #if 0
 732 static void
 733 micro_imul(
 734    union tgsi_exec_channel *dst,
 735    const union tgsi_exec_channel *src0,
 736    const union tgsi_exec_channel *src1 )
 737 {
 738    dst->i[0] = src0->i[0] * src1->i[0];
 739    dst->i[1] = src0->i[1] * src1->i[1];
 740    dst->i[2] = src0->i[2] * src1->i[2];
 741    dst->i[3] = src0->i[3] * src1->i[3];
 742 }
 743 #endif
 744
 745 #if 0
 746 static void
 747 micro_imul64(
 748    union tgsi_exec_channel *dst0,
 749    union tgsi_exec_channel *dst1,
 750    const union tgsi_exec_channel *src0,
 751    const union tgsi_exec_channel *src1 )
 752 {
 753    dst1->i[0] = src0->i[0] * src1->i[0];
 754    dst1->i[1] = src0->i[1] * src1->i[1];
 755    dst1->i[2] = src0->i[2] * src1->i[2];
 756    dst1->i[3] = src0->i[3] * src1->i[3];
 757    dst0->i[0] = 0;
 758    dst0->i[1] = 0;
 759    dst0->i[2] = 0;
 760    dst0->i[3] = 0;
 761 }
 762 #endif
 763
 764 #if 0
 765 static void
 766 micro_umul64(
 767    union tgsi_exec_channel *dst0,
 768    union tgsi_exec_channel *dst1,
 769    const union tgsi_exec_channel *src0,
 770    const union tgsi_exec_channel *src1 )
 771 {
 772    dst1->u[0] = src0->u[0] * src1->u[0];
 773    dst1->u[1] = src0->u[1] * src1->u[1];
 774    dst1->u[2] = src0->u[2] * src1->u[2];
 775    dst1->u[3] = src0->u[3] * src1->u[3];
 776    dst0->u[0] = 0;
 777    dst0->u[1] = 0;
 778    dst0->u[2] = 0;
 779    dst0->u[3] = 0;
 780 }
 781 #endif
 782
 783
 784 #if 0
 785 static void
 786 micro_movc(
 787    union tgsi_exec_channel *dst,
 788    const union tgsi_exec_channel *src0,
 789    const union tgsi_exec_channel *src1,
 790    const union tgsi_exec_channel *src2 )
 791 {
 792    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 793    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 794    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 795    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 796 }
 797 #endif
 798
 799 static void
 800 micro_neg(
 801    union tgsi_exec_channel *dst,
 802    const union tgsi_exec_channel *src )
 803 {
 804    dst->f[0] = -src->f[0];
 805    dst->f[1] = -src->f[1];
 806    dst->f[2] = -src->f[2];
 807    dst->f[3] = -src->f[3];
 808 }
 809
 810 #if 0
 811 static void
 812 micro_ineg(
 813    union tgsi_exec_channel *dst,
 814    const union tgsi_exec_channel *src )
 815 {
 816    dst->i[0] = -src->i[0];
 817    dst->i[1] = -src->i[1];
 818    dst->i[2] = -src->i[2];
 819    dst->i[3] = -src->i[3];
 820 }
 821 #endif
 822
 823 static void
 824 micro_not(
 825    union tgsi_exec_channel *dst,
 826    const union tgsi_exec_channel *src )
 827 {
 828    dst->u[0] = ~src->u[0];
 829    dst->u[1] = ~src->u[1];
 830    dst->u[2] = ~src->u[2];
 831    dst->u[3] = ~src->u[3];
 832 }
 833
 834 static void
 835 micro_or(
 836    union tgsi_exec_channel *dst,
 837    const union tgsi_exec_channel *src0,
 838    const union tgsi_exec_channel *src1 )
 839 {
 840    dst->u[0] = src0->u[0] | src1->u[0];
 841    dst->u[1] = src0->u[1] | src1->u[1];
 842    dst->u[2] = src0->u[2] | src1->u[2];
 843    dst->u[3] = src0->u[3] | src1->u[3];
 844 }
 845
 846 static void
 847 micro_pow(
 848    union tgsi_exec_channel *dst,
 849    const union tgsi_exec_channel *src0,
 850    const union tgsi_exec_channel *src1 )
 851 {
 852 #if FAST_MATH
 853    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 854    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 855    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 856    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 857 #else
 858    dst->f[0] = powf( src0->f[0], src1->f[0] );
 859    dst->f[1] = powf( src0->f[1], src1->f[1] );
 860    dst->f[2] = powf( src0->f[2], src1->f[2] );
 861    dst->f[3] = powf( src0->f[3], src1->f[3] );
 862 #endif
 863 }
 864
 865 static void
 866 micro_rnd(
 867    union tgsi_exec_channel *dst,
 868    const union tgsi_exec_channel *src )
 869 {
 870    dst->f[0] = floorf( src->f[0] + 0.5f );
 871    dst->f[1] = floorf( src->f[1] + 0.5f );
 872    dst->f[2] = floorf( src->f[2] + 0.5f );
 873    dst->f[3] = floorf( src->f[3] + 0.5f );
 874 }
 875
 876 static void
 877 micro_sgn(
 878    union tgsi_exec_channel *dst,
 879    const union tgsi_exec_channel *src )
 880 {
 881    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 882    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 883    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 884    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 885 }
 886
 887 static void
 888 micro_shl(
 889    union tgsi_exec_channel *dst,
 890    const union tgsi_exec_channel *src0,
 891    const union tgsi_exec_channel *src1 )
 892 {
 893    dst->i[0] = src0->i[0] << src1->i[0];
 894    dst->i[1] = src0->i[1] << src1->i[1];
 895    dst->i[2] = src0->i[2] << src1->i[2];
 896    dst->i[3] = src0->i[3] << src1->i[3];
 897 }
 898
 899 static void
 900 micro_ishr(
 901    union tgsi_exec_channel *dst,
 902    const union tgsi_exec_channel *src0,
 903    const union tgsi_exec_channel *src1 )
 904 {
 905    dst->i[0] = src0->i[0] >> src1->i[0];
 906    dst->i[1] = src0->i[1] >> src1->i[1];
 907    dst->i[2] = src0->i[2] >> src1->i[2];
 908    dst->i[3] = src0->i[3] >> src1->i[3];
 909 }
 910
 911 static void
 912 micro_trunc(
 913    union tgsi_exec_channel *dst,
 914    const union tgsi_exec_channel *src0 )
 915 {
 916    dst->f[0] = (float) (int) src0->f[0];
 917    dst->f[1] = (float) (int) src0->f[1];
 918    dst->f[2] = (float) (int) src0->f[2];
 919    dst->f[3] = (float) (int) src0->f[3];
 920 }
 921
 922 #if 0
 923 static void
 924 micro_ushr(
 925    union tgsi_exec_channel *dst,
 926    const union tgsi_exec_channel *src0,
 927    const union tgsi_exec_channel *src1 )
 928 {
 929    dst->u[0] = src0->u[0] >> src1->u[0];
 930    dst->u[1] = src0->u[1] >> src1->u[1];
 931    dst->u[2] = src0->u[2] >> src1->u[2];
 932    dst->u[3] = src0->u[3] >> src1->u[3];
 933 }
 934 #endif
 935
 936 static void
 937 micro_sin(
 938    union tgsi_exec_channel *dst,
 939    const union tgsi_exec_channel *src )
 940 {
 941    dst->f[0] = sinf( src->f[0] );
 942    dst->f[1] = sinf( src->f[1] );
 943    dst->f[2] = sinf( src->f[2] );
 944    dst->f[3] = sinf( src->f[3] );
 945 }
 946
 947 static void
 948 micro_sqrt( union tgsi_exec_channel *dst,
 949             const union tgsi_exec_channel *src )
 950 {
 951    dst->f[0] = sqrtf( src->f[0] );
 952    dst->f[1] = sqrtf( src->f[1] );
 953    dst->f[2] = sqrtf( src->f[2] );
 954    dst->f[3] = sqrtf( src->f[3] );
 955 }
 956
 957 static void
 958 micro_sub(
 959    union tgsi_exec_channel *dst,
 960    const union tgsi_exec_channel *src0,
 961    const union tgsi_exec_channel *src1 )
 962 {
 963    dst->f[0] = src0->f[0] - src1->f[0];
 964    dst->f[1] = src0->f[1] - src1->f[1];
 965    dst->f[2] = src0->f[2] - src1->f[2];
 966    dst->f[3] = src0->f[3] - src1->f[3];
 967 }
 968
 969 #if 0
 970 static void
 971 micro_u2f(
 972    union tgsi_exec_channel *dst,
 973    const union tgsi_exec_channel *src )
 974 {
 975    dst->f[0] = (float) src->u[0];
 976    dst->f[1] = (float) src->u[1];
 977    dst->f[2] = (float) src->u[2];
 978    dst->f[3] = (float) src->u[3];
 979 }
 980 #endif
 981
 982 static void
 983 micro_xor(
 984    union tgsi_exec_channel *dst,
 985    const union tgsi_exec_channel *src0,
 986    const union tgsi_exec_channel *src1 )
 987 {
 988    dst->u[0] = src0->u[0] ^ src1->u[0];
 989    dst->u[1] = src0->u[1] ^ src1->u[1];
 990    dst->u[2] = src0->u[2] ^ src1->u[2];
 991    dst->u[3] = src0->u[3] ^ src1->u[3];
 992 }
 993
 994 static void
 995 fetch_src_file_channel(
 996    const struct tgsi_exec_machine *mach,
 997    const uint file,
 998    const uint swizzle,
 999    const union tgsi_exec_channel *index,
1000    union tgsi_exec_channel *chan )
1001 {
1002    switch( swizzle ) {
1003    case TGSI_EXTSWIZZLE_X:
1004    case TGSI_EXTSWIZZLE_Y:
1005    case TGSI_EXTSWIZZLE_Z:
1006    case TGSI_EXTSWIZZLE_W:
1007       switch( file ) {
1008       case TGSI_FILE_CONSTANT:
1009          assert(mach->Consts);
1010          if (index->i[0] < 0)
1011             chan->f[0] = 0.0f;
1012          else
1013             chan->f[0] = mach->Consts[index->i[0]][swizzle];
1014          if (index->i[1] < 0)
1015             chan->f[1] = 0.0f;
1016          else
1017             chan->f[1] = mach->Consts[index->i[1]][swizzle];
1018          if (index->i[2] < 0)
1019             chan->f[2] = 0.0f;
1020          else
1021             chan->f[2] = mach->Consts[index->i[2]][swizzle];
1022          if (index->i[3] < 0)
1023             chan->f[3] = 0.0f;
1024          else
1025             chan->f[3] = mach->Consts[index->i[3]][swizzle];
1026          break;
1027
1028       case TGSI_FILE_INPUT:
1029          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1030          chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1031          chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1032          chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1033          break;
1034
1035       case TGSI_FILE_TEMPORARY:
1036          assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1037          chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1038          chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1039          chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1040          chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1041          break;
1042
1043       case TGSI_FILE_IMMEDIATE:
1044          assert( index->i[0] < (int) mach->ImmLimit );
1045          chan->f[0] = mach->Imms[index->i[0]][swizzle];
1046          assert( index->i[1] < (int) mach->ImmLimit );
1047          chan->f[1] = mach->Imms[index->i[1]][swizzle];
1048          assert( index->i[2] < (int) mach->ImmLimit );
1049          chan->f[2] = mach->Imms[index->i[2]][swizzle];
1050          assert( index->i[3] < (int) mach->ImmLimit );
1051          chan->f[3] = mach->Imms[index->i[3]][swizzle];
1052          break;
1053
1054       case TGSI_FILE_ADDRESS:
1055          chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1056          chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1057          chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1058          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1059          break;
1060
1061       case TGSI_FILE_OUTPUT:
1062          /* vertex/fragment output vars can be read too */
1063          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1064          chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1065          chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1066          chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1067          break;
1068
1069       default:
1070          assert( 0 );
1071       }
1072       break;
1073
1074    case TGSI_EXTSWIZZLE_ZERO:
1075       *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
1076       break;
1077
1078    case TGSI_EXTSWIZZLE_ONE:
1079       *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
1080       break;
1081
1082    default:
1083       assert( 0 );
1084    }
1085 }
1086
1087 static void
1088 fetch_source(
1089    const struct tgsi_exec_machine *mach,
1090    union tgsi_exec_channel *chan,
1091    const struct tgsi_full_src_register *reg,
1092    const uint chan_index )
1093 {
1094    union tgsi_exec_channel index;
1095    uint swizzle;
1096
1097    /* We start with a direct index into a register file.
1098     *
1099     *    file[1],
1100     *    where:
1101     *       file = SrcRegister.File
1102     *       [1] = SrcRegister.Index
1103     */
1104    index.i[0] =
1105    index.i[1] =
1106    index.i[2] =
1107    index.i[3] = reg->SrcRegister.Index;
1108
1109    /* There is an extra source register that indirectly subscripts
1110     * a register file. The direct index now becomes an offset
1111     * that is being added to the indirect register.
1112     *
1113     *    file[ind[2].x+1],
1114     *    where:
1115     *       ind = SrcRegisterInd.File
1116     *       [2] = SrcRegisterInd.Index
1117     *       .x = SrcRegisterInd.SwizzleX
1118     */
1119    if (reg->SrcRegister.Indirect) {
1120       union tgsi_exec_channel index2;
1121       union tgsi_exec_channel indir_index;
1122       const uint execmask = mach->ExecMask;
1123       uint i;
1124
1125       /* which address register (always zero now) */
1126       index2.i[0] =
1127       index2.i[1] =
1128       index2.i[2] =
1129       index2.i[3] = reg->SrcRegisterInd.Index;
1130
1131       /* get current value of address register[swizzle] */
1132       swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1133       fetch_src_file_channel(
1134          mach,
1135          reg->SrcRegisterInd.File,
1136          swizzle,
1137          &index2,
1138          &indir_index );
1139
1140       /* add value of address register to the offset */
1141       index.i[0] += (int) indir_index.f[0];
1142       index.i[1] += (int) indir_index.f[1];
1143       index.i[2] += (int) indir_index.f[2];
1144       index.i[3] += (int) indir_index.f[3];
1145
1146       /* for disabled execution channels, zero-out the index to
1147        * avoid using a potential garbage value.
1148        */
1149       for (i = 0; i < QUAD_SIZE; i++) {
1150          if ((execmask & (1 << i)) == 0)
1151             index.i[i] = 0;
1152       }
1153    }
1154
1155    /* There is an extra source register that is a second
1156     * subscript to a register file. Effectively it means that
1157     * the register file is actually a 2D array of registers.
1158     *
1159     *    file[1][3] == file[1*sizeof(file[1])+3],
1160     *    where:
1161     *       [3] = SrcRegisterDim.Index
1162     */
1163    if (reg->SrcRegister.Dimension) {
1164       /* The size of the first-order array depends on the register file type.
1165        * We need to multiply the index to the first array to get an effective,
1166        * "flat" index that points to the beginning of the second-order array.
1167        */
1168       switch (reg->SrcRegister.File) {
1169       case TGSI_FILE_INPUT:
1170          index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1171          index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1172          index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1173          index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1174          break;
1175       case TGSI_FILE_CONSTANT:
1176          index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1177          index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1178          index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1179          index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1180          break;
1181       default:
1182          assert( 0 );
1183       }
1184
1185       index.i[0] += reg->SrcRegisterDim.Index;
1186       index.i[1] += reg->SrcRegisterDim.Index;
1187       index.i[2] += reg->SrcRegisterDim.Index;
1188       index.i[3] += reg->SrcRegisterDim.Index;
1189
1190       /* Again, the second subscript index can be addressed indirectly
1191        * identically to the first one.
1192        * Nothing stops us from indirectly addressing the indirect register,
1193        * but there is no need for that, so we won't exercise it.
1194        *
1195        *    file[1][ind[4].y+3],
1196        *    where:
1197        *       ind = SrcRegisterDimInd.File
1198        *       [4] = SrcRegisterDimInd.Index
1199        *       .y = SrcRegisterDimInd.SwizzleX
1200        */
1201       if (reg->SrcRegisterDim.Indirect) {
1202          union tgsi_exec_channel index2;
1203          union tgsi_exec_channel indir_index;
1204          const uint execmask = mach->ExecMask;
1205          uint i;
1206
1207          index2.i[0] =
1208          index2.i[1] =
1209          index2.i[2] =
1210          index2.i[3] = reg->SrcRegisterDimInd.Index;
1211
1212          swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1213          fetch_src_file_channel(
1214             mach,
1215             reg->SrcRegisterDimInd.File,
1216             swizzle,
1217             &index2,
1218             &indir_index );
1219
1220          index.i[0] += (int) indir_index.f[0];
1221          index.i[1] += (int) indir_index.f[1];
1222          index.i[2] += (int) indir_index.f[2];
1223          index.i[3] += (int) indir_index.f[3];
1224
1225          /* for disabled execution channels, zero-out the index to
1226           * avoid using a potential garbage value.
1227           */
1228          for (i = 0; i < QUAD_SIZE; i++) {
1229             if ((execmask & (1 << i)) == 0)
1230                index.i[i] = 0;
1231          }
1232       }
1233
1234       /* If by any chance there was a need for a 3D array of register
1235        * files, we would have to check whether SrcRegisterDim is followed
1236        * by a dimension register and continue the saga.
1237        */
1238    }
1239
1240    swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1241    fetch_src_file_channel(
1242       mach,
1243       reg->SrcRegister.File,
1244       swizzle,
1245       &index,
1246       chan );
1247
1248    switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1249    case TGSI_UTIL_SIGN_CLEAR:
1250       micro_abs( chan, chan );
1251       break;
1252
1253    case TGSI_UTIL_SIGN_SET:
1254       micro_abs( chan, chan );
1255       micro_neg( chan, chan );
1256       break;
1257
1258    case TGSI_UTIL_SIGN_TOGGLE:
1259       micro_neg( chan, chan );
1260       break;
1261
1262    case TGSI_UTIL_SIGN_KEEP:
1263       break;
1264    }
1265
1266    if (reg->SrcRegisterExtMod.Complement) {
1267       micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1268    }
1269 }
1270
1271 static void
1272 store_dest(
1273    struct tgsi_exec_machine *mach,
1274    const union tgsi_exec_channel *chan,
1275    const struct tgsi_full_dst_register *reg,
1276    const struct tgsi_full_instruction *inst,
1277    uint chan_index )
1278 {
1279    uint i;
1280    union tgsi_exec_channel null;
1281    union tgsi_exec_channel *dst;
1282    uint execmask = mach->ExecMask;
1283
1284    switch (reg->DstRegister.File) {
1285    case TGSI_FILE_NULL:
1286       dst = &null;
1287       break;
1288
1289    case TGSI_FILE_OUTPUT:
1290       dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1291                            + reg->DstRegister.Index].xyzw[chan_index];
1292       break;
1293
1294    case TGSI_FILE_TEMPORARY:
1295       assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS );
1296       dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
1297       break;
1298
1299    case TGSI_FILE_ADDRESS:
1300       dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
1301       break;
1302
1303    default:
1304       assert( 0 );
1305       return;
1306    }
1307
1308    if (inst->InstructionExtNv.CondFlowEnable) {
1309       union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1310       uint swizzle;
1311       uint shift;
1312       uint mask;
1313       uint test;
1314
1315       /* Only CC0 supported.
1316        */
1317       assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1318
1319       switch (chan_index) {
1320       case CHAN_X:
1321          swizzle = inst->InstructionExtNv.CondSwizzleX;
1322          break;
1323       case CHAN_Y:
1324          swizzle = inst->InstructionExtNv.CondSwizzleY;
1325          break;
1326       case CHAN_Z:
1327          swizzle = inst->InstructionExtNv.CondSwizzleZ;
1328          break;
1329       case CHAN_W:
1330          swizzle = inst->InstructionExtNv.CondSwizzleW;
1331          break;
1332       default:
1333          assert( 0 );
1334          return;
1335       }
1336
1337       switch (swizzle) {
1338       case TGSI_SWIZZLE_X:
1339          shift = TGSI_EXEC_CC_X_SHIFT;
1340          mask = TGSI_EXEC_CC_X_MASK;
1341          break;
1342       case TGSI_SWIZZLE_Y:
1343          shift = TGSI_EXEC_CC_Y_SHIFT;
1344          mask = TGSI_EXEC_CC_Y_MASK;
1345          break;
1346       case TGSI_SWIZZLE_Z:
1347          shift = TGSI_EXEC_CC_Z_SHIFT;
1348          mask = TGSI_EXEC_CC_Z_MASK;
1349          break;
1350       case TGSI_SWIZZLE_W:
1351          shift = TGSI_EXEC_CC_W_SHIFT;
1352          mask = TGSI_EXEC_CC_W_MASK;
1353          break;
1354       default:
1355          assert( 0 );
1356          return;
1357       }
1358
1359       switch (inst->InstructionExtNv.CondMask) {
1360       case TGSI_CC_GT:
1361          test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1362          for (i = 0; i < QUAD_SIZE; i++)
1363             if (cc->u[i] & test)
1364                execmask &= ~(1 << i);
1365          break;
1366
1367       case TGSI_CC_EQ:
1368          test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1369          for (i = 0; i < QUAD_SIZE; i++)
1370             if (cc->u[i] & test)
1371                execmask &= ~(1 << i);
1372          break;
1373
1374       case TGSI_CC_LT:
1375          test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1376          for (i = 0; i < QUAD_SIZE; i++)
1377             if (cc->u[i] & test)
1378                execmask &= ~(1 << i);
1379          break;
1380
1381       case TGSI_CC_GE:
1382          test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1383          for (i = 0; i < QUAD_SIZE; i++)
1384             if (cc->u[i] & test)
1385                execmask &= ~(1 << i);
1386          break;
1387
1388       case TGSI_CC_LE:
1389          test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1390          for (i = 0; i < QUAD_SIZE; i++)
1391             if (cc->u[i] & test)
1392                execmask &= ~(1 << i);
1393          break;
1394
1395       case TGSI_CC_NE:
1396          test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1397          for (i = 0; i < QUAD_SIZE; i++)
1398             if (cc->u[i] & test)
1399                execmask &= ~(1 << i);
1400          break;
1401
1402       case TGSI_CC_TR:
1403          break;
1404
1405       case TGSI_CC_FL:
1406          for (i = 0; i < QUAD_SIZE; i++)
1407             execmask &= ~(1 << i);
1408          break;
1409
1410       default:
1411          assert( 0 );
1412          return;
1413       }
1414    }
1415
1416    switch (inst->Instruction.Saturate) {
1417    case TGSI_SAT_NONE:
1418       for (i = 0; i < QUAD_SIZE; i++)
1419          if (execmask & (1 << i))
1420             dst->i[i] = chan->i[i];
1421       break;
1422
1423    case TGSI_SAT_ZERO_ONE:
1424       for (i = 0; i < QUAD_SIZE; i++)
1425          if (execmask & (1 << i)) {
1426             if (chan->f[i] < 0.0f)
1427                dst->f[i] = 0.0f;
1428             else if (chan->f[i] > 1.0f)
1429                dst->f[i] = 1.0f;
1430             else
1431                dst->i[i] = chan->i[i];
1432          }
1433       break;
1434
1435    case TGSI_SAT_MINUS_PLUS_ONE:
1436       for (i = 0; i < QUAD_SIZE; i++)
1437          if (execmask & (1 << i)) {
1438             if (chan->f[i] < -1.0f)
1439                dst->f[i] = -1.0f;
1440             else if (chan->f[i] > 1.0f)
1441                dst->f[i] = 1.0f;
1442             else
1443                dst->i[i] = chan->i[i];
1444          }
1445       break;
1446
1447    default:
1448       assert( 0 );
1449    }
1450
1451    if (inst->InstructionExtNv.CondDstUpdate) {
1452       union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1453       uint shift;
1454       uint mask;
1455
1456       /* Only CC0 supported.
1457        */
1458       assert( inst->InstructionExtNv.CondDstIndex < 1 );
1459
1460       switch (chan_index) {
1461       case CHAN_X:
1462          shift = TGSI_EXEC_CC_X_SHIFT;
1463          mask = ~TGSI_EXEC_CC_X_MASK;
1464          break;
1465       case CHAN_Y:
1466          shift = TGSI_EXEC_CC_Y_SHIFT;
1467          mask = ~TGSI_EXEC_CC_Y_MASK;
1468          break;
1469       case CHAN_Z:
1470          shift = TGSI_EXEC_CC_Z_SHIFT;
1471          mask = ~TGSI_EXEC_CC_Z_MASK;
1472          break;
1473       case CHAN_W:
1474          shift = TGSI_EXEC_CC_W_SHIFT;
1475          mask = ~TGSI_EXEC_CC_W_MASK;
1476          break;
1477       default:
1478          assert( 0 );
1479          return;
1480       }
1481
1482       for (i = 0; i < QUAD_SIZE; i++)
1483          if (execmask & (1 << i)) {
1484             cc->u[i] &= mask;
1485             if (dst->f[i] < 0.0f)
1486                cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1487             else if (dst->f[i] > 0.0f)
1488                cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1489             else if (dst->f[i] == 0.0f)
1490                cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1491             else
1492                cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1493          }
1494    }
1495 }
1496
1497 #define FETCH(VAL,INDEX,CHAN)\
1498     fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1499
1500 #define STORE(VAL,INDEX,CHAN)\
1501     store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1502
1503
1504 /**
1505  * Execute ARB-style KIL which is predicated by a src register.
1506  * Kill fragment if any of the four values is less than zero.
1507  */
1508 static void
1509 exec_kil(struct tgsi_exec_machine *mach,
1510          const struct tgsi_full_instruction *inst)
1511 {
1512    uint uniquemask;
1513    uint chan_index;
1514    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1515    union tgsi_exec_channel r[1];
1516
1517    /* This mask stores component bits that were already tested. Note that
1518     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1519     * tested. */
1520    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1521
1522    for (chan_index = 0; chan_index < 4; chan_index++)
1523    {
1524       uint swizzle;
1525       uint i;
1526
1527       /* unswizzle channel */
1528       swizzle = tgsi_util_get_full_src_register_extswizzle (
1529                         &inst->FullSrcRegisters[0],
1530                         chan_index);
1531
1532       /* check if the component has not been already tested */
1533       if (uniquemask & (1 << swizzle))
1534          continue;
1535       uniquemask |= 1 << swizzle;
1536
1537       FETCH(&r[0], 0, chan_index);
1538       for (i = 0; i < 4; i++)
1539          if (r[0].f[i] < 0.0f)
1540             kilmask |= 1 << i;
1541    }
1542
1543    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1544 }
1545
1546 /**
1547  * Execute NVIDIA-style KIL which is predicated by a condition code.
1548  * Kill fragment if the condition code is TRUE.
1549  */
1550 static void
1551 exec_kilp(struct tgsi_exec_machine *mach,
1552           const struct tgsi_full_instruction *inst)
1553 {
1554    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1555
1556    if (inst->InstructionExtNv.CondFlowEnable) {
1557       uint swizzle[4];
1558       uint chan_index;
1559
1560       kilmask = 0x0;
1561
1562       swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1563       swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1564       swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1565       swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1566
1567       for (chan_index = 0; chan_index < 4; chan_index++)
1568       {
1569          uint i;
1570
1571          for (i = 0; i < 4; i++) {
1572             /* TODO: evaluate the condition code */
1573             if (0)
1574                kilmask |= 1 << i;
1575          }
1576       }
1577    }
1578    else {
1579       /* "unconditional" kil */
1580       kilmask = mach->ExecMask;
1581    }
1582    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1583 }
1584
1585
1586 /*
1587  * Fetch a four texture samples using STR texture coordinates.
1588  */
1589 static void
1590 fetch_texel( struct tgsi_sampler *sampler,
1591              const union tgsi_exec_channel *s,
1592              const union tgsi_exec_channel *t,
1593              const union tgsi_exec_channel *p,
1594              float lodbias,  /* XXX should be float[4] */
1595              union tgsi_exec_channel *r,
1596              union tgsi_exec_channel *g,
1597              union tgsi_exec_channel *b,
1598              union tgsi_exec_channel *a )
1599 {
1600    uint j;
1601    float rgba[NUM_CHANNELS][QUAD_SIZE];
1602
1603    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1604
1605    for (j = 0; j < 4; j++) {
1606       r->f[j] = rgba[0][j];
1607       g->f[j] = rgba[1][j];
1608       b->f[j] = rgba[2][j];
1609       a->f[j] = rgba[3][j];
1610    }
1611 }
1612
1613
1614 static void
1615 exec_tex(struct tgsi_exec_machine *mach,
1616          const struct tgsi_full_instruction *inst,
1617          boolean biasLod,
1618          boolean projected)
1619 {
1620    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1621    union tgsi_exec_channel r[4];
1622    uint chan_index;
1623    float lodBias;
1624
1625    /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1626
1627    switch (inst->InstructionExtTexture.Texture) {
1628    case TGSI_TEXTURE_1D:
1629    case TGSI_TEXTURE_SHADOW1D:
1630
1631       FETCH(&r[0], 0, CHAN_X);
1632
1633       if (projected) {
1634          FETCH(&r[1], 0, CHAN_W);
1635          micro_div( &r[0], &r[0], &r[1] );
1636       }
1637
1638       if (biasLod) {
1639          FETCH(&r[1], 0, CHAN_W);
1640          lodBias = r[2].f[0];
1641       }
1642       else
1643          lodBias = 0.0;
1644
1645       fetch_texel(mach->Samplers[unit],
1646                   &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
1647                   &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1648       break;
1649
1650    case TGSI_TEXTURE_2D:
1651    case TGSI_TEXTURE_RECT:
1652    case TGSI_TEXTURE_SHADOW2D:
1653    case TGSI_TEXTURE_SHADOWRECT:
1654
1655       FETCH(&r[0], 0, CHAN_X);
1656       FETCH(&r[1], 0, CHAN_Y);
1657       FETCH(&r[2], 0, CHAN_Z);
1658
1659       if (projected) {
1660          FETCH(&r[3], 0, CHAN_W);
1661          micro_div( &r[0], &r[0], &r[3] );
1662          micro_div( &r[1], &r[1], &r[3] );
1663          micro_div( &r[2], &r[2], &r[3] );
1664       }
1665
1666       if (biasLod) {
1667          FETCH(&r[3], 0, CHAN_W);
1668          lodBias = r[3].f[0];
1669       }
1670       else
1671          lodBias = 0.0;
1672
1673       fetch_texel(mach->Samplers[unit],
1674                   &r[0], &r[1], &r[2], lodBias,  /* inputs */
1675                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1676       break;
1677
1678    case TGSI_TEXTURE_3D:
1679    case TGSI_TEXTURE_CUBE:
1680
1681       FETCH(&r[0], 0, CHAN_X);
1682       FETCH(&r[1], 0, CHAN_Y);
1683       FETCH(&r[2], 0, CHAN_Z);
1684
1685       if (projected) {
1686          FETCH(&r[3], 0, CHAN_W);
1687          micro_div( &r[0], &r[0], &r[3] );
1688          micro_div( &r[1], &r[1], &r[3] );
1689          micro_div( &r[2], &r[2], &r[3] );
1690       }
1691
1692       if (biasLod) {
1693          FETCH(&r[3], 0, CHAN_W);
1694          lodBias = r[3].f[0];
1695       }
1696       else
1697          lodBias = 0.0;
1698
1699       fetch_texel(mach->Samplers[unit],
1700                   &r[0], &r[1], &r[2], lodBias,
1701                   &r[0], &r[1], &r[2], &r[3]);
1702       break;
1703
1704    default:
1705       assert (0);
1706    }
1707
1708    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1709       STORE( &r[chan_index], 0, chan_index );
1710    }
1711 }
1712
1713
1714 /**
1715  * Evaluate a constant-valued coefficient at the position of the
1716  * current quad.
1717  */
1718 static void
1719 eval_constant_coef(
1720    struct tgsi_exec_machine *mach,
1721    unsigned attrib,
1722    unsigned chan )
1723 {
1724    unsigned i;
1725
1726    for( i = 0; i < QUAD_SIZE; i++ ) {
1727       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1728    }
1729 }
1730
1731 /**
1732  * Evaluate a linear-valued coefficient at the position of the
1733  * current quad.
1734  */
1735 static void
1736 eval_linear_coef(
1737    struct tgsi_exec_machine *mach,
1738    unsigned attrib,
1739    unsigned chan )
1740 {
1741    const float x = mach->QuadPos.xyzw[0].f[0];
1742    const float y = mach->QuadPos.xyzw[1].f[0];
1743    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1744    const float dady = mach->InterpCoefs[attrib].dady[chan];
1745    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1746    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1747    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1748    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1749    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1750 }
1751
1752 /**
1753  * Evaluate a perspective-valued coefficient at the position of the
1754  * current quad.
1755  */
1756 static void
1757 eval_perspective_coef(
1758    struct tgsi_exec_machine *mach,
1759    unsigned attrib,
1760    unsigned chan )
1761 {
1762    const float x = mach->QuadPos.xyzw[0].f[0];
1763    const float y = mach->QuadPos.xyzw[1].f[0];
1764    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1765    const float dady = mach->InterpCoefs[attrib].dady[chan];
1766    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1767    const float *w = mach->QuadPos.xyzw[3].f;
1768    /* divide by W here */
1769    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1770    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1771    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1772    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1773 }
1774
1775
1776 typedef void (* eval_coef_func)(
1777    struct tgsi_exec_machine *mach,
1778    unsigned attrib,
1779    unsigned chan );
1780
1781 static void
1782 exec_declaration(
1783    struct tgsi_exec_machine *mach,
1784    const struct tgsi_full_declaration *decl )
1785 {
1786    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1787       if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1788          unsigned first, last, mask;
1789          eval_coef_func eval;
1790
1791          first = decl->DeclarationRange.First;
1792          last = decl->DeclarationRange.Last;
1793          mask = decl->Declaration.UsageMask;
1794
1795          switch( decl->Declaration.Interpolate ) {
1796          case TGSI_INTERPOLATE_CONSTANT:
1797             eval = eval_constant_coef;
1798             break;
1799
1800          case TGSI_INTERPOLATE_LINEAR:
1801             eval = eval_linear_coef;
1802             break;
1803
1804          case TGSI_INTERPOLATE_PERSPECTIVE:
1805             eval = eval_perspective_coef;
1806             break;
1807
1808          default:
1809             eval = NULL;
1810             assert( 0 );
1811          }
1812
1813          if( mask == TGSI_WRITEMASK_XYZW ) {
1814             unsigned i, j;
1815
1816             for( i = first; i <= last; i++ ) {
1817                for( j = 0; j < NUM_CHANNELS; j++ ) {
1818                   eval( mach, i, j );
1819                }
1820             }
1821          }
1822          else {
1823             unsigned i, j;
1824
1825             for( j = 0; j < NUM_CHANNELS; j++ ) {
1826                if( mask & (1 << j) ) {
1827                   for( i = first; i <= last; i++ ) {
1828                      eval( mach, i, j );
1829                   }
1830                }
1831             }
1832          }
1833       }
1834    }
1835 }
1836
1837 static void
1838 exec_instruction(
1839    struct tgsi_exec_machine *mach,
1840    const struct tgsi_full_instruction *inst,
1841    int *pc )
1842 {
1843    uint chan_index;
1844    union tgsi_exec_channel r[8];
1845
1846    (*pc)++;
1847
1848    switch (inst->Instruction.Opcode) {
1849    case TGSI_OPCODE_ARL:
1850    /* TGSI_OPCODE_FLOOR */
1851    /* TGSI_OPCODE_FLR */
1852       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1853          FETCH( &r[0], 0, chan_index );
1854          micro_flr( &r[0], &r[0] );
1855          STORE( &r[0], 0, chan_index );
1856       }
1857       break;
1858
1859    case TGSI_OPCODE_MOV:
1860    case TGSI_OPCODE_SWZ:
1861       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1862          FETCH( &r[0], 0, chan_index );
1863          STORE( &r[0], 0, chan_index );
1864       }
1865       break;
1866
1867    case TGSI_OPCODE_LIT:
1868       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1869          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1870       }
1871
1872       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1873          FETCH( &r[0], 0, CHAN_X );
1874          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1875             micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1876             STORE( &r[0], 0, CHAN_Y );
1877          }
1878
1879          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1880             FETCH( &r[1], 0, CHAN_Y );
1881             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1882
1883             FETCH( &r[2], 0, CHAN_W );
1884             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1885             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1886             micro_pow( &r[1], &r[1], &r[2] );
1887             micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1888             STORE( &r[0], 0, CHAN_Z );
1889          }
1890       }
1891
1892       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1893          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1894       }
1895       break;
1896
1897    case TGSI_OPCODE_RCP:
1898    /* TGSI_OPCODE_RECIP */
1899       FETCH( &r[0], 0, CHAN_X );
1900       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1901       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1902          STORE( &r[0], 0, chan_index );
1903       }
1904       break;
1905
1906    case TGSI_OPCODE_RSQ:
1907    /* TGSI_OPCODE_RECIPSQRT */
1908       FETCH( &r[0], 0, CHAN_X );
1909       micro_abs( &r[0], &r[0] );
1910       micro_sqrt( &r[0], &r[0] );
1911       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1912       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1913          STORE( &r[0], 0, chan_index );
1914       }
1915       break;
1916
1917    case TGSI_OPCODE_EXP:
1918       FETCH( &r[0], 0, CHAN_X );
1919       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1920       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1921          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1922          STORE( &r[2], 0, CHAN_X );        /* store r2 */
1923       }
1924       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1925          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1926          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1927       }
1928       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1929          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1930          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1931       }
1932       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1933          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1934       }
1935       break;
1936
1937    case TGSI_OPCODE_LOG:
1938       FETCH( &r[0], 0, CHAN_X );
1939       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1940       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1941       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1942       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1943          STORE( &r[0], 0, CHAN_X );
1944       }
1945       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1946          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1947          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
1948          STORE( &r[0], 0, CHAN_Y );
1949       }
1950       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1951          STORE( &r[1], 0, CHAN_Z );
1952       }
1953       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1954          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1955       }
1956       break;
1957
1958    case TGSI_OPCODE_MUL:
1959       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1960       {
1961          FETCH(&r[0], 0, chan_index);
1962          FETCH(&r[1], 1, chan_index);
1963
1964          micro_mul( &r[0], &r[0], &r[1] );
1965
1966          STORE(&r[0], 0, chan_index);
1967       }
1968       break;
1969
1970    case TGSI_OPCODE_ADD:
1971       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1972          FETCH( &r[0], 0, chan_index );
1973          FETCH( &r[1], 1, chan_index );
1974          micro_add( &r[0], &r[0], &r[1] );
1975          STORE( &r[0], 0, chan_index );
1976       }
1977       break;
1978
1979    case TGSI_OPCODE_DP3:
1980    /* TGSI_OPCODE_DOT3 */
1981       FETCH( &r[0], 0, CHAN_X );
1982       FETCH( &r[1], 1, CHAN_X );
1983       micro_mul( &r[0], &r[0], &r[1] );
1984
1985       FETCH( &r[1], 0, CHAN_Y );
1986       FETCH( &r[2], 1, CHAN_Y );
1987       micro_mul( &r[1], &r[1], &r[2] );
1988       micro_add( &r[0], &r[0], &r[1] );
1989
1990       FETCH( &r[1], 0, CHAN_Z );
1991       FETCH( &r[2], 1, CHAN_Z );
1992       micro_mul( &r[1], &r[1], &r[2] );
1993       micro_add( &r[0], &r[0], &r[1] );
1994
1995       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1996          STORE( &r[0], 0, chan_index );
1997       }
1998       break;
1999
2000     case TGSI_OPCODE_DP4:
2001     /* TGSI_OPCODE_DOT4 */
2002        FETCH(&r[0], 0, CHAN_X);
2003        FETCH(&r[1], 1, CHAN_X);
2004
2005        micro_mul( &r[0], &r[0], &r[1] );
2006
2007        FETCH(&r[1], 0, CHAN_Y);
2008        FETCH(&r[2], 1, CHAN_Y);
2009
2010        micro_mul( &r[1], &r[1], &r[2] );
2011        micro_add( &r[0], &r[0], &r[1] );
2012
2013        FETCH(&r[1], 0, CHAN_Z);
2014        FETCH(&r[2], 1, CHAN_Z);
2015
2016        micro_mul( &r[1], &r[1], &r[2] );
2017        micro_add( &r[0], &r[0], &r[1] );
2018
2019        FETCH(&r[1], 0, CHAN_W);
2020        FETCH(&r[2], 1, CHAN_W);
2021
2022        micro_mul( &r[1], &r[1], &r[2] );
2023        micro_add( &r[0], &r[0], &r[1] );
2024
2025       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2026          STORE( &r[0], 0, chan_index );
2027       }
2028       break;
2029
2030    case TGSI_OPCODE_DST:
2031       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2032          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2033       }
2034
2035       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2036          FETCH( &r[0], 0, CHAN_Y );
2037          FETCH( &r[1], 1, CHAN_Y);
2038          micro_mul( &r[0], &r[0], &r[1] );
2039          STORE( &r[0], 0, CHAN_Y );
2040       }
2041
2042       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2043          FETCH( &r[0], 0, CHAN_Z );
2044          STORE( &r[0], 0, CHAN_Z );
2045       }
2046
2047       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2048          FETCH( &r[0], 1, CHAN_W );
2049          STORE( &r[0], 0, CHAN_W );
2050       }
2051       break;
2052
2053    case TGSI_OPCODE_MIN:
2054       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2055          FETCH(&r[0], 0, chan_index);
2056          FETCH(&r[1], 1, chan_index);
2057
2058          /* XXX use micro_min()?? */
2059          micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
2060
2061          STORE(&r[0], 0, chan_index);
2062       }
2063       break;
2064
2065    case TGSI_OPCODE_MAX:
2066       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2067          FETCH(&r[0], 0, chan_index);
2068          FETCH(&r[1], 1, chan_index);
2069
2070          /* XXX use micro_max()?? */
2071          micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2072
2073          STORE(&r[0], 0, chan_index );
2074       }
2075       break;
2076
2077    case TGSI_OPCODE_SLT:
2078    /* TGSI_OPCODE_SETLT */
2079       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2080          FETCH( &r[0], 0, chan_index );
2081          FETCH( &r[1], 1, chan_index );
2082          micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2083          STORE( &r[0], 0, chan_index );
2084       }
2085       break;
2086
2087    case TGSI_OPCODE_SGE:
2088    /* TGSI_OPCODE_SETGE */
2089       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2090          FETCH( &r[0], 0, chan_index );
2091          FETCH( &r[1], 1, chan_index );
2092          micro_le( &r[0], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2093          STORE( &r[0], 0, chan_index );
2094       }
2095       break;
2096
2097    case TGSI_OPCODE_MAD:
2098    /* TGSI_OPCODE_MADD */
2099       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2100          FETCH( &r[0], 0, chan_index );
2101          FETCH( &r[1], 1, chan_index );
2102          micro_mul( &r[0], &r[0], &r[1] );
2103          FETCH( &r[1], 2, chan_index );
2104          micro_add( &r[0], &r[0], &r[1] );
2105          STORE( &r[0], 0, chan_index );
2106       }
2107       break;
2108
2109    case TGSI_OPCODE_SUB:
2110       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2111          FETCH(&r[0], 0, chan_index);
2112          FETCH(&r[1], 1, chan_index);
2113
2114          micro_sub( &r[0], &r[0], &r[1] );
2115
2116          STORE(&r[0], 0, chan_index);
2117       }
2118       break;
2119
2120    case TGSI_OPCODE_LERP:
2121    /* TGSI_OPCODE_LRP */
2122       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2123          FETCH(&r[0], 0, chan_index);
2124          FETCH(&r[1], 1, chan_index);
2125          FETCH(&r[2], 2, chan_index);
2126
2127          micro_sub( &r[1], &r[1], &r[2] );
2128          micro_mul( &r[0], &r[0], &r[1] );
2129          micro_add( &r[0], &r[0], &r[2] );
2130
2131          STORE(&r[0], 0, chan_index);
2132       }
2133       break;
2134
2135    case TGSI_OPCODE_CND:
2136       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2137          FETCH(&r[0], 0, chan_index);
2138          FETCH(&r[1], 1, chan_index);
2139          FETCH(&r[2], 2, chan_index);
2140          micro_lt(&r[0], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2141          STORE(&r[0], 0, chan_index);
2142       }
2143       break;
2144
2145    case TGSI_OPCODE_CND0:
2146       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2147          FETCH(&r[0], 0, chan_index);
2148          FETCH(&r[1], 1, chan_index);
2149          FETCH(&r[2], 2, chan_index);
2150          micro_le(&r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[2], &r[0], &r[1]);
2151          STORE(&r[0], 0, chan_index);
2152       }
2153       break;
2154
2155    case TGSI_OPCODE_DOT2ADD:
2156    /* TGSI_OPCODE_DP2A */
2157       FETCH( &r[0], 0, CHAN_X );
2158       FETCH( &r[1], 1, CHAN_X );
2159       micro_mul( &r[0], &r[0], &r[1] );
2160
2161       FETCH( &r[1], 0, CHAN_Y );
2162       FETCH( &r[2], 1, CHAN_Y );
2163       micro_mul( &r[1], &r[1], &r[2] );
2164       micro_add( &r[0], &r[0], &r[1] );
2165
2166       FETCH( &r[2], 2, CHAN_X );
2167       micro_add( &r[0], &r[0], &r[2] );
2168
2169       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2170          STORE( &r[0], 0, chan_index );
2171       }
2172       break;
2173
2174    case TGSI_OPCODE_INDEX:
2175       /* XXX: considered for removal */
2176       assert (0);
2177       break;
2178
2179    case TGSI_OPCODE_NEGATE:
2180       /* XXX: considered for removal */
2181       assert (0);
2182       break;
2183
2184    case TGSI_OPCODE_FRAC:
2185    /* TGSI_OPCODE_FRC */
2186       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2187          FETCH( &r[0], 0, chan_index );
2188          micro_frc( &r[0], &r[0] );
2189          STORE( &r[0], 0, chan_index );
2190       }
2191       break;
2192
2193    case TGSI_OPCODE_CLAMP:
2194       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2195          FETCH(&r[0], 0, chan_index);
2196          FETCH(&r[1], 1, chan_index);
2197          micro_max(&r[0], &r[0], &r[1]);
2198          FETCH(&r[1], 2, chan_index);
2199          micro_min(&r[0], &r[0], &r[1]);
2200          STORE(&r[0], 0, chan_index);
2201       }
2202       break;
2203
2204    case TGSI_OPCODE_ROUND:
2205    case TGSI_OPCODE_ARR:
2206       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2207          FETCH( &r[0], 0, chan_index );
2208          micro_rnd( &r[0], &r[0] );
2209          STORE( &r[0], 0, chan_index );
2210       }
2211       break;
2212
2213    case TGSI_OPCODE_EXPBASE2:
2214    /* TGSI_OPCODE_EX2 */
2215       FETCH(&r[0], 0, CHAN_X);
2216
2217 #if FAST_MATH
2218       micro_exp2( &r[0], &r[0] );
2219 #else
2220       micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2221 #endif
2222
2223       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2224          STORE( &r[0], 0, chan_index );
2225       }
2226       break;
2227
2228    case TGSI_OPCODE_LOGBASE2:
2229    /* TGSI_OPCODE_LG2 */
2230       FETCH( &r[0], 0, CHAN_X );
2231       micro_lg2( &r[0], &r[0] );
2232       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2233          STORE( &r[0], 0, chan_index );
2234       }
2235       break;
2236
2237    case TGSI_OPCODE_POWER:
2238    /* TGSI_OPCODE_POW */
2239       FETCH(&r[0], 0, CHAN_X);
2240       FETCH(&r[1], 1, CHAN_X);
2241
2242       micro_pow( &r[0], &r[0], &r[1] );
2243
2244       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2245          STORE( &r[0], 0, chan_index );
2246       }
2247       break;
2248
2249    case TGSI_OPCODE_CROSSPRODUCT:
2250    /* TGSI_OPCODE_XPD */
2251       FETCH(&r[0], 0, CHAN_Y);
2252       FETCH(&r[1], 1, CHAN_Z);
2253
2254       micro_mul( &r[2], &r[0], &r[1] );
2255
2256       FETCH(&r[3], 0, CHAN_Z);
2257       FETCH(&r[4], 1, CHAN_Y);
2258
2259       micro_mul( &r[5], &r[3], &r[4] );
2260       micro_sub( &r[2], &r[2], &r[5] );
2261
2262       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2263          STORE( &r[2], 0, CHAN_X );
2264       }
2265
2266       FETCH(&r[2], 1, CHAN_X);
2267
2268       micro_mul( &r[3], &r[3], &r[2] );
2269
2270       FETCH(&r[5], 0, CHAN_X);
2271
2272       micro_mul( &r[1], &r[1], &r[5] );
2273       micro_sub( &r[3], &r[3], &r[1] );
2274
2275       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2276          STORE( &r[3], 0, CHAN_Y );
2277       }
2278
2279       micro_mul( &r[5], &r[5], &r[4] );
2280       micro_mul( &r[0], &r[0], &r[2] );
2281       micro_sub( &r[5], &r[5], &r[0] );
2282
2283       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2284          STORE( &r[5], 0, CHAN_Z );
2285       }
2286
2287       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2288          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2289       }
2290       break;
2291
2292     case TGSI_OPCODE_MULTIPLYMATRIX:
2293        /* XXX: considered for removal */
2294        assert (0);
2295        break;
2296
2297     case TGSI_OPCODE_ABS:
2298        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2299           FETCH(&r[0], 0, chan_index);
2300
2301           micro_abs( &r[0], &r[0] );
2302
2303           STORE(&r[0], 0, chan_index);
2304        }
2305        break;
2306
2307    case TGSI_OPCODE_RCC:
2308       FETCH(&r[0], 0, CHAN_X);
2309       micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2310       micro_float_clamp(&r[0], &r[0]);
2311       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2312          STORE(&r[0], 0, chan_index);
2313       }
2314       break;
2315
2316    case TGSI_OPCODE_DPH:
2317       FETCH(&r[0], 0, CHAN_X);
2318       FETCH(&r[1], 1, CHAN_X);
2319
2320       micro_mul( &r[0], &r[0], &r[1] );
2321
2322       FETCH(&r[1], 0, CHAN_Y);
2323       FETCH(&r[2], 1, CHAN_Y);
2324
2325       micro_mul( &r[1], &r[1], &r[2] );
2326       micro_add( &r[0], &r[0], &r[1] );
2327
2328       FETCH(&r[1], 0, CHAN_Z);
2329       FETCH(&r[2], 1, CHAN_Z);
2330
2331       micro_mul( &r[1], &r[1], &r[2] );
2332       micro_add( &r[0], &r[0], &r[1] );
2333
2334       FETCH(&r[1], 1, CHAN_W);
2335
2336       micro_add( &r[0], &r[0], &r[1] );
2337
2338       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2339          STORE( &r[0], 0, chan_index );
2340       }
2341       break;
2342
2343    case TGSI_OPCODE_COS:
2344       FETCH(&r[0], 0, CHAN_X);
2345
2346       micro_cos( &r[0], &r[0] );
2347
2348       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2349          STORE( &r[0], 0, chan_index );
2350       }
2351       break;
2352
2353    case TGSI_OPCODE_DDX:
2354       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2355          FETCH( &r[0], 0, chan_index );
2356          micro_ddx( &r[0], &r[0] );
2357          STORE( &r[0], 0, chan_index );
2358       }
2359       break;
2360
2361    case TGSI_OPCODE_DDY:
2362       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2363          FETCH( &r[0], 0, chan_index );
2364          micro_ddy( &r[0], &r[0] );
2365          STORE( &r[0], 0, chan_index );
2366       }
2367       break;
2368
2369    case TGSI_OPCODE_KILP:
2370       exec_kilp (mach, inst);
2371       break;
2372
2373    case TGSI_OPCODE_KIL:
2374       exec_kil (mach, inst);
2375       break;
2376
2377    case TGSI_OPCODE_PK2H:
2378       assert (0);
2379       break;
2380
2381    case TGSI_OPCODE_PK2US:
2382       assert (0);
2383       break;
2384
2385    case TGSI_OPCODE_PK4B:
2386       assert (0);
2387       break;
2388
2389    case TGSI_OPCODE_PK4UB:
2390       assert (0);
2391       break;
2392
2393    case TGSI_OPCODE_RFL:
2394       assert (0);
2395       break;
2396
2397    case TGSI_OPCODE_SEQ:
2398       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2399          FETCH( &r[0], 0, chan_index );
2400          FETCH( &r[1], 1, chan_index );
2401          micro_eq( &r[0], &r[0], &r[1],
2402                    &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2403                    &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2404          STORE( &r[0], 0, chan_index );
2405       }
2406       break;
2407
2408    case TGSI_OPCODE_SFL:
2409       assert (0);
2410       break;
2411
2412    case TGSI_OPCODE_SGT:
2413       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2414          FETCH( &r[0], 0, chan_index );
2415          FETCH( &r[1], 1, chan_index );
2416          micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2417          STORE( &r[0], 0, chan_index );
2418       }
2419       break;
2420
2421    case TGSI_OPCODE_SIN:
2422       FETCH( &r[0], 0, CHAN_X );
2423       micro_sin( &r[0], &r[0] );
2424       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2425          STORE( &r[0], 0, chan_index );
2426       }
2427       break;
2428
2429    case TGSI_OPCODE_SLE:
2430       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2431          FETCH( &r[0], 0, chan_index );
2432          FETCH( &r[1], 1, chan_index );
2433          micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2434          STORE( &r[0], 0, chan_index );
2435       }
2436       break;
2437
2438    case TGSI_OPCODE_SNE:
2439       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2440          FETCH( &r[0], 0, chan_index );
2441          FETCH( &r[1], 1, chan_index );
2442          micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2443          STORE( &r[0], 0, chan_index );
2444       }
2445       break;
2446
2447    case TGSI_OPCODE_STR:
2448       assert (0);
2449       break;
2450
2451    case TGSI_OPCODE_TEX:
2452       /* simple texture lookup */
2453       /* src[0] = texcoord */
2454       /* src[1] = sampler unit */
2455       exec_tex(mach, inst, FALSE, FALSE);
2456       break;
2457
2458    case TGSI_OPCODE_TXB:
2459       /* Texture lookup with lod bias */
2460       /* src[0] = texcoord (src[0].w = LOD bias) */
2461       /* src[1] = sampler unit */
2462       exec_tex(mach, inst, TRUE, FALSE);
2463       break;
2464
2465    case TGSI_OPCODE_TXD:
2466       /* Texture lookup with explict partial derivatives */
2467       /* src[0] = texcoord */
2468       /* src[1] = d[strq]/dx */
2469       /* src[2] = d[strq]/dy */
2470       /* src[3] = sampler unit */
2471       assert (0);
2472       break;
2473
2474    case TGSI_OPCODE_TXL:
2475       /* Texture lookup with explit LOD */
2476       /* src[0] = texcoord (src[0].w = LOD) */
2477       /* src[1] = sampler unit */
2478       exec_tex(mach, inst, TRUE, FALSE);
2479       break;
2480
2481    case TGSI_OPCODE_TXP:
2482       /* Texture lookup with projection */
2483       /* src[0] = texcoord (src[0].w = projection) */
2484       /* src[1] = sampler unit */
2485       exec_tex(mach, inst, FALSE, TRUE);
2486       break;
2487
2488    case TGSI_OPCODE_UP2H:
2489       assert (0);
2490       break;
2491
2492    case TGSI_OPCODE_UP2US:
2493       assert (0);
2494       break;
2495
2496    case TGSI_OPCODE_UP4B:
2497       assert (0);
2498       break;
2499
2500    case TGSI_OPCODE_UP4UB:
2501       assert (0);
2502       break;
2503
2504    case TGSI_OPCODE_X2D:
2505       assert (0);
2506       break;
2507
2508    case TGSI_OPCODE_ARA:
2509       assert (0);
2510       break;
2511
2512    case TGSI_OPCODE_BRA:
2513       assert (0);
2514       break;
2515
2516    case TGSI_OPCODE_CAL:
2517       /* skip the call if no execution channels are enabled */
2518       if (mach->ExecMask) {
2519          /* do the call */
2520
2521          /* push the Cond, Loop, Cont stacks */
2522          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2523          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2524          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2525          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2526          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2527          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2528
2529          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2530          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2531
2532          /* note that PC was already incremented above */
2533          mach->CallStack[mach->CallStackTop++] = *pc;
2534          *pc = inst->InstructionExtLabel.Label;
2535       }
2536       break;
2537
2538    case TGSI_OPCODE_RET:
2539       mach->FuncMask &= ~mach->ExecMask;
2540       UPDATE_EXEC_MASK(mach);
2541
2542       if (mach->FuncMask == 0x0) {
2543          /* really return now (otherwise, keep executing */
2544
2545          if (mach->CallStackTop == 0) {
2546             /* returning from main() */
2547             *pc = -1;
2548             return;
2549          }
2550          *pc = mach->CallStack[--mach->CallStackTop];
2551
2552          /* pop the Cond, Loop, Cont stacks */
2553          assert(mach->CondStackTop > 0);
2554          mach->CondMask = mach->CondStack[--mach->CondStackTop];
2555          assert(mach->LoopStackTop > 0);
2556          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2557          assert(mach->ContStackTop > 0);
2558          mach->ContMask = mach->ContStack[--mach->ContStackTop];
2559          assert(mach->FuncStackTop > 0);
2560          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2561
2562          UPDATE_EXEC_MASK(mach);
2563       }
2564       break;
2565
2566    case TGSI_OPCODE_SSG:
2567    /* TGSI_OPCODE_SGN */
2568       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2569          FETCH( &r[0], 0, chan_index );
2570          micro_sgn( &r[0], &r[0] );
2571          STORE( &r[0], 0, chan_index );
2572       }
2573       break;
2574
2575    case TGSI_OPCODE_CMP:
2576       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2577          FETCH(&r[0], 0, chan_index);
2578          FETCH(&r[1], 1, chan_index);
2579          FETCH(&r[2], 2, chan_index);
2580
2581          micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2582
2583          STORE(&r[0], 0, chan_index);
2584       }
2585       break;
2586
2587    case TGSI_OPCODE_SCS:
2588       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2589          FETCH( &r[0], 0, CHAN_X );
2590       }
2591       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
2592          micro_cos( &r[1], &r[0] );
2593          STORE( &r[1], 0, CHAN_X );
2594       }
2595       if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2596          micro_sin( &r[1], &r[0] );
2597          STORE( &r[1], 0, CHAN_Y );
2598       }
2599       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2600          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2601       }
2602       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2603          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2604       }
2605       break;
2606
2607    case TGSI_OPCODE_NRM:
2608       /* 3-component vector normalize */
2609       {
2610          union tgsi_exec_channel tmp, dot;
2611
2612          /* tmp = dp3(src0, src0): */
2613          FETCH( &r[0], 0, CHAN_X );
2614          micro_mul( &tmp, &r[0], &r[0] );
2615
2616          FETCH( &r[1], 0, CHAN_Y );
2617          micro_mul( &dot, &r[1], &r[1] );
2618          micro_add( &tmp, &tmp, &dot );
2619
2620          FETCH( &r[2], 0, CHAN_Z );
2621          micro_mul( &dot, &r[2], &r[2] );
2622          micro_add( &tmp, &tmp, &dot );
2623
2624          /* tmp = 1 / sqrt(tmp) */
2625          micro_sqrt( &tmp, &tmp );
2626          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2627
2628          /* note: w channel is undefined */
2629          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2630             /* chan = chan * tmp */
2631             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2632             STORE( &r[chan_index], 0, chan_index );
2633          }
2634       }
2635       break;
2636
2637    case TGSI_OPCODE_NRM4:
2638       /* 4-component vector normalize */
2639       {
2640          union tgsi_exec_channel tmp, dot;
2641
2642          /* tmp = dp4(src0, src0): */
2643          FETCH( &r[0], 0, CHAN_X );
2644          micro_mul( &tmp, &r[0], &r[0] );
2645
2646          FETCH( &r[1], 0, CHAN_Y );
2647          micro_mul( &dot, &r[1], &r[1] );
2648          micro_add( &tmp, &tmp, &dot );
2649
2650          FETCH( &r[2], 0, CHAN_Z );
2651          micro_mul( &dot, &r[2], &r[2] );
2652          micro_add( &tmp, &tmp, &dot );
2653
2654          FETCH( &r[3], 0, CHAN_W );
2655          micro_mul( &dot, &r[3], &r[3] );
2656          micro_add( &tmp, &tmp, &dot );
2657
2658          /* tmp = 1 / sqrt(tmp) */
2659          micro_sqrt( &tmp, &tmp );
2660          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2661
2662          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2663             /* chan = chan * tmp */
2664             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2665             STORE( &r[chan_index], 0, chan_index );
2666          }
2667       }
2668       break;
2669
2670    case TGSI_OPCODE_DIV:
2671       assert( 0 );
2672       break;
2673
2674    case TGSI_OPCODE_DP2:
2675       FETCH( &r[0], 0, CHAN_X );
2676       FETCH( &r[1], 1, CHAN_X );
2677       micro_mul( &r[0], &r[0], &r[1] );
2678
2679       FETCH( &r[1], 0, CHAN_Y );
2680       FETCH( &r[2], 1, CHAN_Y );
2681       micro_mul( &r[1], &r[1], &r[2] );
2682       micro_add( &r[0], &r[0], &r[1] );
2683
2684       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2685          STORE( &r[0], 0, chan_index );
2686       }
2687       break;
2688
2689    case TGSI_OPCODE_IF:
2690       /* push CondMask */
2691       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2692       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2693       FETCH( &r[0], 0, CHAN_X );
2694       /* update CondMask */
2695       if( ! r[0].u[0] ) {
2696          mach->CondMask &= ~0x1;
2697       }
2698       if( ! r[0].u[1] ) {
2699          mach->CondMask &= ~0x2;
2700       }
2701       if( ! r[0].u[2] ) {
2702          mach->CondMask &= ~0x4;
2703       }
2704       if( ! r[0].u[3] ) {
2705          mach->CondMask &= ~0x8;
2706       }
2707       UPDATE_EXEC_MASK(mach);
2708       /* Todo: If CondMask==0, jump to ELSE */
2709       break;
2710
2711    case TGSI_OPCODE_ELSE:
2712       /* invert CondMask wrt previous mask */
2713       {
2714          uint prevMask;
2715          assert(mach->CondStackTop > 0);
2716          prevMask = mach->CondStack[mach->CondStackTop - 1];
2717          mach->CondMask = ~mach->CondMask & prevMask;
2718          UPDATE_EXEC_MASK(mach);
2719          /* Todo: If CondMask==0, jump to ENDIF */
2720       }
2721       break;
2722
2723    case TGSI_OPCODE_ENDIF:
2724       /* pop CondMask */
2725       assert(mach->CondStackTop > 0);
2726       mach->CondMask = mach->CondStack[--mach->CondStackTop];
2727       UPDATE_EXEC_MASK(mach);
2728       break;
2729
2730    case TGSI_OPCODE_END:
2731       /* halt execution */
2732       *pc = -1;
2733       break;
2734
2735    case TGSI_OPCODE_REP:
2736       assert (0);
2737       break;
2738
2739    case TGSI_OPCODE_ENDREP:
2740        assert (0);
2741        break;
2742
2743    case TGSI_OPCODE_PUSHA:
2744       assert (0);
2745       break;
2746
2747    case TGSI_OPCODE_POPA:
2748       assert (0);
2749       break;
2750
2751    case TGSI_OPCODE_CEIL:
2752       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2753          FETCH( &r[0], 0, chan_index );
2754          micro_ceil( &r[0], &r[0] );
2755          STORE( &r[0], 0, chan_index );
2756       }
2757       break;
2758
2759    case TGSI_OPCODE_I2F:
2760       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2761          FETCH( &r[0], 0, chan_index );
2762          micro_i2f( &r[0], &r[0] );
2763          STORE( &r[0], 0, chan_index );
2764       }
2765       break;
2766
2767    case TGSI_OPCODE_NOT:
2768       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2769          FETCH( &r[0], 0, chan_index );
2770          micro_not( &r[0], &r[0] );
2771          STORE( &r[0], 0, chan_index );
2772       }
2773       break;
2774
2775    case TGSI_OPCODE_TRUNC:
2776       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2777          FETCH( &r[0], 0, chan_index );
2778          micro_trunc( &r[0], &r[0] );
2779          STORE( &r[0], 0, chan_index );
2780       }
2781       break;
2782
2783    case TGSI_OPCODE_SHL:
2784       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2785          FETCH( &r[0], 0, chan_index );
2786          FETCH( &r[1], 1, chan_index );
2787          micro_shl( &r[0], &r[0], &r[1] );
2788          STORE( &r[0], 0, chan_index );
2789       }
2790       break;
2791
2792    case TGSI_OPCODE_SHR:
2793       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2794          FETCH( &r[0], 0, chan_index );
2795          FETCH( &r[1], 1, chan_index );
2796          micro_ishr( &r[0], &r[0], &r[1] );
2797          STORE( &r[0], 0, chan_index );
2798       }
2799       break;
2800
2801    case TGSI_OPCODE_AND:
2802       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2803          FETCH( &r[0], 0, chan_index );
2804          FETCH( &r[1], 1, chan_index );
2805          micro_and( &r[0], &r[0], &r[1] );
2806          STORE( &r[0], 0, chan_index );
2807       }
2808       break;
2809
2810    case TGSI_OPCODE_OR:
2811       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2812          FETCH( &r[0], 0, chan_index );
2813          FETCH( &r[1], 1, chan_index );
2814          micro_or( &r[0], &r[0], &r[1] );
2815          STORE( &r[0], 0, chan_index );
2816       }
2817       break;
2818
2819    case TGSI_OPCODE_MOD:
2820       assert (0);
2821       break;
2822
2823    case TGSI_OPCODE_XOR:
2824       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2825          FETCH( &r[0], 0, chan_index );
2826          FETCH( &r[1], 1, chan_index );
2827          micro_xor( &r[0], &r[0], &r[1] );
2828          STORE( &r[0], 0, chan_index );
2829       }
2830       break;
2831
2832    case TGSI_OPCODE_SAD:
2833       assert (0);
2834       break;
2835
2836    case TGSI_OPCODE_TXF:
2837       assert (0);
2838       break;
2839
2840    case TGSI_OPCODE_TXQ:
2841       assert (0);
2842       break;
2843
2844    case TGSI_OPCODE_EMIT:
2845       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
2846       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2847       break;
2848
2849    case TGSI_OPCODE_ENDPRIM:
2850       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
2851       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
2852       break;
2853
2854    case TGSI_OPCODE_LOOP:
2855       /* fall-through (for now) */
2856    case TGSI_OPCODE_BGNLOOP2:
2857       /* push LoopMask and ContMasks */
2858       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2859       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2860       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2861       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2862       break;
2863
2864    case TGSI_OPCODE_ENDLOOP:
2865       /* fall-through (for now at least) */
2866    case TGSI_OPCODE_ENDLOOP2:
2867       /* Restore ContMask, but don't pop */
2868       assert(mach->ContStackTop > 0);
2869       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
2870       UPDATE_EXEC_MASK(mach);
2871       if (mach->ExecMask) {
2872          /* repeat loop: jump to instruction just past BGNLOOP */
2873          *pc = inst->InstructionExtLabel.Label + 1;
2874       }
2875       else {
2876          /* exit loop: pop LoopMask */
2877          assert(mach->LoopStackTop > 0);
2878          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2879          /* pop ContMask */
2880          assert(mach->ContStackTop > 0);
2881          mach->ContMask = mach->ContStack[--mach->ContStackTop];
2882       }
2883       UPDATE_EXEC_MASK(mach);
2884       break;
2885
2886    case TGSI_OPCODE_BRK:
2887       /* turn off loop channels for each enabled exec channel */
2888       mach->LoopMask &= ~mach->ExecMask;
2889       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2890       UPDATE_EXEC_MASK(mach);
2891       break;
2892
2893    case TGSI_OPCODE_CONT:
2894       /* turn off cont channels for each enabled exec channel */
2895       mach->ContMask &= ~mach->ExecMask;
2896       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2897       UPDATE_EXEC_MASK(mach);
2898       break;
2899
2900    case TGSI_OPCODE_BGNSUB:
2901       /* no-op */
2902       break;
2903
2904    case TGSI_OPCODE_ENDSUB:
2905       /* no-op */
2906       break;
2907
2908    case TGSI_OPCODE_NOISE1:
2909       assert( 0 );
2910       break;
2911
2912    case TGSI_OPCODE_NOISE2:
2913       assert( 0 );
2914       break;
2915
2916    case TGSI_OPCODE_NOISE3:
2917       assert( 0 );
2918       break;
2919
2920    case TGSI_OPCODE_NOISE4:
2921       assert( 0 );
2922       break;
2923
2924    case TGSI_OPCODE_NOP:
2925       break;
2926
2927    default:
2928       assert( 0 );
2929    }
2930 }
2931
2932
2933 /**
2934  * Run TGSI interpreter.
2935  * \return bitmask of "alive" quad components
2936  */
2937 uint
2938 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
2939 {
2940    uint i;
2941    int pc = 0;
2942
2943    mach->CondMask = 0xf;
2944    mach->LoopMask = 0xf;
2945    mach->ContMask = 0xf;
2946    mach->FuncMask = 0xf;
2947    mach->ExecMask = 0xf;
2948
2949    mach->CondStackTop = 0; /* temporarily subvert this assertion */
2950    assert(mach->CondStackTop == 0);
2951    assert(mach->LoopStackTop == 0);
2952    assert(mach->ContStackTop == 0);
2953    assert(mach->CallStackTop == 0);
2954
2955    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
2956    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
2957
2958    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
2959       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
2960       mach->Primitives[0] = 0;
2961    }
2962
2963    for (i = 0; i < QUAD_SIZE; i++) {
2964       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
2965          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
2966          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
2967          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
2968          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
2969    }
2970
2971    /* execute declarations (interpolants) */
2972    for (i = 0; i < mach->NumDeclarations; i++) {
2973       exec_declaration( mach, mach->Declarations+i );
2974    }
2975
2976    /* execute instructions, until pc is set to -1 */
2977    while (pc != -1) {
2978       assert(pc < (int) mach->NumInstructions);
2979       exec_instruction( mach, mach->Instructions + pc, &pc );
2980    }
2981
2982 #if 0
2983    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
2984    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2985       /*
2986        * Scale back depth component.
2987        */
2988       for (i = 0; i < 4; i++)
2989          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
2990    }
2991 #endif
2992
2993    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
2994 }