src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * TGSI interpreter/executor.
  30  *
  31  * Flow control information:
  32  *
  33  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  34  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  35  * care since a condition may be true for some quad components but false
  36  * for other components.
  37  *
  38  * We basically execute all statements (even if they're in the part of
  39  * an IF/ELSE clause that's "not taken") and use a special mask to
  40  * control writing to destination registers.  This is the ExecMask.
  41  * See store_dest().
  42  *
  43  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  44  * ContMask) which are controlled by the flow control instructions (namely:
  45  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  46  *
  47  *
  48  * Authors:
  49  *   Michal Krol
  50  *   Brian Paul
  51  */
  52
  53 #include "pipe/p_compiler.h"
  54 #include "pipe/p_state.h"
  55 #include "pipe/p_shader_tokens.h"
  56 #include "tgsi/tgsi_dump.h"
  57 #include "tgsi/tgsi_parse.h"
  58 #include "tgsi/tgsi_util.h"
  59 #include "tgsi_exec.h"
  60 #include "util/u_memory.h"
  61 #include "util/u_math.h"
  62
  63 #define FAST_MATH 1
  64
  65 #define TILE_TOP_LEFT     0
  66 #define TILE_TOP_RIGHT    1
  67 #define TILE_BOTTOM_LEFT  2
  68 #define TILE_BOTTOM_RIGHT 3
  69
  70 #define CHAN_X  0
  71 #define CHAN_Y  1
  72 #define CHAN_Z  2
  73 #define CHAN_W  3
  74
  75 /*
  76  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  77  */
  78 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
  79 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
  80 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
  81 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
  82 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
  83 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
  84 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
  85 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
  86 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
  87 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
  88 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
  89 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
  90 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
  91 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
  92 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
  93 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
  94 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
  95 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
  96 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
  97 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
  98 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
  99 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 100 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 101 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 102 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 103 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 104 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 105 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 106 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 107 #define TEMP_P0            TGSI_EXEC_TEMP_P0
 108
 109 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 110    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
 111
 112 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 113    ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
 114
 115 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 116    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 117       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 118
 119 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 120    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 121       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 122
 123
 124 /** The execution mask depends on the conditional mask and the loop mask */
 125 #define UPDATE_EXEC_MASK(MACH) \
 126       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
 127
 128
 129 static const union tgsi_exec_channel ZeroVec =
 130    { { 0.0, 0.0, 0.0, 0.0 } };
 131
 132
 133 #ifdef DEBUG
 134 static void
 135 check_inf_or_nan(const union tgsi_exec_channel *chan)
 136 {
 137    assert(!util_is_inf_or_nan(chan->f[0]));
 138    assert(!util_is_inf_or_nan(chan->f[1]));
 139    assert(!util_is_inf_or_nan(chan->f[2]));
 140    assert(!util_is_inf_or_nan(chan->f[3]));
 141 }
 142 #endif
 143
 144
 145 #ifdef DEBUG
 146 static void
 147 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 148 {
 149    debug_printf("%s = {%f, %f, %f, %f}\n",
 150                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 151 }
 152 #endif
 153
 154
 155 #ifdef DEBUG
 156 static void
 157 print_temp(const struct tgsi_exec_machine *mach, uint index)
 158 {
 159    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 160    int i;
 161    debug_printf("Temp[%u] =\n", index);
 162    for (i = 0; i < 4; i++) {
 163       debug_printf("  %c: { %f, %f, %f, %f }\n",
 164                    "XYZW"[i],
 165                    tmp->xyzw[i].f[0],
 166                    tmp->xyzw[i].f[1],
 167                    tmp->xyzw[i].f[2],
 168                    tmp->xyzw[i].f[3]);
 169    }
 170 }
 171 #endif
 172
 173
 174 /**
 175  * Check if there's a potential src/dst register data dependency when
 176  * using SOA execution.
 177  * Example:
 178  *   MOV T, T.yxwz;
 179  * This would expand into:
 180  *   MOV t0, t1;
 181  *   MOV t1, t0;
 182  *   MOV t2, t3;
 183  *   MOV t3, t2;
 184  * The second instruction will have the wrong value for t0 if executed as-is.
 185  */
 186 boolean
 187 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
 188 {
 189    uint i, chan;
 190
 191    uint writemask = inst->Dst[0].Register.WriteMask;
 192    if (writemask == TGSI_WRITEMASK_X ||
 193        writemask == TGSI_WRITEMASK_Y ||
 194        writemask == TGSI_WRITEMASK_Z ||
 195        writemask == TGSI_WRITEMASK_W ||
 196        writemask == TGSI_WRITEMASK_NONE) {
 197       /* no chance of data dependency */
 198       return FALSE;
 199    }
 200
 201    /* loop over src regs */
 202    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 203       if ((inst->Src[i].Register.File ==
 204            inst->Dst[0].Register.File) &&
 205           (inst->Src[i].Register.Index ==
 206            inst->Dst[0].Register.Index)) {
 207          /* loop over dest channels */
 208          uint channelsWritten = 0x0;
 209          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
 210             /* check if we're reading a channel that's been written */
 211             uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
 212             if (channelsWritten & (1 << swizzle)) {
 213                return TRUE;
 214             }
 215
 216             channelsWritten |= (1 << chan);
 217          }
 218       }
 219    }
 220    return FALSE;
 221 }
 222
 223
 224 /**
 225  * Initialize machine state by expanding tokens to full instructions,
 226  * allocating temporary storage, setting up constants, etc.
 227  * After this, we can call tgsi_exec_machine_run() many times.
 228  */
 229 void
 230 tgsi_exec_machine_bind_shader(
 231    struct tgsi_exec_machine *mach,
 232    const struct tgsi_token *tokens,
 233    uint numSamplers,
 234    struct tgsi_sampler **samplers)
 235 {
 236    uint k;
 237    struct tgsi_parse_context parse;
 238    struct tgsi_exec_labels *labels = &mach->Labels;
 239    struct tgsi_full_instruction *instructions;
 240    struct tgsi_full_declaration *declarations;
 241    uint maxInstructions = 10, numInstructions = 0;
 242    uint maxDeclarations = 10, numDeclarations = 0;
 243    uint instno = 0;
 244
 245 #if 0
 246    tgsi_dump(tokens, 0);
 247 #endif
 248
 249    util_init_math();
 250
 251    mach->Tokens = tokens;
 252    mach->Samplers = samplers;
 253
 254    k = tgsi_parse_init (&parse, mach->Tokens);
 255    if (k != TGSI_PARSE_OK) {
 256       debug_printf( "Problem parsing!\n" );
 257       return;
 258    }
 259
 260    mach->Processor = parse.FullHeader.Processor.Processor;
 261    mach->ImmLimit = 0;
 262    labels->count = 0;
 263
 264    declarations = (struct tgsi_full_declaration *)
 265       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 266
 267    if (!declarations) {
 268       return;
 269    }
 270
 271    instructions = (struct tgsi_full_instruction *)
 272       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 273
 274    if (!instructions) {
 275       FREE( declarations );
 276       return;
 277    }
 278
 279    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 280       uint pointer = parse.Position;
 281       uint i;
 282
 283       tgsi_parse_token( &parse );
 284       switch( parse.FullToken.Token.Type ) {
 285       case TGSI_TOKEN_TYPE_DECLARATION:
 286          /* save expanded declaration */
 287          if (numDeclarations == maxDeclarations) {
 288             declarations = REALLOC(declarations,
 289                                    maxDeclarations
 290                                    * sizeof(struct tgsi_full_declaration),
 291                                    (maxDeclarations + 10)
 292                                    * sizeof(struct tgsi_full_declaration));
 293             maxDeclarations += 10;
 294          }
 295          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
 296             unsigned reg;
 297             for (reg = parse.FullToken.FullDeclaration.Range.First;
 298                  reg <= parse.FullToken.FullDeclaration.Range.Last;
 299                  ++reg) {
 300                ++mach->NumOutputs;
 301             }
 302          }
 303          memcpy(declarations + numDeclarations,
 304                 &parse.FullToken.FullDeclaration,
 305                 sizeof(declarations[0]));
 306          numDeclarations++;
 307          break;
 308
 309       case TGSI_TOKEN_TYPE_IMMEDIATE:
 310          {
 311             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 312             assert( size <= 4 );
 313             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
 314
 315             for( i = 0; i < size; i++ ) {
 316                mach->Imms[mach->ImmLimit][i] =
 317                   parse.FullToken.FullImmediate.u[i].Float;
 318             }
 319             mach->ImmLimit += 1;
 320          }
 321          break;
 322
 323       case TGSI_TOKEN_TYPE_INSTRUCTION:
 324          assert( labels->count < MAX_LABELS );
 325
 326          labels->labels[labels->count][0] = instno;
 327          labels->labels[labels->count][1] = pointer;
 328          labels->count++;
 329
 330          /* save expanded instruction */
 331          if (numInstructions == maxInstructions) {
 332             instructions = REALLOC(instructions,
 333                                    maxInstructions
 334                                    * sizeof(struct tgsi_full_instruction),
 335                                    (maxInstructions + 10)
 336                                    * sizeof(struct tgsi_full_instruction));
 337             maxInstructions += 10;
 338          }
 339
 340          memcpy(instructions + numInstructions,
 341                 &parse.FullToken.FullInstruction,
 342                 sizeof(instructions[0]));
 343
 344          numInstructions++;
 345          break;
 346
 347       case TGSI_TOKEN_TYPE_PROPERTY:
 348          break;
 349
 350       default:
 351          assert( 0 );
 352       }
 353    }
 354    tgsi_parse_free (&parse);
 355
 356    if (mach->Declarations) {
 357       FREE( mach->Declarations );
 358    }
 359    mach->Declarations = declarations;
 360    mach->NumDeclarations = numDeclarations;
 361
 362    if (mach->Instructions) {
 363       FREE( mach->Instructions );
 364    }
 365    mach->Instructions = instructions;
 366    mach->NumInstructions = numInstructions;
 367 }
 368
 369
 370 struct tgsi_exec_machine *
 371 tgsi_exec_machine_create( void )
 372 {
 373    struct tgsi_exec_machine *mach;
 374    uint i;
 375
 376    mach = align_malloc( sizeof *mach, 16 );
 377    if (!mach)
 378       goto fail;
 379
 380    memset(mach, 0, sizeof(*mach));
 381
 382    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 383    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
 384    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
 385
 386    /* Setup constants. */
 387    for( i = 0; i < 4; i++ ) {
 388       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 389       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 390       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 391       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 392       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 393       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 394       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 395       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 396       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 397       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 398    }
 399
 400 #ifdef DEBUG
 401    /* silence warnings */
 402    (void) print_chan;
 403    (void) print_temp;
 404 #endif
 405
 406    return mach;
 407
 408 fail:
 409    align_free(mach);
 410    return NULL;
 411 }
 412
 413
 414 void
 415 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 416 {
 417    if (mach) {
 418       FREE(mach->Instructions);
 419       FREE(mach->Declarations);
 420    }
 421
 422    align_free(mach);
 423 }
 424
 425
 426 static void
 427 micro_abs(
 428    union tgsi_exec_channel *dst,
 429    const union tgsi_exec_channel *src )
 430 {
 431    dst->f[0] = fabsf( src->f[0] );
 432    dst->f[1] = fabsf( src->f[1] );
 433    dst->f[2] = fabsf( src->f[2] );
 434    dst->f[3] = fabsf( src->f[3] );
 435 }
 436
 437 static void
 438 micro_add(
 439    union tgsi_exec_channel *dst,
 440    const union tgsi_exec_channel *src0,
 441    const union tgsi_exec_channel *src1 )
 442 {
 443    dst->f[0] = src0->f[0] + src1->f[0];
 444    dst->f[1] = src0->f[1] + src1->f[1];
 445    dst->f[2] = src0->f[2] + src1->f[2];
 446    dst->f[3] = src0->f[3] + src1->f[3];
 447 }
 448
 449 static void
 450 micro_ceil(
 451    union tgsi_exec_channel *dst,
 452    const union tgsi_exec_channel *src )
 453 {
 454    dst->f[0] = ceilf( src->f[0] );
 455    dst->f[1] = ceilf( src->f[1] );
 456    dst->f[2] = ceilf( src->f[2] );
 457    dst->f[3] = ceilf( src->f[3] );
 458 }
 459
 460 static void
 461 micro_cos(
 462    union tgsi_exec_channel *dst,
 463    const union tgsi_exec_channel *src )
 464 {
 465    dst->f[0] = cosf( src->f[0] );
 466    dst->f[1] = cosf( src->f[1] );
 467    dst->f[2] = cosf( src->f[2] );
 468    dst->f[3] = cosf( src->f[3] );
 469 }
 470
 471 static void
 472 micro_ddx(
 473    union tgsi_exec_channel *dst,
 474    const union tgsi_exec_channel *src )
 475 {
 476    dst->f[0] =
 477    dst->f[1] =
 478    dst->f[2] =
 479    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 480 }
 481
 482 static void
 483 micro_ddy(
 484    union tgsi_exec_channel *dst,
 485    const union tgsi_exec_channel *src )
 486 {
 487    dst->f[0] =
 488    dst->f[1] =
 489    dst->f[2] =
 490    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 491 }
 492
 493 static void
 494 micro_div(
 495    union tgsi_exec_channel *dst,
 496    const union tgsi_exec_channel *src0,
 497    const union tgsi_exec_channel *src1 )
 498 {
 499    if (src1->f[0] != 0) {
 500       dst->f[0] = src0->f[0] / src1->f[0];
 501    }
 502    if (src1->f[1] != 0) {
 503       dst->f[1] = src0->f[1] / src1->f[1];
 504    }
 505    if (src1->f[2] != 0) {
 506       dst->f[2] = src0->f[2] / src1->f[2];
 507    }
 508    if (src1->f[3] != 0) {
 509       dst->f[3] = src0->f[3] / src1->f[3];
 510    }
 511 }
 512
 513 static void
 514 micro_eq(
 515    union tgsi_exec_channel *dst,
 516    const union tgsi_exec_channel *src0,
 517    const union tgsi_exec_channel *src1,
 518    const union tgsi_exec_channel *src2,
 519    const union tgsi_exec_channel *src3 )
 520 {
 521    dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
 522    dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
 523    dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
 524    dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
 525 }
 526
 527 static void
 528 micro_exp2(
 529    union tgsi_exec_channel *dst,
 530    const union tgsi_exec_channel *src)
 531 {
 532 #if FAST_MATH
 533    dst->f[0] = util_fast_exp2( src->f[0] );
 534    dst->f[1] = util_fast_exp2( src->f[1] );
 535    dst->f[2] = util_fast_exp2( src->f[2] );
 536    dst->f[3] = util_fast_exp2( src->f[3] );
 537 #else
 538
 539 #if DEBUG
 540    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 541    uint i;
 542    union tgsi_exec_channel clamped;
 543
 544    for (i = 0; i < 4; i++) {
 545       if (src->f[i] > 127.99999f) {
 546          clamped.f[i] = 127.99999f;
 547       } else if (src->f[i] < -126.99999f) {
 548          clamped.f[i] = -126.99999f;
 549       } else {
 550          clamped.f[i] = src->f[i];
 551       }
 552    }
 553    src = &clamped;
 554 #endif
 555
 556    dst->f[0] = powf( 2.0f, src->f[0] );
 557    dst->f[1] = powf( 2.0f, src->f[1] );
 558    dst->f[2] = powf( 2.0f, src->f[2] );
 559    dst->f[3] = powf( 2.0f, src->f[3] );
 560 #endif
 561 }
 562
 563 static void
 564 micro_float_clamp(union tgsi_exec_channel *dst,
 565                   const union tgsi_exec_channel *src)
 566 {
 567    uint i;
 568
 569    for (i = 0; i < 4; i++) {
 570       if (src->f[i] > 0.0f) {
 571          if (src->f[i] > 1.884467e+019f)
 572             dst->f[i] = 1.884467e+019f;
 573          else if (src->f[i] < 5.42101e-020f)
 574             dst->f[i] = 5.42101e-020f;
 575          else
 576             dst->f[i] = src->f[i];
 577       }
 578       else {
 579          if (src->f[i] < -1.884467e+019f)
 580             dst->f[i] = -1.884467e+019f;
 581          else if (src->f[i] > -5.42101e-020f)
 582             dst->f[i] = -5.42101e-020f;
 583          else
 584             dst->f[i] = src->f[i];
 585       }
 586    }
 587 }
 588
 589 static void
 590 micro_flr(
 591    union tgsi_exec_channel *dst,
 592    const union tgsi_exec_channel *src )
 593 {
 594    dst->f[0] = floorf( src->f[0] );
 595    dst->f[1] = floorf( src->f[1] );
 596    dst->f[2] = floorf( src->f[2] );
 597    dst->f[3] = floorf( src->f[3] );
 598 }
 599
 600 static void
 601 micro_frc(
 602    union tgsi_exec_channel *dst,
 603    const union tgsi_exec_channel *src )
 604 {
 605    dst->f[0] = src->f[0] - floorf( src->f[0] );
 606    dst->f[1] = src->f[1] - floorf( src->f[1] );
 607    dst->f[2] = src->f[2] - floorf( src->f[2] );
 608    dst->f[3] = src->f[3] - floorf( src->f[3] );
 609 }
 610
 611 static void
 612 micro_lg2(
 613    union tgsi_exec_channel *dst,
 614    const union tgsi_exec_channel *src )
 615 {
 616 #if FAST_MATH
 617    dst->f[0] = util_fast_log2( src->f[0] );
 618    dst->f[1] = util_fast_log2( src->f[1] );
 619    dst->f[2] = util_fast_log2( src->f[2] );
 620    dst->f[3] = util_fast_log2( src->f[3] );
 621 #else
 622    dst->f[0] = logf( src->f[0] ) * 1.442695f;
 623    dst->f[1] = logf( src->f[1] ) * 1.442695f;
 624    dst->f[2] = logf( src->f[2] ) * 1.442695f;
 625    dst->f[3] = logf( src->f[3] ) * 1.442695f;
 626 #endif
 627 }
 628
 629 static void
 630 micro_le(
 631    union tgsi_exec_channel *dst,
 632    const union tgsi_exec_channel *src0,
 633    const union tgsi_exec_channel *src1,
 634    const union tgsi_exec_channel *src2,
 635    const union tgsi_exec_channel *src3 )
 636 {
 637    dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
 638    dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
 639    dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
 640    dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
 641 }
 642
 643 static void
 644 micro_lt(
 645    union tgsi_exec_channel *dst,
 646    const union tgsi_exec_channel *src0,
 647    const union tgsi_exec_channel *src1,
 648    const union tgsi_exec_channel *src2,
 649    const union tgsi_exec_channel *src3 )
 650 {
 651    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 652    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 653    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 654    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 655 }
 656
 657 static void
 658 micro_max(
 659    union tgsi_exec_channel *dst,
 660    const union tgsi_exec_channel *src0,
 661    const union tgsi_exec_channel *src1 )
 662 {
 663    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 664    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 665    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 666    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 667 }
 668
 669 static void
 670 micro_min(
 671    union tgsi_exec_channel *dst,
 672    const union tgsi_exec_channel *src0,
 673    const union tgsi_exec_channel *src1 )
 674 {
 675    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 676    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 677    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 678    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 679 }
 680
 681 #if 0
 682 static void
 683 micro_umod(
 684    union tgsi_exec_channel *dst,
 685    const union tgsi_exec_channel *src0,
 686    const union tgsi_exec_channel *src1 )
 687 {
 688    dst->u[0] = src0->u[0] % src1->u[0];
 689    dst->u[1] = src0->u[1] % src1->u[1];
 690    dst->u[2] = src0->u[2] % src1->u[2];
 691    dst->u[3] = src0->u[3] % src1->u[3];
 692 }
 693 #endif
 694
 695 static void
 696 micro_mul(
 697    union tgsi_exec_channel *dst,
 698    const union tgsi_exec_channel *src0,
 699    const union tgsi_exec_channel *src1 )
 700 {
 701    dst->f[0] = src0->f[0] * src1->f[0];
 702    dst->f[1] = src0->f[1] * src1->f[1];
 703    dst->f[2] = src0->f[2] * src1->f[2];
 704    dst->f[3] = src0->f[3] * src1->f[3];
 705 }
 706
 707 #if 0
 708 static void
 709 micro_imul64(
 710    union tgsi_exec_channel *dst0,
 711    union tgsi_exec_channel *dst1,
 712    const union tgsi_exec_channel *src0,
 713    const union tgsi_exec_channel *src1 )
 714 {
 715    dst1->i[0] = src0->i[0] * src1->i[0];
 716    dst1->i[1] = src0->i[1] * src1->i[1];
 717    dst1->i[2] = src0->i[2] * src1->i[2];
 718    dst1->i[3] = src0->i[3] * src1->i[3];
 719    dst0->i[0] = 0;
 720    dst0->i[1] = 0;
 721    dst0->i[2] = 0;
 722    dst0->i[3] = 0;
 723 }
 724 #endif
 725
 726 #if 0
 727 static void
 728 micro_umul64(
 729    union tgsi_exec_channel *dst0,
 730    union tgsi_exec_channel *dst1,
 731    const union tgsi_exec_channel *src0,
 732    const union tgsi_exec_channel *src1 )
 733 {
 734    dst1->u[0] = src0->u[0] * src1->u[0];
 735    dst1->u[1] = src0->u[1] * src1->u[1];
 736    dst1->u[2] = src0->u[2] * src1->u[2];
 737    dst1->u[3] = src0->u[3] * src1->u[3];
 738    dst0->u[0] = 0;
 739    dst0->u[1] = 0;
 740    dst0->u[2] = 0;
 741    dst0->u[3] = 0;
 742 }
 743 #endif
 744
 745
 746 #if 0
 747 static void
 748 micro_movc(
 749    union tgsi_exec_channel *dst,
 750    const union tgsi_exec_channel *src0,
 751    const union tgsi_exec_channel *src1,
 752    const union tgsi_exec_channel *src2 )
 753 {
 754    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 755    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 756    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 757    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 758 }
 759 #endif
 760
 761 static void
 762 micro_neg(
 763    union tgsi_exec_channel *dst,
 764    const union tgsi_exec_channel *src )
 765 {
 766    dst->f[0] = -src->f[0];
 767    dst->f[1] = -src->f[1];
 768    dst->f[2] = -src->f[2];
 769    dst->f[3] = -src->f[3];
 770 }
 771
 772 static void
 773 micro_pow(
 774    union tgsi_exec_channel *dst,
 775    const union tgsi_exec_channel *src0,
 776    const union tgsi_exec_channel *src1 )
 777 {
 778 #if FAST_MATH
 779    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 780    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 781    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 782    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 783 #else
 784    dst->f[0] = powf( src0->f[0], src1->f[0] );
 785    dst->f[1] = powf( src0->f[1], src1->f[1] );
 786    dst->f[2] = powf( src0->f[2], src1->f[2] );
 787    dst->f[3] = powf( src0->f[3], src1->f[3] );
 788 #endif
 789 }
 790
 791 static void
 792 micro_rnd(
 793    union tgsi_exec_channel *dst,
 794    const union tgsi_exec_channel *src )
 795 {
 796    dst->f[0] = floorf( src->f[0] + 0.5f );
 797    dst->f[1] = floorf( src->f[1] + 0.5f );
 798    dst->f[2] = floorf( src->f[2] + 0.5f );
 799    dst->f[3] = floorf( src->f[3] + 0.5f );
 800 }
 801
 802 static void
 803 micro_sgn(
 804    union tgsi_exec_channel *dst,
 805    const union tgsi_exec_channel *src )
 806 {
 807    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 808    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 809    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 810    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 811 }
 812
 813 static void
 814 micro_trunc(
 815    union tgsi_exec_channel *dst,
 816    const union tgsi_exec_channel *src0 )
 817 {
 818    dst->f[0] = (float) (int) src0->f[0];
 819    dst->f[1] = (float) (int) src0->f[1];
 820    dst->f[2] = (float) (int) src0->f[2];
 821    dst->f[3] = (float) (int) src0->f[3];
 822 }
 823
 824 static void
 825 micro_sin(
 826    union tgsi_exec_channel *dst,
 827    const union tgsi_exec_channel *src )
 828 {
 829    dst->f[0] = sinf( src->f[0] );
 830    dst->f[1] = sinf( src->f[1] );
 831    dst->f[2] = sinf( src->f[2] );
 832    dst->f[3] = sinf( src->f[3] );
 833 }
 834
 835 static void
 836 micro_sqrt( union tgsi_exec_channel *dst,
 837             const union tgsi_exec_channel *src )
 838 {
 839    dst->f[0] = sqrtf( src->f[0] );
 840    dst->f[1] = sqrtf( src->f[1] );
 841    dst->f[2] = sqrtf( src->f[2] );
 842    dst->f[3] = sqrtf( src->f[3] );
 843 }
 844
 845 static void
 846 micro_sub(
 847    union tgsi_exec_channel *dst,
 848    const union tgsi_exec_channel *src0,
 849    const union tgsi_exec_channel *src1 )
 850 {
 851    dst->f[0] = src0->f[0] - src1->f[0];
 852    dst->f[1] = src0->f[1] - src1->f[1];
 853    dst->f[2] = src0->f[2] - src1->f[2];
 854    dst->f[3] = src0->f[3] - src1->f[3];
 855 }
 856
 857 static void
 858 fetch_src_file_channel(
 859    const struct tgsi_exec_machine *mach,
 860    const uint file,
 861    const uint swizzle,
 862    const union tgsi_exec_channel *index,
 863    union tgsi_exec_channel *chan )
 864 {
 865    switch( swizzle ) {
 866    case TGSI_SWIZZLE_X:
 867    case TGSI_SWIZZLE_Y:
 868    case TGSI_SWIZZLE_Z:
 869    case TGSI_SWIZZLE_W:
 870       switch( file ) {
 871       case TGSI_FILE_CONSTANT:
 872          assert(mach->Consts);
 873          if (index->i[0] < 0)
 874             chan->f[0] = 0.0f;
 875          else
 876             chan->f[0] = mach->Consts[index->i[0]][swizzle];
 877          if (index->i[1] < 0)
 878             chan->f[1] = 0.0f;
 879          else
 880             chan->f[1] = mach->Consts[index->i[1]][swizzle];
 881          if (index->i[2] < 0)
 882             chan->f[2] = 0.0f;
 883          else
 884             chan->f[2] = mach->Consts[index->i[2]][swizzle];
 885          if (index->i[3] < 0)
 886             chan->f[3] = 0.0f;
 887          else
 888             chan->f[3] = mach->Consts[index->i[3]][swizzle];
 889          break;
 890
 891       case TGSI_FILE_INPUT:
 892       case TGSI_FILE_SYSTEM_VALUE:
 893          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
 894          chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
 895          chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
 896          chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
 897          break;
 898
 899       case TGSI_FILE_TEMPORARY:
 900          assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
 901          chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
 902          chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
 903          chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
 904          chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
 905          break;
 906
 907       case TGSI_FILE_IMMEDIATE:
 908          assert( index->i[0] < (int) mach->ImmLimit );
 909          chan->f[0] = mach->Imms[index->i[0]][swizzle];
 910          assert( index->i[1] < (int) mach->ImmLimit );
 911          chan->f[1] = mach->Imms[index->i[1]][swizzle];
 912          assert( index->i[2] < (int) mach->ImmLimit );
 913          chan->f[2] = mach->Imms[index->i[2]][swizzle];
 914          assert( index->i[3] < (int) mach->ImmLimit );
 915          chan->f[3] = mach->Imms[index->i[3]][swizzle];
 916          break;
 917
 918       case TGSI_FILE_ADDRESS:
 919          chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
 920          chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
 921          chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
 922          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
 923          break;
 924
 925       case TGSI_FILE_PREDICATE:
 926          assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
 927          assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
 928          assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
 929          assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
 930          chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
 931          chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
 932          chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
 933          chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
 934          break;
 935
 936       case TGSI_FILE_OUTPUT:
 937          /* vertex/fragment output vars can be read too */
 938          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
 939          chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
 940          chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
 941          chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
 942          break;
 943
 944       default:
 945          assert( 0 );
 946       }
 947       break;
 948
 949    default:
 950       assert( 0 );
 951    }
 952 }
 953
 954 static void
 955 fetch_source(
 956    const struct tgsi_exec_machine *mach,
 957    union tgsi_exec_channel *chan,
 958    const struct tgsi_full_src_register *reg,
 959    const uint chan_index )
 960 {
 961    union tgsi_exec_channel index;
 962    uint swizzle;
 963
 964    /* We start with a direct index into a register file.
 965     *
 966     *    file[1],
 967     *    where:
 968     *       file = Register.File
 969     *       [1] = Register.Index
 970     */
 971    index.i[0] =
 972    index.i[1] =
 973    index.i[2] =
 974    index.i[3] = reg->Register.Index;
 975
 976    /* There is an extra source register that indirectly subscripts
 977     * a register file. The direct index now becomes an offset
 978     * that is being added to the indirect register.
 979     *
 980     *    file[ind[2].x+1],
 981     *    where:
 982     *       ind = Indirect.File
 983     *       [2] = Indirect.Index
 984     *       .x = Indirect.SwizzleX
 985     */
 986    if (reg->Register.Indirect) {
 987       union tgsi_exec_channel index2;
 988       union tgsi_exec_channel indir_index;
 989       const uint execmask = mach->ExecMask;
 990       uint i;
 991
 992       /* which address register (always zero now) */
 993       index2.i[0] =
 994       index2.i[1] =
 995       index2.i[2] =
 996       index2.i[3] = reg->Indirect.Index;
 997
 998       /* get current value of address register[swizzle] */
 999       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1000       fetch_src_file_channel(
1001          mach,
1002          reg->Indirect.File,
1003          swizzle,
1004          &index2,
1005          &indir_index );
1006
1007       /* add value of address register to the offset */
1008       index.i[0] += (int) indir_index.f[0];
1009       index.i[1] += (int) indir_index.f[1];
1010       index.i[2] += (int) indir_index.f[2];
1011       index.i[3] += (int) indir_index.f[3];
1012
1013       /* for disabled execution channels, zero-out the index to
1014        * avoid using a potential garbage value.
1015        */
1016       for (i = 0; i < QUAD_SIZE; i++) {
1017          if ((execmask & (1 << i)) == 0)
1018             index.i[i] = 0;
1019       }
1020    }
1021
1022    /* There is an extra source register that is a second
1023     * subscript to a register file. Effectively it means that
1024     * the register file is actually a 2D array of registers.
1025     *
1026     *    file[1][3] == file[1*sizeof(file[1])+3],
1027     *    where:
1028     *       [3] = Dimension.Index
1029     */
1030    if (reg->Register.Dimension) {
1031       /* The size of the first-order array depends on the register file type.
1032        * We need to multiply the index to the first array to get an effective,
1033        * "flat" index that points to the beginning of the second-order array.
1034        */
1035       switch (reg->Register.File) {
1036       case TGSI_FILE_INPUT:
1037       case TGSI_FILE_SYSTEM_VALUE:
1038          index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1039          index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1040          index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1041          index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1042          break;
1043       case TGSI_FILE_CONSTANT:
1044          index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1045          index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1046          index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1047          index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1048          break;
1049       default:
1050          assert( 0 );
1051       }
1052
1053       index.i[0] += reg->Dimension.Index;
1054       index.i[1] += reg->Dimension.Index;
1055       index.i[2] += reg->Dimension.Index;
1056       index.i[3] += reg->Dimension.Index;
1057
1058       /* Again, the second subscript index can be addressed indirectly
1059        * identically to the first one.
1060        * Nothing stops us from indirectly addressing the indirect register,
1061        * but there is no need for that, so we won't exercise it.
1062        *
1063        *    file[1][ind[4].y+3],
1064        *    where:
1065        *       ind = DimIndirect.File
1066        *       [4] = DimIndirect.Index
1067        *       .y = DimIndirect.SwizzleX
1068        */
1069       if (reg->Dimension.Indirect) {
1070          union tgsi_exec_channel index2;
1071          union tgsi_exec_channel indir_index;
1072          const uint execmask = mach->ExecMask;
1073          uint i;
1074
1075          index2.i[0] =
1076          index2.i[1] =
1077          index2.i[2] =
1078          index2.i[3] = reg->DimIndirect.Index;
1079
1080          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1081          fetch_src_file_channel(
1082             mach,
1083             reg->DimIndirect.File,
1084             swizzle,
1085             &index2,
1086             &indir_index );
1087
1088          index.i[0] += (int) indir_index.f[0];
1089          index.i[1] += (int) indir_index.f[1];
1090          index.i[2] += (int) indir_index.f[2];
1091          index.i[3] += (int) indir_index.f[3];
1092
1093          /* for disabled execution channels, zero-out the index to
1094           * avoid using a potential garbage value.
1095           */
1096          for (i = 0; i < QUAD_SIZE; i++) {
1097             if ((execmask & (1 << i)) == 0)
1098                index.i[i] = 0;
1099          }
1100       }
1101
1102       /* If by any chance there was a need for a 3D array of register
1103        * files, we would have to check whether Dimension is followed
1104        * by a dimension register and continue the saga.
1105        */
1106    }
1107
1108    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1109    fetch_src_file_channel(
1110       mach,
1111       reg->Register.File,
1112       swizzle,
1113       &index,
1114       chan );
1115
1116    switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1117    case TGSI_UTIL_SIGN_CLEAR:
1118       micro_abs( chan, chan );
1119       break;
1120
1121    case TGSI_UTIL_SIGN_SET:
1122       micro_abs( chan, chan );
1123       micro_neg( chan, chan );
1124       break;
1125
1126    case TGSI_UTIL_SIGN_TOGGLE:
1127       micro_neg( chan, chan );
1128       break;
1129
1130    case TGSI_UTIL_SIGN_KEEP:
1131       break;
1132    }
1133 }
1134
1135 static void
1136 store_dest(
1137    struct tgsi_exec_machine *mach,
1138    const union tgsi_exec_channel *chan,
1139    const struct tgsi_full_dst_register *reg,
1140    const struct tgsi_full_instruction *inst,
1141    uint chan_index )
1142 {
1143    uint i;
1144    union tgsi_exec_channel null;
1145    union tgsi_exec_channel *dst;
1146    uint execmask = mach->ExecMask;
1147    int offset = 0;  /* indirection offset */
1148    int index;
1149
1150 #ifdef DEBUG
1151    check_inf_or_nan(chan);
1152 #endif
1153
1154    /* There is an extra source register that indirectly subscripts
1155     * a register file. The direct index now becomes an offset
1156     * that is being added to the indirect register.
1157     *
1158     *    file[ind[2].x+1],
1159     *    where:
1160     *       ind = Indirect.File
1161     *       [2] = Indirect.Index
1162     *       .x = Indirect.SwizzleX
1163     */
1164    if (reg->Register.Indirect) {
1165       union tgsi_exec_channel index;
1166       union tgsi_exec_channel indir_index;
1167       uint swizzle;
1168
1169       /* which address register (always zero for now) */
1170       index.i[0] =
1171       index.i[1] =
1172       index.i[2] =
1173       index.i[3] = reg->Indirect.Index;
1174
1175       /* get current value of address register[swizzle] */
1176       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1177
1178       /* fetch values from the address/indirection register */
1179       fetch_src_file_channel(
1180          mach,
1181          reg->Indirect.File,
1182          swizzle,
1183          &index,
1184          &indir_index );
1185
1186       /* save indirection offset */
1187       offset = (int) indir_index.f[0];
1188    }
1189
1190    switch (reg->Register.File) {
1191    case TGSI_FILE_NULL:
1192       dst = &null;
1193       break;
1194
1195    case TGSI_FILE_OUTPUT:
1196       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1197          + reg->Register.Index;
1198       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1199 #if 0
1200       if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1201          fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1202          for (i = 0; i < QUAD_SIZE; i++)
1203             if (execmask & (1 << i))
1204                fprintf(stderr, "%f, ", chan->f[i]);
1205          fprintf(stderr, ")\n");
1206       }
1207 #endif
1208       break;
1209
1210    case TGSI_FILE_TEMPORARY:
1211       index = reg->Register.Index;
1212       assert( index < TGSI_EXEC_NUM_TEMPS );
1213       dst = &mach->Temps[offset + index].xyzw[chan_index];
1214       break;
1215
1216    case TGSI_FILE_ADDRESS:
1217       index = reg->Register.Index;
1218       dst = &mach->Addrs[index].xyzw[chan_index];
1219       break;
1220
1221    case TGSI_FILE_LOOP:
1222       assert(reg->Register.Index == 0);
1223       assert(mach->LoopCounterStackTop > 0);
1224       assert(chan_index == CHAN_X);
1225       dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1226       break;
1227
1228    case TGSI_FILE_PREDICATE:
1229       index = reg->Register.Index;
1230       assert(index < TGSI_EXEC_NUM_PREDS);
1231       dst = &mach->Predicates[index].xyzw[chan_index];
1232       break;
1233
1234    default:
1235       assert( 0 );
1236       return;
1237    }
1238
1239    if (inst->Instruction.Predicate) {
1240       uint swizzle;
1241       union tgsi_exec_channel *pred;
1242
1243       switch (chan_index) {
1244       case CHAN_X:
1245          swizzle = inst->Predicate.SwizzleX;
1246          break;
1247       case CHAN_Y:
1248          swizzle = inst->Predicate.SwizzleY;
1249          break;
1250       case CHAN_Z:
1251          swizzle = inst->Predicate.SwizzleZ;
1252          break;
1253       case CHAN_W:
1254          swizzle = inst->Predicate.SwizzleW;
1255          break;
1256       default:
1257          assert(0);
1258          return;
1259       }
1260
1261       assert(inst->Predicate.Index == 0);
1262
1263       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1264
1265       if (inst->Predicate.Negate) {
1266          for (i = 0; i < QUAD_SIZE; i++) {
1267             if (pred->u[i]) {
1268                execmask &= ~(1 << i);
1269             }
1270          }
1271       } else {
1272          for (i = 0; i < QUAD_SIZE; i++) {
1273             if (!pred->u[i]) {
1274                execmask &= ~(1 << i);
1275             }
1276          }
1277       }
1278    }
1279
1280    switch (inst->Instruction.Saturate) {
1281    case TGSI_SAT_NONE:
1282       for (i = 0; i < QUAD_SIZE; i++)
1283          if (execmask & (1 << i))
1284             dst->i[i] = chan->i[i];
1285       break;
1286
1287    case TGSI_SAT_ZERO_ONE:
1288       for (i = 0; i < QUAD_SIZE; i++)
1289          if (execmask & (1 << i)) {
1290             if (chan->f[i] < 0.0f)
1291                dst->f[i] = 0.0f;
1292             else if (chan->f[i] > 1.0f)
1293                dst->f[i] = 1.0f;
1294             else
1295                dst->i[i] = chan->i[i];
1296          }
1297       break;
1298
1299    case TGSI_SAT_MINUS_PLUS_ONE:
1300       for (i = 0; i < QUAD_SIZE; i++)
1301          if (execmask & (1 << i)) {
1302             if (chan->f[i] < -1.0f)
1303                dst->f[i] = -1.0f;
1304             else if (chan->f[i] > 1.0f)
1305                dst->f[i] = 1.0f;
1306             else
1307                dst->i[i] = chan->i[i];
1308          }
1309       break;
1310
1311    default:
1312       assert( 0 );
1313    }
1314 }
1315
1316 #define FETCH(VAL,INDEX,CHAN)\
1317     fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
1318
1319 #define STORE(VAL,INDEX,CHAN)\
1320     store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
1321
1322
1323 /**
1324  * Execute ARB-style KIL which is predicated by a src register.
1325  * Kill fragment if any of the four values is less than zero.
1326  */
1327 static void
1328 exec_kil(struct tgsi_exec_machine *mach,
1329          const struct tgsi_full_instruction *inst)
1330 {
1331    uint uniquemask;
1332    uint chan_index;
1333    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1334    union tgsi_exec_channel r[1];
1335
1336    /* This mask stores component bits that were already tested. */
1337    uniquemask = 0;
1338
1339    for (chan_index = 0; chan_index < 4; chan_index++)
1340    {
1341       uint swizzle;
1342       uint i;
1343
1344       /* unswizzle channel */
1345       swizzle = tgsi_util_get_full_src_register_swizzle (
1346                         &inst->Src[0],
1347                         chan_index);
1348
1349       /* check if the component has not been already tested */
1350       if (uniquemask & (1 << swizzle))
1351          continue;
1352       uniquemask |= 1 << swizzle;
1353
1354       FETCH(&r[0], 0, chan_index);
1355       for (i = 0; i < 4; i++)
1356          if (r[0].f[i] < 0.0f)
1357             kilmask |= 1 << i;
1358    }
1359
1360    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1361 }
1362
1363 /**
1364  * Execute NVIDIA-style KIL which is predicated by a condition code.
1365  * Kill fragment if the condition code is TRUE.
1366  */
1367 static void
1368 exec_kilp(struct tgsi_exec_machine *mach,
1369           const struct tgsi_full_instruction *inst)
1370 {
1371    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1372
1373    /* "unconditional" kil */
1374    kilmask = mach->ExecMask;
1375    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1376 }
1377
1378 static void
1379 emit_vertex(struct tgsi_exec_machine *mach)
1380 {
1381    /* FIXME: check for exec mask correctly
1382    unsigned i;
1383    for (i = 0; i < QUAD_SIZE; ++i) {
1384          if ((mach->ExecMask & (1 << i)))
1385    */
1386    if (mach->ExecMask) {
1387       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1388       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1389    }
1390 }
1391
1392 static void
1393 emit_primitive(struct tgsi_exec_machine *mach)
1394 {
1395    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1396    /* FIXME: check for exec mask correctly
1397    unsigned i;
1398    for (i = 0; i < QUAD_SIZE; ++i) {
1399          if ((mach->ExecMask & (1 << i)))
1400    */
1401    if (mach->ExecMask) {
1402       ++(*prim_count);
1403       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1404       mach->Primitives[*prim_count] = 0;
1405    }
1406 }
1407
1408 /*
1409  * Fetch a four texture samples using STR texture coordinates.
1410  */
1411 static void
1412 fetch_texel( struct tgsi_sampler *sampler,
1413              const union tgsi_exec_channel *s,
1414              const union tgsi_exec_channel *t,
1415              const union tgsi_exec_channel *p,
1416              float lodbias,  /* XXX should be float[4] */
1417              union tgsi_exec_channel *r,
1418              union tgsi_exec_channel *g,
1419              union tgsi_exec_channel *b,
1420              union tgsi_exec_channel *a )
1421 {
1422    uint j;
1423    float rgba[NUM_CHANNELS][QUAD_SIZE];
1424
1425    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1426
1427    for (j = 0; j < 4; j++) {
1428       r->f[j] = rgba[0][j];
1429       g->f[j] = rgba[1][j];
1430       b->f[j] = rgba[2][j];
1431       a->f[j] = rgba[3][j];
1432    }
1433 }
1434
1435
1436 static void
1437 exec_tex(struct tgsi_exec_machine *mach,
1438          const struct tgsi_full_instruction *inst,
1439          boolean biasLod,
1440          boolean projected)
1441 {
1442    const uint unit = inst->Src[1].Register.Index;
1443    union tgsi_exec_channel r[4];
1444    uint chan_index;
1445    float lodBias;
1446
1447    /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1448
1449    switch (inst->Texture.Texture) {
1450    case TGSI_TEXTURE_1D:
1451    case TGSI_TEXTURE_SHADOW1D:
1452
1453       FETCH(&r[0], 0, CHAN_X);
1454
1455       if (projected) {
1456          FETCH(&r[1], 0, CHAN_W);
1457          micro_div( &r[0], &r[0], &r[1] );
1458       }
1459
1460       if (biasLod) {
1461          FETCH(&r[1], 0, CHAN_W);
1462          lodBias = r[2].f[0];
1463       }
1464       else
1465          lodBias = 0.0;
1466
1467       fetch_texel(mach->Samplers[unit],
1468                   &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1469                   &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1470       break;
1471
1472    case TGSI_TEXTURE_2D:
1473    case TGSI_TEXTURE_RECT:
1474    case TGSI_TEXTURE_SHADOW2D:
1475    case TGSI_TEXTURE_SHADOWRECT:
1476
1477       FETCH(&r[0], 0, CHAN_X);
1478       FETCH(&r[1], 0, CHAN_Y);
1479       FETCH(&r[2], 0, CHAN_Z);
1480
1481       if (projected) {
1482          FETCH(&r[3], 0, CHAN_W);
1483          micro_div( &r[0], &r[0], &r[3] );
1484          micro_div( &r[1], &r[1], &r[3] );
1485          micro_div( &r[2], &r[2], &r[3] );
1486       }
1487
1488       if (biasLod) {
1489          FETCH(&r[3], 0, CHAN_W);
1490          lodBias = r[3].f[0];
1491       }
1492       else
1493          lodBias = 0.0;
1494
1495       fetch_texel(mach->Samplers[unit],
1496                   &r[0], &r[1], &r[2], lodBias,  /* inputs */
1497                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1498       break;
1499
1500    case TGSI_TEXTURE_3D:
1501    case TGSI_TEXTURE_CUBE:
1502
1503       FETCH(&r[0], 0, CHAN_X);
1504       FETCH(&r[1], 0, CHAN_Y);
1505       FETCH(&r[2], 0, CHAN_Z);
1506
1507       if (projected) {
1508          FETCH(&r[3], 0, CHAN_W);
1509          micro_div( &r[0], &r[0], &r[3] );
1510          micro_div( &r[1], &r[1], &r[3] );
1511          micro_div( &r[2], &r[2], &r[3] );
1512       }
1513
1514       if (biasLod) {
1515          FETCH(&r[3], 0, CHAN_W);
1516          lodBias = r[3].f[0];
1517       }
1518       else
1519          lodBias = 0.0;
1520
1521       fetch_texel(mach->Samplers[unit],
1522                   &r[0], &r[1], &r[2], lodBias,
1523                   &r[0], &r[1], &r[2], &r[3]);
1524       break;
1525
1526    default:
1527       assert (0);
1528    }
1529
1530    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1531       STORE( &r[chan_index], 0, chan_index );
1532    }
1533 }
1534
1535 static void
1536 exec_txd(struct tgsi_exec_machine *mach,
1537          const struct tgsi_full_instruction *inst)
1538 {
1539    const uint unit = inst->Src[3].Register.Index;
1540    union tgsi_exec_channel r[4];
1541    uint chan_index;
1542
1543    /*
1544     * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1545     */
1546
1547    switch (inst->Texture.Texture) {
1548    case TGSI_TEXTURE_1D:
1549    case TGSI_TEXTURE_SHADOW1D:
1550
1551       FETCH(&r[0], 0, CHAN_X);
1552
1553       fetch_texel(mach->Samplers[unit],
1554                   &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
1555                   &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
1556       break;
1557
1558    case TGSI_TEXTURE_2D:
1559    case TGSI_TEXTURE_RECT:
1560    case TGSI_TEXTURE_SHADOW2D:
1561    case TGSI_TEXTURE_SHADOWRECT:
1562
1563       FETCH(&r[0], 0, CHAN_X);
1564       FETCH(&r[1], 0, CHAN_Y);
1565       FETCH(&r[2], 0, CHAN_Z);
1566
1567       fetch_texel(mach->Samplers[unit],
1568                   &r[0], &r[1], &r[2], 0.0f,    /* inputs */
1569                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1570       break;
1571
1572    case TGSI_TEXTURE_3D:
1573    case TGSI_TEXTURE_CUBE:
1574
1575       FETCH(&r[0], 0, CHAN_X);
1576       FETCH(&r[1], 0, CHAN_Y);
1577       FETCH(&r[2], 0, CHAN_Z);
1578
1579       fetch_texel(mach->Samplers[unit],
1580                   &r[0], &r[1], &r[2], 0.0f,
1581                   &r[0], &r[1], &r[2], &r[3]);
1582       break;
1583
1584    default:
1585       assert(0);
1586    }
1587
1588    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1589       STORE(&r[chan_index], 0, chan_index);
1590    }
1591 }
1592
1593
1594 /**
1595  * Evaluate a constant-valued coefficient at the position of the
1596  * current quad.
1597  */
1598 static void
1599 eval_constant_coef(
1600    struct tgsi_exec_machine *mach,
1601    unsigned attrib,
1602    unsigned chan )
1603 {
1604    unsigned i;
1605
1606    for( i = 0; i < QUAD_SIZE; i++ ) {
1607       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1608    }
1609 }
1610
1611 /**
1612  * Evaluate a linear-valued coefficient at the position of the
1613  * current quad.
1614  */
1615 static void
1616 eval_linear_coef(
1617    struct tgsi_exec_machine *mach,
1618    unsigned attrib,
1619    unsigned chan )
1620 {
1621    const float x = mach->QuadPos.xyzw[0].f[0];
1622    const float y = mach->QuadPos.xyzw[1].f[0];
1623    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1624    const float dady = mach->InterpCoefs[attrib].dady[chan];
1625    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1626    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1627    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1628    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1629    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1630 }
1631
1632 /**
1633  * Evaluate a perspective-valued coefficient at the position of the
1634  * current quad.
1635  */
1636 static void
1637 eval_perspective_coef(
1638    struct tgsi_exec_machine *mach,
1639    unsigned attrib,
1640    unsigned chan )
1641 {
1642    const float x = mach->QuadPos.xyzw[0].f[0];
1643    const float y = mach->QuadPos.xyzw[1].f[0];
1644    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1645    const float dady = mach->InterpCoefs[attrib].dady[chan];
1646    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1647    const float *w = mach->QuadPos.xyzw[3].f;
1648    /* divide by W here */
1649    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1650    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1651    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1652    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1653 }
1654
1655
1656 typedef void (* eval_coef_func)(
1657    struct tgsi_exec_machine *mach,
1658    unsigned attrib,
1659    unsigned chan );
1660
1661 static void
1662 exec_declaration(struct tgsi_exec_machine *mach,
1663                  const struct tgsi_full_declaration *decl)
1664 {
1665    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1666       if (decl->Declaration.File == TGSI_FILE_INPUT ||
1667           decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1668          uint first, last, mask;
1669
1670          first = decl->Range.First;
1671          last = decl->Range.Last;
1672          mask = decl->Declaration.UsageMask;
1673
1674          if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1675             assert(decl->Semantic.Index == 0);
1676             assert(first == last);
1677             assert(mask == TGSI_WRITEMASK_XYZW);
1678
1679             mach->Inputs[first] = mach->QuadPos;
1680          } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1681             uint i;
1682
1683             assert(decl->Semantic.Index == 0);
1684             assert(first == last);
1685
1686             for (i = 0; i < QUAD_SIZE; i++) {
1687                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1688             }
1689          } else {
1690             eval_coef_func eval;
1691             uint i, j;
1692
1693             switch (decl->Declaration.Interpolate) {
1694             case TGSI_INTERPOLATE_CONSTANT:
1695                eval = eval_constant_coef;
1696                break;
1697
1698             case TGSI_INTERPOLATE_LINEAR:
1699                eval = eval_linear_coef;
1700                break;
1701
1702             case TGSI_INTERPOLATE_PERSPECTIVE:
1703                eval = eval_perspective_coef;
1704                break;
1705
1706             default:
1707                assert(0);
1708                return;
1709             }
1710
1711             for (j = 0; j < NUM_CHANNELS; j++) {
1712                if (mask & (1 << j)) {
1713                   for (i = first; i <= last; i++) {
1714                      eval(mach, i, j);
1715                   }
1716                }
1717             }
1718          }
1719       }
1720    }
1721 }
1722
1723 typedef void (* micro_op)(union tgsi_exec_channel *dst,
1724                           const union tgsi_exec_channel *src);
1725
1726 static void
1727 exec_vector_unary(struct tgsi_exec_machine *mach,
1728                   const struct tgsi_full_instruction *inst,
1729                   micro_op op)
1730 {
1731    unsigned int chan;
1732    struct tgsi_exec_vector dst;
1733
1734    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1735       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1736          union tgsi_exec_channel src;
1737
1738          fetch_source(mach, &src, &inst->Src[0], chan);
1739          op(&dst.xyzw[chan], &src);
1740       }
1741    }
1742    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1743       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1744          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
1745       }
1746    }
1747 }
1748
1749 static void
1750 exec_vector_binary(struct tgsi_exec_machine *mach,
1751                    const struct tgsi_full_instruction *inst,
1752                    micro_op op)
1753 {
1754    unsigned int chan;
1755    struct tgsi_exec_vector dst;
1756
1757    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1758       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1759          union tgsi_exec_channel src[2];
1760
1761          fetch_source(mach, &src[0], &inst->Src[0], chan);
1762          fetch_source(mach, &src[1], &inst->Src[1], chan);
1763          op(&dst.xyzw[chan], src);
1764       }
1765    }
1766    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1767       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1768          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
1769       }
1770    }
1771 }
1772
1773 static void
1774 exec_vector_trinary(struct tgsi_exec_machine *mach,
1775                     const struct tgsi_full_instruction *inst,
1776                     micro_op op)
1777 {
1778    unsigned int chan;
1779    struct tgsi_exec_vector dst;
1780
1781    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1782       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1783          union tgsi_exec_channel src[3];
1784
1785          fetch_source(mach, &src[0], &inst->Src[0], chan);
1786          fetch_source(mach, &src[1], &inst->Src[1], chan);
1787          fetch_source(mach, &src[2], &inst->Src[2], chan);
1788          op(&dst.xyzw[chan], src);
1789       }
1790    }
1791    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1792       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1793          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
1794       }
1795    }
1796 }
1797
1798 static void
1799 micro_i2f(union tgsi_exec_channel *dst,
1800           const union tgsi_exec_channel *src)
1801 {
1802    dst->f[0] = (float)src->i[0];
1803    dst->f[1] = (float)src->i[1];
1804    dst->f[2] = (float)src->i[2];
1805    dst->f[3] = (float)src->i[3];
1806 }
1807
1808 static void
1809 micro_not(union tgsi_exec_channel *dst,
1810           const union tgsi_exec_channel *src)
1811 {
1812    dst->u[0] = ~src->u[0];
1813    dst->u[1] = ~src->u[1];
1814    dst->u[2] = ~src->u[2];
1815    dst->u[3] = ~src->u[3];
1816 }
1817
1818 static void
1819 micro_shl(union tgsi_exec_channel *dst,
1820           const union tgsi_exec_channel *src)
1821 {
1822    dst->u[0] = src[0].u[0] << src[1].u[0];
1823    dst->u[1] = src[0].u[1] << src[1].u[1];
1824    dst->u[2] = src[0].u[2] << src[1].u[2];
1825    dst->u[3] = src[0].u[3] << src[1].u[3];
1826 }
1827
1828 static void
1829 micro_and(union tgsi_exec_channel *dst,
1830           const union tgsi_exec_channel *src)
1831 {
1832    dst->u[0] = src[0].u[0] & src[1].u[0];
1833    dst->u[1] = src[0].u[1] & src[1].u[1];
1834    dst->u[2] = src[0].u[2] & src[1].u[2];
1835    dst->u[3] = src[0].u[3] & src[1].u[3];
1836 }
1837
1838 static void
1839 micro_or(union tgsi_exec_channel *dst,
1840          const union tgsi_exec_channel *src)
1841 {
1842    dst->u[0] = src[0].u[0] | src[1].u[0];
1843    dst->u[1] = src[0].u[1] | src[1].u[1];
1844    dst->u[2] = src[0].u[2] | src[1].u[2];
1845    dst->u[3] = src[0].u[3] | src[1].u[3];
1846 }
1847
1848 static void
1849 micro_xor(union tgsi_exec_channel *dst,
1850           const union tgsi_exec_channel *src)
1851 {
1852    dst->u[0] = src[0].u[0] ^ src[1].u[0];
1853    dst->u[1] = src[0].u[1] ^ src[1].u[1];
1854    dst->u[2] = src[0].u[2] ^ src[1].u[2];
1855    dst->u[3] = src[0].u[3] ^ src[1].u[3];
1856 }
1857
1858 static void
1859 micro_f2i(union tgsi_exec_channel *dst,
1860           const union tgsi_exec_channel *src)
1861 {
1862    dst->i[0] = (int)src->f[0];
1863    dst->i[1] = (int)src->f[1];
1864    dst->i[2] = (int)src->f[2];
1865    dst->i[3] = (int)src->f[3];
1866 }
1867
1868 static void
1869 micro_idiv(union tgsi_exec_channel *dst,
1870            const union tgsi_exec_channel *src)
1871 {
1872    dst->i[0] = src[0].i[0] / src[1].i[0];
1873    dst->i[1] = src[0].i[1] / src[1].i[1];
1874    dst->i[2] = src[0].i[2] / src[1].i[2];
1875    dst->i[3] = src[0].i[3] / src[1].i[3];
1876 }
1877
1878 static void
1879 micro_imax(union tgsi_exec_channel *dst,
1880            const union tgsi_exec_channel *src)
1881 {
1882    dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
1883    dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
1884    dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
1885    dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
1886 }
1887
1888 static void
1889 micro_imin(union tgsi_exec_channel *dst,
1890            const union tgsi_exec_channel *src)
1891 {
1892    dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
1893    dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
1894    dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
1895    dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
1896 }
1897
1898 static void
1899 micro_ineg(union tgsi_exec_channel *dst,
1900            const union tgsi_exec_channel *src)
1901 {
1902    dst->i[0] = -src->i[0];
1903    dst->i[1] = -src->i[1];
1904    dst->i[2] = -src->i[2];
1905    dst->i[3] = -src->i[3];
1906 }
1907
1908 static void
1909 micro_isge(union tgsi_exec_channel *dst,
1910            const union tgsi_exec_channel *src)
1911 {
1912    dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
1913    dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
1914    dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
1915    dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
1916 }
1917
1918 static void
1919 micro_ishr(union tgsi_exec_channel *dst,
1920            const union tgsi_exec_channel *src)
1921 {
1922    dst->i[0] = src[0].i[0] >> src[1].i[0];
1923    dst->i[1] = src[0].i[1] >> src[1].i[1];
1924    dst->i[2] = src[0].i[2] >> src[1].i[2];
1925    dst->i[3] = src[0].i[3] >> src[1].i[3];
1926 }
1927
1928 static void
1929 micro_islt(union tgsi_exec_channel *dst,
1930            const union tgsi_exec_channel *src)
1931 {
1932    dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
1933    dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
1934    dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
1935    dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
1936 }
1937
1938 static void
1939 micro_f2u(union tgsi_exec_channel *dst,
1940           const union tgsi_exec_channel *src)
1941 {
1942    dst->u[0] = (uint)src->f[0];
1943    dst->u[1] = (uint)src->f[1];
1944    dst->u[2] = (uint)src->f[2];
1945    dst->u[3] = (uint)src->f[3];
1946 }
1947
1948 static void
1949 micro_u2f(union tgsi_exec_channel *dst,
1950           const union tgsi_exec_channel *src)
1951 {
1952    dst->f[0] = (float)src->u[0];
1953    dst->f[1] = (float)src->u[1];
1954    dst->f[2] = (float)src->u[2];
1955    dst->f[3] = (float)src->u[3];
1956 }
1957
1958 static void
1959 micro_uadd(union tgsi_exec_channel *dst,
1960            const union tgsi_exec_channel *src)
1961 {
1962    dst->u[0] = src[0].u[0] + src[1].u[0];
1963    dst->u[1] = src[0].u[1] + src[1].u[1];
1964    dst->u[2] = src[0].u[2] + src[1].u[2];
1965    dst->u[3] = src[0].u[3] + src[1].u[3];
1966 }
1967
1968 static void
1969 micro_udiv(union tgsi_exec_channel *dst,
1970            const union tgsi_exec_channel *src)
1971 {
1972    dst->u[0] = src[0].u[0] / src[1].u[0];
1973    dst->u[1] = src[0].u[1] / src[1].u[1];
1974    dst->u[2] = src[0].u[2] / src[1].u[2];
1975    dst->u[3] = src[0].u[3] / src[1].u[3];
1976 }
1977
1978 static void
1979 micro_umad(union tgsi_exec_channel *dst,
1980            const union tgsi_exec_channel *src)
1981 {
1982    dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
1983    dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
1984    dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
1985    dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
1986 }
1987
1988 static void
1989 micro_umax(union tgsi_exec_channel *dst,
1990            const union tgsi_exec_channel *src)
1991 {
1992    dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
1993    dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
1994    dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
1995    dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
1996 }
1997
1998 static void
1999 micro_umin(union tgsi_exec_channel *dst,
2000            const union tgsi_exec_channel *src)
2001 {
2002    dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2003    dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2004    dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2005    dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2006 }
2007
2008 static void
2009 micro_umul(union tgsi_exec_channel *dst,
2010            const union tgsi_exec_channel *src)
2011 {
2012    dst->u[0] = src[0].u[0] * src[1].u[0];
2013    dst->u[1] = src[0].u[1] * src[1].u[1];
2014    dst->u[2] = src[0].u[2] * src[1].u[2];
2015    dst->u[3] = src[0].u[3] * src[1].u[3];
2016 }
2017
2018 static void
2019 micro_useq(union tgsi_exec_channel *dst,
2020            const union tgsi_exec_channel *src)
2021 {
2022    dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2023    dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2024    dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2025    dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2026 }
2027
2028 static void
2029 micro_usge(union tgsi_exec_channel *dst,
2030            const union tgsi_exec_channel *src)
2031 {
2032    dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2033    dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2034    dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2035    dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2036 }
2037
2038 static void
2039 micro_ushr(union tgsi_exec_channel *dst,
2040            const union tgsi_exec_channel *src)
2041 {
2042    dst->u[0] = src[0].u[0] >> src[1].u[0];
2043    dst->u[1] = src[0].u[1] >> src[1].u[1];
2044    dst->u[2] = src[0].u[2] >> src[1].u[2];
2045    dst->u[3] = src[0].u[3] >> src[1].u[3];
2046 }
2047
2048 static void
2049 micro_uslt(union tgsi_exec_channel *dst,
2050            const union tgsi_exec_channel *src)
2051 {
2052    dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2053    dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2054    dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2055    dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2056 }
2057
2058 static void
2059 micro_usne(union tgsi_exec_channel *dst,
2060            const union tgsi_exec_channel *src)
2061 {
2062    dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2063    dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2064    dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2065    dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2066 }
2067
2068 static void
2069 exec_instruction(
2070    struct tgsi_exec_machine *mach,
2071    const struct tgsi_full_instruction *inst,
2072    int *pc )
2073 {
2074    uint chan_index;
2075    union tgsi_exec_channel r[10];
2076    union tgsi_exec_channel d[8];
2077
2078    (*pc)++;
2079
2080    switch (inst->Instruction.Opcode) {
2081    case TGSI_OPCODE_ARL:
2082    case TGSI_OPCODE_FLR:
2083       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2084          FETCH( &r[0], 0, chan_index );
2085          micro_flr(&d[chan_index], &r[0]);
2086       }
2087       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2088          STORE(&d[chan_index], 0, chan_index);
2089       }
2090       break;
2091
2092    case TGSI_OPCODE_MOV:
2093       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2094          FETCH(&d[chan_index], 0, chan_index);
2095       }
2096       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2097          STORE(&d[chan_index], 0, chan_index);
2098       }
2099       break;
2100
2101    case TGSI_OPCODE_LIT:
2102       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2103          FETCH( &r[0], 0, CHAN_X );
2104          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2105             micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2106          }
2107
2108          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2109             FETCH( &r[1], 0, CHAN_Y );
2110             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2111
2112             FETCH( &r[2], 0, CHAN_W );
2113             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2114             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2115             micro_pow( &r[1], &r[1], &r[2] );
2116             micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2117          }
2118
2119          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2120             STORE(&d[CHAN_Y], 0, CHAN_Y);
2121          }
2122          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2123             STORE(&d[CHAN_Z], 0, CHAN_Z);
2124          }
2125       }
2126       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2127          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2128       }
2129       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2130          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2131       }
2132       break;
2133
2134    case TGSI_OPCODE_RCP:
2135    /* TGSI_OPCODE_RECIP */
2136       FETCH( &r[0], 0, CHAN_X );
2137       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2138       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2139          STORE( &r[0], 0, chan_index );
2140       }
2141       break;
2142
2143    case TGSI_OPCODE_RSQ:
2144    /* TGSI_OPCODE_RECIPSQRT */
2145       FETCH( &r[0], 0, CHAN_X );
2146       micro_abs( &r[0], &r[0] );
2147       micro_sqrt( &r[0], &r[0] );
2148       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2149       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2150          STORE( &r[0], 0, chan_index );
2151       }
2152       break;
2153
2154    case TGSI_OPCODE_EXP:
2155       FETCH( &r[0], 0, CHAN_X );
2156       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2157       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2158          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2159          STORE( &r[2], 0, CHAN_X );        /* store r2 */
2160       }
2161       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2162          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2163          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2164       }
2165       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2166          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2167          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2168       }
2169       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2170          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2171       }
2172       break;
2173
2174    case TGSI_OPCODE_LOG:
2175       FETCH( &r[0], 0, CHAN_X );
2176       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2177       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2178       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2179       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2180          STORE( &r[0], 0, CHAN_X );
2181       }
2182       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2183          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2184          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2185          STORE( &r[0], 0, CHAN_Y );
2186       }
2187       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2188          STORE( &r[1], 0, CHAN_Z );
2189       }
2190       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2191          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2192       }
2193       break;
2194
2195    case TGSI_OPCODE_MUL:
2196       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2197          FETCH(&r[0], 0, chan_index);
2198          FETCH(&r[1], 1, chan_index);
2199          micro_mul(&d[chan_index], &r[0], &r[1]);
2200       }
2201       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2202          STORE(&d[chan_index], 0, chan_index);
2203       }
2204       break;
2205
2206    case TGSI_OPCODE_ADD:
2207       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2208          FETCH( &r[0], 0, chan_index );
2209          FETCH( &r[1], 1, chan_index );
2210          micro_add(&d[chan_index], &r[0], &r[1]);
2211       }
2212       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2213          STORE(&d[chan_index], 0, chan_index);
2214       }
2215       break;
2216
2217    case TGSI_OPCODE_DP3:
2218    /* TGSI_OPCODE_DOT3 */
2219       FETCH( &r[0], 0, CHAN_X );
2220       FETCH( &r[1], 1, CHAN_X );
2221       micro_mul( &r[0], &r[0], &r[1] );
2222
2223       FETCH( &r[1], 0, CHAN_Y );
2224       FETCH( &r[2], 1, CHAN_Y );
2225       micro_mul( &r[1], &r[1], &r[2] );
2226       micro_add( &r[0], &r[0], &r[1] );
2227
2228       FETCH( &r[1], 0, CHAN_Z );
2229       FETCH( &r[2], 1, CHAN_Z );
2230       micro_mul( &r[1], &r[1], &r[2] );
2231       micro_add( &r[0], &r[0], &r[1] );
2232
2233       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2234          STORE( &r[0], 0, chan_index );
2235       }
2236       break;
2237
2238     case TGSI_OPCODE_DP4:
2239     /* TGSI_OPCODE_DOT4 */
2240        FETCH(&r[0], 0, CHAN_X);
2241        FETCH(&r[1], 1, CHAN_X);
2242
2243        micro_mul( &r[0], &r[0], &r[1] );
2244
2245        FETCH(&r[1], 0, CHAN_Y);
2246        FETCH(&r[2], 1, CHAN_Y);
2247
2248        micro_mul( &r[1], &r[1], &r[2] );
2249        micro_add( &r[0], &r[0], &r[1] );
2250
2251        FETCH(&r[1], 0, CHAN_Z);
2252        FETCH(&r[2], 1, CHAN_Z);
2253
2254        micro_mul( &r[1], &r[1], &r[2] );
2255        micro_add( &r[0], &r[0], &r[1] );
2256
2257        FETCH(&r[1], 0, CHAN_W);
2258        FETCH(&r[2], 1, CHAN_W);
2259
2260        micro_mul( &r[1], &r[1], &r[2] );
2261        micro_add( &r[0], &r[0], &r[1] );
2262
2263       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2264          STORE( &r[0], 0, chan_index );
2265       }
2266       break;
2267
2268    case TGSI_OPCODE_DST:
2269       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2270          FETCH( &r[0], 0, CHAN_Y );
2271          FETCH( &r[1], 1, CHAN_Y);
2272          micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2273       }
2274       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2275          FETCH(&d[CHAN_Z], 0, CHAN_Z);
2276       }
2277       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2278          FETCH(&d[CHAN_W], 1, CHAN_W);
2279       }
2280
2281       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2282          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2283       }
2284       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2285          STORE(&d[CHAN_Y], 0, CHAN_Y);
2286       }
2287       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2288          STORE(&d[CHAN_Z], 0, CHAN_Z);
2289       }
2290       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2291          STORE(&d[CHAN_W], 0, CHAN_W);
2292       }
2293       break;
2294
2295    case TGSI_OPCODE_MIN:
2296       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2297          FETCH(&r[0], 0, chan_index);
2298          FETCH(&r[1], 1, chan_index);
2299
2300          /* XXX use micro_min()?? */
2301          micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2302       }
2303       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2304          STORE(&d[chan_index], 0, chan_index);
2305       }
2306       break;
2307
2308    case TGSI_OPCODE_MAX:
2309       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2310          FETCH(&r[0], 0, chan_index);
2311          FETCH(&r[1], 1, chan_index);
2312
2313          /* XXX use micro_max()?? */
2314          micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2315       }
2316       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2317          STORE(&d[chan_index], 0, chan_index);
2318       }
2319       break;
2320
2321    case TGSI_OPCODE_SLT:
2322    /* TGSI_OPCODE_SETLT */
2323       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2324          FETCH( &r[0], 0, chan_index );
2325          FETCH( &r[1], 1, chan_index );
2326          micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2327       }
2328       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2329          STORE(&d[chan_index], 0, chan_index);
2330       }
2331       break;
2332
2333    case TGSI_OPCODE_SGE:
2334    /* TGSI_OPCODE_SETGE */
2335       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2336          FETCH( &r[0], 0, chan_index );
2337          FETCH( &r[1], 1, chan_index );
2338          micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2339       }
2340       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2341          STORE(&d[chan_index], 0, chan_index);
2342       }
2343       break;
2344
2345    case TGSI_OPCODE_MAD:
2346    /* TGSI_OPCODE_MADD */
2347       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2348          FETCH( &r[0], 0, chan_index );
2349          FETCH( &r[1], 1, chan_index );
2350          micro_mul( &r[0], &r[0], &r[1] );
2351          FETCH( &r[1], 2, chan_index );
2352          micro_add(&d[chan_index], &r[0], &r[1]);
2353       }
2354       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2355          STORE(&d[chan_index], 0, chan_index);
2356       }
2357       break;
2358
2359    case TGSI_OPCODE_SUB:
2360       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2361          FETCH(&r[0], 0, chan_index);
2362          FETCH(&r[1], 1, chan_index);
2363          micro_sub(&d[chan_index], &r[0], &r[1]);
2364       }
2365       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2366          STORE(&d[chan_index], 0, chan_index);
2367       }
2368       break;
2369
2370    case TGSI_OPCODE_LRP:
2371       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2372          FETCH(&r[0], 0, chan_index);
2373          FETCH(&r[1], 1, chan_index);
2374          FETCH(&r[2], 2, chan_index);
2375          micro_sub( &r[1], &r[1], &r[2] );
2376          micro_mul( &r[0], &r[0], &r[1] );
2377          micro_add(&d[chan_index], &r[0], &r[2]);
2378       }
2379       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2380          STORE(&d[chan_index], 0, chan_index);
2381       }
2382       break;
2383
2384    case TGSI_OPCODE_CND:
2385       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2386          FETCH(&r[0], 0, chan_index);
2387          FETCH(&r[1], 1, chan_index);
2388          FETCH(&r[2], 2, chan_index);
2389          micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2390       }
2391       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2392          STORE(&d[chan_index], 0, chan_index);
2393       }
2394       break;
2395
2396    case TGSI_OPCODE_DP2A:
2397       FETCH( &r[0], 0, CHAN_X );
2398       FETCH( &r[1], 1, CHAN_X );
2399       micro_mul( &r[0], &r[0], &r[1] );
2400
2401       FETCH( &r[1], 0, CHAN_Y );
2402       FETCH( &r[2], 1, CHAN_Y );
2403       micro_mul( &r[1], &r[1], &r[2] );
2404       micro_add( &r[0], &r[0], &r[1] );
2405
2406       FETCH( &r[2], 2, CHAN_X );
2407       micro_add( &r[0], &r[0], &r[2] );
2408
2409       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2410          STORE( &r[0], 0, chan_index );
2411       }
2412       break;
2413
2414    case TGSI_OPCODE_FRC:
2415       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2416          FETCH( &r[0], 0, chan_index );
2417          micro_frc(&d[chan_index], &r[0]);
2418       }
2419       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2420          STORE(&d[chan_index], 0, chan_index);
2421       }
2422       break;
2423
2424    case TGSI_OPCODE_CLAMP:
2425       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2426          FETCH(&r[0], 0, chan_index);
2427          FETCH(&r[1], 1, chan_index);
2428          micro_max(&r[0], &r[0], &r[1]);
2429          FETCH(&r[1], 2, chan_index);
2430          micro_min(&d[chan_index], &r[0], &r[1]);
2431       }
2432       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2433          STORE(&d[chan_index], 0, chan_index);
2434       }
2435       break;
2436
2437    case TGSI_OPCODE_ROUND:
2438    case TGSI_OPCODE_ARR:
2439       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2440          FETCH( &r[0], 0, chan_index );
2441          micro_rnd(&d[chan_index], &r[0]);
2442       }
2443       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2444          STORE(&d[chan_index], 0, chan_index);
2445       }
2446       break;
2447
2448    case TGSI_OPCODE_EX2:
2449       FETCH(&r[0], 0, CHAN_X);
2450
2451       micro_exp2( &r[0], &r[0] );
2452
2453       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2454          STORE( &r[0], 0, chan_index );
2455       }
2456       break;
2457
2458    case TGSI_OPCODE_LG2:
2459       FETCH( &r[0], 0, CHAN_X );
2460       micro_lg2( &r[0], &r[0] );
2461       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2462          STORE( &r[0], 0, chan_index );
2463       }
2464       break;
2465
2466    case TGSI_OPCODE_POW:
2467       FETCH(&r[0], 0, CHAN_X);
2468       FETCH(&r[1], 1, CHAN_X);
2469
2470       micro_pow( &r[0], &r[0], &r[1] );
2471
2472       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2473          STORE( &r[0], 0, chan_index );
2474       }
2475       break;
2476
2477    case TGSI_OPCODE_XPD:
2478       FETCH(&r[0], 0, CHAN_Y);
2479       FETCH(&r[1], 1, CHAN_Z);
2480
2481       micro_mul( &r[2], &r[0], &r[1] );
2482
2483       FETCH(&r[3], 0, CHAN_Z);
2484       FETCH(&r[4], 1, CHAN_Y);
2485
2486       micro_mul( &r[5], &r[3], &r[4] );
2487       micro_sub(&d[CHAN_X], &r[2], &r[5]);
2488
2489       FETCH(&r[2], 1, CHAN_X);
2490
2491       micro_mul( &r[3], &r[3], &r[2] );
2492
2493       FETCH(&r[5], 0, CHAN_X);
2494
2495       micro_mul( &r[1], &r[1], &r[5] );
2496       micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2497
2498       micro_mul( &r[5], &r[5], &r[4] );
2499       micro_mul( &r[0], &r[0], &r[2] );
2500       micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2501
2502       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2503          STORE(&d[CHAN_X], 0, CHAN_X);
2504       }
2505       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2506          STORE(&d[CHAN_Y], 0, CHAN_Y);
2507       }
2508       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2509          STORE(&d[CHAN_Z], 0, CHAN_Z);
2510       }
2511       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2512          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2513       }
2514       break;
2515
2516     case TGSI_OPCODE_ABS:
2517        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2518           FETCH(&r[0], 0, chan_index);
2519           micro_abs(&d[chan_index], &r[0]);
2520        }
2521        FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2522          STORE(&d[chan_index], 0, chan_index);
2523       }
2524        break;
2525
2526    case TGSI_OPCODE_RCC:
2527       FETCH(&r[0], 0, CHAN_X);
2528       micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2529       micro_float_clamp(&r[0], &r[0]);
2530       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2531          STORE(&r[0], 0, chan_index);
2532       }
2533       break;
2534
2535    case TGSI_OPCODE_DPH:
2536       FETCH(&r[0], 0, CHAN_X);
2537       FETCH(&r[1], 1, CHAN_X);
2538
2539       micro_mul( &r[0], &r[0], &r[1] );
2540
2541       FETCH(&r[1], 0, CHAN_Y);
2542       FETCH(&r[2], 1, CHAN_Y);
2543
2544       micro_mul( &r[1], &r[1], &r[2] );
2545       micro_add( &r[0], &r[0], &r[1] );
2546
2547       FETCH(&r[1], 0, CHAN_Z);
2548       FETCH(&r[2], 1, CHAN_Z);
2549
2550       micro_mul( &r[1], &r[1], &r[2] );
2551       micro_add( &r[0], &r[0], &r[1] );
2552
2553       FETCH(&r[1], 1, CHAN_W);
2554
2555       micro_add( &r[0], &r[0], &r[1] );
2556
2557       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2558          STORE( &r[0], 0, chan_index );
2559       }
2560       break;
2561
2562    case TGSI_OPCODE_COS:
2563       FETCH(&r[0], 0, CHAN_X);
2564
2565       micro_cos( &r[0], &r[0] );
2566
2567       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2568          STORE( &r[0], 0, chan_index );
2569       }
2570       break;
2571
2572    case TGSI_OPCODE_DDX:
2573       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2574          FETCH( &r[0], 0, chan_index );
2575          micro_ddx(&d[chan_index], &r[0]);
2576       }
2577       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2578          STORE(&d[chan_index], 0, chan_index);
2579       }
2580       break;
2581
2582    case TGSI_OPCODE_DDY:
2583       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2584          FETCH( &r[0], 0, chan_index );
2585          micro_ddy(&d[chan_index], &r[0]);
2586       }
2587       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2588          STORE(&d[chan_index], 0, chan_index);
2589       }
2590       break;
2591
2592    case TGSI_OPCODE_KILP:
2593       exec_kilp (mach, inst);
2594       break;
2595
2596    case TGSI_OPCODE_KIL:
2597       exec_kil (mach, inst);
2598       break;
2599
2600    case TGSI_OPCODE_PK2H:
2601       assert (0);
2602       break;
2603
2604    case TGSI_OPCODE_PK2US:
2605       assert (0);
2606       break;
2607
2608    case TGSI_OPCODE_PK4B:
2609       assert (0);
2610       break;
2611
2612    case TGSI_OPCODE_PK4UB:
2613       assert (0);
2614       break;
2615
2616    case TGSI_OPCODE_RFL:
2617       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2618           IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2619           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2620          /* r0 = dp3(src0, src0) */
2621          FETCH(&r[2], 0, CHAN_X);
2622          micro_mul(&r[0], &r[2], &r[2]);
2623          FETCH(&r[4], 0, CHAN_Y);
2624          micro_mul(&r[8], &r[4], &r[4]);
2625          micro_add(&r[0], &r[0], &r[8]);
2626          FETCH(&r[6], 0, CHAN_Z);
2627          micro_mul(&r[8], &r[6], &r[6]);
2628          micro_add(&r[0], &r[0], &r[8]);
2629
2630          /* r1 = dp3(src0, src1) */
2631          FETCH(&r[3], 1, CHAN_X);
2632          micro_mul(&r[1], &r[2], &r[3]);
2633          FETCH(&r[5], 1, CHAN_Y);
2634          micro_mul(&r[8], &r[4], &r[5]);
2635          micro_add(&r[1], &r[1], &r[8]);
2636          FETCH(&r[7], 1, CHAN_Z);
2637          micro_mul(&r[8], &r[6], &r[7]);
2638          micro_add(&r[1], &r[1], &r[8]);
2639
2640          /* r1 = 2 * r1 / r0 */
2641          micro_add(&r[1], &r[1], &r[1]);
2642          micro_div(&r[1], &r[1], &r[0]);
2643
2644          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2645             micro_mul(&r[2], &r[2], &r[1]);
2646             micro_sub(&r[2], &r[2], &r[3]);
2647             STORE(&r[2], 0, CHAN_X);
2648          }
2649          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2650             micro_mul(&r[4], &r[4], &r[1]);
2651             micro_sub(&r[4], &r[4], &r[5]);
2652             STORE(&r[4], 0, CHAN_Y);
2653          }
2654          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2655             micro_mul(&r[6], &r[6], &r[1]);
2656             micro_sub(&r[6], &r[6], &r[7]);
2657             STORE(&r[6], 0, CHAN_Z);
2658          }
2659       }
2660       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2661          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2662       }
2663       break;
2664
2665    case TGSI_OPCODE_SEQ:
2666       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2667          FETCH( &r[0], 0, chan_index );
2668          FETCH( &r[1], 1, chan_index );
2669          micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2670       }
2671       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2672          STORE(&d[chan_index], 0, chan_index);
2673       }
2674       break;
2675
2676    case TGSI_OPCODE_SFL:
2677       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2678          STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2679       }
2680       break;
2681
2682    case TGSI_OPCODE_SGT:
2683       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2684          FETCH( &r[0], 0, chan_index );
2685          FETCH( &r[1], 1, chan_index );
2686          micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2687       }
2688       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2689          STORE(&d[chan_index], 0, chan_index);
2690       }
2691       break;
2692
2693    case TGSI_OPCODE_SIN:
2694       FETCH( &r[0], 0, CHAN_X );
2695       micro_sin( &r[0], &r[0] );
2696       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2697          STORE( &r[0], 0, chan_index );
2698       }
2699       break;
2700
2701    case TGSI_OPCODE_SLE:
2702       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2703          FETCH( &r[0], 0, chan_index );
2704          FETCH( &r[1], 1, chan_index );
2705          micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2706       }
2707       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2708          STORE(&d[chan_index], 0, chan_index);
2709       }
2710       break;
2711
2712    case TGSI_OPCODE_SNE:
2713       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2714          FETCH( &r[0], 0, chan_index );
2715          FETCH( &r[1], 1, chan_index );
2716          micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2717       }
2718       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2719          STORE(&d[chan_index], 0, chan_index);
2720       }
2721       break;
2722
2723    case TGSI_OPCODE_STR:
2724       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2725          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2726       }
2727       break;
2728
2729    case TGSI_OPCODE_TEX:
2730       /* simple texture lookup */
2731       /* src[0] = texcoord */
2732       /* src[1] = sampler unit */
2733       exec_tex(mach, inst, FALSE, FALSE);
2734       break;
2735
2736    case TGSI_OPCODE_TXB:
2737       /* Texture lookup with lod bias */
2738       /* src[0] = texcoord (src[0].w = LOD bias) */
2739       /* src[1] = sampler unit */
2740       exec_tex(mach, inst, TRUE, FALSE);
2741       break;
2742
2743    case TGSI_OPCODE_TXD:
2744       /* Texture lookup with explict partial derivatives */
2745       /* src[0] = texcoord */
2746       /* src[1] = d[strq]/dx */
2747       /* src[2] = d[strq]/dy */
2748       /* src[3] = sampler unit */
2749       exec_txd(mach, inst);
2750       break;
2751
2752    case TGSI_OPCODE_TXL:
2753       /* Texture lookup with explit LOD */
2754       /* src[0] = texcoord (src[0].w = LOD) */
2755       /* src[1] = sampler unit */
2756       exec_tex(mach, inst, TRUE, FALSE);
2757       break;
2758
2759    case TGSI_OPCODE_TXP:
2760       /* Texture lookup with projection */
2761       /* src[0] = texcoord (src[0].w = projection) */
2762       /* src[1] = sampler unit */
2763       exec_tex(mach, inst, FALSE, TRUE);
2764       break;
2765
2766    case TGSI_OPCODE_UP2H:
2767       assert (0);
2768       break;
2769
2770    case TGSI_OPCODE_UP2US:
2771       assert (0);
2772       break;
2773
2774    case TGSI_OPCODE_UP4B:
2775       assert (0);
2776       break;
2777
2778    case TGSI_OPCODE_UP4UB:
2779       assert (0);
2780       break;
2781
2782    case TGSI_OPCODE_X2D:
2783       FETCH(&r[0], 1, CHAN_X);
2784       FETCH(&r[1], 1, CHAN_Y);
2785       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2786           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2787          FETCH(&r[2], 2, CHAN_X);
2788          micro_mul(&r[2], &r[2], &r[0]);
2789          FETCH(&r[3], 2, CHAN_Y);
2790          micro_mul(&r[3], &r[3], &r[1]);
2791          micro_add(&r[2], &r[2], &r[3]);
2792          FETCH(&r[3], 0, CHAN_X);
2793          micro_add(&d[CHAN_X], &r[2], &r[3]);
2794
2795       }
2796       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2797           IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2798          FETCH(&r[2], 2, CHAN_Z);
2799          micro_mul(&r[2], &r[2], &r[0]);
2800          FETCH(&r[3], 2, CHAN_W);
2801          micro_mul(&r[3], &r[3], &r[1]);
2802          micro_add(&r[2], &r[2], &r[3]);
2803          FETCH(&r[3], 0, CHAN_Y);
2804          micro_add(&d[CHAN_Y], &r[2], &r[3]);
2805
2806       }
2807       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2808          STORE(&d[CHAN_X], 0, CHAN_X);
2809       }
2810       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2811          STORE(&d[CHAN_Y], 0, CHAN_Y);
2812       }
2813       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2814          STORE(&d[CHAN_X], 0, CHAN_Z);
2815       }
2816       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2817          STORE(&d[CHAN_Y], 0, CHAN_W);
2818       }
2819       break;
2820
2821    case TGSI_OPCODE_ARA:
2822       assert (0);
2823       break;
2824
2825    case TGSI_OPCODE_BRA:
2826       assert (0);
2827       break;
2828
2829    case TGSI_OPCODE_CAL:
2830       /* skip the call if no execution channels are enabled */
2831       if (mach->ExecMask) {
2832          /* do the call */
2833
2834          /* First, record the depths of the execution stacks.
2835           * This is important for deeply nested/looped return statements.
2836           * We have to unwind the stacks by the correct amount.  For a
2837           * real code generator, we could determine the number of entries
2838           * to pop off each stack with simple static analysis and avoid
2839           * implementing this data structure at run time.
2840           */
2841          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2842          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2843          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2844          /* note that PC was already incremented above */
2845          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2846
2847          mach->CallStackTop++;
2848
2849          /* Second, push the Cond, Loop, Cont, Func stacks */
2850          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2851          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2852          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2853          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2854          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2855          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2856          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2857          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2858
2859          /* Finally, jump to the subroutine */
2860          *pc = inst->Label.Label;
2861       }
2862       break;
2863
2864    case TGSI_OPCODE_RET:
2865       mach->FuncMask &= ~mach->ExecMask;
2866       UPDATE_EXEC_MASK(mach);
2867
2868       if (mach->FuncMask == 0x0) {
2869          /* really return now (otherwise, keep executing */
2870
2871          if (mach->CallStackTop == 0) {
2872             /* returning from main() */
2873             *pc = -1;
2874             return;
2875          }
2876
2877          assert(mach->CallStackTop > 0);
2878          mach->CallStackTop--;
2879
2880          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2881          mach->CondMask = mach->CondStack[mach->CondStackTop];
2882
2883          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2884          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2885
2886          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2887          mach->ContMask = mach->ContStack[mach->ContStackTop];
2888
2889          assert(mach->FuncStackTop > 0);
2890          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2891
2892          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2893
2894          UPDATE_EXEC_MASK(mach);
2895       }
2896       break;
2897
2898    case TGSI_OPCODE_SSG:
2899    /* TGSI_OPCODE_SGN */
2900       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2901          FETCH( &r[0], 0, chan_index );
2902          micro_sgn(&d[chan_index], &r[0]);
2903       }
2904       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2905          STORE(&d[chan_index], 0, chan_index);
2906       }
2907       break;
2908
2909    case TGSI_OPCODE_CMP:
2910       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2911          FETCH(&r[0], 0, chan_index);
2912          FETCH(&r[1], 1, chan_index);
2913          FETCH(&r[2], 2, chan_index);
2914          micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2915       }
2916       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2917          STORE(&d[chan_index], 0, chan_index);
2918       }
2919       break;
2920
2921    case TGSI_OPCODE_SCS:
2922       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2923          FETCH( &r[0], 0, CHAN_X );
2924          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2925             micro_cos(&r[1], &r[0]);
2926             STORE(&r[1], 0, CHAN_X);
2927          }
2928          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2929             micro_sin(&r[1], &r[0]);
2930             STORE(&r[1], 0, CHAN_Y);
2931          }
2932       }
2933       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2934          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2935       }
2936       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2937          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2938       }
2939       break;
2940
2941    case TGSI_OPCODE_NRM:
2942       /* 3-component vector normalize */
2943       if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2944          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2945          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2946          /* r3 = sqrt(dp3(src0, src0)) */
2947          FETCH(&r[0], 0, CHAN_X);
2948          micro_mul(&r[3], &r[0], &r[0]);
2949          FETCH(&r[1], 0, CHAN_Y);
2950          micro_mul(&r[4], &r[1], &r[1]);
2951          micro_add(&r[3], &r[3], &r[4]);
2952          FETCH(&r[2], 0, CHAN_Z);
2953          micro_mul(&r[4], &r[2], &r[2]);
2954          micro_add(&r[3], &r[3], &r[4]);
2955          micro_sqrt(&r[3], &r[3]);
2956
2957          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2958             micro_div(&r[0], &r[0], &r[3]);
2959             STORE(&r[0], 0, CHAN_X);
2960          }
2961          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2962             micro_div(&r[1], &r[1], &r[3]);
2963             STORE(&r[1], 0, CHAN_Y);
2964          }
2965          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2966             micro_div(&r[2], &r[2], &r[3]);
2967             STORE(&r[2], 0, CHAN_Z);
2968          }
2969       }
2970       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2971          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2972       }
2973       break;
2974
2975    case TGSI_OPCODE_NRM4:
2976       /* 4-component vector normalize */
2977       {
2978          union tgsi_exec_channel tmp, dot;
2979
2980          /* tmp = dp4(src0, src0): */
2981          FETCH( &r[0], 0, CHAN_X );
2982          micro_mul( &tmp, &r[0], &r[0] );
2983
2984          FETCH( &r[1], 0, CHAN_Y );
2985          micro_mul( &dot, &r[1], &r[1] );
2986          micro_add( &tmp, &tmp, &dot );
2987
2988          FETCH( &r[2], 0, CHAN_Z );
2989          micro_mul( &dot, &r[2], &r[2] );
2990          micro_add( &tmp, &tmp, &dot );
2991
2992          FETCH( &r[3], 0, CHAN_W );
2993          micro_mul( &dot, &r[3], &r[3] );
2994          micro_add( &tmp, &tmp, &dot );
2995
2996          /* tmp = 1 / sqrt(tmp) */
2997          micro_sqrt( &tmp, &tmp );
2998          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2999
3000          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3001             /* chan = chan * tmp */
3002             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
3003             STORE( &r[chan_index], 0, chan_index );
3004          }
3005       }
3006       break;
3007
3008    case TGSI_OPCODE_DIV:
3009       assert( 0 );
3010       break;
3011
3012    case TGSI_OPCODE_DP2:
3013       FETCH( &r[0], 0, CHAN_X );
3014       FETCH( &r[1], 1, CHAN_X );
3015       micro_mul( &r[0], &r[0], &r[1] );
3016
3017       FETCH( &r[1], 0, CHAN_Y );
3018       FETCH( &r[2], 1, CHAN_Y );
3019       micro_mul( &r[1], &r[1], &r[2] );
3020       micro_add( &r[0], &r[0], &r[1] );
3021
3022       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3023          STORE( &r[0], 0, chan_index );
3024       }
3025       break;
3026
3027    case TGSI_OPCODE_IF:
3028       /* push CondMask */
3029       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3030       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3031       FETCH( &r[0], 0, CHAN_X );
3032       /* update CondMask */
3033       if( ! r[0].u[0] ) {
3034          mach->CondMask &= ~0x1;
3035       }
3036       if( ! r[0].u[1] ) {
3037          mach->CondMask &= ~0x2;
3038       }
3039       if( ! r[0].u[2] ) {
3040          mach->CondMask &= ~0x4;
3041       }
3042       if( ! r[0].u[3] ) {
3043          mach->CondMask &= ~0x8;
3044       }
3045       UPDATE_EXEC_MASK(mach);
3046       /* Todo: If CondMask==0, jump to ELSE */
3047       break;
3048
3049    case TGSI_OPCODE_ELSE:
3050       /* invert CondMask wrt previous mask */
3051       {
3052          uint prevMask;
3053          assert(mach->CondStackTop > 0);
3054          prevMask = mach->CondStack[mach->CondStackTop - 1];
3055          mach->CondMask = ~mach->CondMask & prevMask;
3056          UPDATE_EXEC_MASK(mach);
3057          /* Todo: If CondMask==0, jump to ENDIF */
3058       }
3059       break;
3060
3061    case TGSI_OPCODE_ENDIF:
3062       /* pop CondMask */
3063       assert(mach->CondStackTop > 0);
3064       mach->CondMask = mach->CondStack[--mach->CondStackTop];
3065       UPDATE_EXEC_MASK(mach);
3066       break;
3067
3068    case TGSI_OPCODE_END:
3069       /* halt execution */
3070       *pc = -1;
3071       break;
3072
3073    case TGSI_OPCODE_REP:
3074       assert (0);
3075       break;
3076
3077    case TGSI_OPCODE_ENDREP:
3078        assert (0);
3079        break;
3080
3081    case TGSI_OPCODE_PUSHA:
3082       assert (0);
3083       break;
3084
3085    case TGSI_OPCODE_POPA:
3086       assert (0);
3087       break;
3088
3089    case TGSI_OPCODE_CEIL:
3090       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3091          FETCH( &r[0], 0, chan_index );
3092          micro_ceil(&d[chan_index], &r[0]);
3093       }
3094       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3095          STORE(&d[chan_index], 0, chan_index);
3096       }
3097       break;
3098
3099    case TGSI_OPCODE_I2F:
3100       exec_vector_unary(mach, inst, micro_i2f);
3101       break;
3102
3103    case TGSI_OPCODE_NOT:
3104       exec_vector_unary(mach, inst, micro_not);
3105       break;
3106
3107    case TGSI_OPCODE_TRUNC:
3108       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3109          FETCH( &r[0], 0, chan_index );
3110          micro_trunc(&d[chan_index], &r[0]);
3111       }
3112       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3113          STORE(&d[chan_index], 0, chan_index);
3114       }
3115       break;
3116
3117    case TGSI_OPCODE_SHL:
3118       exec_vector_binary(mach, inst, micro_shl);
3119       break;
3120
3121    case TGSI_OPCODE_AND:
3122       exec_vector_binary(mach, inst, micro_and);
3123       break;
3124
3125    case TGSI_OPCODE_OR:
3126       exec_vector_binary(mach, inst, micro_or);
3127       break;
3128
3129    case TGSI_OPCODE_MOD:
3130       assert (0);
3131       break;
3132
3133    case TGSI_OPCODE_XOR:
3134       exec_vector_binary(mach, inst, micro_xor);
3135       break;
3136
3137    case TGSI_OPCODE_SAD:
3138       assert (0);
3139       break;
3140
3141    case TGSI_OPCODE_TXF:
3142       assert (0);
3143       break;
3144
3145    case TGSI_OPCODE_TXQ:
3146       assert (0);
3147       break;
3148
3149    case TGSI_OPCODE_EMIT:
3150       emit_vertex(mach);
3151       break;
3152
3153    case TGSI_OPCODE_ENDPRIM:
3154       emit_primitive(mach);
3155       break;
3156
3157    case TGSI_OPCODE_BGNFOR:
3158       assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3159       for (chan_index = 0; chan_index < 3; chan_index++) {
3160          FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3161       }
3162       ++mach->LoopCounterStackTop;
3163       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3164       /* update LoopMask */
3165       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3166          mach->LoopMask &= ~0x1;
3167       }
3168       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3169          mach->LoopMask &= ~0x2;
3170       }
3171       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3172          mach->LoopMask &= ~0x4;
3173       }
3174       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3175          mach->LoopMask &= ~0x8;
3176       }
3177       /* TODO: if mach->LoopMask == 0, jump to end of loop */
3178       UPDATE_EXEC_MASK(mach);
3179       /* fall-through (for now) */
3180    case TGSI_OPCODE_BGNLOOP:
3181       /* push LoopMask and ContMasks */
3182       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3183       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3184       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3185       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3186       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3187       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3188       break;
3189
3190    case TGSI_OPCODE_ENDFOR:
3191       assert(mach->LoopCounterStackTop > 0);
3192       micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3193                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3194                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3195       /* update LoopMask */
3196       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3197          mach->LoopMask &= ~0x1;
3198       }
3199       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3200          mach->LoopMask &= ~0x2;
3201       }
3202       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3203          mach->LoopMask &= ~0x4;
3204       }
3205       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3206          mach->LoopMask &= ~0x8;
3207       }
3208       micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3209                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3210                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3211       assert(mach->LoopLabelStackTop > 0);
3212       inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3213       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3214       /* Restore ContMask, but don't pop */
3215       assert(mach->ContStackTop > 0);
3216       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3217       UPDATE_EXEC_MASK(mach);
3218       if (mach->ExecMask) {
3219          /* repeat loop: jump to instruction just past BGNLOOP */
3220          assert(mach->LoopLabelStackTop > 0);
3221          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3222       }
3223       else {
3224          /* exit loop: pop LoopMask */
3225          assert(mach->LoopStackTop > 0);
3226          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3227          /* pop ContMask */
3228          assert(mach->ContStackTop > 0);
3229          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3230          assert(mach->LoopLabelStackTop > 0);
3231          --mach->LoopLabelStackTop;
3232          assert(mach->LoopCounterStackTop > 0);
3233          --mach->LoopCounterStackTop;
3234       }
3235       UPDATE_EXEC_MASK(mach);
3236       break;
3237
3238    case TGSI_OPCODE_ENDLOOP:
3239       /* Restore ContMask, but don't pop */
3240       assert(mach->ContStackTop > 0);
3241       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3242       UPDATE_EXEC_MASK(mach);
3243       if (mach->ExecMask) {
3244          /* repeat loop: jump to instruction just past BGNLOOP */
3245          assert(mach->LoopLabelStackTop > 0);
3246          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3247       }
3248       else {
3249          /* exit loop: pop LoopMask */
3250          assert(mach->LoopStackTop > 0);
3251          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3252          /* pop ContMask */
3253          assert(mach->ContStackTop > 0);
3254          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3255          assert(mach->LoopLabelStackTop > 0);
3256          --mach->LoopLabelStackTop;
3257       }
3258       UPDATE_EXEC_MASK(mach);
3259       break;
3260
3261    case TGSI_OPCODE_BRK:
3262       /* turn off loop channels for each enabled exec channel */
3263       mach->LoopMask &= ~mach->ExecMask;
3264       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3265       UPDATE_EXEC_MASK(mach);
3266       break;
3267
3268    case TGSI_OPCODE_CONT:
3269       /* turn off cont channels for each enabled exec channel */
3270       mach->ContMask &= ~mach->ExecMask;
3271       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3272       UPDATE_EXEC_MASK(mach);
3273       break;
3274
3275    case TGSI_OPCODE_BGNSUB:
3276       /* no-op */
3277       break;
3278
3279    case TGSI_OPCODE_ENDSUB:
3280       /*
3281        * XXX: This really should be a no-op. We should never reach this opcode.
3282        */
3283
3284       assert(mach->CallStackTop > 0);
3285       mach->CallStackTop--;
3286
3287       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3288       mach->CondMask = mach->CondStack[mach->CondStackTop];
3289
3290       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3291       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3292
3293       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3294       mach->ContMask = mach->ContStack[mach->ContStackTop];
3295
3296       assert(mach->FuncStackTop > 0);
3297       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3298
3299       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3300
3301       UPDATE_EXEC_MASK(mach);
3302       break;
3303
3304    case TGSI_OPCODE_NOP:
3305       break;
3306
3307    case TGSI_OPCODE_BREAKC:
3308       FETCH(&r[0], 0, CHAN_X);
3309       /* update CondMask */
3310       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3311          mach->LoopMask &= ~0x1;
3312       }
3313       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3314          mach->LoopMask &= ~0x2;
3315       }
3316       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3317          mach->LoopMask &= ~0x4;
3318       }
3319       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3320          mach->LoopMask &= ~0x8;
3321       }
3322       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3323       UPDATE_EXEC_MASK(mach);
3324       break;
3325
3326    case TGSI_OPCODE_F2I:
3327       exec_vector_unary(mach, inst, micro_f2i);
3328       break;
3329
3330    case TGSI_OPCODE_IDIV:
3331       exec_vector_binary(mach, inst, micro_idiv);
3332       break;
3333
3334    case TGSI_OPCODE_IMAX:
3335       exec_vector_binary(mach, inst, micro_imax);
3336       break;
3337
3338    case TGSI_OPCODE_IMIN:
3339       exec_vector_binary(mach, inst, micro_imin);
3340       break;
3341
3342    case TGSI_OPCODE_INEG:
3343       exec_vector_unary(mach, inst, micro_ineg);
3344       break;
3345
3346    case TGSI_OPCODE_ISGE:
3347       exec_vector_binary(mach, inst, micro_isge);
3348       break;
3349
3350    case TGSI_OPCODE_ISHR:
3351       exec_vector_binary(mach, inst, micro_ishr);
3352       break;
3353
3354    case TGSI_OPCODE_ISLT:
3355       exec_vector_binary(mach, inst, micro_islt);
3356       break;
3357
3358    case TGSI_OPCODE_F2U:
3359       exec_vector_unary(mach, inst, micro_f2u);
3360       break;
3361
3362    case TGSI_OPCODE_U2F:
3363       exec_vector_unary(mach, inst, micro_u2f);
3364       break;
3365
3366    case TGSI_OPCODE_UADD:
3367       exec_vector_binary(mach, inst, micro_uadd);
3368       break;
3369
3370    case TGSI_OPCODE_UDIV:
3371       exec_vector_binary(mach, inst, micro_udiv);
3372       break;
3373
3374    case TGSI_OPCODE_UMAD:
3375       exec_vector_trinary(mach, inst, micro_umad);
3376       break;
3377
3378    case TGSI_OPCODE_UMAX:
3379       exec_vector_binary(mach, inst, micro_umax);
3380       break;
3381
3382    case TGSI_OPCODE_UMIN:
3383       exec_vector_binary(mach, inst, micro_umin);
3384       break;
3385
3386    case TGSI_OPCODE_UMUL:
3387       exec_vector_binary(mach, inst, micro_umul);
3388       break;
3389
3390    case TGSI_OPCODE_USEQ:
3391       exec_vector_binary(mach, inst, micro_useq);
3392       break;
3393
3394    case TGSI_OPCODE_USGE:
3395       exec_vector_binary(mach, inst, micro_usge);
3396       break;
3397
3398    case TGSI_OPCODE_USHR:
3399       exec_vector_binary(mach, inst, micro_ushr);
3400       break;
3401
3402    case TGSI_OPCODE_USLT:
3403       exec_vector_binary(mach, inst, micro_uslt);
3404       break;
3405
3406    case TGSI_OPCODE_USNE:
3407       exec_vector_binary(mach, inst, micro_usne);
3408       break;
3409
3410    default:
3411       assert( 0 );
3412    }
3413 }
3414
3415 #define DEBUG_EXECUTION 0
3416
3417
3418 /**
3419  * Run TGSI interpreter.
3420  * \return bitmask of "alive" quad components
3421  */
3422 uint
3423 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3424 {
3425    uint i;
3426    int pc = 0;
3427
3428    mach->CondMask = 0xf;
3429    mach->LoopMask = 0xf;
3430    mach->ContMask = 0xf;
3431    mach->FuncMask = 0xf;
3432    mach->ExecMask = 0xf;
3433
3434    assert(mach->CondStackTop == 0);
3435    assert(mach->LoopStackTop == 0);
3436    assert(mach->ContStackTop == 0);
3437    assert(mach->CallStackTop == 0);
3438
3439    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3440    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3441
3442    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3443       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3444       mach->Primitives[0] = 0;
3445    }
3446
3447    for (i = 0; i < QUAD_SIZE; i++) {
3448       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3449          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3450          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3451          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3452          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3453    }
3454
3455    /* execute declarations (interpolants) */
3456    for (i = 0; i < mach->NumDeclarations; i++) {
3457       exec_declaration( mach, mach->Declarations+i );
3458    }
3459
3460    {
3461 #if DEBUG_EXECUTION
3462       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3463       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3464       uint inst = 1;
3465
3466       memcpy(temps, mach->Temps, sizeof(temps));
3467       memcpy(outputs, mach->Outputs, sizeof(outputs));
3468 #endif
3469
3470       /* execute instructions, until pc is set to -1 */
3471       while (pc != -1) {
3472
3473 #if DEBUG_EXECUTION
3474          uint i;
3475
3476          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3477 #endif
3478
3479          assert(pc < (int) mach->NumInstructions);
3480          exec_instruction(mach, mach->Instructions + pc, &pc);
3481
3482 #if DEBUG_EXECUTION
3483          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3484             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3485                uint j;
3486
3487                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3488                debug_printf("TEMP[%2u] = ", i);
3489                for (j = 0; j < 4; j++) {
3490                   if (j > 0) {
3491                      debug_printf("           ");
3492                   }
3493                   debug_printf("(%6f, %6f, %6f, %6f)\n",
3494                                temps[i].xyzw[0].f[j],
3495                                temps[i].xyzw[1].f[j],
3496                                temps[i].xyzw[2].f[j],
3497                                temps[i].xyzw[3].f[j]);
3498                }
3499             }
3500          }
3501          for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3502             if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3503                uint j;
3504
3505                memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3506                debug_printf("OUT[%2u] =  ", i);
3507                for (j = 0; j < 4; j++) {
3508                   if (j > 0) {
3509                      debug_printf("           ");
3510                   }
3511                   debug_printf("{%6f, %6f, %6f, %6f}\n",
3512                                outputs[i].xyzw[0].f[j],
3513                                outputs[i].xyzw[1].f[j],
3514                                outputs[i].xyzw[2].f[j],
3515                                outputs[i].xyzw[3].f[j]);
3516                }
3517             }
3518          }
3519 #endif
3520       }
3521    }
3522
3523 #if 0
3524    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3525    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3526       /*
3527        * Scale back depth component.
3528        */
3529       for (i = 0; i < 4; i++)
3530          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3531    }
3532 #endif
3533
3534    assert(mach->CondStackTop == 0);
3535    assert(mach->LoopStackTop == 0);
3536    assert(mach->ContStackTop == 0);
3537    assert(mach->CallStackTop == 0);
3538
3539    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3540 }