src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * TGSI interpreter/executor.
  30  *
  31  * Flow control information:
  32  *
  33  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  34  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  35  * care since a condition may be true for some quad components but false
  36  * for other components.
  37  *
  38  * We basically execute all statements (even if they're in the part of
  39  * an IF/ELSE clause that's "not taken") and use a special mask to
  40  * control writing to destination registers.  This is the ExecMask.
  41  * See store_dest().
  42  *
  43  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  44  * ContMask) which are controlled by the flow control instructions (namely:
  45  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  46  *
  47  *
  48  * Authors:
  49  *   Michal Krol
  50  *   Brian Paul
  51  */
  52
  53 #include "pipe/p_compiler.h"
  54 #include "pipe/p_state.h"
  55 #include "pipe/p_shader_tokens.h"
  56 #include "tgsi/tgsi_parse.h"
  57 #include "tgsi/tgsi_util.h"
  58 #include "tgsi_exec.h"
  59 #include "util/u_memory.h"
  60 #include "util/u_math.h"
  61
  62 #define FAST_MATH 1
  63
  64 #define TILE_TOP_LEFT     0
  65 #define TILE_TOP_RIGHT    1
  66 #define TILE_BOTTOM_LEFT  2
  67 #define TILE_BOTTOM_RIGHT 3
  68
  69 #define CHAN_X  0
  70 #define CHAN_Y  1
  71 #define CHAN_Z  2
  72 #define CHAN_W  3
  73
  74 /*
  75  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  76  */
  77 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
  78 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
  79 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
  80 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
  81 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
  82 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
  83 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
  84 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
  85 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
  86 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
  87 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
  88 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
  89 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
  90 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
  91 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
  92 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
  93 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
  94 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
  95 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
  96 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
  97 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
  98 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
  99 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 100 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 101 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 102 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 103 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 104 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 105 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 106
 107 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 108    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
 109
 110 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 111    ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
 112
 113 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 114    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 115       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 116
 117 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 118    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 119       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 120
 121
 122 /** The execution mask depends on the conditional mask and the loop mask */
 123 #define UPDATE_EXEC_MASK(MACH) \
 124       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
 125
 126 /**
 127  * Initialize machine state by expanding tokens to full instructions,
 128  * allocating temporary storage, setting up constants, etc.
 129  * After this, we can call tgsi_exec_machine_run() many times.
 130  */
 131 void
 132 tgsi_exec_machine_bind_shader(
 133    struct tgsi_exec_machine *mach,
 134    const struct tgsi_token *tokens,
 135    uint numSamplers,
 136    struct tgsi_sampler *samplers)
 137 {
 138    uint k;
 139    struct tgsi_parse_context parse;
 140    struct tgsi_exec_labels *labels = &mach->Labels;
 141    struct tgsi_full_instruction *instructions;
 142    struct tgsi_full_declaration *declarations;
 143    uint maxInstructions = 10, numInstructions = 0;
 144    uint maxDeclarations = 10, numDeclarations = 0;
 145    uint instno = 0;
 146
 147 #if 0
 148    tgsi_dump(tokens, 0);
 149 #endif
 150
 151    util_init_math();
 152
 153    mach->Tokens = tokens;
 154    mach->Samplers = samplers;
 155
 156    k = tgsi_parse_init (&parse, mach->Tokens);
 157    if (k != TGSI_PARSE_OK) {
 158       debug_printf( "Problem parsing!\n" );
 159       return;
 160    }
 161
 162    mach->Processor = parse.FullHeader.Processor.Processor;
 163    mach->ImmLimit = 0;
 164    labels->count = 0;
 165
 166    declarations = (struct tgsi_full_declaration *)
 167       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 168
 169    if (!declarations) {
 170       return;
 171    }
 172
 173    instructions = (struct tgsi_full_instruction *)
 174       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 175
 176    if (!instructions) {
 177       FREE( declarations );
 178       return;
 179    }
 180
 181    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 182       uint pointer = parse.Position;
 183       uint i;
 184
 185       tgsi_parse_token( &parse );
 186       switch( parse.FullToken.Token.Type ) {
 187       case TGSI_TOKEN_TYPE_DECLARATION:
 188          /* save expanded declaration */
 189          if (numDeclarations == maxDeclarations) {
 190             declarations = REALLOC(declarations,
 191                                    maxDeclarations
 192                                    * sizeof(struct tgsi_full_declaration),
 193                                    (maxDeclarations + 10)
 194                                    * sizeof(struct tgsi_full_declaration));
 195             maxDeclarations += 10;
 196          }
 197          memcpy(declarations + numDeclarations,
 198                 &parse.FullToken.FullDeclaration,
 199                 sizeof(declarations[0]));
 200          numDeclarations++;
 201          break;
 202
 203       case TGSI_TOKEN_TYPE_IMMEDIATE:
 204          {
 205             uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
 206             assert( size % 4 == 0 );
 207             assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
 208
 209             for( i = 0; i < size; i++ ) {
 210                mach->Imms[mach->ImmLimit + i / 4][i % 4] =
 211                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
 212             }
 213             mach->ImmLimit += size / 4;
 214          }
 215          break;
 216
 217       case TGSI_TOKEN_TYPE_INSTRUCTION:
 218          assert( labels->count < MAX_LABELS );
 219
 220          labels->labels[labels->count][0] = instno;
 221          labels->labels[labels->count][1] = pointer;
 222          labels->count++;
 223
 224          /* save expanded instruction */
 225          if (numInstructions == maxInstructions) {
 226             instructions = REALLOC(instructions,
 227                                    maxInstructions
 228                                    * sizeof(struct tgsi_full_instruction),
 229                                    (maxInstructions + 10)
 230                                    * sizeof(struct tgsi_full_instruction));
 231             maxInstructions += 10;
 232          }
 233          memcpy(instructions + numInstructions,
 234                 &parse.FullToken.FullInstruction,
 235                 sizeof(instructions[0]));
 236          numInstructions++;
 237          break;
 238
 239       default:
 240          assert( 0 );
 241       }
 242    }
 243    tgsi_parse_free (&parse);
 244
 245    if (mach->Declarations) {
 246       FREE( mach->Declarations );
 247    }
 248    mach->Declarations = declarations;
 249    mach->NumDeclarations = numDeclarations;
 250
 251    if (mach->Instructions) {
 252       FREE( mach->Instructions );
 253    }
 254    mach->Instructions = instructions;
 255    mach->NumInstructions = numInstructions;
 256 }
 257
 258
 259 void
 260 tgsi_exec_machine_init(
 261    struct tgsi_exec_machine *mach )
 262 {
 263    uint i;
 264
 265    mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
 266    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 267
 268    /* Setup constants. */
 269    for( i = 0; i < 4; i++ ) {
 270       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 271       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 272       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 273       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 274       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 275       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 276       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 277       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 278       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 279       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 280    }
 281 }
 282
 283
 284 void
 285 tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
 286 {
 287    if (mach->Instructions) {
 288       FREE(mach->Instructions);
 289       mach->Instructions = NULL;
 290       mach->NumInstructions = 0;
 291    }
 292    if (mach->Declarations) {
 293       FREE(mach->Declarations);
 294       mach->Declarations = NULL;
 295       mach->NumDeclarations = 0;
 296    }
 297 }
 298
 299
 300 static void
 301 micro_abs(
 302    union tgsi_exec_channel *dst,
 303    const union tgsi_exec_channel *src )
 304 {
 305    dst->f[0] = fabsf( src->f[0] );
 306    dst->f[1] = fabsf( src->f[1] );
 307    dst->f[2] = fabsf( src->f[2] );
 308    dst->f[3] = fabsf( src->f[3] );
 309 }
 310
 311 static void
 312 micro_add(
 313    union tgsi_exec_channel *dst,
 314    const union tgsi_exec_channel *src0,
 315    const union tgsi_exec_channel *src1 )
 316 {
 317    dst->f[0] = src0->f[0] + src1->f[0];
 318    dst->f[1] = src0->f[1] + src1->f[1];
 319    dst->f[2] = src0->f[2] + src1->f[2];
 320    dst->f[3] = src0->f[3] + src1->f[3];
 321 }
 322
 323 static void
 324 micro_iadd(
 325    union tgsi_exec_channel *dst,
 326    const union tgsi_exec_channel *src0,
 327    const union tgsi_exec_channel *src1 )
 328 {
 329    dst->i[0] = src0->i[0] + src1->i[0];
 330    dst->i[1] = src0->i[1] + src1->i[1];
 331    dst->i[2] = src0->i[2] + src1->i[2];
 332    dst->i[3] = src0->i[3] + src1->i[3];
 333 }
 334
 335 static void
 336 micro_and(
 337    union tgsi_exec_channel *dst,
 338    const union tgsi_exec_channel *src0,
 339    const union tgsi_exec_channel *src1 )
 340 {
 341    dst->u[0] = src0->u[0] & src1->u[0];
 342    dst->u[1] = src0->u[1] & src1->u[1];
 343    dst->u[2] = src0->u[2] & src1->u[2];
 344    dst->u[3] = src0->u[3] & src1->u[3];
 345 }
 346
 347 static void
 348 micro_ceil(
 349    union tgsi_exec_channel *dst,
 350    const union tgsi_exec_channel *src )
 351 {
 352    dst->f[0] = ceilf( src->f[0] );
 353    dst->f[1] = ceilf( src->f[1] );
 354    dst->f[2] = ceilf( src->f[2] );
 355    dst->f[3] = ceilf( src->f[3] );
 356 }
 357
 358 static void
 359 micro_cos(
 360    union tgsi_exec_channel *dst,
 361    const union tgsi_exec_channel *src )
 362 {
 363    dst->f[0] = cosf( src->f[0] );
 364    dst->f[1] = cosf( src->f[1] );
 365    dst->f[2] = cosf( src->f[2] );
 366    dst->f[3] = cosf( src->f[3] );
 367 }
 368
 369 static void
 370 micro_ddx(
 371    union tgsi_exec_channel *dst,
 372    const union tgsi_exec_channel *src )
 373 {
 374    dst->f[0] =
 375    dst->f[1] =
 376    dst->f[2] =
 377    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 378 }
 379
 380 static void
 381 micro_ddy(
 382    union tgsi_exec_channel *dst,
 383    const union tgsi_exec_channel *src )
 384 {
 385    dst->f[0] =
 386    dst->f[1] =
 387    dst->f[2] =
 388    dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
 389 }
 390
 391 static void
 392 micro_div(
 393    union tgsi_exec_channel *dst,
 394    const union tgsi_exec_channel *src0,
 395    const union tgsi_exec_channel *src1 )
 396 {
 397    if (src1->f[0] != 0) {
 398       dst->f[0] = src0->f[0] / src1->f[0];
 399    }
 400    if (src1->f[1] != 0) {
 401       dst->f[1] = src0->f[1] / src1->f[1];
 402    }
 403    if (src1->f[2] != 0) {
 404       dst->f[2] = src0->f[2] / src1->f[2];
 405    }
 406    if (src1->f[3] != 0) {
 407       dst->f[3] = src0->f[3] / src1->f[3];
 408    }
 409 }
 410
 411 static void
 412 micro_udiv(
 413    union tgsi_exec_channel *dst,
 414    const union tgsi_exec_channel *src0,
 415    const union tgsi_exec_channel *src1 )
 416 {
 417    dst->u[0] = src0->u[0] / src1->u[0];
 418    dst->u[1] = src0->u[1] / src1->u[1];
 419    dst->u[2] = src0->u[2] / src1->u[2];
 420    dst->u[3] = src0->u[3] / src1->u[3];
 421 }
 422
 423 static void
 424 micro_eq(
 425    union tgsi_exec_channel *dst,
 426    const union tgsi_exec_channel *src0,
 427    const union tgsi_exec_channel *src1,
 428    const union tgsi_exec_channel *src2,
 429    const union tgsi_exec_channel *src3 )
 430 {
 431    dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
 432    dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
 433    dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
 434    dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
 435 }
 436
 437 static void
 438 micro_ieq(
 439    union tgsi_exec_channel *dst,
 440    const union tgsi_exec_channel *src0,
 441    const union tgsi_exec_channel *src1,
 442    const union tgsi_exec_channel *src2,
 443    const union tgsi_exec_channel *src3 )
 444 {
 445    dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
 446    dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
 447    dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
 448    dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
 449 }
 450
 451 static void
 452 micro_exp2(
 453    union tgsi_exec_channel *dst,
 454    const union tgsi_exec_channel *src)
 455 {
 456 #if FAST_MATH
 457    dst->f[0] = util_fast_exp2( src->f[0] );
 458    dst->f[1] = util_fast_exp2( src->f[1] );
 459    dst->f[2] = util_fast_exp2( src->f[2] );
 460    dst->f[3] = util_fast_exp2( src->f[3] );
 461 #else
 462    dst->f[0] = powf( 2.0f, src->f[0] );
 463    dst->f[1] = powf( 2.0f, src->f[1] );
 464    dst->f[2] = powf( 2.0f, src->f[2] );
 465    dst->f[3] = powf( 2.0f, src->f[3] );
 466 #endif
 467 }
 468
 469 static void
 470 micro_f2it(
 471    union tgsi_exec_channel *dst,
 472    const union tgsi_exec_channel *src )
 473 {
 474    dst->i[0] = (int) src->f[0];
 475    dst->i[1] = (int) src->f[1];
 476    dst->i[2] = (int) src->f[2];
 477    dst->i[3] = (int) src->f[3];
 478 }
 479
 480 static void
 481 micro_f2ut(
 482    union tgsi_exec_channel *dst,
 483    const union tgsi_exec_channel *src )
 484 {
 485    dst->u[0] = (uint) src->f[0];
 486    dst->u[1] = (uint) src->f[1];
 487    dst->u[2] = (uint) src->f[2];
 488    dst->u[3] = (uint) src->f[3];
 489 }
 490
 491 static void
 492 micro_flr(
 493    union tgsi_exec_channel *dst,
 494    const union tgsi_exec_channel *src )
 495 {
 496    dst->f[0] = floorf( src->f[0] );
 497    dst->f[1] = floorf( src->f[1] );
 498    dst->f[2] = floorf( src->f[2] );
 499    dst->f[3] = floorf( src->f[3] );
 500 }
 501
 502 static void
 503 micro_frc(
 504    union tgsi_exec_channel *dst,
 505    const union tgsi_exec_channel *src )
 506 {
 507    dst->f[0] = src->f[0] - floorf( src->f[0] );
 508    dst->f[1] = src->f[1] - floorf( src->f[1] );
 509    dst->f[2] = src->f[2] - floorf( src->f[2] );
 510    dst->f[3] = src->f[3] - floorf( src->f[3] );
 511 }
 512
 513 static void
 514 micro_ge(
 515    union tgsi_exec_channel *dst,
 516    const union tgsi_exec_channel *src0,
 517    const union tgsi_exec_channel *src1,
 518    const union tgsi_exec_channel *src2,
 519    const union tgsi_exec_channel *src3 )
 520 {
 521    dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
 522    dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
 523    dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
 524    dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
 525 }
 526
 527 static void
 528 micro_i2f(
 529    union tgsi_exec_channel *dst,
 530    const union tgsi_exec_channel *src )
 531 {
 532    dst->f[0] = (float) src->i[0];
 533    dst->f[1] = (float) src->i[1];
 534    dst->f[2] = (float) src->i[2];
 535    dst->f[3] = (float) src->i[3];
 536 }
 537
 538 static void
 539 micro_lg2(
 540    union tgsi_exec_channel *dst,
 541    const union tgsi_exec_channel *src )
 542 {
 543 #if FAST_MATH
 544    dst->f[0] = util_fast_log2( src->f[0] );
 545    dst->f[1] = util_fast_log2( src->f[1] );
 546    dst->f[2] = util_fast_log2( src->f[2] );
 547    dst->f[3] = util_fast_log2( src->f[3] );
 548 #else
 549    dst->f[0] = logf( src->f[0] ) * 1.442695f;
 550    dst->f[1] = logf( src->f[1] ) * 1.442695f;
 551    dst->f[2] = logf( src->f[2] ) * 1.442695f;
 552    dst->f[3] = logf( src->f[3] ) * 1.442695f;
 553 #endif
 554 }
 555
 556 static void
 557 micro_le(
 558    union tgsi_exec_channel *dst,
 559    const union tgsi_exec_channel *src0,
 560    const union tgsi_exec_channel *src1,
 561    const union tgsi_exec_channel *src2,
 562    const union tgsi_exec_channel *src3 )
 563 {
 564    dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
 565    dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
 566    dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
 567    dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
 568 }
 569
 570 static void
 571 micro_lt(
 572    union tgsi_exec_channel *dst,
 573    const union tgsi_exec_channel *src0,
 574    const union tgsi_exec_channel *src1,
 575    const union tgsi_exec_channel *src2,
 576    const union tgsi_exec_channel *src3 )
 577 {
 578    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 579    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 580    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 581    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 582 }
 583
 584 static void
 585 micro_ilt(
 586    union tgsi_exec_channel *dst,
 587    const union tgsi_exec_channel *src0,
 588    const union tgsi_exec_channel *src1,
 589    const union tgsi_exec_channel *src2,
 590    const union tgsi_exec_channel *src3 )
 591 {
 592    dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
 593    dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
 594    dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
 595    dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
 596 }
 597
 598 static void
 599 micro_ult(
 600    union tgsi_exec_channel *dst,
 601    const union tgsi_exec_channel *src0,
 602    const union tgsi_exec_channel *src1,
 603    const union tgsi_exec_channel *src2,
 604    const union tgsi_exec_channel *src3 )
 605 {
 606    dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
 607    dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
 608    dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
 609    dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
 610 }
 611
 612 static void
 613 micro_max(
 614    union tgsi_exec_channel *dst,
 615    const union tgsi_exec_channel *src0,
 616    const union tgsi_exec_channel *src1 )
 617 {
 618    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 619    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 620    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 621    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 622 }
 623
 624 static void
 625 micro_imax(
 626    union tgsi_exec_channel *dst,
 627    const union tgsi_exec_channel *src0,
 628    const union tgsi_exec_channel *src1 )
 629 {
 630    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
 631    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
 632    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
 633    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
 634 }
 635
 636 static void
 637 micro_umax(
 638    union tgsi_exec_channel *dst,
 639    const union tgsi_exec_channel *src0,
 640    const union tgsi_exec_channel *src1 )
 641 {
 642    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
 643    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
 644    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
 645    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
 646 }
 647
 648 static void
 649 micro_min(
 650    union tgsi_exec_channel *dst,
 651    const union tgsi_exec_channel *src0,
 652    const union tgsi_exec_channel *src1 )
 653 {
 654    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 655    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 656    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 657    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 658 }
 659
 660 static void
 661 micro_imin(
 662    union tgsi_exec_channel *dst,
 663    const union tgsi_exec_channel *src0,
 664    const union tgsi_exec_channel *src1 )
 665 {
 666    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
 667    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
 668    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
 669    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
 670 }
 671
 672 static void
 673 micro_umin(
 674    union tgsi_exec_channel *dst,
 675    const union tgsi_exec_channel *src0,
 676    const union tgsi_exec_channel *src1 )
 677 {
 678    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
 679    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
 680    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
 681    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
 682 }
 683
 684 static void
 685 micro_umod(
 686    union tgsi_exec_channel *dst,
 687    const union tgsi_exec_channel *src0,
 688    const union tgsi_exec_channel *src1 )
 689 {
 690    dst->u[0] = src0->u[0] % src1->u[0];
 691    dst->u[1] = src0->u[1] % src1->u[1];
 692    dst->u[2] = src0->u[2] % src1->u[2];
 693    dst->u[3] = src0->u[3] % src1->u[3];
 694 }
 695
 696 static void
 697 micro_mul(
 698    union tgsi_exec_channel *dst,
 699    const union tgsi_exec_channel *src0,
 700    const union tgsi_exec_channel *src1 )
 701 {
 702    dst->f[0] = src0->f[0] * src1->f[0];
 703    dst->f[1] = src0->f[1] * src1->f[1];
 704    dst->f[2] = src0->f[2] * src1->f[2];
 705    dst->f[3] = src0->f[3] * src1->f[3];
 706 }
 707
 708 static void
 709 micro_imul(
 710    union tgsi_exec_channel *dst,
 711    const union tgsi_exec_channel *src0,
 712    const union tgsi_exec_channel *src1 )
 713 {
 714    dst->i[0] = src0->i[0] * src1->i[0];
 715    dst->i[1] = src0->i[1] * src1->i[1];
 716    dst->i[2] = src0->i[2] * src1->i[2];
 717    dst->i[3] = src0->i[3] * src1->i[3];
 718 }
 719
 720 static void
 721 micro_imul64(
 722    union tgsi_exec_channel *dst0,
 723    union tgsi_exec_channel *dst1,
 724    const union tgsi_exec_channel *src0,
 725    const union tgsi_exec_channel *src1 )
 726 {
 727    dst1->i[0] = src0->i[0] * src1->i[0];
 728    dst1->i[1] = src0->i[1] * src1->i[1];
 729    dst1->i[2] = src0->i[2] * src1->i[2];
 730    dst1->i[3] = src0->i[3] * src1->i[3];
 731    dst0->i[0] = 0;
 732    dst0->i[1] = 0;
 733    dst0->i[2] = 0;
 734    dst0->i[3] = 0;
 735 }
 736
 737 static void
 738 micro_umul64(
 739    union tgsi_exec_channel *dst0,
 740    union tgsi_exec_channel *dst1,
 741    const union tgsi_exec_channel *src0,
 742    const union tgsi_exec_channel *src1 )
 743 {
 744    dst1->u[0] = src0->u[0] * src1->u[0];
 745    dst1->u[1] = src0->u[1] * src1->u[1];
 746    dst1->u[2] = src0->u[2] * src1->u[2];
 747    dst1->u[3] = src0->u[3] * src1->u[3];
 748    dst0->u[0] = 0;
 749    dst0->u[1] = 0;
 750    dst0->u[2] = 0;
 751    dst0->u[3] = 0;
 752 }
 753
 754 static void
 755 micro_movc(
 756    union tgsi_exec_channel *dst,
 757    const union tgsi_exec_channel *src0,
 758    const union tgsi_exec_channel *src1,
 759    const union tgsi_exec_channel *src2 )
 760 {
 761    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 762    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 763    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 764    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 765 }
 766
 767 static void
 768 micro_neg(
 769    union tgsi_exec_channel *dst,
 770    const union tgsi_exec_channel *src )
 771 {
 772    dst->f[0] = -src->f[0];
 773    dst->f[1] = -src->f[1];
 774    dst->f[2] = -src->f[2];
 775    dst->f[3] = -src->f[3];
 776 }
 777
 778 static void
 779 micro_ineg(
 780    union tgsi_exec_channel *dst,
 781    const union tgsi_exec_channel *src )
 782 {
 783    dst->i[0] = -src->i[0];
 784    dst->i[1] = -src->i[1];
 785    dst->i[2] = -src->i[2];
 786    dst->i[3] = -src->i[3];
 787 }
 788
 789 static void
 790 micro_not(
 791    union tgsi_exec_channel *dst,
 792    const union tgsi_exec_channel *src )
 793 {
 794    dst->u[0] = ~src->u[0];
 795    dst->u[1] = ~src->u[1];
 796    dst->u[2] = ~src->u[2];
 797    dst->u[3] = ~src->u[3];
 798 }
 799
 800 static void
 801 micro_or(
 802    union tgsi_exec_channel *dst,
 803    const union tgsi_exec_channel *src0,
 804    const union tgsi_exec_channel *src1 )
 805 {
 806    dst->u[0] = src0->u[0] | src1->u[0];
 807    dst->u[1] = src0->u[1] | src1->u[1];
 808    dst->u[2] = src0->u[2] | src1->u[2];
 809    dst->u[3] = src0->u[3] | src1->u[3];
 810 }
 811
 812 static void
 813 micro_pow(
 814    union tgsi_exec_channel *dst,
 815    const union tgsi_exec_channel *src0,
 816    const union tgsi_exec_channel *src1 )
 817 {
 818 #if FAST_MATH
 819    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 820    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 821    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 822    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 823 #else
 824    dst->f[0] = powf( src0->f[0], src1->f[0] );
 825    dst->f[1] = powf( src0->f[1], src1->f[1] );
 826    dst->f[2] = powf( src0->f[2], src1->f[2] );
 827    dst->f[3] = powf( src0->f[3], src1->f[3] );
 828 #endif
 829 }
 830
 831 static void
 832 micro_rnd(
 833    union tgsi_exec_channel *dst,
 834    const union tgsi_exec_channel *src )
 835 {
 836    dst->f[0] = floorf( src->f[0] + 0.5f );
 837    dst->f[1] = floorf( src->f[1] + 0.5f );
 838    dst->f[2] = floorf( src->f[2] + 0.5f );
 839    dst->f[3] = floorf( src->f[3] + 0.5f );
 840 }
 841
 842 static void
 843 micro_shl(
 844    union tgsi_exec_channel *dst,
 845    const union tgsi_exec_channel *src0,
 846    const union tgsi_exec_channel *src1 )
 847 {
 848    dst->i[0] = src0->i[0] << src1->i[0];
 849    dst->i[1] = src0->i[1] << src1->i[1];
 850    dst->i[2] = src0->i[2] << src1->i[2];
 851    dst->i[3] = src0->i[3] << src1->i[3];
 852 }
 853
 854 static void
 855 micro_ishr(
 856    union tgsi_exec_channel *dst,
 857    const union tgsi_exec_channel *src0,
 858    const union tgsi_exec_channel *src1 )
 859 {
 860    dst->i[0] = src0->i[0] >> src1->i[0];
 861    dst->i[1] = src0->i[1] >> src1->i[1];
 862    dst->i[2] = src0->i[2] >> src1->i[2];
 863    dst->i[3] = src0->i[3] >> src1->i[3];
 864 }
 865
 866 static void
 867 micro_trunc(
 868    union tgsi_exec_channel *dst,
 869    const union tgsi_exec_channel *src0 )
 870 {
 871    dst->f[0] = (float) (int) src0->f[0];
 872    dst->f[1] = (float) (int) src0->f[1];
 873    dst->f[2] = (float) (int) src0->f[2];
 874    dst->f[3] = (float) (int) src0->f[3];
 875 }
 876
 877 static void
 878 micro_ushr(
 879    union tgsi_exec_channel *dst,
 880    const union tgsi_exec_channel *src0,
 881    const union tgsi_exec_channel *src1 )
 882 {
 883    dst->u[0] = src0->u[0] >> src1->u[0];
 884    dst->u[1] = src0->u[1] >> src1->u[1];
 885    dst->u[2] = src0->u[2] >> src1->u[2];
 886    dst->u[3] = src0->u[3] >> src1->u[3];
 887 }
 888
 889 static void
 890 micro_sin(
 891    union tgsi_exec_channel *dst,
 892    const union tgsi_exec_channel *src )
 893 {
 894    dst->f[0] = sinf( src->f[0] );
 895    dst->f[1] = sinf( src->f[1] );
 896    dst->f[2] = sinf( src->f[2] );
 897    dst->f[3] = sinf( src->f[3] );
 898 }
 899
 900 static void
 901 micro_sqrt( union tgsi_exec_channel *dst,
 902             const union tgsi_exec_channel *src )
 903 {
 904    dst->f[0] = sqrtf( src->f[0] );
 905    dst->f[1] = sqrtf( src->f[1] );
 906    dst->f[2] = sqrtf( src->f[2] );
 907    dst->f[3] = sqrtf( src->f[3] );
 908 }
 909
 910 static void
 911 micro_sub(
 912    union tgsi_exec_channel *dst,
 913    const union tgsi_exec_channel *src0,
 914    const union tgsi_exec_channel *src1 )
 915 {
 916    dst->f[0] = src0->f[0] - src1->f[0];
 917    dst->f[1] = src0->f[1] - src1->f[1];
 918    dst->f[2] = src0->f[2] - src1->f[2];
 919    dst->f[3] = src0->f[3] - src1->f[3];
 920 }
 921
 922 static void
 923 micro_u2f(
 924    union tgsi_exec_channel *dst,
 925    const union tgsi_exec_channel *src )
 926 {
 927    dst->f[0] = (float) src->u[0];
 928    dst->f[1] = (float) src->u[1];
 929    dst->f[2] = (float) src->u[2];
 930    dst->f[3] = (float) src->u[3];
 931 }
 932
 933 static void
 934 micro_xor(
 935    union tgsi_exec_channel *dst,
 936    const union tgsi_exec_channel *src0,
 937    const union tgsi_exec_channel *src1 )
 938 {
 939    dst->u[0] = src0->u[0] ^ src1->u[0];
 940    dst->u[1] = src0->u[1] ^ src1->u[1];
 941    dst->u[2] = src0->u[2] ^ src1->u[2];
 942    dst->u[3] = src0->u[3] ^ src1->u[3];
 943 }
 944
 945 static void
 946 fetch_src_file_channel(
 947    const struct tgsi_exec_machine *mach,
 948    const uint file,
 949    const uint swizzle,
 950    const union tgsi_exec_channel *index,
 951    union tgsi_exec_channel *chan )
 952 {
 953    switch( swizzle ) {
 954    case TGSI_EXTSWIZZLE_X:
 955    case TGSI_EXTSWIZZLE_Y:
 956    case TGSI_EXTSWIZZLE_Z:
 957    case TGSI_EXTSWIZZLE_W:
 958       switch( file ) {
 959       case TGSI_FILE_CONSTANT:
 960          assert(mach->Consts);
 961          assert(index->i[0] >= 0);
 962          assert(index->i[1] >= 0);
 963          assert(index->i[2] >= 0);
 964          assert(index->i[3] >= 0);
 965          chan->f[0] = mach->Consts[index->i[0]][swizzle];
 966          chan->f[1] = mach->Consts[index->i[1]][swizzle];
 967          chan->f[2] = mach->Consts[index->i[2]][swizzle];
 968          chan->f[3] = mach->Consts[index->i[3]][swizzle];
 969          break;
 970
 971       case TGSI_FILE_INPUT:
 972          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
 973          chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
 974          chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
 975          chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
 976          break;
 977
 978       case TGSI_FILE_TEMPORARY:
 979          assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
 980          chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
 981          chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
 982          chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
 983          chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
 984          break;
 985
 986       case TGSI_FILE_IMMEDIATE:
 987          assert( index->i[0] < (int) mach->ImmLimit );
 988          chan->f[0] = mach->Imms[index->i[0]][swizzle];
 989          assert( index->i[1] < (int) mach->ImmLimit );
 990          chan->f[1] = mach->Imms[index->i[1]][swizzle];
 991          assert( index->i[2] < (int) mach->ImmLimit );
 992          chan->f[2] = mach->Imms[index->i[2]][swizzle];
 993          assert( index->i[3] < (int) mach->ImmLimit );
 994          chan->f[3] = mach->Imms[index->i[3]][swizzle];
 995          break;
 996
 997       case TGSI_FILE_ADDRESS:
 998          chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
 999          chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1000          chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1001          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1002          break;
1003
1004       case TGSI_FILE_OUTPUT:
1005          /* vertex/fragment output vars can be read too */
1006          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1007          chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1008          chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1009          chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1010          break;
1011
1012       default:
1013          assert( 0 );
1014       }
1015       break;
1016
1017    case TGSI_EXTSWIZZLE_ZERO:
1018       *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
1019       break;
1020
1021    case TGSI_EXTSWIZZLE_ONE:
1022       *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
1023       break;
1024
1025    default:
1026       assert( 0 );
1027    }
1028 }
1029
1030 static void
1031 fetch_source(
1032    const struct tgsi_exec_machine *mach,
1033    union tgsi_exec_channel *chan,
1034    const struct tgsi_full_src_register *reg,
1035    const uint chan_index )
1036 {
1037    union tgsi_exec_channel index;
1038    uint swizzle;
1039
1040    /* We start with a direct index into a register file.
1041     *
1042     *    file[1],
1043     *    where:
1044     *       file = SrcRegister.File
1045     *       [1] = SrcRegister.Index
1046     */
1047    index.i[0] =
1048    index.i[1] =
1049    index.i[2] =
1050    index.i[3] = reg->SrcRegister.Index;
1051
1052    /* There is an extra source register that indirectly subscripts
1053     * a register file. The direct index now becomes an offset
1054     * that is being added to the indirect register.
1055     *
1056     *    file[ind[2].x+1],
1057     *    where:
1058     *       ind = SrcRegisterInd.File
1059     *       [2] = SrcRegisterInd.Index
1060     *       .x = SrcRegisterInd.SwizzleX
1061     */
1062    if (reg->SrcRegister.Indirect) {
1063       union tgsi_exec_channel index2;
1064       union tgsi_exec_channel indir_index;
1065       const uint execmask = mach->ExecMask;
1066       uint i;
1067
1068       /* which address register (always zero now) */
1069       index2.i[0] =
1070       index2.i[1] =
1071       index2.i[2] =
1072       index2.i[3] = reg->SrcRegisterInd.Index;
1073
1074       /* get current value of address register[swizzle] */
1075       swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1076       fetch_src_file_channel(
1077          mach,
1078          reg->SrcRegisterInd.File,
1079          swizzle,
1080          &index2,
1081          &indir_index );
1082
1083       /* add value of address register to the offset */
1084       index.i[0] += indir_index.i[0];
1085       index.i[1] += indir_index.i[1];
1086       index.i[2] += indir_index.i[2];
1087       index.i[3] += indir_index.i[3];
1088
1089       /* for disabled execution channels, zero-out the index to
1090        * avoid using a potential garbage value.
1091        */
1092       for (i = 0; i < QUAD_SIZE; i++) {
1093          if ((execmask & (1 << i)) == 0)
1094             index.i[i] = 0;
1095       }
1096    }
1097
1098    /* There is an extra source register that is a second
1099     * subscript to a register file. Effectively it means that
1100     * the register file is actually a 2D array of registers.
1101     *
1102     *    file[1][3] == file[1*sizeof(file[1])+3],
1103     *    where:
1104     *       [3] = SrcRegisterDim.Index
1105     */
1106    if (reg->SrcRegister.Dimension) {
1107       /* The size of the first-order array depends on the register file type.
1108        * We need to multiply the index to the first array to get an effective,
1109        * "flat" index that points to the beginning of the second-order array.
1110        */
1111       switch (reg->SrcRegister.File) {
1112       case TGSI_FILE_INPUT:
1113          index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1114          index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1115          index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1116          index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1117          break;
1118       case TGSI_FILE_CONSTANT:
1119          index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1120          index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1121          index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1122          index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1123          break;
1124       default:
1125          assert( 0 );
1126       }
1127
1128       index.i[0] += reg->SrcRegisterDim.Index;
1129       index.i[1] += reg->SrcRegisterDim.Index;
1130       index.i[2] += reg->SrcRegisterDim.Index;
1131       index.i[3] += reg->SrcRegisterDim.Index;
1132
1133       /* Again, the second subscript index can be addressed indirectly
1134        * identically to the first one.
1135        * Nothing stops us from indirectly addressing the indirect register,
1136        * but there is no need for that, so we won't exercise it.
1137        *
1138        *    file[1][ind[4].y+3],
1139        *    where:
1140        *       ind = SrcRegisterDimInd.File
1141        *       [4] = SrcRegisterDimInd.Index
1142        *       .y = SrcRegisterDimInd.SwizzleX
1143        */
1144       if (reg->SrcRegisterDim.Indirect) {
1145          union tgsi_exec_channel index2;
1146          union tgsi_exec_channel indir_index;
1147          const uint execmask = mach->ExecMask;
1148          uint i;
1149
1150          index2.i[0] =
1151          index2.i[1] =
1152          index2.i[2] =
1153          index2.i[3] = reg->SrcRegisterDimInd.Index;
1154
1155          swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1156          fetch_src_file_channel(
1157             mach,
1158             reg->SrcRegisterDimInd.File,
1159             swizzle,
1160             &index2,
1161             &indir_index );
1162
1163          index.i[0] += indir_index.i[0];
1164          index.i[1] += indir_index.i[1];
1165          index.i[2] += indir_index.i[2];
1166          index.i[3] += indir_index.i[3];
1167
1168          /* for disabled execution channels, zero-out the index to
1169           * avoid using a potential garbage value.
1170           */
1171          for (i = 0; i < QUAD_SIZE; i++) {
1172             if ((execmask & (1 << i)) == 0)
1173                index.i[i] = 0;
1174          }
1175       }
1176
1177       /* If by any chance there was a need for a 3D array of register
1178        * files, we would have to check whether SrcRegisterDim is followed
1179        * by a dimension register and continue the saga.
1180        */
1181    }
1182
1183    swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1184    fetch_src_file_channel(
1185       mach,
1186       reg->SrcRegister.File,
1187       swizzle,
1188       &index,
1189       chan );
1190
1191    switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1192    case TGSI_UTIL_SIGN_CLEAR:
1193       micro_abs( chan, chan );
1194       break;
1195
1196    case TGSI_UTIL_SIGN_SET:
1197       micro_abs( chan, chan );
1198       micro_neg( chan, chan );
1199       break;
1200
1201    case TGSI_UTIL_SIGN_TOGGLE:
1202       micro_neg( chan, chan );
1203       break;
1204
1205    case TGSI_UTIL_SIGN_KEEP:
1206       break;
1207    }
1208
1209    if (reg->SrcRegisterExtMod.Complement) {
1210       micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1211    }
1212 }
1213
1214 static void
1215 store_dest(
1216    struct tgsi_exec_machine *mach,
1217    const union tgsi_exec_channel *chan,
1218    const struct tgsi_full_dst_register *reg,
1219    const struct tgsi_full_instruction *inst,
1220    uint chan_index )
1221 {
1222    uint i;
1223    union tgsi_exec_channel null;
1224    union tgsi_exec_channel *dst;
1225    uint execmask = mach->ExecMask;
1226
1227    switch (reg->DstRegister.File) {
1228    case TGSI_FILE_NULL:
1229       dst = &null;
1230       break;
1231
1232    case TGSI_FILE_OUTPUT:
1233       dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1234                            + reg->DstRegister.Index].xyzw[chan_index];
1235       break;
1236
1237    case TGSI_FILE_TEMPORARY:
1238       assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS );
1239       dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
1240       break;
1241
1242    case TGSI_FILE_ADDRESS:
1243       dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
1244       break;
1245
1246    default:
1247       assert( 0 );
1248       return;
1249    }
1250
1251    if (inst->InstructionExtNv.CondFlowEnable) {
1252       union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1253       uint swizzle;
1254       uint shift;
1255       uint mask;
1256       uint test;
1257
1258       /* Only CC0 supported.
1259        */
1260       assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1261
1262       switch (chan_index) {
1263       case CHAN_X:
1264          swizzle = inst->InstructionExtNv.CondSwizzleX;
1265          break;
1266       case CHAN_Y:
1267          swizzle = inst->InstructionExtNv.CondSwizzleY;
1268          break;
1269       case CHAN_Z:
1270          swizzle = inst->InstructionExtNv.CondSwizzleZ;
1271          break;
1272       case CHAN_W:
1273          swizzle = inst->InstructionExtNv.CondSwizzleW;
1274          break;
1275       default:
1276          assert( 0 );
1277          return;
1278       }
1279
1280       switch (swizzle) {
1281       case TGSI_SWIZZLE_X:
1282          shift = TGSI_EXEC_CC_X_SHIFT;
1283          mask = TGSI_EXEC_CC_X_MASK;
1284          break;
1285       case TGSI_SWIZZLE_Y:
1286          shift = TGSI_EXEC_CC_Y_SHIFT;
1287          mask = TGSI_EXEC_CC_Y_MASK;
1288          break;
1289       case TGSI_SWIZZLE_Z:
1290          shift = TGSI_EXEC_CC_Z_SHIFT;
1291          mask = TGSI_EXEC_CC_Z_MASK;
1292          break;
1293       case TGSI_SWIZZLE_W:
1294          shift = TGSI_EXEC_CC_W_SHIFT;
1295          mask = TGSI_EXEC_CC_W_MASK;
1296          break;
1297       default:
1298          assert( 0 );
1299          return;
1300       }
1301
1302       switch (inst->InstructionExtNv.CondMask) {
1303       case TGSI_CC_GT:
1304          test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1305          for (i = 0; i < QUAD_SIZE; i++)
1306             if (cc->u[i] & test)
1307                execmask &= ~(1 << i);
1308          break;
1309
1310       case TGSI_CC_EQ:
1311          test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1312          for (i = 0; i < QUAD_SIZE; i++)
1313             if (cc->u[i] & test)
1314                execmask &= ~(1 << i);
1315          break;
1316
1317       case TGSI_CC_LT:
1318          test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1319          for (i = 0; i < QUAD_SIZE; i++)
1320             if (cc->u[i] & test)
1321                execmask &= ~(1 << i);
1322          break;
1323
1324       case TGSI_CC_GE:
1325          test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1326          for (i = 0; i < QUAD_SIZE; i++)
1327             if (cc->u[i] & test)
1328                execmask &= ~(1 << i);
1329          break;
1330
1331       case TGSI_CC_LE:
1332          test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1333          for (i = 0; i < QUAD_SIZE; i++)
1334             if (cc->u[i] & test)
1335                execmask &= ~(1 << i);
1336          break;
1337
1338       case TGSI_CC_NE:
1339          test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1340          for (i = 0; i < QUAD_SIZE; i++)
1341             if (cc->u[i] & test)
1342                execmask &= ~(1 << i);
1343          break;
1344
1345       case TGSI_CC_TR:
1346          break;
1347
1348       case TGSI_CC_FL:
1349          for (i = 0; i < QUAD_SIZE; i++)
1350             execmask &= ~(1 << i);
1351          break;
1352
1353       default:
1354          assert( 0 );
1355          return;
1356       }
1357    }
1358
1359    switch (inst->Instruction.Saturate) {
1360    case TGSI_SAT_NONE:
1361       for (i = 0; i < QUAD_SIZE; i++)
1362          if (execmask & (1 << i))
1363             dst->i[i] = chan->i[i];
1364       break;
1365
1366    case TGSI_SAT_ZERO_ONE:
1367       for (i = 0; i < QUAD_SIZE; i++)
1368          if (execmask & (1 << i)) {
1369             if (chan->f[i] < 0.0f)
1370                dst->f[i] = 0.0f;
1371             else if (chan->f[i] > 1.0f)
1372                dst->f[i] = 1.0f;
1373             else
1374                dst->i[i] = chan->i[i];
1375          }
1376       break;
1377
1378    case TGSI_SAT_MINUS_PLUS_ONE:
1379       for (i = 0; i < QUAD_SIZE; i++)
1380          if (execmask & (1 << i)) {
1381             if (chan->f[i] < -1.0f)
1382                dst->f[i] = -1.0f;
1383             else if (chan->f[i] > 1.0f)
1384                dst->f[i] = 1.0f;
1385             else
1386                dst->i[i] = chan->i[i];
1387          }
1388       break;
1389
1390    default:
1391       assert( 0 );
1392    }
1393
1394    if (inst->InstructionExtNv.CondDstUpdate) {
1395       union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1396       uint shift;
1397       uint mask;
1398
1399       /* Only CC0 supported.
1400        */
1401       assert( inst->InstructionExtNv.CondDstIndex < 1 );
1402
1403       switch (chan_index) {
1404       case CHAN_X:
1405          shift = TGSI_EXEC_CC_X_SHIFT;
1406          mask = ~TGSI_EXEC_CC_X_MASK;
1407          break;
1408       case CHAN_Y:
1409          shift = TGSI_EXEC_CC_Y_SHIFT;
1410          mask = ~TGSI_EXEC_CC_Y_MASK;
1411          break;
1412       case CHAN_Z:
1413          shift = TGSI_EXEC_CC_Z_SHIFT;
1414          mask = ~TGSI_EXEC_CC_Z_MASK;
1415          break;
1416       case CHAN_W:
1417          shift = TGSI_EXEC_CC_W_SHIFT;
1418          mask = ~TGSI_EXEC_CC_W_MASK;
1419          break;
1420       default:
1421          assert( 0 );
1422          return;
1423       }
1424
1425       for (i = 0; i < QUAD_SIZE; i++)
1426          if (execmask & (1 << i)) {
1427             cc->u[i] &= mask;
1428             if (dst->f[i] < 0.0f)
1429                cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1430             else if (dst->f[i] > 0.0f)
1431                cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1432             else if (dst->f[i] == 0.0f)
1433                cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1434             else
1435                cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1436          }
1437    }
1438 }
1439
1440 #define FETCH(VAL,INDEX,CHAN)\
1441     fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1442
1443 #define STORE(VAL,INDEX,CHAN)\
1444     store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1445
1446
1447 /**
1448  * Execute ARB-style KIL which is predicated by a src register.
1449  * Kill fragment if any of the four values is less than zero.
1450  */
1451 static void
1452 exec_kil(struct tgsi_exec_machine *mach,
1453          const struct tgsi_full_instruction *inst)
1454 {
1455    uint uniquemask;
1456    uint chan_index;
1457    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1458    union tgsi_exec_channel r[1];
1459
1460    /* This mask stores component bits that were already tested. Note that
1461     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1462     * tested. */
1463    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1464
1465    for (chan_index = 0; chan_index < 4; chan_index++)
1466    {
1467       uint swizzle;
1468       uint i;
1469
1470       /* unswizzle channel */
1471       swizzle = tgsi_util_get_full_src_register_extswizzle (
1472                         &inst->FullSrcRegisters[0],
1473                         chan_index);
1474
1475       /* check if the component has not been already tested */
1476       if (uniquemask & (1 << swizzle))
1477          continue;
1478       uniquemask |= 1 << swizzle;
1479
1480       FETCH(&r[0], 0, chan_index);
1481       for (i = 0; i < 4; i++)
1482          if (r[0].f[i] < 0.0f)
1483             kilmask |= 1 << i;
1484    }
1485
1486    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1487 }
1488
1489 /**
1490  * Execute NVIDIA-style KIL which is predicated by a condition code.
1491  * Kill fragment if the condition code is TRUE.
1492  */
1493 static void
1494 exec_kilp(struct tgsi_exec_machine *mach,
1495           const struct tgsi_full_instruction *inst)
1496 {
1497    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1498
1499    if (inst->InstructionExtNv.CondFlowEnable) {
1500       uint swizzle[4];
1501       uint chan_index;
1502
1503       kilmask = 0x0;
1504
1505       swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1506       swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1507       swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1508       swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1509
1510       for (chan_index = 0; chan_index < 4; chan_index++)
1511       {
1512          uint i;
1513
1514          for (i = 0; i < 4; i++) {
1515             /* TODO: evaluate the condition code */
1516             if (0)
1517                kilmask |= 1 << i;
1518          }
1519       }
1520    }
1521    else {
1522       /* "unconditional" kil */
1523       kilmask = mach->ExecMask;
1524    }
1525    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1526 }
1527
1528
1529 /*
1530  * Fetch a texel using STR texture coordinates.
1531  */
1532 static void
1533 fetch_texel( struct tgsi_sampler *sampler,
1534              const union tgsi_exec_channel *s,
1535              const union tgsi_exec_channel *t,
1536              const union tgsi_exec_channel *p,
1537              float lodbias,  /* XXX should be float[4] */
1538              union tgsi_exec_channel *r,
1539              union tgsi_exec_channel *g,
1540              union tgsi_exec_channel *b,
1541              union tgsi_exec_channel *a )
1542 {
1543    uint j;
1544    float rgba[NUM_CHANNELS][QUAD_SIZE];
1545
1546    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1547
1548    for (j = 0; j < 4; j++) {
1549       r->f[j] = rgba[0][j];
1550       g->f[j] = rgba[1][j];
1551       b->f[j] = rgba[2][j];
1552       a->f[j] = rgba[3][j];
1553    }
1554 }
1555
1556
1557 static void
1558 exec_tex(struct tgsi_exec_machine *mach,
1559          const struct tgsi_full_instruction *inst,
1560          boolean biasLod,
1561          boolean projected)
1562 {
1563    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1564    union tgsi_exec_channel r[8];
1565    uint chan_index;
1566    float lodBias;
1567
1568    /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1569
1570    switch (inst->InstructionExtTexture.Texture) {
1571    case TGSI_TEXTURE_1D:
1572
1573       FETCH(&r[0], 0, CHAN_X);
1574
1575       if (projected) {
1576          FETCH(&r[1], 0, CHAN_W);
1577          micro_div( &r[0], &r[0], &r[1] );
1578       }
1579
1580       if (biasLod) {
1581          FETCH(&r[1], 0, CHAN_W);
1582          lodBias = r[2].f[0];
1583       }
1584       else
1585          lodBias = 0.0;
1586
1587       fetch_texel(&mach->Samplers[unit],
1588                   &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
1589                   &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1590       break;
1591
1592    case TGSI_TEXTURE_2D:
1593    case TGSI_TEXTURE_RECT:
1594
1595       FETCH(&r[0], 0, CHAN_X);
1596       FETCH(&r[1], 0, CHAN_Y);
1597       FETCH(&r[2], 0, CHAN_Z);
1598
1599       if (projected) {
1600          FETCH(&r[3], 0, CHAN_W);
1601          micro_div( &r[0], &r[0], &r[3] );
1602          micro_div( &r[1], &r[1], &r[3] );
1603          micro_div( &r[2], &r[2], &r[3] );
1604       }
1605
1606       if (biasLod) {
1607          FETCH(&r[3], 0, CHAN_W);
1608          lodBias = r[3].f[0];
1609       }
1610       else
1611          lodBias = 0.0;
1612
1613       fetch_texel(&mach->Samplers[unit],
1614                   &r[0], &r[1], &r[2], lodBias,  /* inputs */
1615                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1616       break;
1617
1618    case TGSI_TEXTURE_3D:
1619    case TGSI_TEXTURE_CUBE:
1620
1621       FETCH(&r[0], 0, CHAN_X);
1622       FETCH(&r[1], 0, CHAN_Y);
1623       FETCH(&r[2], 0, CHAN_Z);
1624
1625       if (projected) {
1626          FETCH(&r[3], 0, CHAN_W);
1627          micro_div( &r[0], &r[0], &r[3] );
1628          micro_div( &r[1], &r[1], &r[3] );
1629          micro_div( &r[2], &r[2], &r[3] );
1630       }
1631
1632       if (biasLod) {
1633          FETCH(&r[3], 0, CHAN_W);
1634          lodBias = r[3].f[0];
1635       }
1636       else
1637          lodBias = 0.0;
1638
1639       fetch_texel(&mach->Samplers[unit],
1640                   &r[0], &r[1], &r[2], lodBias,
1641                   &r[0], &r[1], &r[2], &r[3]);
1642       break;
1643
1644    default:
1645       assert (0);
1646    }
1647
1648    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1649       STORE( &r[chan_index], 0, chan_index );
1650    }
1651 }
1652
1653
1654 /**
1655  * Evaluate a constant-valued coefficient at the position of the
1656  * current quad.
1657  */
1658 static void
1659 eval_constant_coef(
1660    struct tgsi_exec_machine *mach,
1661    unsigned attrib,
1662    unsigned chan )
1663 {
1664    unsigned i;
1665
1666    for( i = 0; i < QUAD_SIZE; i++ ) {
1667       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1668    }
1669 }
1670
1671 /**
1672  * Evaluate a linear-valued coefficient at the position of the
1673  * current quad.
1674  */
1675 static void
1676 eval_linear_coef(
1677    struct tgsi_exec_machine *mach,
1678    unsigned attrib,
1679    unsigned chan )
1680 {
1681    const float x = mach->QuadPos.xyzw[0].f[0];
1682    const float y = mach->QuadPos.xyzw[1].f[0];
1683    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1684    const float dady = mach->InterpCoefs[attrib].dady[chan];
1685    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1686    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1687    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1688    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1689    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1690 }
1691
1692 /**
1693  * Evaluate a perspective-valued coefficient at the position of the
1694  * current quad.
1695  */
1696 static void
1697 eval_perspective_coef(
1698    struct tgsi_exec_machine *mach,
1699    unsigned attrib,
1700    unsigned chan )
1701 {
1702    const float x = mach->QuadPos.xyzw[0].f[0];
1703    const float y = mach->QuadPos.xyzw[1].f[0];
1704    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1705    const float dady = mach->InterpCoefs[attrib].dady[chan];
1706    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1707    const float *w = mach->QuadPos.xyzw[3].f;
1708    /* divide by W here */
1709    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1710    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1711    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1712    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1713 }
1714
1715
1716 typedef void (* eval_coef_func)(
1717    struct tgsi_exec_machine *mach,
1718    unsigned attrib,
1719    unsigned chan );
1720
1721 static void
1722 exec_declaration(
1723    struct tgsi_exec_machine *mach,
1724    const struct tgsi_full_declaration *decl )
1725 {
1726    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1727       if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1728          unsigned first, last, mask;
1729          eval_coef_func eval;
1730
1731          first = decl->DeclarationRange.First;
1732          last = decl->DeclarationRange.Last;
1733          mask = decl->Declaration.UsageMask;
1734
1735          switch( decl->Declaration.Interpolate ) {
1736          case TGSI_INTERPOLATE_CONSTANT:
1737             eval = eval_constant_coef;
1738             break;
1739
1740          case TGSI_INTERPOLATE_LINEAR:
1741             eval = eval_linear_coef;
1742             break;
1743
1744          case TGSI_INTERPOLATE_PERSPECTIVE:
1745             eval = eval_perspective_coef;
1746             break;
1747
1748          default:
1749             eval = NULL;
1750             assert( 0 );
1751          }
1752
1753          if( mask == TGSI_WRITEMASK_XYZW ) {
1754             unsigned i, j;
1755
1756             for( i = first; i <= last; i++ ) {
1757                for( j = 0; j < NUM_CHANNELS; j++ ) {
1758                   eval( mach, i, j );
1759                }
1760             }
1761          }
1762          else {
1763             unsigned i, j;
1764
1765             for( j = 0; j < NUM_CHANNELS; j++ ) {
1766                if( mask & (1 << j) ) {
1767                   for( i = first; i <= last; i++ ) {
1768                      eval( mach, i, j );
1769                   }
1770                }
1771             }
1772          }
1773       }
1774    }
1775 }
1776
1777 static void
1778 exec_instruction(
1779    struct tgsi_exec_machine *mach,
1780    const struct tgsi_full_instruction *inst,
1781    int *pc )
1782 {
1783    uint chan_index;
1784    union tgsi_exec_channel r[8];
1785
1786    (*pc)++;
1787
1788    switch (inst->Instruction.Opcode) {
1789    case TGSI_OPCODE_ARL:
1790       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1791          FETCH( &r[0], 0, chan_index );
1792          micro_f2it( &r[0], &r[0] );
1793          STORE( &r[0], 0, chan_index );
1794       }
1795       break;
1796
1797    case TGSI_OPCODE_MOV:
1798    case TGSI_OPCODE_SWZ:
1799       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1800          FETCH( &r[0], 0, chan_index );
1801          STORE( &r[0], 0, chan_index );
1802       }
1803       break;
1804
1805    case TGSI_OPCODE_LIT:
1806       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1807          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1808       }
1809
1810       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1811          FETCH( &r[0], 0, CHAN_X );
1812          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1813             micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1814             STORE( &r[0], 0, CHAN_Y );
1815          }
1816
1817          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1818             FETCH( &r[1], 0, CHAN_Y );
1819             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1820
1821             FETCH( &r[2], 0, CHAN_W );
1822             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1823             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1824             micro_pow( &r[1], &r[1], &r[2] );
1825             micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1826             STORE( &r[0], 0, CHAN_Z );
1827          }
1828       }
1829
1830       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1831          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1832       }
1833       break;
1834
1835    case TGSI_OPCODE_RCP:
1836    /* TGSI_OPCODE_RECIP */
1837       FETCH( &r[0], 0, CHAN_X );
1838       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1839       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1840          STORE( &r[0], 0, chan_index );
1841       }
1842       break;
1843
1844    case TGSI_OPCODE_RSQ:
1845    /* TGSI_OPCODE_RECIPSQRT */
1846       FETCH( &r[0], 0, CHAN_X );
1847       micro_sqrt( &r[0], &r[0] );
1848       micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1849       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1850          STORE( &r[0], 0, chan_index );
1851       }
1852       break;
1853
1854    case TGSI_OPCODE_EXP:
1855       FETCH( &r[0], 0, CHAN_X );
1856       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1857       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1858          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1859          STORE( &r[2], 0, CHAN_X );        /* store r2 */
1860       }
1861       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1862          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1863          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1864       }
1865       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1866          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1867          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1868       }
1869       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1870          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1871       }
1872       break;
1873
1874    case TGSI_OPCODE_LOG:
1875       FETCH( &r[0], 0, CHAN_X );
1876       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1877       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1878       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1879       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1880          STORE( &r[0], 0, CHAN_X );
1881       }
1882       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1883          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1884          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
1885          STORE( &r[0], 0, CHAN_Y );
1886       }
1887       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1888          STORE( &r[1], 0, CHAN_Z );
1889       }
1890       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1891          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1892       }
1893       break;
1894
1895    case TGSI_OPCODE_MUL:
1896       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1897       {
1898          FETCH(&r[0], 0, chan_index);
1899          FETCH(&r[1], 1, chan_index);
1900
1901          micro_mul( &r[0], &r[0], &r[1] );
1902
1903          STORE(&r[0], 0, chan_index);
1904       }
1905       break;
1906
1907    case TGSI_OPCODE_ADD:
1908       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1909          FETCH( &r[0], 0, chan_index );
1910          FETCH( &r[1], 1, chan_index );
1911          micro_add( &r[0], &r[0], &r[1] );
1912          STORE( &r[0], 0, chan_index );
1913       }
1914       break;
1915
1916    case TGSI_OPCODE_DP3:
1917    /* TGSI_OPCODE_DOT3 */
1918       FETCH( &r[0], 0, CHAN_X );
1919       FETCH( &r[1], 1, CHAN_X );
1920       micro_mul( &r[0], &r[0], &r[1] );
1921
1922       FETCH( &r[1], 0, CHAN_Y );
1923       FETCH( &r[2], 1, CHAN_Y );
1924       micro_mul( &r[1], &r[1], &r[2] );
1925       micro_add( &r[0], &r[0], &r[1] );
1926
1927       FETCH( &r[1], 0, CHAN_Z );
1928       FETCH( &r[2], 1, CHAN_Z );
1929       micro_mul( &r[1], &r[1], &r[2] );
1930       micro_add( &r[0], &r[0], &r[1] );
1931
1932       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1933          STORE( &r[0], 0, chan_index );
1934       }
1935       break;
1936
1937     case TGSI_OPCODE_DP4:
1938     /* TGSI_OPCODE_DOT4 */
1939        FETCH(&r[0], 0, CHAN_X);
1940        FETCH(&r[1], 1, CHAN_X);
1941
1942        micro_mul( &r[0], &r[0], &r[1] );
1943
1944        FETCH(&r[1], 0, CHAN_Y);
1945        FETCH(&r[2], 1, CHAN_Y);
1946
1947        micro_mul( &r[1], &r[1], &r[2] );
1948        micro_add( &r[0], &r[0], &r[1] );
1949
1950        FETCH(&r[1], 0, CHAN_Z);
1951        FETCH(&r[2], 1, CHAN_Z);
1952
1953        micro_mul( &r[1], &r[1], &r[2] );
1954        micro_add( &r[0], &r[0], &r[1] );
1955
1956        FETCH(&r[1], 0, CHAN_W);
1957        FETCH(&r[2], 1, CHAN_W);
1958
1959        micro_mul( &r[1], &r[1], &r[2] );
1960        micro_add( &r[0], &r[0], &r[1] );
1961
1962       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1963          STORE( &r[0], 0, chan_index );
1964       }
1965       break;
1966
1967    case TGSI_OPCODE_DST:
1968       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1969          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1970       }
1971
1972       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1973          FETCH( &r[0], 0, CHAN_Y );
1974          FETCH( &r[1], 1, CHAN_Y);
1975          micro_mul( &r[0], &r[0], &r[1] );
1976          STORE( &r[0], 0, CHAN_Y );
1977       }
1978
1979       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1980          FETCH( &r[0], 0, CHAN_Z );
1981          STORE( &r[0], 0, CHAN_Z );
1982       }
1983
1984       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1985          FETCH( &r[0], 1, CHAN_W );
1986          STORE( &r[0], 0, CHAN_W );
1987       }
1988       break;
1989
1990    case TGSI_OPCODE_MIN:
1991       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1992          FETCH(&r[0], 0, chan_index);
1993          FETCH(&r[1], 1, chan_index);
1994
1995          /* XXX use micro_min()?? */
1996          micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
1997
1998          STORE(&r[0], 0, chan_index);
1999       }
2000       break;
2001
2002    case TGSI_OPCODE_MAX:
2003       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2004          FETCH(&r[0], 0, chan_index);
2005          FETCH(&r[1], 1, chan_index);
2006
2007          /* XXX use micro_max()?? */
2008          micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2009
2010          STORE(&r[0], 0, chan_index );
2011       }
2012       break;
2013
2014    case TGSI_OPCODE_SLT:
2015    /* TGSI_OPCODE_SETLT */
2016       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2017          FETCH( &r[0], 0, chan_index );
2018          FETCH( &r[1], 1, chan_index );
2019          micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2020          STORE( &r[0], 0, chan_index );
2021       }
2022       break;
2023
2024    case TGSI_OPCODE_SGE:
2025    /* TGSI_OPCODE_SETGE */
2026       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2027          FETCH( &r[0], 0, chan_index );
2028          FETCH( &r[1], 1, chan_index );
2029          micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2030          STORE( &r[0], 0, chan_index );
2031       }
2032       break;
2033
2034    case TGSI_OPCODE_MAD:
2035    /* TGSI_OPCODE_MADD */
2036       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2037          FETCH( &r[0], 0, chan_index );
2038          FETCH( &r[1], 1, chan_index );
2039          micro_mul( &r[0], &r[0], &r[1] );
2040          FETCH( &r[1], 2, chan_index );
2041          micro_add( &r[0], &r[0], &r[1] );
2042          STORE( &r[0], 0, chan_index );
2043       }
2044       break;
2045
2046    case TGSI_OPCODE_SUB:
2047       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2048          FETCH(&r[0], 0, chan_index);
2049          FETCH(&r[1], 1, chan_index);
2050
2051          micro_sub( &r[0], &r[0], &r[1] );
2052
2053          STORE(&r[0], 0, chan_index);
2054       }
2055       break;
2056
2057    case TGSI_OPCODE_LERP:
2058    /* TGSI_OPCODE_LRP */
2059       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2060          FETCH(&r[0], 0, chan_index);
2061          FETCH(&r[1], 1, chan_index);
2062          FETCH(&r[2], 2, chan_index);
2063
2064          micro_sub( &r[1], &r[1], &r[2] );
2065          micro_mul( &r[0], &r[0], &r[1] );
2066          micro_add( &r[0], &r[0], &r[2] );
2067
2068          STORE(&r[0], 0, chan_index);
2069       }
2070       break;
2071
2072    case TGSI_OPCODE_CND:
2073       assert (0);
2074       break;
2075
2076    case TGSI_OPCODE_CND0:
2077       assert (0);
2078       break;
2079
2080    case TGSI_OPCODE_DOT2ADD:
2081       /* TGSI_OPCODE_DP2A */
2082       FETCH( &r[0], 0, CHAN_X );
2083       FETCH( &r[1], 1, CHAN_X );
2084       micro_mul( &r[0], &r[0], &r[1] );
2085
2086       FETCH( &r[1], 0, CHAN_Y );
2087       FETCH( &r[2], 1, CHAN_Y );
2088       micro_mul( &r[1], &r[1], &r[2] );
2089       micro_add( &r[0], &r[0], &r[1] );
2090
2091       FETCH( &r[2], 2, CHAN_X );
2092       micro_add( &r[0], &r[0], &r[2] );
2093
2094       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2095          STORE( &r[0], 0, chan_index );
2096       }
2097       break;
2098
2099    case TGSI_OPCODE_INDEX:
2100       assert (0);
2101       break;
2102
2103    case TGSI_OPCODE_NEGATE:
2104       assert (0);
2105       break;
2106
2107    case TGSI_OPCODE_FRAC:
2108    /* TGSI_OPCODE_FRC */
2109       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2110          FETCH( &r[0], 0, chan_index );
2111          micro_frc( &r[0], &r[0] );
2112          STORE( &r[0], 0, chan_index );
2113       }
2114       break;
2115
2116    case TGSI_OPCODE_CLAMP:
2117       assert (0);
2118       break;
2119
2120    case TGSI_OPCODE_FLOOR:
2121    /* TGSI_OPCODE_FLR */
2122       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2123          FETCH( &r[0], 0, chan_index );
2124          micro_flr( &r[0], &r[0] );
2125          STORE( &r[0], 0, chan_index );
2126       }
2127       break;
2128
2129    case TGSI_OPCODE_ROUND:
2130       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2131          FETCH( &r[0], 0, chan_index );
2132          micro_rnd( &r[0], &r[0] );
2133          STORE( &r[0], 0, chan_index );
2134       }
2135       break;
2136
2137    case TGSI_OPCODE_EXPBASE2:
2138     /* TGSI_OPCODE_EX2 */
2139       FETCH(&r[0], 0, CHAN_X);
2140
2141 #if FAST_MATH
2142       micro_exp2( &r[0], &r[0] );
2143 #else
2144       micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2145 #endif
2146
2147       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2148          STORE( &r[0], 0, chan_index );
2149       }
2150       break;
2151
2152    case TGSI_OPCODE_LOGBASE2:
2153    /* TGSI_OPCODE_LG2 */
2154       FETCH( &r[0], 0, CHAN_X );
2155       micro_lg2( &r[0], &r[0] );
2156       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2157          STORE( &r[0], 0, chan_index );
2158       }
2159       break;
2160
2161    case TGSI_OPCODE_POWER:
2162       /* TGSI_OPCODE_POW */
2163       FETCH(&r[0], 0, CHAN_X);
2164       FETCH(&r[1], 1, CHAN_X);
2165
2166       micro_pow( &r[0], &r[0], &r[1] );
2167
2168       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2169          STORE( &r[0], 0, chan_index );
2170       }
2171       break;
2172
2173    case TGSI_OPCODE_CROSSPRODUCT:
2174       /* TGSI_OPCODE_XPD */
2175       FETCH(&r[0], 0, CHAN_Y);
2176       FETCH(&r[1], 1, CHAN_Z);
2177
2178       micro_mul( &r[2], &r[0], &r[1] );
2179
2180       FETCH(&r[3], 0, CHAN_Z);
2181       FETCH(&r[4], 1, CHAN_Y);
2182
2183       micro_mul( &r[5], &r[3], &r[4] );
2184       micro_sub( &r[2], &r[2], &r[5] );
2185
2186       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2187          STORE( &r[2], 0, CHAN_X );
2188       }
2189
2190       FETCH(&r[2], 1, CHAN_X);
2191
2192       micro_mul( &r[3], &r[3], &r[2] );
2193
2194       FETCH(&r[5], 0, CHAN_X);
2195
2196       micro_mul( &r[1], &r[1], &r[5] );
2197       micro_sub( &r[3], &r[3], &r[1] );
2198
2199       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2200          STORE( &r[3], 0, CHAN_Y );
2201       }
2202
2203       micro_mul( &r[5], &r[5], &r[4] );
2204       micro_mul( &r[0], &r[0], &r[2] );
2205       micro_sub( &r[5], &r[5], &r[0] );
2206
2207       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2208          STORE( &r[5], 0, CHAN_Z );
2209       }
2210
2211       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2212          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2213       }
2214       break;
2215
2216     case TGSI_OPCODE_MULTIPLYMATRIX:
2217        assert (0);
2218        break;
2219
2220     case TGSI_OPCODE_ABS:
2221        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2222           FETCH(&r[0], 0, chan_index);
2223
2224           micro_abs( &r[0], &r[0] );
2225
2226           STORE(&r[0], 0, chan_index);
2227        }
2228        break;
2229
2230    case TGSI_OPCODE_RCC:
2231       assert (0);
2232       break;
2233
2234    case TGSI_OPCODE_DPH:
2235       FETCH(&r[0], 0, CHAN_X);
2236       FETCH(&r[1], 1, CHAN_X);
2237
2238       micro_mul( &r[0], &r[0], &r[1] );
2239
2240       FETCH(&r[1], 0, CHAN_Y);
2241       FETCH(&r[2], 1, CHAN_Y);
2242
2243       micro_mul( &r[1], &r[1], &r[2] );
2244       micro_add( &r[0], &r[0], &r[1] );
2245
2246       FETCH(&r[1], 0, CHAN_Z);
2247       FETCH(&r[2], 1, CHAN_Z);
2248
2249       micro_mul( &r[1], &r[1], &r[2] );
2250       micro_add( &r[0], &r[0], &r[1] );
2251
2252       FETCH(&r[1], 1, CHAN_W);
2253
2254       micro_add( &r[0], &r[0], &r[1] );
2255
2256       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2257          STORE( &r[0], 0, chan_index );
2258       }
2259       break;
2260
2261    case TGSI_OPCODE_COS:
2262       FETCH(&r[0], 0, CHAN_X);
2263
2264       micro_cos( &r[0], &r[0] );
2265
2266       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2267          STORE( &r[0], 0, chan_index );
2268       }
2269       break;
2270
2271    case TGSI_OPCODE_DDX:
2272       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2273          FETCH( &r[0], 0, chan_index );
2274          micro_ddx( &r[0], &r[0] );
2275          STORE( &r[0], 0, chan_index );
2276       }
2277       break;
2278
2279    case TGSI_OPCODE_DDY:
2280       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2281          FETCH( &r[0], 0, chan_index );
2282          micro_ddy( &r[0], &r[0] );
2283          STORE( &r[0], 0, chan_index );
2284       }
2285       break;
2286
2287    case TGSI_OPCODE_KILP:
2288       exec_kilp (mach, inst);
2289       break;
2290
2291    case TGSI_OPCODE_KIL:
2292       exec_kil (mach, inst);
2293       break;
2294
2295    case TGSI_OPCODE_PK2H:
2296       assert (0);
2297       break;
2298
2299    case TGSI_OPCODE_PK2US:
2300       assert (0);
2301       break;
2302
2303    case TGSI_OPCODE_PK4B:
2304       assert (0);
2305       break;
2306
2307    case TGSI_OPCODE_PK4UB:
2308       assert (0);
2309       break;
2310
2311    case TGSI_OPCODE_RFL:
2312       assert (0);
2313       break;
2314
2315    case TGSI_OPCODE_SEQ:
2316       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2317          FETCH( &r[0], 0, chan_index );
2318          FETCH( &r[1], 1, chan_index );
2319          micro_eq( &r[0], &r[0], &r[1],
2320                    &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2321                    &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2322          STORE( &r[0], 0, chan_index );
2323       }
2324       break;
2325
2326    case TGSI_OPCODE_SFL:
2327       assert (0);
2328       break;
2329
2330    case TGSI_OPCODE_SGT:
2331       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2332          FETCH( &r[0], 0, chan_index );
2333          FETCH( &r[1], 1, chan_index );
2334          micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2335          STORE( &r[0], 0, chan_index );
2336       }
2337       break;
2338
2339    case TGSI_OPCODE_SIN:
2340       FETCH( &r[0], 0, CHAN_X );
2341       micro_sin( &r[0], &r[0] );
2342       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2343          STORE( &r[0], 0, chan_index );
2344       }
2345       break;
2346
2347    case TGSI_OPCODE_SLE:
2348       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2349          FETCH( &r[0], 0, chan_index );
2350          FETCH( &r[1], 1, chan_index );
2351          micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2352          STORE( &r[0], 0, chan_index );
2353       }
2354       break;
2355
2356    case TGSI_OPCODE_SNE:
2357       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2358          FETCH( &r[0], 0, chan_index );
2359          FETCH( &r[1], 1, chan_index );
2360          micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2361          STORE( &r[0], 0, chan_index );
2362       }
2363       break;
2364
2365    case TGSI_OPCODE_STR:
2366       assert (0);
2367       break;
2368
2369    case TGSI_OPCODE_TEX:
2370       /* simple texture lookup */
2371       /* src[0] = texcoord */
2372       /* src[1] = sampler unit */
2373       exec_tex(mach, inst, FALSE, FALSE);
2374       break;
2375
2376    case TGSI_OPCODE_TXB:
2377       /* Texture lookup with lod bias */
2378       /* src[0] = texcoord (src[0].w = LOD bias) */
2379       /* src[1] = sampler unit */
2380       exec_tex(mach, inst, TRUE, FALSE);
2381       break;
2382
2383    case TGSI_OPCODE_TXD:
2384       /* Texture lookup with explict partial derivatives */
2385       /* src[0] = texcoord */
2386       /* src[1] = d[strq]/dx */
2387       /* src[2] = d[strq]/dy */
2388       /* src[3] = sampler unit */
2389       assert (0);
2390       break;
2391
2392    case TGSI_OPCODE_TXL:
2393       /* Texture lookup with explit LOD */
2394       /* src[0] = texcoord (src[0].w = LOD) */
2395       /* src[1] = sampler unit */
2396       exec_tex(mach, inst, TRUE, FALSE);
2397       break;
2398
2399    case TGSI_OPCODE_TXP:
2400       /* Texture lookup with projection */
2401       /* src[0] = texcoord (src[0].w = projection) */
2402       /* src[1] = sampler unit */
2403       exec_tex(mach, inst, FALSE, TRUE);
2404       break;
2405
2406    case TGSI_OPCODE_UP2H:
2407       assert (0);
2408       break;
2409
2410    case TGSI_OPCODE_UP2US:
2411       assert (0);
2412       break;
2413
2414    case TGSI_OPCODE_UP4B:
2415       assert (0);
2416       break;
2417
2418    case TGSI_OPCODE_UP4UB:
2419       assert (0);
2420       break;
2421
2422    case TGSI_OPCODE_X2D:
2423       assert (0);
2424       break;
2425
2426    case TGSI_OPCODE_ARA:
2427       assert (0);
2428       break;
2429
2430    case TGSI_OPCODE_ARR:
2431       assert (0);
2432       break;
2433
2434    case TGSI_OPCODE_BRA:
2435       assert (0);
2436       break;
2437
2438    case TGSI_OPCODE_CAL:
2439       /* skip the call if no execution channels are enabled */
2440       if (mach->ExecMask) {
2441          /* do the call */
2442
2443          /* push the Cond, Loop, Cont stacks */
2444          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2445          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2446          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2447          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2448          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2449          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2450
2451          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2452          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2453
2454          /* note that PC was already incremented above */
2455          mach->CallStack[mach->CallStackTop++] = *pc;
2456          *pc = inst->InstructionExtLabel.Label;
2457       }
2458       break;
2459
2460    case TGSI_OPCODE_RET:
2461       mach->FuncMask &= ~mach->ExecMask;
2462       UPDATE_EXEC_MASK(mach);
2463
2464       if (mach->FuncMask == 0x0) {
2465          /* really return now (otherwise, keep executing */
2466
2467          if (mach->CallStackTop == 0) {
2468             /* returning from main() */
2469             *pc = -1;
2470             return;
2471          }
2472          *pc = mach->CallStack[--mach->CallStackTop];
2473
2474          /* pop the Cond, Loop, Cont stacks */
2475          assert(mach->CondStackTop > 0);
2476          mach->CondMask = mach->CondStack[--mach->CondStackTop];
2477          assert(mach->LoopStackTop > 0);
2478          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2479          assert(mach->ContStackTop > 0);
2480          mach->ContMask = mach->ContStack[--mach->ContStackTop];
2481          assert(mach->FuncStackTop > 0);
2482          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2483
2484          UPDATE_EXEC_MASK(mach);
2485       }
2486       break;
2487
2488    case TGSI_OPCODE_SSG:
2489       assert (0);
2490       break;
2491
2492    case TGSI_OPCODE_CMP:
2493       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2494          FETCH(&r[0], 0, chan_index);
2495          FETCH(&r[1], 1, chan_index);
2496          FETCH(&r[2], 2, chan_index);
2497
2498          micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2499
2500          STORE(&r[0], 0, chan_index);
2501       }
2502       break;
2503
2504    case TGSI_OPCODE_SCS:
2505       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2506          FETCH( &r[0], 0, CHAN_X );
2507       }
2508       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
2509          micro_cos( &r[1], &r[0] );
2510          STORE( &r[1], 0, CHAN_X );
2511       }
2512       if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2513          micro_sin( &r[1], &r[0] );
2514          STORE( &r[1], 0, CHAN_Y );
2515       }
2516       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2517          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2518       }
2519       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2520          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2521       }
2522       break;
2523
2524    case TGSI_OPCODE_NRM:
2525       /* 3-component vector normalize */
2526       {
2527          union tgsi_exec_channel tmp, dot;
2528
2529          /* tmp = dp3(src0, src0): */
2530          FETCH( &r[0], 0, CHAN_X );
2531          micro_mul( &tmp, &r[0], &r[0] );
2532
2533          FETCH( &r[1], 0, CHAN_Y );
2534          micro_mul( &dot, &r[1], &r[1] );
2535          micro_add( &tmp, &tmp, &dot );
2536
2537          FETCH( &r[2], 0, CHAN_Z );
2538          micro_mul( &dot, &r[2], &r[2] );
2539          micro_add( &tmp, &tmp, &dot );
2540
2541          /* tmp = 1 / sqrt(tmp) */
2542          micro_sqrt( &tmp, &tmp );
2543          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2544
2545          /* note: w channel is undefined */
2546          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2547             /* chan = chan * tmp */
2548             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2549             STORE( &r[chan_index], 0, chan_index );
2550          }
2551       }
2552       break;
2553
2554    case TGSI_OPCODE_NRM4:
2555       /* 4-component vector normalize */
2556       {
2557          union tgsi_exec_channel tmp, dot;
2558
2559          /* tmp = dp4(src0, src0): */
2560          FETCH( &r[0], 0, CHAN_X );
2561          micro_mul( &tmp, &r[0], &r[0] );
2562
2563          FETCH( &r[1], 0, CHAN_Y );
2564          micro_mul( &dot, &r[1], &r[1] );
2565          micro_add( &tmp, &tmp, &dot );
2566
2567          FETCH( &r[2], 0, CHAN_Z );
2568          micro_mul( &dot, &r[2], &r[2] );
2569          micro_add( &tmp, &tmp, &dot );
2570
2571          FETCH( &r[3], 0, CHAN_W );
2572          micro_mul( &dot, &r[3], &r[3] );
2573          micro_add( &tmp, &tmp, &dot );
2574
2575          /* tmp = 1 / sqrt(tmp) */
2576          micro_sqrt( &tmp, &tmp );
2577          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2578
2579          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2580             /* chan = chan * tmp */
2581             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2582             STORE( &r[chan_index], 0, chan_index );
2583          }
2584       }
2585       break;
2586
2587    case TGSI_OPCODE_DIV:
2588       assert( 0 );
2589       break;
2590
2591    case TGSI_OPCODE_DP2:
2592       FETCH( &r[0], 0, CHAN_X );
2593       FETCH( &r[1], 1, CHAN_X );
2594       micro_mul( &r[0], &r[0], &r[1] );
2595
2596       FETCH( &r[1], 0, CHAN_Y );
2597       FETCH( &r[2], 1, CHAN_Y );
2598       micro_mul( &r[1], &r[1], &r[2] );
2599       micro_add( &r[0], &r[0], &r[1] );
2600
2601       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2602          STORE( &r[0], 0, chan_index );
2603       }
2604       break;
2605
2606    case TGSI_OPCODE_IF:
2607       /* push CondMask */
2608       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2609       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2610       FETCH( &r[0], 0, CHAN_X );
2611       /* update CondMask */
2612       if( ! r[0].u[0] ) {
2613          mach->CondMask &= ~0x1;
2614       }
2615       if( ! r[0].u[1] ) {
2616          mach->CondMask &= ~0x2;
2617       }
2618       if( ! r[0].u[2] ) {
2619          mach->CondMask &= ~0x4;
2620       }
2621       if( ! r[0].u[3] ) {
2622          mach->CondMask &= ~0x8;
2623       }
2624       UPDATE_EXEC_MASK(mach);
2625       /* Todo: If CondMask==0, jump to ELSE */
2626       break;
2627
2628    case TGSI_OPCODE_ELSE:
2629       /* invert CondMask wrt previous mask */
2630       {
2631          uint prevMask;
2632          assert(mach->CondStackTop > 0);
2633          prevMask = mach->CondStack[mach->CondStackTop - 1];
2634          mach->CondMask = ~mach->CondMask & prevMask;
2635          UPDATE_EXEC_MASK(mach);
2636          /* Todo: If CondMask==0, jump to ENDIF */
2637       }
2638       break;
2639
2640    case TGSI_OPCODE_ENDIF:
2641       /* pop CondMask */
2642       assert(mach->CondStackTop > 0);
2643       mach->CondMask = mach->CondStack[--mach->CondStackTop];
2644       UPDATE_EXEC_MASK(mach);
2645       break;
2646
2647    case TGSI_OPCODE_END:
2648       /* halt execution */
2649       *pc = -1;
2650       break;
2651
2652    case TGSI_OPCODE_REP:
2653       assert (0);
2654       break;
2655
2656    case TGSI_OPCODE_ENDREP:
2657        assert (0);
2658        break;
2659
2660    case TGSI_OPCODE_PUSHA:
2661       assert (0);
2662       break;
2663
2664    case TGSI_OPCODE_POPA:
2665       assert (0);
2666       break;
2667
2668    case TGSI_OPCODE_CEIL:
2669       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2670          FETCH( &r[0], 0, chan_index );
2671          micro_ceil( &r[0], &r[0] );
2672          STORE( &r[0], 0, chan_index );
2673       }
2674       break;
2675
2676    case TGSI_OPCODE_I2F:
2677       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2678          FETCH( &r[0], 0, chan_index );
2679          micro_i2f( &r[0], &r[0] );
2680          STORE( &r[0], 0, chan_index );
2681       }
2682       break;
2683
2684    case TGSI_OPCODE_NOT:
2685       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2686          FETCH( &r[0], 0, chan_index );
2687          micro_not( &r[0], &r[0] );
2688          STORE( &r[0], 0, chan_index );
2689       }
2690       break;
2691
2692    case TGSI_OPCODE_TRUNC:
2693       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2694          FETCH( &r[0], 0, chan_index );
2695          micro_trunc( &r[0], &r[0] );
2696          STORE( &r[0], 0, chan_index );
2697       }
2698       break;
2699
2700    case TGSI_OPCODE_SHL:
2701       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2702          FETCH( &r[0], 0, chan_index );
2703          FETCH( &r[1], 1, chan_index );
2704          micro_shl( &r[0], &r[0], &r[1] );
2705          STORE( &r[0], 0, chan_index );
2706       }
2707       break;
2708
2709    case TGSI_OPCODE_SHR:
2710       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2711          FETCH( &r[0], 0, chan_index );
2712          FETCH( &r[1], 1, chan_index );
2713          micro_ishr( &r[0], &r[0], &r[1] );
2714          STORE( &r[0], 0, chan_index );
2715       }
2716       break;
2717
2718    case TGSI_OPCODE_AND:
2719       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2720          FETCH( &r[0], 0, chan_index );
2721          FETCH( &r[1], 1, chan_index );
2722          micro_and( &r[0], &r[0], &r[1] );
2723          STORE( &r[0], 0, chan_index );
2724       }
2725       break;
2726
2727    case TGSI_OPCODE_OR:
2728       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2729          FETCH( &r[0], 0, chan_index );
2730          FETCH( &r[1], 1, chan_index );
2731          micro_or( &r[0], &r[0], &r[1] );
2732          STORE( &r[0], 0, chan_index );
2733       }
2734       break;
2735
2736    case TGSI_OPCODE_MOD:
2737       assert (0);
2738       break;
2739
2740    case TGSI_OPCODE_XOR:
2741       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2742          FETCH( &r[0], 0, chan_index );
2743          FETCH( &r[1], 1, chan_index );
2744          micro_xor( &r[0], &r[0], &r[1] );
2745          STORE( &r[0], 0, chan_index );
2746       }
2747       break;
2748
2749    case TGSI_OPCODE_SAD:
2750       assert (0);
2751       break;
2752
2753    case TGSI_OPCODE_TXF:
2754       assert (0);
2755       break;
2756
2757    case TGSI_OPCODE_TXQ:
2758       assert (0);
2759       break;
2760
2761    case TGSI_OPCODE_EMIT:
2762       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
2763       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2764       break;
2765
2766    case TGSI_OPCODE_ENDPRIM:
2767       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
2768       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
2769       break;
2770
2771    case TGSI_OPCODE_LOOP:
2772       /* fall-through (for now) */
2773    case TGSI_OPCODE_BGNLOOP2:
2774       /* push LoopMask and ContMasks */
2775       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2776       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2777       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2778       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2779       break;
2780
2781    case TGSI_OPCODE_ENDLOOP:
2782       /* fall-through (for now at least) */
2783    case TGSI_OPCODE_ENDLOOP2:
2784       /* Restore ContMask, but don't pop */
2785       assert(mach->ContStackTop > 0);
2786       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
2787       UPDATE_EXEC_MASK(mach);
2788       if (mach->ExecMask) {
2789          /* repeat loop: jump to instruction just past BGNLOOP */
2790          *pc = inst->InstructionExtLabel.Label + 1;
2791       }
2792       else {
2793          /* exit loop: pop LoopMask */
2794          assert(mach->LoopStackTop > 0);
2795          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2796          /* pop ContMask */
2797          assert(mach->ContStackTop > 0);
2798          mach->ContMask = mach->ContStack[--mach->ContStackTop];
2799       }
2800       UPDATE_EXEC_MASK(mach);
2801       break;
2802
2803    case TGSI_OPCODE_BRK:
2804       /* turn off loop channels for each enabled exec channel */
2805       mach->LoopMask &= ~mach->ExecMask;
2806       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2807       UPDATE_EXEC_MASK(mach);
2808       break;
2809
2810    case TGSI_OPCODE_CONT:
2811       /* turn off cont channels for each enabled exec channel */
2812       mach->ContMask &= ~mach->ExecMask;
2813       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2814       UPDATE_EXEC_MASK(mach);
2815       break;
2816
2817    case TGSI_OPCODE_BGNSUB:
2818       /* no-op */
2819       break;
2820
2821    case TGSI_OPCODE_ENDSUB:
2822       /* no-op */
2823       break;
2824
2825    case TGSI_OPCODE_NOISE1:
2826       assert( 0 );
2827       break;
2828
2829    case TGSI_OPCODE_NOISE2:
2830       assert( 0 );
2831       break;
2832
2833    case TGSI_OPCODE_NOISE3:
2834       assert( 0 );
2835       break;
2836
2837    case TGSI_OPCODE_NOISE4:
2838       assert( 0 );
2839       break;
2840
2841    case TGSI_OPCODE_NOP:
2842       break;
2843
2844    default:
2845       assert( 0 );
2846    }
2847 }
2848
2849
2850 /**
2851  * Run TGSI interpreter.
2852  * \return bitmask of "alive" quad components
2853  */
2854 uint
2855 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
2856 {
2857    uint i;
2858    int pc = 0;
2859
2860    mach->CondMask = 0xf;
2861    mach->LoopMask = 0xf;
2862    mach->ContMask = 0xf;
2863    mach->FuncMask = 0xf;
2864    mach->ExecMask = 0xf;
2865
2866    mach->CondStackTop = 0; /* temporarily subvert this assertion */
2867    assert(mach->CondStackTop == 0);
2868    assert(mach->LoopStackTop == 0);
2869    assert(mach->ContStackTop == 0);
2870    assert(mach->CallStackTop == 0);
2871
2872    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
2873    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
2874
2875    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
2876       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
2877       mach->Primitives[0] = 0;
2878    }
2879
2880    for (i = 0; i < QUAD_SIZE; i++) {
2881       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
2882          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
2883          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
2884          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
2885          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
2886    }
2887
2888    /* execute declarations (interpolants) */
2889    for (i = 0; i < mach->NumDeclarations; i++) {
2890       exec_declaration( mach, mach->Declarations+i );
2891    }
2892
2893    /* execute instructions, until pc is set to -1 */
2894    while (pc != -1) {
2895       assert(pc < (int) mach->NumInstructions);
2896       exec_instruction( mach, mach->Instructions + pc, &pc );
2897    }
2898
2899 #if 0
2900    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
2901    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2902       /*
2903        * Scale back depth component.
2904        */
2905       for (i = 0; i < 4; i++)
2906          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
2907    }
2908 #endif
2909
2910    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
2911 }
2912
2913