src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_memory.h"
  62 #include "util/u_math.h"
  63
  64
  65 #define FAST_MATH 1
  66
  67 #define TILE_TOP_LEFT     0
  68 #define TILE_TOP_RIGHT    1
  69 #define TILE_BOTTOM_LEFT  2
  70 #define TILE_BOTTOM_RIGHT 3
  71
  72 static void
  73 micro_abs(union tgsi_exec_channel *dst,
  74           const union tgsi_exec_channel *src)
  75 {
  76    dst->f[0] = fabsf(src->f[0]);
  77    dst->f[1] = fabsf(src->f[1]);
  78    dst->f[2] = fabsf(src->f[2]);
  79    dst->f[3] = fabsf(src->f[3]);
  80 }
  81
  82 static void
  83 micro_arl(union tgsi_exec_channel *dst,
  84           const union tgsi_exec_channel *src)
  85 {
  86    dst->i[0] = (int)floorf(src->f[0]);
  87    dst->i[1] = (int)floorf(src->f[1]);
  88    dst->i[2] = (int)floorf(src->f[2]);
  89    dst->i[3] = (int)floorf(src->f[3]);
  90 }
  91
  92 static void
  93 micro_arr(union tgsi_exec_channel *dst,
  94           const union tgsi_exec_channel *src)
  95 {
  96    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
  97    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
  98    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
  99    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 100 }
 101
 102 static void
 103 micro_ceil(union tgsi_exec_channel *dst,
 104            const union tgsi_exec_channel *src)
 105 {
 106    dst->f[0] = ceilf(src->f[0]);
 107    dst->f[1] = ceilf(src->f[1]);
 108    dst->f[2] = ceilf(src->f[2]);
 109    dst->f[3] = ceilf(src->f[3]);
 110 }
 111
 112 static void
 113 micro_cos(union tgsi_exec_channel *dst,
 114           const union tgsi_exec_channel *src)
 115 {
 116    dst->f[0] = cosf(src->f[0]);
 117    dst->f[1] = cosf(src->f[1]);
 118    dst->f[2] = cosf(src->f[2]);
 119    dst->f[3] = cosf(src->f[3]);
 120 }
 121
 122 static void
 123 micro_ddx(union tgsi_exec_channel *dst,
 124           const union tgsi_exec_channel *src)
 125 {
 126    dst->f[0] =
 127    dst->f[1] =
 128    dst->f[2] =
 129    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 130 }
 131
 132 static void
 133 micro_ddy(union tgsi_exec_channel *dst,
 134           const union tgsi_exec_channel *src)
 135 {
 136    dst->f[0] =
 137    dst->f[1] =
 138    dst->f[2] =
 139    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 140 }
 141
 142 static void
 143 micro_exp2(union tgsi_exec_channel *dst,
 144            const union tgsi_exec_channel *src)
 145 {
 146 #if FAST_MATH
 147    dst->f[0] = util_fast_exp2(src->f[0]);
 148    dst->f[1] = util_fast_exp2(src->f[1]);
 149    dst->f[2] = util_fast_exp2(src->f[2]);
 150    dst->f[3] = util_fast_exp2(src->f[3]);
 151 #else
 152 #if DEBUG
 153    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 154    uint i;
 155    union tgsi_exec_channel clamped;
 156
 157    for (i = 0; i < 4; i++) {
 158       if (src->f[i] > 127.99999f) {
 159          clamped.f[i] = 127.99999f;
 160       } else if (src->f[i] < -126.99999f) {
 161          clamped.f[i] = -126.99999f;
 162       } else {
 163          clamped.f[i] = src->f[i];
 164       }
 165    }
 166    src = &clamped;
 167 #endif /* DEBUG */
 168
 169    dst->f[0] = powf(2.0f, src->f[0]);
 170    dst->f[1] = powf(2.0f, src->f[1]);
 171    dst->f[2] = powf(2.0f, src->f[2]);
 172    dst->f[3] = powf(2.0f, src->f[3]);
 173 #endif /* FAST_MATH */
 174 }
 175
 176 static void
 177 micro_flr(union tgsi_exec_channel *dst,
 178           const union tgsi_exec_channel *src)
 179 {
 180    dst->f[0] = floorf(src->f[0]);
 181    dst->f[1] = floorf(src->f[1]);
 182    dst->f[2] = floorf(src->f[2]);
 183    dst->f[3] = floorf(src->f[3]);
 184 }
 185
 186 static void
 187 micro_frc(union tgsi_exec_channel *dst,
 188           const union tgsi_exec_channel *src)
 189 {
 190    dst->f[0] = src->f[0] - floorf(src->f[0]);
 191    dst->f[1] = src->f[1] - floorf(src->f[1]);
 192    dst->f[2] = src->f[2] - floorf(src->f[2]);
 193    dst->f[3] = src->f[3] - floorf(src->f[3]);
 194 }
 195
 196 static void
 197 micro_iabs(union tgsi_exec_channel *dst,
 198            const union tgsi_exec_channel *src)
 199 {
 200    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 201    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 202    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 203    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 204 }
 205
 206 static void
 207 micro_ineg(union tgsi_exec_channel *dst,
 208            const union tgsi_exec_channel *src)
 209 {
 210    dst->i[0] = -src->i[0];
 211    dst->i[1] = -src->i[1];
 212    dst->i[2] = -src->i[2];
 213    dst->i[3] = -src->i[3];
 214 }
 215
 216 static void
 217 micro_lg2(union tgsi_exec_channel *dst,
 218           const union tgsi_exec_channel *src)
 219 {
 220 #if FAST_MATH
 221    dst->f[0] = util_fast_log2(src->f[0]);
 222    dst->f[1] = util_fast_log2(src->f[1]);
 223    dst->f[2] = util_fast_log2(src->f[2]);
 224    dst->f[3] = util_fast_log2(src->f[3]);
 225 #else
 226    dst->f[0] = logf(src->f[0]) * 1.442695f;
 227    dst->f[1] = logf(src->f[1]) * 1.442695f;
 228    dst->f[2] = logf(src->f[2]) * 1.442695f;
 229    dst->f[3] = logf(src->f[3]) * 1.442695f;
 230 #endif
 231 }
 232
 233 static void
 234 micro_lrp(union tgsi_exec_channel *dst,
 235           const union tgsi_exec_channel *src)
 236 {
 237    dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
 238    dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
 239    dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
 240    dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
 241 }
 242
 243 static void
 244 micro_mad(union tgsi_exec_channel *dst,
 245           const union tgsi_exec_channel *src)
 246 {
 247    dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
 248    dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
 249    dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
 250    dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
 251 }
 252
 253 static void
 254 micro_mov(union tgsi_exec_channel *dst,
 255           const union tgsi_exec_channel *src)
 256 {
 257    dst->u[0] = src->u[0];
 258    dst->u[1] = src->u[1];
 259    dst->u[2] = src->u[2];
 260    dst->u[3] = src->u[3];
 261 }
 262
 263 static void
 264 micro_rcp(union tgsi_exec_channel *dst,
 265           const union tgsi_exec_channel *src)
 266 {
 267    dst->f[0] = 1.0f / src->f[0];
 268    dst->f[1] = 1.0f / src->f[1];
 269    dst->f[2] = 1.0f / src->f[2];
 270    dst->f[3] = 1.0f / src->f[3];
 271 }
 272
 273 static void
 274 micro_rnd(union tgsi_exec_channel *dst,
 275           const union tgsi_exec_channel *src)
 276 {
 277    dst->f[0] = floorf(src->f[0] + 0.5f);
 278    dst->f[1] = floorf(src->f[1] + 0.5f);
 279    dst->f[2] = floorf(src->f[2] + 0.5f);
 280    dst->f[3] = floorf(src->f[3] + 0.5f);
 281 }
 282
 283 static void
 284 micro_rsq(union tgsi_exec_channel *dst,
 285           const union tgsi_exec_channel *src)
 286 {
 287    dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
 288    dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
 289    dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
 290    dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
 291 }
 292
 293 static void
 294 micro_seq(union tgsi_exec_channel *dst,
 295           const union tgsi_exec_channel *src)
 296 {
 297    dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
 298    dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
 299    dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
 300    dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
 301 }
 302
 303 static void
 304 micro_sge(union tgsi_exec_channel *dst,
 305           const union tgsi_exec_channel *src)
 306 {
 307    dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
 308    dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
 309    dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
 310    dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
 311 }
 312
 313 static void
 314 micro_sgn(union tgsi_exec_channel *dst,
 315           const union tgsi_exec_channel *src)
 316 {
 317    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 318    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 319    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 320    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 321 }
 322
 323 static void
 324 micro_sgt(union tgsi_exec_channel *dst,
 325           const union tgsi_exec_channel *src)
 326 {
 327    dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
 328    dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
 329    dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
 330    dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
 331 }
 332
 333 static void
 334 micro_sin(union tgsi_exec_channel *dst,
 335           const union tgsi_exec_channel *src)
 336 {
 337    dst->f[0] = sinf(src->f[0]);
 338    dst->f[1] = sinf(src->f[1]);
 339    dst->f[2] = sinf(src->f[2]);
 340    dst->f[3] = sinf(src->f[3]);
 341 }
 342
 343 static void
 344 micro_sle(union tgsi_exec_channel *dst,
 345           const union tgsi_exec_channel *src)
 346 {
 347    dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
 348    dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
 349    dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
 350    dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
 351 }
 352
 353 static void
 354 micro_slt(union tgsi_exec_channel *dst,
 355           const union tgsi_exec_channel *src)
 356 {
 357    dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
 358    dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
 359    dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
 360    dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
 361 }
 362
 363 static void
 364 micro_sne(union tgsi_exec_channel *dst,
 365           const union tgsi_exec_channel *src)
 366 {
 367    dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
 368    dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
 369    dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
 370    dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
 371 }
 372
 373 static void
 374 micro_trunc(union tgsi_exec_channel *dst,
 375             const union tgsi_exec_channel *src)
 376 {
 377    dst->f[0] = (float)(int)src->f[0];
 378    dst->f[1] = (float)(int)src->f[1];
 379    dst->f[2] = (float)(int)src->f[2];
 380    dst->f[3] = (float)(int)src->f[3];
 381 }
 382
 383
 384 #define CHAN_X  0
 385 #define CHAN_Y  1
 386 #define CHAN_Z  2
 387 #define CHAN_W  3
 388
 389 enum tgsi_exec_datatype {
 390    TGSI_EXEC_DATA_FLOAT,
 391    TGSI_EXEC_DATA_INT,
 392    TGSI_EXEC_DATA_UINT
 393 };
 394
 395 /*
 396  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 397  */
 398 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
 399 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
 400 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
 401 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
 402 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
 403 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
 404 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
 405 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
 406 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
 407 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
 408 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
 409 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
 410 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
 411 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
 412 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
 413 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
 414 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 415 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 416 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 417 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 418 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 419 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 420 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 421 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 422 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 423 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 424 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 425 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 426 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 427 #define TEMP_P0            TGSI_EXEC_TEMP_P0
 428
 429 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 430    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
 431
 432 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 433    ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
 434
 435 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 436    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 437       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 438
 439 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 440    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 441       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 442
 443
 444 /** The execution mask depends on the conditional mask and the loop mask */
 445 #define UPDATE_EXEC_MASK(MACH) \
 446       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 447
 448
 449 static const union tgsi_exec_channel ZeroVec =
 450    { { 0.0, 0.0, 0.0, 0.0 } };
 451
 452
 453 #define CHECK_INF_OR_NAN(chan) do {\
 454       assert(!util_is_inf_or_nan((chan)->f[0]));\
 455       assert(!util_is_inf_or_nan((chan)->f[1]));\
 456       assert(!util_is_inf_or_nan((chan)->f[2]));\
 457       assert(!util_is_inf_or_nan((chan)->f[3]));\
 458    } while (0)
 459
 460
 461 #ifdef DEBUG
 462 static void
 463 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 464 {
 465    debug_printf("%s = {%f, %f, %f, %f}\n",
 466                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 467 }
 468 #endif
 469
 470
 471 #ifdef DEBUG
 472 static void
 473 print_temp(const struct tgsi_exec_machine *mach, uint index)
 474 {
 475    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 476    int i;
 477    debug_printf("Temp[%u] =\n", index);
 478    for (i = 0; i < 4; i++) {
 479       debug_printf("  %c: { %f, %f, %f, %f }\n",
 480                    "XYZW"[i],
 481                    tmp->xyzw[i].f[0],
 482                    tmp->xyzw[i].f[1],
 483                    tmp->xyzw[i].f[2],
 484                    tmp->xyzw[i].f[3]);
 485    }
 486 }
 487 #endif
 488
 489
 490 /**
 491  * Check if there's a potential src/dst register data dependency when
 492  * using SOA execution.
 493  * Example:
 494  *   MOV T, T.yxwz;
 495  * This would expand into:
 496  *   MOV t0, t1;
 497  *   MOV t1, t0;
 498  *   MOV t2, t3;
 499  *   MOV t3, t2;
 500  * The second instruction will have the wrong value for t0 if executed as-is.
 501  */
 502 boolean
 503 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
 504 {
 505    uint i, chan;
 506
 507    uint writemask = inst->Dst[0].Register.WriteMask;
 508    if (writemask == TGSI_WRITEMASK_X ||
 509        writemask == TGSI_WRITEMASK_Y ||
 510        writemask == TGSI_WRITEMASK_Z ||
 511        writemask == TGSI_WRITEMASK_W ||
 512        writemask == TGSI_WRITEMASK_NONE) {
 513       /* no chance of data dependency */
 514       return FALSE;
 515    }
 516
 517    /* loop over src regs */
 518    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 519       if ((inst->Src[i].Register.File ==
 520            inst->Dst[0].Register.File) &&
 521           (inst->Src[i].Register.Index ==
 522            inst->Dst[0].Register.Index)) {
 523          /* loop over dest channels */
 524          uint channelsWritten = 0x0;
 525          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
 526             /* check if we're reading a channel that's been written */
 527             uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
 528             if (channelsWritten & (1 << swizzle)) {
 529                return TRUE;
 530             }
 531
 532             channelsWritten |= (1 << chan);
 533          }
 534       }
 535    }
 536    return FALSE;
 537 }
 538
 539
 540 /**
 541  * Initialize machine state by expanding tokens to full instructions,
 542  * allocating temporary storage, setting up constants, etc.
 543  * After this, we can call tgsi_exec_machine_run() many times.
 544  */
 545 void
 546 tgsi_exec_machine_bind_shader(
 547    struct tgsi_exec_machine *mach,
 548    const struct tgsi_token *tokens,
 549    uint numSamplers,
 550    struct tgsi_sampler **samplers)
 551 {
 552    uint k;
 553    struct tgsi_parse_context parse;
 554    struct tgsi_exec_labels *labels = &mach->Labels;
 555    struct tgsi_full_instruction *instructions;
 556    struct tgsi_full_declaration *declarations;
 557    uint maxInstructions = 10, numInstructions = 0;
 558    uint maxDeclarations = 10, numDeclarations = 0;
 559    uint instno = 0;
 560
 561 #if 0
 562    tgsi_dump(tokens, 0);
 563 #endif
 564
 565    util_init_math();
 566
 567    mach->Tokens = tokens;
 568    mach->Samplers = samplers;
 569
 570    k = tgsi_parse_init (&parse, mach->Tokens);
 571    if (k != TGSI_PARSE_OK) {
 572       debug_printf( "Problem parsing!\n" );
 573       return;
 574    }
 575
 576    mach->Processor = parse.FullHeader.Processor.Processor;
 577    mach->ImmLimit = 0;
 578    labels->count = 0;
 579
 580    declarations = (struct tgsi_full_declaration *)
 581       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 582
 583    if (!declarations) {
 584       return;
 585    }
 586
 587    instructions = (struct tgsi_full_instruction *)
 588       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 589
 590    if (!instructions) {
 591       FREE( declarations );
 592       return;
 593    }
 594
 595    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 596       uint pointer = parse.Position;
 597       uint i;
 598
 599       tgsi_parse_token( &parse );
 600       switch( parse.FullToken.Token.Type ) {
 601       case TGSI_TOKEN_TYPE_DECLARATION:
 602          /* save expanded declaration */
 603          if (numDeclarations == maxDeclarations) {
 604             declarations = REALLOC(declarations,
 605                                    maxDeclarations
 606                                    * sizeof(struct tgsi_full_declaration),
 607                                    (maxDeclarations + 10)
 608                                    * sizeof(struct tgsi_full_declaration));
 609             maxDeclarations += 10;
 610          }
 611          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
 612             unsigned reg;
 613             for (reg = parse.FullToken.FullDeclaration.Range.First;
 614                  reg <= parse.FullToken.FullDeclaration.Range.Last;
 615                  ++reg) {
 616                ++mach->NumOutputs;
 617             }
 618          }
 619          memcpy(declarations + numDeclarations,
 620                 &parse.FullToken.FullDeclaration,
 621                 sizeof(declarations[0]));
 622          numDeclarations++;
 623          break;
 624
 625       case TGSI_TOKEN_TYPE_IMMEDIATE:
 626          {
 627             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 628             assert( size <= 4 );
 629             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
 630
 631             for( i = 0; i < size; i++ ) {
 632                mach->Imms[mach->ImmLimit][i] =
 633                   parse.FullToken.FullImmediate.u[i].Float;
 634             }
 635             mach->ImmLimit += 1;
 636          }
 637          break;
 638
 639       case TGSI_TOKEN_TYPE_INSTRUCTION:
 640          assert( labels->count < MAX_LABELS );
 641
 642          labels->labels[labels->count][0] = instno;
 643          labels->labels[labels->count][1] = pointer;
 644          labels->count++;
 645
 646          /* save expanded instruction */
 647          if (numInstructions == maxInstructions) {
 648             instructions = REALLOC(instructions,
 649                                    maxInstructions
 650                                    * sizeof(struct tgsi_full_instruction),
 651                                    (maxInstructions + 10)
 652                                    * sizeof(struct tgsi_full_instruction));
 653             maxInstructions += 10;
 654          }
 655
 656          memcpy(instructions + numInstructions,
 657                 &parse.FullToken.FullInstruction,
 658                 sizeof(instructions[0]));
 659
 660          numInstructions++;
 661          break;
 662
 663       case TGSI_TOKEN_TYPE_PROPERTY:
 664          break;
 665
 666       default:
 667          assert( 0 );
 668       }
 669    }
 670    tgsi_parse_free (&parse);
 671
 672    if (mach->Declarations) {
 673       FREE( mach->Declarations );
 674    }
 675    mach->Declarations = declarations;
 676    mach->NumDeclarations = numDeclarations;
 677
 678    if (mach->Instructions) {
 679       FREE( mach->Instructions );
 680    }
 681    mach->Instructions = instructions;
 682    mach->NumInstructions = numInstructions;
 683 }
 684
 685
 686 struct tgsi_exec_machine *
 687 tgsi_exec_machine_create( void )
 688 {
 689    struct tgsi_exec_machine *mach;
 690    uint i;
 691
 692    mach = align_malloc( sizeof *mach, 16 );
 693    if (!mach)
 694       goto fail;
 695
 696    memset(mach, 0, sizeof(*mach));
 697
 698    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 699    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
 700    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
 701
 702    /* Setup constants. */
 703    for( i = 0; i < 4; i++ ) {
 704       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 705       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 706       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 707       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 708       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 709       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 710       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 711       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 712       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 713       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 714    }
 715
 716 #ifdef DEBUG
 717    /* silence warnings */
 718    (void) print_chan;
 719    (void) print_temp;
 720 #endif
 721
 722    return mach;
 723
 724 fail:
 725    align_free(mach);
 726    return NULL;
 727 }
 728
 729
 730 void
 731 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 732 {
 733    if (mach) {
 734       FREE(mach->Instructions);
 735       FREE(mach->Declarations);
 736    }
 737
 738    align_free(mach);
 739 }
 740
 741 static void
 742 micro_add(
 743    union tgsi_exec_channel *dst,
 744    const union tgsi_exec_channel *src0,
 745    const union tgsi_exec_channel *src1 )
 746 {
 747    dst->f[0] = src0->f[0] + src1->f[0];
 748    dst->f[1] = src0->f[1] + src1->f[1];
 749    dst->f[2] = src0->f[2] + src1->f[2];
 750    dst->f[3] = src0->f[3] + src1->f[3];
 751 }
 752
 753 static void
 754 micro_div(
 755    union tgsi_exec_channel *dst,
 756    const union tgsi_exec_channel *src0,
 757    const union tgsi_exec_channel *src1 )
 758 {
 759    if (src1->f[0] != 0) {
 760       dst->f[0] = src0->f[0] / src1->f[0];
 761    }
 762    if (src1->f[1] != 0) {
 763       dst->f[1] = src0->f[1] / src1->f[1];
 764    }
 765    if (src1->f[2] != 0) {
 766       dst->f[2] = src0->f[2] / src1->f[2];
 767    }
 768    if (src1->f[3] != 0) {
 769       dst->f[3] = src0->f[3] / src1->f[3];
 770    }
 771 }
 772
 773 static void
 774 micro_float_clamp(union tgsi_exec_channel *dst,
 775                   const union tgsi_exec_channel *src)
 776 {
 777    uint i;
 778
 779    for (i = 0; i < 4; i++) {
 780       if (src->f[i] > 0.0f) {
 781          if (src->f[i] > 1.884467e+019f)
 782             dst->f[i] = 1.884467e+019f;
 783          else if (src->f[i] < 5.42101e-020f)
 784             dst->f[i] = 5.42101e-020f;
 785          else
 786             dst->f[i] = src->f[i];
 787       }
 788       else {
 789          if (src->f[i] < -1.884467e+019f)
 790             dst->f[i] = -1.884467e+019f;
 791          else if (src->f[i] > -5.42101e-020f)
 792             dst->f[i] = -5.42101e-020f;
 793          else
 794             dst->f[i] = src->f[i];
 795       }
 796    }
 797 }
 798
 799 static void
 800 micro_lt(
 801    union tgsi_exec_channel *dst,
 802    const union tgsi_exec_channel *src0,
 803    const union tgsi_exec_channel *src1,
 804    const union tgsi_exec_channel *src2,
 805    const union tgsi_exec_channel *src3 )
 806 {
 807    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 808    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 809    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 810    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 811 }
 812
 813 static void
 814 micro_max(
 815    union tgsi_exec_channel *dst,
 816    const union tgsi_exec_channel *src0,
 817    const union tgsi_exec_channel *src1 )
 818 {
 819    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 820    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 821    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 822    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 823 }
 824
 825 static void
 826 micro_min(
 827    union tgsi_exec_channel *dst,
 828    const union tgsi_exec_channel *src0,
 829    const union tgsi_exec_channel *src1 )
 830 {
 831    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 832    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 833    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 834    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 835 }
 836
 837 static void
 838 micro_mul(
 839    union tgsi_exec_channel *dst,
 840    const union tgsi_exec_channel *src0,
 841    const union tgsi_exec_channel *src1 )
 842 {
 843    dst->f[0] = src0->f[0] * src1->f[0];
 844    dst->f[1] = src0->f[1] * src1->f[1];
 845    dst->f[2] = src0->f[2] * src1->f[2];
 846    dst->f[3] = src0->f[3] * src1->f[3];
 847 }
 848
 849 #if 0
 850 static void
 851 micro_imul64(
 852    union tgsi_exec_channel *dst0,
 853    union tgsi_exec_channel *dst1,
 854    const union tgsi_exec_channel *src0,
 855    const union tgsi_exec_channel *src1 )
 856 {
 857    dst1->i[0] = src0->i[0] * src1->i[0];
 858    dst1->i[1] = src0->i[1] * src1->i[1];
 859    dst1->i[2] = src0->i[2] * src1->i[2];
 860    dst1->i[3] = src0->i[3] * src1->i[3];
 861    dst0->i[0] = 0;
 862    dst0->i[1] = 0;
 863    dst0->i[2] = 0;
 864    dst0->i[3] = 0;
 865 }
 866 #endif
 867
 868 #if 0
 869 static void
 870 micro_umul64(
 871    union tgsi_exec_channel *dst0,
 872    union tgsi_exec_channel *dst1,
 873    const union tgsi_exec_channel *src0,
 874    const union tgsi_exec_channel *src1 )
 875 {
 876    dst1->u[0] = src0->u[0] * src1->u[0];
 877    dst1->u[1] = src0->u[1] * src1->u[1];
 878    dst1->u[2] = src0->u[2] * src1->u[2];
 879    dst1->u[3] = src0->u[3] * src1->u[3];
 880    dst0->u[0] = 0;
 881    dst0->u[1] = 0;
 882    dst0->u[2] = 0;
 883    dst0->u[3] = 0;
 884 }
 885 #endif
 886
 887
 888 #if 0
 889 static void
 890 micro_movc(
 891    union tgsi_exec_channel *dst,
 892    const union tgsi_exec_channel *src0,
 893    const union tgsi_exec_channel *src1,
 894    const union tgsi_exec_channel *src2 )
 895 {
 896    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 897    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 898    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 899    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 900 }
 901 #endif
 902
 903 static void
 904 micro_neg(
 905    union tgsi_exec_channel *dst,
 906    const union tgsi_exec_channel *src )
 907 {
 908    dst->f[0] = -src->f[0];
 909    dst->f[1] = -src->f[1];
 910    dst->f[2] = -src->f[2];
 911    dst->f[3] = -src->f[3];
 912 }
 913
 914 static void
 915 micro_pow(
 916    union tgsi_exec_channel *dst,
 917    const union tgsi_exec_channel *src0,
 918    const union tgsi_exec_channel *src1 )
 919 {
 920 #if FAST_MATH
 921    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 922    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 923    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 924    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 925 #else
 926    dst->f[0] = powf( src0->f[0], src1->f[0] );
 927    dst->f[1] = powf( src0->f[1], src1->f[1] );
 928    dst->f[2] = powf( src0->f[2], src1->f[2] );
 929    dst->f[3] = powf( src0->f[3], src1->f[3] );
 930 #endif
 931 }
 932
 933 static void
 934 micro_sqrt( union tgsi_exec_channel *dst,
 935             const union tgsi_exec_channel *src )
 936 {
 937    dst->f[0] = sqrtf( src->f[0] );
 938    dst->f[1] = sqrtf( src->f[1] );
 939    dst->f[2] = sqrtf( src->f[2] );
 940    dst->f[3] = sqrtf( src->f[3] );
 941 }
 942
 943 static void
 944 micro_sub(
 945    union tgsi_exec_channel *dst,
 946    const union tgsi_exec_channel *src0,
 947    const union tgsi_exec_channel *src1 )
 948 {
 949    dst->f[0] = src0->f[0] - src1->f[0];
 950    dst->f[1] = src0->f[1] - src1->f[1];
 951    dst->f[2] = src0->f[2] - src1->f[2];
 952    dst->f[3] = src0->f[3] - src1->f[3];
 953 }
 954
 955 static void
 956 fetch_src_file_channel(
 957    const struct tgsi_exec_machine *mach,
 958    const uint file,
 959    const uint swizzle,
 960    const union tgsi_exec_channel *index,
 961    union tgsi_exec_channel *chan )
 962 {
 963    switch( swizzle ) {
 964    case TGSI_SWIZZLE_X:
 965    case TGSI_SWIZZLE_Y:
 966    case TGSI_SWIZZLE_Z:
 967    case TGSI_SWIZZLE_W:
 968       switch( file ) {
 969       case TGSI_FILE_CONSTANT:
 970          assert(mach->Consts);
 971          if (index->i[0] < 0)
 972             chan->f[0] = 0.0f;
 973          else
 974             chan->f[0] = mach->Consts[index->i[0]][swizzle];
 975          if (index->i[1] < 0)
 976             chan->f[1] = 0.0f;
 977          else
 978             chan->f[1] = mach->Consts[index->i[1]][swizzle];
 979          if (index->i[2] < 0)
 980             chan->f[2] = 0.0f;
 981          else
 982             chan->f[2] = mach->Consts[index->i[2]][swizzle];
 983          if (index->i[3] < 0)
 984             chan->f[3] = 0.0f;
 985          else
 986             chan->f[3] = mach->Consts[index->i[3]][swizzle];
 987          break;
 988
 989       case TGSI_FILE_INPUT:
 990       case TGSI_FILE_SYSTEM_VALUE:
 991          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
 992          chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
 993          chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
 994          chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
 995          break;
 996
 997       case TGSI_FILE_TEMPORARY:
 998          assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
 999          chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1000          chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1001          chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1002          chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1003          break;
1004
1005       case TGSI_FILE_IMMEDIATE:
1006          assert( index->i[0] < (int) mach->ImmLimit );
1007          chan->f[0] = mach->Imms[index->i[0]][swizzle];
1008          assert( index->i[1] < (int) mach->ImmLimit );
1009          chan->f[1] = mach->Imms[index->i[1]][swizzle];
1010          assert( index->i[2] < (int) mach->ImmLimit );
1011          chan->f[2] = mach->Imms[index->i[2]][swizzle];
1012          assert( index->i[3] < (int) mach->ImmLimit );
1013          chan->f[3] = mach->Imms[index->i[3]][swizzle];
1014          break;
1015
1016       case TGSI_FILE_ADDRESS:
1017          chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1018          chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1019          chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1020          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1021          break;
1022
1023       case TGSI_FILE_PREDICATE:
1024          assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1025          assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1026          assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1027          assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1028          chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
1029          chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
1030          chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
1031          chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
1032          break;
1033
1034       case TGSI_FILE_OUTPUT:
1035          /* vertex/fragment output vars can be read too */
1036          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1037          chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1038          chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1039          chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1040          break;
1041
1042       default:
1043          assert( 0 );
1044       }
1045       break;
1046
1047    default:
1048       assert( 0 );
1049    }
1050 }
1051
1052 static void
1053 fetch_source(const struct tgsi_exec_machine *mach,
1054              union tgsi_exec_channel *chan,
1055              const struct tgsi_full_src_register *reg,
1056              const uint chan_index,
1057              enum tgsi_exec_datatype src_datatype)
1058 {
1059    union tgsi_exec_channel index;
1060    uint swizzle;
1061
1062    /* We start with a direct index into a register file.
1063     *
1064     *    file[1],
1065     *    where:
1066     *       file = Register.File
1067     *       [1] = Register.Index
1068     */
1069    index.i[0] =
1070    index.i[1] =
1071    index.i[2] =
1072    index.i[3] = reg->Register.Index;
1073
1074    /* There is an extra source register that indirectly subscripts
1075     * a register file. The direct index now becomes an offset
1076     * that is being added to the indirect register.
1077     *
1078     *    file[ind[2].x+1],
1079     *    where:
1080     *       ind = Indirect.File
1081     *       [2] = Indirect.Index
1082     *       .x = Indirect.SwizzleX
1083     */
1084    if (reg->Register.Indirect) {
1085       union tgsi_exec_channel index2;
1086       union tgsi_exec_channel indir_index;
1087       const uint execmask = mach->ExecMask;
1088       uint i;
1089
1090       /* which address register (always zero now) */
1091       index2.i[0] =
1092       index2.i[1] =
1093       index2.i[2] =
1094       index2.i[3] = reg->Indirect.Index;
1095
1096       /* get current value of address register[swizzle] */
1097       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1098       fetch_src_file_channel(
1099          mach,
1100          reg->Indirect.File,
1101          swizzle,
1102          &index2,
1103          &indir_index );
1104
1105       /* add value of address register to the offset */
1106       index.i[0] += indir_index.i[0];
1107       index.i[1] += indir_index.i[1];
1108       index.i[2] += indir_index.i[2];
1109       index.i[3] += indir_index.i[3];
1110
1111       /* for disabled execution channels, zero-out the index to
1112        * avoid using a potential garbage value.
1113        */
1114       for (i = 0; i < QUAD_SIZE; i++) {
1115          if ((execmask & (1 << i)) == 0)
1116             index.i[i] = 0;
1117       }
1118    }
1119
1120    /* There is an extra source register that is a second
1121     * subscript to a register file. Effectively it means that
1122     * the register file is actually a 2D array of registers.
1123     *
1124     *    file[1][3] == file[1*sizeof(file[1])+3],
1125     *    where:
1126     *       [3] = Dimension.Index
1127     */
1128    if (reg->Register.Dimension) {
1129       /* The size of the first-order array depends on the register file type.
1130        * We need to multiply the index to the first array to get an effective,
1131        * "flat" index that points to the beginning of the second-order array.
1132        */
1133       switch (reg->Register.File) {
1134       case TGSI_FILE_INPUT:
1135       case TGSI_FILE_SYSTEM_VALUE:
1136          index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1137          index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1138          index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1139          index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1140          break;
1141       case TGSI_FILE_CONSTANT:
1142          index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1143          index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1144          index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1145          index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1146          break;
1147       default:
1148          assert( 0 );
1149       }
1150
1151       index.i[0] += reg->Dimension.Index;
1152       index.i[1] += reg->Dimension.Index;
1153       index.i[2] += reg->Dimension.Index;
1154       index.i[3] += reg->Dimension.Index;
1155
1156       /* Again, the second subscript index can be addressed indirectly
1157        * identically to the first one.
1158        * Nothing stops us from indirectly addressing the indirect register,
1159        * but there is no need for that, so we won't exercise it.
1160        *
1161        *    file[1][ind[4].y+3],
1162        *    where:
1163        *       ind = DimIndirect.File
1164        *       [4] = DimIndirect.Index
1165        *       .y = DimIndirect.SwizzleX
1166        */
1167       if (reg->Dimension.Indirect) {
1168          union tgsi_exec_channel index2;
1169          union tgsi_exec_channel indir_index;
1170          const uint execmask = mach->ExecMask;
1171          uint i;
1172
1173          index2.i[0] =
1174          index2.i[1] =
1175          index2.i[2] =
1176          index2.i[3] = reg->DimIndirect.Index;
1177
1178          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1179          fetch_src_file_channel(
1180             mach,
1181             reg->DimIndirect.File,
1182             swizzle,
1183             &index2,
1184             &indir_index );
1185
1186          index.i[0] += indir_index.i[0];
1187          index.i[1] += indir_index.i[1];
1188          index.i[2] += indir_index.i[2];
1189          index.i[3] += indir_index.i[3];
1190
1191          /* for disabled execution channels, zero-out the index to
1192           * avoid using a potential garbage value.
1193           */
1194          for (i = 0; i < QUAD_SIZE; i++) {
1195             if ((execmask & (1 << i)) == 0)
1196                index.i[i] = 0;
1197          }
1198       }
1199
1200       /* If by any chance there was a need for a 3D array of register
1201        * files, we would have to check whether Dimension is followed
1202        * by a dimension register and continue the saga.
1203        */
1204    }
1205
1206    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1207    fetch_src_file_channel(
1208       mach,
1209       reg->Register.File,
1210       swizzle,
1211       &index,
1212       chan );
1213
1214    if (reg->Register.Absolute) {
1215       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1216          micro_abs(chan, chan);
1217       } else {
1218          micro_iabs(chan, chan);
1219       }
1220    }
1221
1222    if (reg->Register.Negate) {
1223       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1224          micro_neg(chan, chan);
1225       } else {
1226          micro_ineg(chan, chan);
1227       }
1228    }
1229 }
1230
1231 static void
1232 store_dest(struct tgsi_exec_machine *mach,
1233            const union tgsi_exec_channel *chan,
1234            const struct tgsi_full_dst_register *reg,
1235            const struct tgsi_full_instruction *inst,
1236            uint chan_index,
1237            enum tgsi_exec_datatype dst_datatype)
1238 {
1239    uint i;
1240    union tgsi_exec_channel null;
1241    union tgsi_exec_channel *dst;
1242    uint execmask = mach->ExecMask;
1243    int offset = 0;  /* indirection offset */
1244    int index;
1245
1246    if (dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1247       CHECK_INF_OR_NAN(chan);
1248    }
1249
1250    /* There is an extra source register that indirectly subscripts
1251     * a register file. The direct index now becomes an offset
1252     * that is being added to the indirect register.
1253     *
1254     *    file[ind[2].x+1],
1255     *    where:
1256     *       ind = Indirect.File
1257     *       [2] = Indirect.Index
1258     *       .x = Indirect.SwizzleX
1259     */
1260    if (reg->Register.Indirect) {
1261       union tgsi_exec_channel index;
1262       union tgsi_exec_channel indir_index;
1263       uint swizzle;
1264
1265       /* which address register (always zero for now) */
1266       index.i[0] =
1267       index.i[1] =
1268       index.i[2] =
1269       index.i[3] = reg->Indirect.Index;
1270
1271       /* get current value of address register[swizzle] */
1272       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1273
1274       /* fetch values from the address/indirection register */
1275       fetch_src_file_channel(
1276          mach,
1277          reg->Indirect.File,
1278          swizzle,
1279          &index,
1280          &indir_index );
1281
1282       /* save indirection offset */
1283       offset = indir_index.i[0];
1284    }
1285
1286    switch (reg->Register.File) {
1287    case TGSI_FILE_NULL:
1288       dst = &null;
1289       break;
1290
1291    case TGSI_FILE_OUTPUT:
1292       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1293          + reg->Register.Index;
1294       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1295 #if 0
1296       if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1297          fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1298          for (i = 0; i < QUAD_SIZE; i++)
1299             if (execmask & (1 << i))
1300                fprintf(stderr, "%f, ", chan->f[i]);
1301          fprintf(stderr, ")\n");
1302       }
1303 #endif
1304       break;
1305
1306    case TGSI_FILE_TEMPORARY:
1307       index = reg->Register.Index;
1308       assert( index < TGSI_EXEC_NUM_TEMPS );
1309       dst = &mach->Temps[offset + index].xyzw[chan_index];
1310       break;
1311
1312    case TGSI_FILE_ADDRESS:
1313       index = reg->Register.Index;
1314       dst = &mach->Addrs[index].xyzw[chan_index];
1315       break;
1316
1317    case TGSI_FILE_LOOP:
1318       assert(reg->Register.Index == 0);
1319       assert(mach->LoopCounterStackTop > 0);
1320       assert(chan_index == CHAN_X);
1321       dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1322       break;
1323
1324    case TGSI_FILE_PREDICATE:
1325       index = reg->Register.Index;
1326       assert(index < TGSI_EXEC_NUM_PREDS);
1327       dst = &mach->Predicates[index].xyzw[chan_index];
1328       break;
1329
1330    default:
1331       assert( 0 );
1332       return;
1333    }
1334
1335    if (inst->Instruction.Predicate) {
1336       uint swizzle;
1337       union tgsi_exec_channel *pred;
1338
1339       switch (chan_index) {
1340       case CHAN_X:
1341          swizzle = inst->Predicate.SwizzleX;
1342          break;
1343       case CHAN_Y:
1344          swizzle = inst->Predicate.SwizzleY;
1345          break;
1346       case CHAN_Z:
1347          swizzle = inst->Predicate.SwizzleZ;
1348          break;
1349       case CHAN_W:
1350          swizzle = inst->Predicate.SwizzleW;
1351          break;
1352       default:
1353          assert(0);
1354          return;
1355       }
1356
1357       assert(inst->Predicate.Index == 0);
1358
1359       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1360
1361       if (inst->Predicate.Negate) {
1362          for (i = 0; i < QUAD_SIZE; i++) {
1363             if (pred->u[i]) {
1364                execmask &= ~(1 << i);
1365             }
1366          }
1367       } else {
1368          for (i = 0; i < QUAD_SIZE; i++) {
1369             if (!pred->u[i]) {
1370                execmask &= ~(1 << i);
1371             }
1372          }
1373       }
1374    }
1375
1376    switch (inst->Instruction.Saturate) {
1377    case TGSI_SAT_NONE:
1378       for (i = 0; i < QUAD_SIZE; i++)
1379          if (execmask & (1 << i))
1380             dst->i[i] = chan->i[i];
1381       break;
1382
1383    case TGSI_SAT_ZERO_ONE:
1384       for (i = 0; i < QUAD_SIZE; i++)
1385          if (execmask & (1 << i)) {
1386             if (chan->f[i] < 0.0f)
1387                dst->f[i] = 0.0f;
1388             else if (chan->f[i] > 1.0f)
1389                dst->f[i] = 1.0f;
1390             else
1391                dst->i[i] = chan->i[i];
1392          }
1393       break;
1394
1395    case TGSI_SAT_MINUS_PLUS_ONE:
1396       for (i = 0; i < QUAD_SIZE; i++)
1397          if (execmask & (1 << i)) {
1398             if (chan->f[i] < -1.0f)
1399                dst->f[i] = -1.0f;
1400             else if (chan->f[i] > 1.0f)
1401                dst->f[i] = 1.0f;
1402             else
1403                dst->i[i] = chan->i[i];
1404          }
1405       break;
1406
1407    default:
1408       assert( 0 );
1409    }
1410 }
1411
1412 #define FETCH(VAL,INDEX,CHAN)\
1413     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1414
1415 #define STORE(VAL,INDEX,CHAN)\
1416    store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1417
1418
1419 /**
1420  * Execute ARB-style KIL which is predicated by a src register.
1421  * Kill fragment if any of the four values is less than zero.
1422  */
1423 static void
1424 exec_kil(struct tgsi_exec_machine *mach,
1425          const struct tgsi_full_instruction *inst)
1426 {
1427    uint uniquemask;
1428    uint chan_index;
1429    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1430    union tgsi_exec_channel r[1];
1431
1432    /* This mask stores component bits that were already tested. */
1433    uniquemask = 0;
1434
1435    for (chan_index = 0; chan_index < 4; chan_index++)
1436    {
1437       uint swizzle;
1438       uint i;
1439
1440       /* unswizzle channel */
1441       swizzle = tgsi_util_get_full_src_register_swizzle (
1442                         &inst->Src[0],
1443                         chan_index);
1444
1445       /* check if the component has not been already tested */
1446       if (uniquemask & (1 << swizzle))
1447          continue;
1448       uniquemask |= 1 << swizzle;
1449
1450       FETCH(&r[0], 0, chan_index);
1451       for (i = 0; i < 4; i++)
1452          if (r[0].f[i] < 0.0f)
1453             kilmask |= 1 << i;
1454    }
1455
1456    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1457 }
1458
1459 /**
1460  * Execute NVIDIA-style KIL which is predicated by a condition code.
1461  * Kill fragment if the condition code is TRUE.
1462  */
1463 static void
1464 exec_kilp(struct tgsi_exec_machine *mach,
1465           const struct tgsi_full_instruction *inst)
1466 {
1467    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1468
1469    /* "unconditional" kil */
1470    kilmask = mach->ExecMask;
1471    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1472 }
1473
1474 static void
1475 emit_vertex(struct tgsi_exec_machine *mach)
1476 {
1477    /* FIXME: check for exec mask correctly
1478    unsigned i;
1479    for (i = 0; i < QUAD_SIZE; ++i) {
1480          if ((mach->ExecMask & (1 << i)))
1481    */
1482    if (mach->ExecMask) {
1483       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1484       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1485    }
1486 }
1487
1488 static void
1489 emit_primitive(struct tgsi_exec_machine *mach)
1490 {
1491    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1492    /* FIXME: check for exec mask correctly
1493    unsigned i;
1494    for (i = 0; i < QUAD_SIZE; ++i) {
1495          if ((mach->ExecMask & (1 << i)))
1496    */
1497    if (mach->ExecMask) {
1498       ++(*prim_count);
1499       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1500       mach->Primitives[*prim_count] = 0;
1501    }
1502 }
1503
1504 /*
1505  * Fetch a four texture samples using STR texture coordinates.
1506  */
1507 static void
1508 fetch_texel( struct tgsi_sampler *sampler,
1509              const union tgsi_exec_channel *s,
1510              const union tgsi_exec_channel *t,
1511              const union tgsi_exec_channel *p,
1512              float lodbias,  /* XXX should be float[4] */
1513              union tgsi_exec_channel *r,
1514              union tgsi_exec_channel *g,
1515              union tgsi_exec_channel *b,
1516              union tgsi_exec_channel *a )
1517 {
1518    uint j;
1519    float rgba[NUM_CHANNELS][QUAD_SIZE];
1520
1521    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1522
1523    for (j = 0; j < 4; j++) {
1524       r->f[j] = rgba[0][j];
1525       g->f[j] = rgba[1][j];
1526       b->f[j] = rgba[2][j];
1527       a->f[j] = rgba[3][j];
1528    }
1529 }
1530
1531
1532 static void
1533 exec_tex(struct tgsi_exec_machine *mach,
1534          const struct tgsi_full_instruction *inst,
1535          boolean biasLod,
1536          boolean projected)
1537 {
1538    const uint unit = inst->Src[1].Register.Index;
1539    union tgsi_exec_channel r[4];
1540    uint chan_index;
1541    float lodBias;
1542
1543    /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1544
1545    switch (inst->Texture.Texture) {
1546    case TGSI_TEXTURE_1D:
1547    case TGSI_TEXTURE_SHADOW1D:
1548
1549       FETCH(&r[0], 0, CHAN_X);
1550
1551       if (projected) {
1552          FETCH(&r[1], 0, CHAN_W);
1553          micro_div( &r[0], &r[0], &r[1] );
1554       }
1555
1556       if (biasLod) {
1557          FETCH(&r[1], 0, CHAN_W);
1558          lodBias = r[2].f[0];
1559       }
1560       else
1561          lodBias = 0.0;
1562
1563       fetch_texel(mach->Samplers[unit],
1564                   &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1565                   &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1566       break;
1567
1568    case TGSI_TEXTURE_2D:
1569    case TGSI_TEXTURE_RECT:
1570    case TGSI_TEXTURE_SHADOW2D:
1571    case TGSI_TEXTURE_SHADOWRECT:
1572
1573       FETCH(&r[0], 0, CHAN_X);
1574       FETCH(&r[1], 0, CHAN_Y);
1575       FETCH(&r[2], 0, CHAN_Z);
1576
1577       if (projected) {
1578          FETCH(&r[3], 0, CHAN_W);
1579          micro_div( &r[0], &r[0], &r[3] );
1580          micro_div( &r[1], &r[1], &r[3] );
1581          micro_div( &r[2], &r[2], &r[3] );
1582       }
1583
1584       if (biasLod) {
1585          FETCH(&r[3], 0, CHAN_W);
1586          lodBias = r[3].f[0];
1587       }
1588       else
1589          lodBias = 0.0;
1590
1591       fetch_texel(mach->Samplers[unit],
1592                   &r[0], &r[1], &r[2], lodBias,  /* inputs */
1593                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1594       break;
1595
1596    case TGSI_TEXTURE_3D:
1597    case TGSI_TEXTURE_CUBE:
1598
1599       FETCH(&r[0], 0, CHAN_X);
1600       FETCH(&r[1], 0, CHAN_Y);
1601       FETCH(&r[2], 0, CHAN_Z);
1602
1603       if (projected) {
1604          FETCH(&r[3], 0, CHAN_W);
1605          micro_div( &r[0], &r[0], &r[3] );
1606          micro_div( &r[1], &r[1], &r[3] );
1607          micro_div( &r[2], &r[2], &r[3] );
1608       }
1609
1610       if (biasLod) {
1611          FETCH(&r[3], 0, CHAN_W);
1612          lodBias = r[3].f[0];
1613       }
1614       else
1615          lodBias = 0.0;
1616
1617       fetch_texel(mach->Samplers[unit],
1618                   &r[0], &r[1], &r[2], lodBias,
1619                   &r[0], &r[1], &r[2], &r[3]);
1620       break;
1621
1622    default:
1623       assert (0);
1624    }
1625
1626    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1627       STORE( &r[chan_index], 0, chan_index );
1628    }
1629 }
1630
1631 static void
1632 exec_txd(struct tgsi_exec_machine *mach,
1633          const struct tgsi_full_instruction *inst)
1634 {
1635    const uint unit = inst->Src[3].Register.Index;
1636    union tgsi_exec_channel r[4];
1637    uint chan_index;
1638
1639    /*
1640     * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1641     */
1642
1643    switch (inst->Texture.Texture) {
1644    case TGSI_TEXTURE_1D:
1645    case TGSI_TEXTURE_SHADOW1D:
1646
1647       FETCH(&r[0], 0, CHAN_X);
1648
1649       fetch_texel(mach->Samplers[unit],
1650                   &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
1651                   &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
1652       break;
1653
1654    case TGSI_TEXTURE_2D:
1655    case TGSI_TEXTURE_RECT:
1656    case TGSI_TEXTURE_SHADOW2D:
1657    case TGSI_TEXTURE_SHADOWRECT:
1658
1659       FETCH(&r[0], 0, CHAN_X);
1660       FETCH(&r[1], 0, CHAN_Y);
1661       FETCH(&r[2], 0, CHAN_Z);
1662
1663       fetch_texel(mach->Samplers[unit],
1664                   &r[0], &r[1], &r[2], 0.0f,    /* inputs */
1665                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1666       break;
1667
1668    case TGSI_TEXTURE_3D:
1669    case TGSI_TEXTURE_CUBE:
1670
1671       FETCH(&r[0], 0, CHAN_X);
1672       FETCH(&r[1], 0, CHAN_Y);
1673       FETCH(&r[2], 0, CHAN_Z);
1674
1675       fetch_texel(mach->Samplers[unit],
1676                   &r[0], &r[1], &r[2], 0.0f,
1677                   &r[0], &r[1], &r[2], &r[3]);
1678       break;
1679
1680    default:
1681       assert(0);
1682    }
1683
1684    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1685       STORE(&r[chan_index], 0, chan_index);
1686    }
1687 }
1688
1689
1690 /**
1691  * Evaluate a constant-valued coefficient at the position of the
1692  * current quad.
1693  */
1694 static void
1695 eval_constant_coef(
1696    struct tgsi_exec_machine *mach,
1697    unsigned attrib,
1698    unsigned chan )
1699 {
1700    unsigned i;
1701
1702    for( i = 0; i < QUAD_SIZE; i++ ) {
1703       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1704    }
1705 }
1706
1707 /**
1708  * Evaluate a linear-valued coefficient at the position of the
1709  * current quad.
1710  */
1711 static void
1712 eval_linear_coef(
1713    struct tgsi_exec_machine *mach,
1714    unsigned attrib,
1715    unsigned chan )
1716 {
1717    const float x = mach->QuadPos.xyzw[0].f[0];
1718    const float y = mach->QuadPos.xyzw[1].f[0];
1719    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1720    const float dady = mach->InterpCoefs[attrib].dady[chan];
1721    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1722    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1723    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1724    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1725    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1726 }
1727
1728 /**
1729  * Evaluate a perspective-valued coefficient at the position of the
1730  * current quad.
1731  */
1732 static void
1733 eval_perspective_coef(
1734    struct tgsi_exec_machine *mach,
1735    unsigned attrib,
1736    unsigned chan )
1737 {
1738    const float x = mach->QuadPos.xyzw[0].f[0];
1739    const float y = mach->QuadPos.xyzw[1].f[0];
1740    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1741    const float dady = mach->InterpCoefs[attrib].dady[chan];
1742    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1743    const float *w = mach->QuadPos.xyzw[3].f;
1744    /* divide by W here */
1745    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1746    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1747    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1748    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1749 }
1750
1751
1752 typedef void (* eval_coef_func)(
1753    struct tgsi_exec_machine *mach,
1754    unsigned attrib,
1755    unsigned chan );
1756
1757 static void
1758 exec_declaration(struct tgsi_exec_machine *mach,
1759                  const struct tgsi_full_declaration *decl)
1760 {
1761    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1762       if (decl->Declaration.File == TGSI_FILE_INPUT ||
1763           decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1764          uint first, last, mask;
1765
1766          first = decl->Range.First;
1767          last = decl->Range.Last;
1768          mask = decl->Declaration.UsageMask;
1769
1770          if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1771             assert(decl->Semantic.Index == 0);
1772             assert(first == last);
1773             assert(mask == TGSI_WRITEMASK_XYZW);
1774
1775             mach->Inputs[first] = mach->QuadPos;
1776          } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1777             uint i;
1778
1779             assert(decl->Semantic.Index == 0);
1780             assert(first == last);
1781
1782             for (i = 0; i < QUAD_SIZE; i++) {
1783                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1784             }
1785          } else {
1786             eval_coef_func eval;
1787             uint i, j;
1788
1789             switch (decl->Declaration.Interpolate) {
1790             case TGSI_INTERPOLATE_CONSTANT:
1791                eval = eval_constant_coef;
1792                break;
1793
1794             case TGSI_INTERPOLATE_LINEAR:
1795                eval = eval_linear_coef;
1796                break;
1797
1798             case TGSI_INTERPOLATE_PERSPECTIVE:
1799                eval = eval_perspective_coef;
1800                break;
1801
1802             default:
1803                assert(0);
1804                return;
1805             }
1806
1807             for (j = 0; j < NUM_CHANNELS; j++) {
1808                if (mask & (1 << j)) {
1809                   for (i = first; i <= last; i++) {
1810                      eval(mach, i, j);
1811                   }
1812                }
1813             }
1814          }
1815       }
1816    }
1817 }
1818
1819 typedef void (* micro_op)(union tgsi_exec_channel *dst,
1820                           const union tgsi_exec_channel *src);
1821
1822 static void
1823 exec_scalar_unary(struct tgsi_exec_machine *mach,
1824                   const struct tgsi_full_instruction *inst,
1825                   micro_op op,
1826                   enum tgsi_exec_datatype dst_datatype,
1827                   enum tgsi_exec_datatype src_datatype)
1828 {
1829    unsigned int chan;
1830    union tgsi_exec_channel src;
1831    union tgsi_exec_channel dst;
1832
1833    fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1834    op(&dst, &src);
1835    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1836       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1837          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1838       }
1839    }
1840 }
1841
1842 static void
1843 exec_vector_unary(struct tgsi_exec_machine *mach,
1844                   const struct tgsi_full_instruction *inst,
1845                   micro_op op,
1846                   enum tgsi_exec_datatype dst_datatype,
1847                   enum tgsi_exec_datatype src_datatype)
1848 {
1849    unsigned int chan;
1850    struct tgsi_exec_vector dst;
1851
1852    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1853       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1854          union tgsi_exec_channel src;
1855
1856          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1857          op(&dst.xyzw[chan], &src);
1858       }
1859    }
1860    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1861       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1862          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1863       }
1864    }
1865 }
1866
1867 static void
1868 exec_vector_binary(struct tgsi_exec_machine *mach,
1869                    const struct tgsi_full_instruction *inst,
1870                    micro_op op,
1871                    enum tgsi_exec_datatype dst_datatype,
1872                    enum tgsi_exec_datatype src_datatype)
1873 {
1874    unsigned int chan;
1875    struct tgsi_exec_vector dst;
1876
1877    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1878       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1879          union tgsi_exec_channel src[2];
1880
1881          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1882          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1883          op(&dst.xyzw[chan], src);
1884       }
1885    }
1886    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1887       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1888          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1889       }
1890    }
1891 }
1892
1893 static void
1894 exec_vector_trinary(struct tgsi_exec_machine *mach,
1895                     const struct tgsi_full_instruction *inst,
1896                     micro_op op,
1897                     enum tgsi_exec_datatype dst_datatype,
1898                     enum tgsi_exec_datatype src_datatype)
1899 {
1900    unsigned int chan;
1901    struct tgsi_exec_vector dst;
1902
1903    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1904       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1905          union tgsi_exec_channel src[3];
1906
1907          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1908          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1909          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1910          op(&dst.xyzw[chan], src);
1911       }
1912    }
1913    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1914       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1915          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1916       }
1917    }
1918 }
1919
1920 static void
1921 exec_break(struct tgsi_exec_machine *mach)
1922 {
1923    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
1924       /* turn off loop channels for each enabled exec channel */
1925       mach->LoopMask &= ~mach->ExecMask;
1926       /* Todo: if mach->LoopMask == 0, jump to end of loop */
1927       UPDATE_EXEC_MASK(mach);
1928    } else {
1929       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
1930
1931       mach->Switch.mask = 0x0;
1932
1933       UPDATE_EXEC_MASK(mach);
1934    }
1935 }
1936
1937 static void
1938 exec_switch(struct tgsi_exec_machine *mach,
1939             const struct tgsi_full_instruction *inst)
1940 {
1941    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
1942    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
1943
1944    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
1945    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
1946    mach->Switch.mask = 0x0;
1947    mach->Switch.defaultMask = 0x0;
1948
1949    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
1950    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
1951
1952    UPDATE_EXEC_MASK(mach);
1953 }
1954
1955 static void
1956 exec_case(struct tgsi_exec_machine *mach,
1957           const struct tgsi_full_instruction *inst)
1958 {
1959    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
1960    union tgsi_exec_channel src;
1961    uint mask = 0;
1962
1963    fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
1964
1965    if (mach->Switch.selector.u[0] == src.u[0]) {
1966       mask |= 0x1;
1967    }
1968    if (mach->Switch.selector.u[1] == src.u[1]) {
1969       mask |= 0x2;
1970    }
1971    if (mach->Switch.selector.u[2] == src.u[2]) {
1972       mask |= 0x4;
1973    }
1974    if (mach->Switch.selector.u[3] == src.u[3]) {
1975       mask |= 0x8;
1976    }
1977
1978    mach->Switch.defaultMask |= mask;
1979
1980    mach->Switch.mask |= mask & prevMask;
1981
1982    UPDATE_EXEC_MASK(mach);
1983 }
1984
1985 static void
1986 exec_default(struct tgsi_exec_machine *mach)
1987 {
1988    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
1989
1990    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
1991
1992    UPDATE_EXEC_MASK(mach);
1993 }
1994
1995 static void
1996 exec_endswitch(struct tgsi_exec_machine *mach)
1997 {
1998    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
1999    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2000
2001    UPDATE_EXEC_MASK(mach);
2002 }
2003
2004 static void
2005 micro_i2f(union tgsi_exec_channel *dst,
2006           const union tgsi_exec_channel *src)
2007 {
2008    dst->f[0] = (float)src->i[0];
2009    dst->f[1] = (float)src->i[1];
2010    dst->f[2] = (float)src->i[2];
2011    dst->f[3] = (float)src->i[3];
2012 }
2013
2014 static void
2015 micro_not(union tgsi_exec_channel *dst,
2016           const union tgsi_exec_channel *src)
2017 {
2018    dst->u[0] = ~src->u[0];
2019    dst->u[1] = ~src->u[1];
2020    dst->u[2] = ~src->u[2];
2021    dst->u[3] = ~src->u[3];
2022 }
2023
2024 static void
2025 micro_shl(union tgsi_exec_channel *dst,
2026           const union tgsi_exec_channel *src)
2027 {
2028    dst->u[0] = src[0].u[0] << src[1].u[0];
2029    dst->u[1] = src[0].u[1] << src[1].u[1];
2030    dst->u[2] = src[0].u[2] << src[1].u[2];
2031    dst->u[3] = src[0].u[3] << src[1].u[3];
2032 }
2033
2034 static void
2035 micro_and(union tgsi_exec_channel *dst,
2036           const union tgsi_exec_channel *src)
2037 {
2038    dst->u[0] = src[0].u[0] & src[1].u[0];
2039    dst->u[1] = src[0].u[1] & src[1].u[1];
2040    dst->u[2] = src[0].u[2] & src[1].u[2];
2041    dst->u[3] = src[0].u[3] & src[1].u[3];
2042 }
2043
2044 static void
2045 micro_or(union tgsi_exec_channel *dst,
2046          const union tgsi_exec_channel *src)
2047 {
2048    dst->u[0] = src[0].u[0] | src[1].u[0];
2049    dst->u[1] = src[0].u[1] | src[1].u[1];
2050    dst->u[2] = src[0].u[2] | src[1].u[2];
2051    dst->u[3] = src[0].u[3] | src[1].u[3];
2052 }
2053
2054 static void
2055 micro_xor(union tgsi_exec_channel *dst,
2056           const union tgsi_exec_channel *src)
2057 {
2058    dst->u[0] = src[0].u[0] ^ src[1].u[0];
2059    dst->u[1] = src[0].u[1] ^ src[1].u[1];
2060    dst->u[2] = src[0].u[2] ^ src[1].u[2];
2061    dst->u[3] = src[0].u[3] ^ src[1].u[3];
2062 }
2063
2064 static void
2065 micro_f2i(union tgsi_exec_channel *dst,
2066           const union tgsi_exec_channel *src)
2067 {
2068    dst->i[0] = (int)src->f[0];
2069    dst->i[1] = (int)src->f[1];
2070    dst->i[2] = (int)src->f[2];
2071    dst->i[3] = (int)src->f[3];
2072 }
2073
2074 static void
2075 micro_idiv(union tgsi_exec_channel *dst,
2076            const union tgsi_exec_channel *src)
2077 {
2078    dst->i[0] = src[0].i[0] / src[1].i[0];
2079    dst->i[1] = src[0].i[1] / src[1].i[1];
2080    dst->i[2] = src[0].i[2] / src[1].i[2];
2081    dst->i[3] = src[0].i[3] / src[1].i[3];
2082 }
2083
2084 static void
2085 micro_imax(union tgsi_exec_channel *dst,
2086            const union tgsi_exec_channel *src)
2087 {
2088    dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2089    dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2090    dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2091    dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2092 }
2093
2094 static void
2095 micro_imin(union tgsi_exec_channel *dst,
2096            const union tgsi_exec_channel *src)
2097 {
2098    dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2099    dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2100    dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2101    dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2102 }
2103
2104 static void
2105 micro_isge(union tgsi_exec_channel *dst,
2106            const union tgsi_exec_channel *src)
2107 {
2108    dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2109    dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2110    dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2111    dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2112 }
2113
2114 static void
2115 micro_ishr(union tgsi_exec_channel *dst,
2116            const union tgsi_exec_channel *src)
2117 {
2118    dst->i[0] = src[0].i[0] >> src[1].i[0];
2119    dst->i[1] = src[0].i[1] >> src[1].i[1];
2120    dst->i[2] = src[0].i[2] >> src[1].i[2];
2121    dst->i[3] = src[0].i[3] >> src[1].i[3];
2122 }
2123
2124 static void
2125 micro_islt(union tgsi_exec_channel *dst,
2126            const union tgsi_exec_channel *src)
2127 {
2128    dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2129    dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2130    dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2131    dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2132 }
2133
2134 static void
2135 micro_f2u(union tgsi_exec_channel *dst,
2136           const union tgsi_exec_channel *src)
2137 {
2138    dst->u[0] = (uint)src->f[0];
2139    dst->u[1] = (uint)src->f[1];
2140    dst->u[2] = (uint)src->f[2];
2141    dst->u[3] = (uint)src->f[3];
2142 }
2143
2144 static void
2145 micro_u2f(union tgsi_exec_channel *dst,
2146           const union tgsi_exec_channel *src)
2147 {
2148    dst->f[0] = (float)src->u[0];
2149    dst->f[1] = (float)src->u[1];
2150    dst->f[2] = (float)src->u[2];
2151    dst->f[3] = (float)src->u[3];
2152 }
2153
2154 static void
2155 micro_uadd(union tgsi_exec_channel *dst,
2156            const union tgsi_exec_channel *src)
2157 {
2158    dst->u[0] = src[0].u[0] + src[1].u[0];
2159    dst->u[1] = src[0].u[1] + src[1].u[1];
2160    dst->u[2] = src[0].u[2] + src[1].u[2];
2161    dst->u[3] = src[0].u[3] + src[1].u[3];
2162 }
2163
2164 static void
2165 micro_udiv(union tgsi_exec_channel *dst,
2166            const union tgsi_exec_channel *src)
2167 {
2168    dst->u[0] = src[0].u[0] / src[1].u[0];
2169    dst->u[1] = src[0].u[1] / src[1].u[1];
2170    dst->u[2] = src[0].u[2] / src[1].u[2];
2171    dst->u[3] = src[0].u[3] / src[1].u[3];
2172 }
2173
2174 static void
2175 micro_umad(union tgsi_exec_channel *dst,
2176            const union tgsi_exec_channel *src)
2177 {
2178    dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2179    dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2180    dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2181    dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2182 }
2183
2184 static void
2185 micro_umax(union tgsi_exec_channel *dst,
2186            const union tgsi_exec_channel *src)
2187 {
2188    dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2189    dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2190    dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2191    dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2192 }
2193
2194 static void
2195 micro_umin(union tgsi_exec_channel *dst,
2196            const union tgsi_exec_channel *src)
2197 {
2198    dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2199    dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2200    dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2201    dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2202 }
2203
2204 static void
2205 micro_umod(union tgsi_exec_channel *dst,
2206            const union tgsi_exec_channel *src)
2207 {
2208    dst->u[0] = src[0].u[0] % src[1].u[0];
2209    dst->u[1] = src[0].u[1] % src[1].u[1];
2210    dst->u[2] = src[0].u[2] % src[1].u[2];
2211    dst->u[3] = src[0].u[3] % src[1].u[3];
2212 }
2213
2214 static void
2215 micro_umul(union tgsi_exec_channel *dst,
2216            const union tgsi_exec_channel *src)
2217 {
2218    dst->u[0] = src[0].u[0] * src[1].u[0];
2219    dst->u[1] = src[0].u[1] * src[1].u[1];
2220    dst->u[2] = src[0].u[2] * src[1].u[2];
2221    dst->u[3] = src[0].u[3] * src[1].u[3];
2222 }
2223
2224 static void
2225 micro_useq(union tgsi_exec_channel *dst,
2226            const union tgsi_exec_channel *src)
2227 {
2228    dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2229    dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2230    dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2231    dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2232 }
2233
2234 static void
2235 micro_usge(union tgsi_exec_channel *dst,
2236            const union tgsi_exec_channel *src)
2237 {
2238    dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2239    dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2240    dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2241    dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2242 }
2243
2244 static void
2245 micro_ushr(union tgsi_exec_channel *dst,
2246            const union tgsi_exec_channel *src)
2247 {
2248    dst->u[0] = src[0].u[0] >> src[1].u[0];
2249    dst->u[1] = src[0].u[1] >> src[1].u[1];
2250    dst->u[2] = src[0].u[2] >> src[1].u[2];
2251    dst->u[3] = src[0].u[3] >> src[1].u[3];
2252 }
2253
2254 static void
2255 micro_uslt(union tgsi_exec_channel *dst,
2256            const union tgsi_exec_channel *src)
2257 {
2258    dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2259    dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2260    dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2261    dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2262 }
2263
2264 static void
2265 micro_usne(union tgsi_exec_channel *dst,
2266            const union tgsi_exec_channel *src)
2267 {
2268    dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2269    dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2270    dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2271    dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2272 }
2273
2274 static void
2275 exec_instruction(
2276    struct tgsi_exec_machine *mach,
2277    const struct tgsi_full_instruction *inst,
2278    int *pc )
2279 {
2280    uint chan_index;
2281    union tgsi_exec_channel r[10];
2282    union tgsi_exec_channel d[8];
2283
2284    (*pc)++;
2285
2286    switch (inst->Instruction.Opcode) {
2287    case TGSI_OPCODE_ARL:
2288       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2289       break;
2290
2291    case TGSI_OPCODE_MOV:
2292       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2293       break;
2294
2295    case TGSI_OPCODE_LIT:
2296       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2297          FETCH( &r[0], 0, CHAN_X );
2298          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2299             micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2300          }
2301
2302          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2303             FETCH( &r[1], 0, CHAN_Y );
2304             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2305
2306             FETCH( &r[2], 0, CHAN_W );
2307             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2308             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2309             micro_pow( &r[1], &r[1], &r[2] );
2310             micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2311          }
2312
2313          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2314             STORE(&d[CHAN_Y], 0, CHAN_Y);
2315          }
2316          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2317             STORE(&d[CHAN_Z], 0, CHAN_Z);
2318          }
2319       }
2320       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2321          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2322       }
2323       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2324          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2325       }
2326       break;
2327
2328    case TGSI_OPCODE_RCP:
2329       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2330       break;
2331
2332    case TGSI_OPCODE_RSQ:
2333       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2334       break;
2335
2336    case TGSI_OPCODE_EXP:
2337       FETCH( &r[0], 0, CHAN_X );
2338       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2339       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2340          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2341          STORE( &r[2], 0, CHAN_X );        /* store r2 */
2342       }
2343       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2344          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2345          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2346       }
2347       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2348          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2349          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2350       }
2351       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2352          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2353       }
2354       break;
2355
2356    case TGSI_OPCODE_LOG:
2357       FETCH( &r[0], 0, CHAN_X );
2358       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2359       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2360       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2361       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2362          STORE( &r[0], 0, CHAN_X );
2363       }
2364       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2365          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2366          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2367          STORE( &r[0], 0, CHAN_Y );
2368       }
2369       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2370          STORE( &r[1], 0, CHAN_Z );
2371       }
2372       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2373          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2374       }
2375       break;
2376
2377    case TGSI_OPCODE_MUL:
2378       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2379          FETCH(&r[0], 0, chan_index);
2380          FETCH(&r[1], 1, chan_index);
2381          micro_mul(&d[chan_index], &r[0], &r[1]);
2382       }
2383       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2384          STORE(&d[chan_index], 0, chan_index);
2385       }
2386       break;
2387
2388    case TGSI_OPCODE_ADD:
2389       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2390          FETCH( &r[0], 0, chan_index );
2391          FETCH( &r[1], 1, chan_index );
2392          micro_add(&d[chan_index], &r[0], &r[1]);
2393       }
2394       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2395          STORE(&d[chan_index], 0, chan_index);
2396       }
2397       break;
2398
2399    case TGSI_OPCODE_DP3:
2400    /* TGSI_OPCODE_DOT3 */
2401       FETCH( &r[0], 0, CHAN_X );
2402       FETCH( &r[1], 1, CHAN_X );
2403       micro_mul( &r[0], &r[0], &r[1] );
2404
2405       FETCH( &r[1], 0, CHAN_Y );
2406       FETCH( &r[2], 1, CHAN_Y );
2407       micro_mul( &r[1], &r[1], &r[2] );
2408       micro_add( &r[0], &r[0], &r[1] );
2409
2410       FETCH( &r[1], 0, CHAN_Z );
2411       FETCH( &r[2], 1, CHAN_Z );
2412       micro_mul( &r[1], &r[1], &r[2] );
2413       micro_add( &r[0], &r[0], &r[1] );
2414
2415       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2416          STORE( &r[0], 0, chan_index );
2417       }
2418       break;
2419
2420     case TGSI_OPCODE_DP4:
2421     /* TGSI_OPCODE_DOT4 */
2422        FETCH(&r[0], 0, CHAN_X);
2423        FETCH(&r[1], 1, CHAN_X);
2424
2425        micro_mul( &r[0], &r[0], &r[1] );
2426
2427        FETCH(&r[1], 0, CHAN_Y);
2428        FETCH(&r[2], 1, CHAN_Y);
2429
2430        micro_mul( &r[1], &r[1], &r[2] );
2431        micro_add( &r[0], &r[0], &r[1] );
2432
2433        FETCH(&r[1], 0, CHAN_Z);
2434        FETCH(&r[2], 1, CHAN_Z);
2435
2436        micro_mul( &r[1], &r[1], &r[2] );
2437        micro_add( &r[0], &r[0], &r[1] );
2438
2439        FETCH(&r[1], 0, CHAN_W);
2440        FETCH(&r[2], 1, CHAN_W);
2441
2442        micro_mul( &r[1], &r[1], &r[2] );
2443        micro_add( &r[0], &r[0], &r[1] );
2444
2445       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2446          STORE( &r[0], 0, chan_index );
2447       }
2448       break;
2449
2450    case TGSI_OPCODE_DST:
2451       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2452          FETCH( &r[0], 0, CHAN_Y );
2453          FETCH( &r[1], 1, CHAN_Y);
2454          micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2455       }
2456       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2457          FETCH(&d[CHAN_Z], 0, CHAN_Z);
2458       }
2459       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2460          FETCH(&d[CHAN_W], 1, CHAN_W);
2461       }
2462
2463       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2464          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2465       }
2466       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2467          STORE(&d[CHAN_Y], 0, CHAN_Y);
2468       }
2469       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2470          STORE(&d[CHAN_Z], 0, CHAN_Z);
2471       }
2472       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2473          STORE(&d[CHAN_W], 0, CHAN_W);
2474       }
2475       break;
2476
2477    case TGSI_OPCODE_MIN:
2478       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2479          FETCH(&r[0], 0, chan_index);
2480          FETCH(&r[1], 1, chan_index);
2481
2482          /* XXX use micro_min()?? */
2483          micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2484       }
2485       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2486          STORE(&d[chan_index], 0, chan_index);
2487       }
2488       break;
2489
2490    case TGSI_OPCODE_MAX:
2491       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2492          FETCH(&r[0], 0, chan_index);
2493          FETCH(&r[1], 1, chan_index);
2494
2495          /* XXX use micro_max()?? */
2496          micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2497       }
2498       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2499          STORE(&d[chan_index], 0, chan_index);
2500       }
2501       break;
2502
2503    case TGSI_OPCODE_SLT:
2504       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2505       break;
2506
2507    case TGSI_OPCODE_SGE:
2508       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2509       break;
2510
2511    case TGSI_OPCODE_MAD:
2512       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2513       break;
2514
2515    case TGSI_OPCODE_SUB:
2516       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2517          FETCH(&r[0], 0, chan_index);
2518          FETCH(&r[1], 1, chan_index);
2519          micro_sub(&d[chan_index], &r[0], &r[1]);
2520       }
2521       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2522          STORE(&d[chan_index], 0, chan_index);
2523       }
2524       break;
2525
2526    case TGSI_OPCODE_LRP:
2527       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2528       break;
2529
2530    case TGSI_OPCODE_CND:
2531       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2532          FETCH(&r[0], 0, chan_index);
2533          FETCH(&r[1], 1, chan_index);
2534          FETCH(&r[2], 2, chan_index);
2535          micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2536       }
2537       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2538          STORE(&d[chan_index], 0, chan_index);
2539       }
2540       break;
2541
2542    case TGSI_OPCODE_DP2A:
2543       FETCH( &r[0], 0, CHAN_X );
2544       FETCH( &r[1], 1, CHAN_X );
2545       micro_mul( &r[0], &r[0], &r[1] );
2546
2547       FETCH( &r[1], 0, CHAN_Y );
2548       FETCH( &r[2], 1, CHAN_Y );
2549       micro_mul( &r[1], &r[1], &r[2] );
2550       micro_add( &r[0], &r[0], &r[1] );
2551
2552       FETCH( &r[2], 2, CHAN_X );
2553       micro_add( &r[0], &r[0], &r[2] );
2554
2555       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2556          STORE( &r[0], 0, chan_index );
2557       }
2558       break;
2559
2560    case TGSI_OPCODE_FRC:
2561       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2562       break;
2563
2564    case TGSI_OPCODE_CLAMP:
2565       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2566          FETCH(&r[0], 0, chan_index);
2567          FETCH(&r[1], 1, chan_index);
2568          micro_max(&r[0], &r[0], &r[1]);
2569          FETCH(&r[1], 2, chan_index);
2570          micro_min(&d[chan_index], &r[0], &r[1]);
2571       }
2572       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2573          STORE(&d[chan_index], 0, chan_index);
2574       }
2575       break;
2576
2577    case TGSI_OPCODE_FLR:
2578       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2579       break;
2580
2581    case TGSI_OPCODE_ROUND:
2582       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2583       break;
2584
2585    case TGSI_OPCODE_EX2:
2586       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2587       break;
2588
2589    case TGSI_OPCODE_LG2:
2590       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2591       break;
2592
2593    case TGSI_OPCODE_POW:
2594       FETCH(&r[0], 0, CHAN_X);
2595       FETCH(&r[1], 1, CHAN_X);
2596
2597       micro_pow( &r[0], &r[0], &r[1] );
2598
2599       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2600          STORE( &r[0], 0, chan_index );
2601       }
2602       break;
2603
2604    case TGSI_OPCODE_XPD:
2605       FETCH(&r[0], 0, CHAN_Y);
2606       FETCH(&r[1], 1, CHAN_Z);
2607
2608       micro_mul( &r[2], &r[0], &r[1] );
2609
2610       FETCH(&r[3], 0, CHAN_Z);
2611       FETCH(&r[4], 1, CHAN_Y);
2612
2613       micro_mul( &r[5], &r[3], &r[4] );
2614       micro_sub(&d[CHAN_X], &r[2], &r[5]);
2615
2616       FETCH(&r[2], 1, CHAN_X);
2617
2618       micro_mul( &r[3], &r[3], &r[2] );
2619
2620       FETCH(&r[5], 0, CHAN_X);
2621
2622       micro_mul( &r[1], &r[1], &r[5] );
2623       micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2624
2625       micro_mul( &r[5], &r[5], &r[4] );
2626       micro_mul( &r[0], &r[0], &r[2] );
2627       micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2628
2629       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2630          STORE(&d[CHAN_X], 0, CHAN_X);
2631       }
2632       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2633          STORE(&d[CHAN_Y], 0, CHAN_Y);
2634       }
2635       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2636          STORE(&d[CHAN_Z], 0, CHAN_Z);
2637       }
2638       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2639          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2640       }
2641       break;
2642
2643    case TGSI_OPCODE_ABS:
2644       exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2645       break;
2646
2647    case TGSI_OPCODE_RCC:
2648       FETCH(&r[0], 0, CHAN_X);
2649       micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2650       micro_float_clamp(&r[0], &r[0]);
2651       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2652          STORE(&r[0], 0, chan_index);
2653       }
2654       break;
2655
2656    case TGSI_OPCODE_DPH:
2657       FETCH(&r[0], 0, CHAN_X);
2658       FETCH(&r[1], 1, CHAN_X);
2659
2660       micro_mul( &r[0], &r[0], &r[1] );
2661
2662       FETCH(&r[1], 0, CHAN_Y);
2663       FETCH(&r[2], 1, CHAN_Y);
2664
2665       micro_mul( &r[1], &r[1], &r[2] );
2666       micro_add( &r[0], &r[0], &r[1] );
2667
2668       FETCH(&r[1], 0, CHAN_Z);
2669       FETCH(&r[2], 1, CHAN_Z);
2670
2671       micro_mul( &r[1], &r[1], &r[2] );
2672       micro_add( &r[0], &r[0], &r[1] );
2673
2674       FETCH(&r[1], 1, CHAN_W);
2675
2676       micro_add( &r[0], &r[0], &r[1] );
2677
2678       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2679          STORE( &r[0], 0, chan_index );
2680       }
2681       break;
2682
2683    case TGSI_OPCODE_COS:
2684       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2685       break;
2686
2687    case TGSI_OPCODE_DDX:
2688       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2689       break;
2690
2691    case TGSI_OPCODE_DDY:
2692       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2693       break;
2694
2695    case TGSI_OPCODE_KILP:
2696       exec_kilp (mach, inst);
2697       break;
2698
2699    case TGSI_OPCODE_KIL:
2700       exec_kil (mach, inst);
2701       break;
2702
2703    case TGSI_OPCODE_PK2H:
2704       assert (0);
2705       break;
2706
2707    case TGSI_OPCODE_PK2US:
2708       assert (0);
2709       break;
2710
2711    case TGSI_OPCODE_PK4B:
2712       assert (0);
2713       break;
2714
2715    case TGSI_OPCODE_PK4UB:
2716       assert (0);
2717       break;
2718
2719    case TGSI_OPCODE_RFL:
2720       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2721           IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2722           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2723          /* r0 = dp3(src0, src0) */
2724          FETCH(&r[2], 0, CHAN_X);
2725          micro_mul(&r[0], &r[2], &r[2]);
2726          FETCH(&r[4], 0, CHAN_Y);
2727          micro_mul(&r[8], &r[4], &r[4]);
2728          micro_add(&r[0], &r[0], &r[8]);
2729          FETCH(&r[6], 0, CHAN_Z);
2730          micro_mul(&r[8], &r[6], &r[6]);
2731          micro_add(&r[0], &r[0], &r[8]);
2732
2733          /* r1 = dp3(src0, src1) */
2734          FETCH(&r[3], 1, CHAN_X);
2735          micro_mul(&r[1], &r[2], &r[3]);
2736          FETCH(&r[5], 1, CHAN_Y);
2737          micro_mul(&r[8], &r[4], &r[5]);
2738          micro_add(&r[1], &r[1], &r[8]);
2739          FETCH(&r[7], 1, CHAN_Z);
2740          micro_mul(&r[8], &r[6], &r[7]);
2741          micro_add(&r[1], &r[1], &r[8]);
2742
2743          /* r1 = 2 * r1 / r0 */
2744          micro_add(&r[1], &r[1], &r[1]);
2745          micro_div(&r[1], &r[1], &r[0]);
2746
2747          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2748             micro_mul(&r[2], &r[2], &r[1]);
2749             micro_sub(&r[2], &r[2], &r[3]);
2750             STORE(&r[2], 0, CHAN_X);
2751          }
2752          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2753             micro_mul(&r[4], &r[4], &r[1]);
2754             micro_sub(&r[4], &r[4], &r[5]);
2755             STORE(&r[4], 0, CHAN_Y);
2756          }
2757          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2758             micro_mul(&r[6], &r[6], &r[1]);
2759             micro_sub(&r[6], &r[6], &r[7]);
2760             STORE(&r[6], 0, CHAN_Z);
2761          }
2762       }
2763       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2764          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2765       }
2766       break;
2767
2768    case TGSI_OPCODE_SEQ:
2769       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2770       break;
2771
2772    case TGSI_OPCODE_SFL:
2773       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2774          STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2775       }
2776       break;
2777
2778    case TGSI_OPCODE_SGT:
2779       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2780       break;
2781
2782    case TGSI_OPCODE_SIN:
2783       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2784       break;
2785
2786    case TGSI_OPCODE_SLE:
2787       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2788       break;
2789
2790    case TGSI_OPCODE_SNE:
2791       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2792       break;
2793
2794    case TGSI_OPCODE_STR:
2795       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2796          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2797       }
2798       break;
2799
2800    case TGSI_OPCODE_TEX:
2801       /* simple texture lookup */
2802       /* src[0] = texcoord */
2803       /* src[1] = sampler unit */
2804       exec_tex(mach, inst, FALSE, FALSE);
2805       break;
2806
2807    case TGSI_OPCODE_TXB:
2808       /* Texture lookup with lod bias */
2809       /* src[0] = texcoord (src[0].w = LOD bias) */
2810       /* src[1] = sampler unit */
2811       exec_tex(mach, inst, TRUE, FALSE);
2812       break;
2813
2814    case TGSI_OPCODE_TXD:
2815       /* Texture lookup with explict partial derivatives */
2816       /* src[0] = texcoord */
2817       /* src[1] = d[strq]/dx */
2818       /* src[2] = d[strq]/dy */
2819       /* src[3] = sampler unit */
2820       exec_txd(mach, inst);
2821       break;
2822
2823    case TGSI_OPCODE_TXL:
2824       /* Texture lookup with explit LOD */
2825       /* src[0] = texcoord (src[0].w = LOD) */
2826       /* src[1] = sampler unit */
2827       exec_tex(mach, inst, TRUE, FALSE);
2828       break;
2829
2830    case TGSI_OPCODE_TXP:
2831       /* Texture lookup with projection */
2832       /* src[0] = texcoord (src[0].w = projection) */
2833       /* src[1] = sampler unit */
2834       exec_tex(mach, inst, FALSE, TRUE);
2835       break;
2836
2837    case TGSI_OPCODE_UP2H:
2838       assert (0);
2839       break;
2840
2841    case TGSI_OPCODE_UP2US:
2842       assert (0);
2843       break;
2844
2845    case TGSI_OPCODE_UP4B:
2846       assert (0);
2847       break;
2848
2849    case TGSI_OPCODE_UP4UB:
2850       assert (0);
2851       break;
2852
2853    case TGSI_OPCODE_X2D:
2854       FETCH(&r[0], 1, CHAN_X);
2855       FETCH(&r[1], 1, CHAN_Y);
2856       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2857           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2858          FETCH(&r[2], 2, CHAN_X);
2859          micro_mul(&r[2], &r[2], &r[0]);
2860          FETCH(&r[3], 2, CHAN_Y);
2861          micro_mul(&r[3], &r[3], &r[1]);
2862          micro_add(&r[2], &r[2], &r[3]);
2863          FETCH(&r[3], 0, CHAN_X);
2864          micro_add(&d[CHAN_X], &r[2], &r[3]);
2865
2866       }
2867       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2868           IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2869          FETCH(&r[2], 2, CHAN_Z);
2870          micro_mul(&r[2], &r[2], &r[0]);
2871          FETCH(&r[3], 2, CHAN_W);
2872          micro_mul(&r[3], &r[3], &r[1]);
2873          micro_add(&r[2], &r[2], &r[3]);
2874          FETCH(&r[3], 0, CHAN_Y);
2875          micro_add(&d[CHAN_Y], &r[2], &r[3]);
2876
2877       }
2878       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2879          STORE(&d[CHAN_X], 0, CHAN_X);
2880       }
2881       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2882          STORE(&d[CHAN_Y], 0, CHAN_Y);
2883       }
2884       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2885          STORE(&d[CHAN_X], 0, CHAN_Z);
2886       }
2887       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2888          STORE(&d[CHAN_Y], 0, CHAN_W);
2889       }
2890       break;
2891
2892    case TGSI_OPCODE_ARA:
2893       assert (0);
2894       break;
2895
2896    case TGSI_OPCODE_ARR:
2897       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2898       break;
2899
2900    case TGSI_OPCODE_BRA:
2901       assert (0);
2902       break;
2903
2904    case TGSI_OPCODE_CAL:
2905       /* skip the call if no execution channels are enabled */
2906       if (mach->ExecMask) {
2907          /* do the call */
2908
2909          /* First, record the depths of the execution stacks.
2910           * This is important for deeply nested/looped return statements.
2911           * We have to unwind the stacks by the correct amount.  For a
2912           * real code generator, we could determine the number of entries
2913           * to pop off each stack with simple static analysis and avoid
2914           * implementing this data structure at run time.
2915           */
2916          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2917          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2918          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2919          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
2920          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
2921          /* note that PC was already incremented above */
2922          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2923
2924          mach->CallStackTop++;
2925
2926          /* Second, push the Cond, Loop, Cont, Func stacks */
2927          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2928          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2929          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2930          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2931          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2932          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2933
2934          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2935          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2936          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2937          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2938          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2939          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2940
2941          /* Finally, jump to the subroutine */
2942          *pc = inst->Label.Label;
2943       }
2944       break;
2945
2946    case TGSI_OPCODE_RET:
2947       mach->FuncMask &= ~mach->ExecMask;
2948       UPDATE_EXEC_MASK(mach);
2949
2950       if (mach->FuncMask == 0x0) {
2951          /* really return now (otherwise, keep executing */
2952
2953          if (mach->CallStackTop == 0) {
2954             /* returning from main() */
2955             *pc = -1;
2956             return;
2957          }
2958
2959          assert(mach->CallStackTop > 0);
2960          mach->CallStackTop--;
2961
2962          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2963          mach->CondMask = mach->CondStack[mach->CondStackTop];
2964
2965          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2966          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2967
2968          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2969          mach->ContMask = mach->ContStack[mach->ContStackTop];
2970
2971          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
2972          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
2973
2974          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
2975          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
2976
2977          assert(mach->FuncStackTop > 0);
2978          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2979
2980          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2981
2982          UPDATE_EXEC_MASK(mach);
2983       }
2984       break;
2985
2986    case TGSI_OPCODE_SSG:
2987       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2988       break;
2989
2990    case TGSI_OPCODE_CMP:
2991       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2992          FETCH(&r[0], 0, chan_index);
2993          FETCH(&r[1], 1, chan_index);
2994          FETCH(&r[2], 2, chan_index);
2995          micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2996       }
2997       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2998          STORE(&d[chan_index], 0, chan_index);
2999       }
3000       break;
3001
3002    case TGSI_OPCODE_SCS:
3003       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3004          FETCH( &r[0], 0, CHAN_X );
3005          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3006             micro_cos(&r[1], &r[0]);
3007             STORE(&r[1], 0, CHAN_X);
3008          }
3009          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3010             micro_sin(&r[1], &r[0]);
3011             STORE(&r[1], 0, CHAN_Y);
3012          }
3013       }
3014       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3015          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3016       }
3017       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3018          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3019       }
3020       break;
3021
3022    case TGSI_OPCODE_NRM:
3023       /* 3-component vector normalize */
3024       if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
3025          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3026          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3027          /* r3 = sqrt(dp3(src0, src0)) */
3028          FETCH(&r[0], 0, CHAN_X);
3029          micro_mul(&r[3], &r[0], &r[0]);
3030          FETCH(&r[1], 0, CHAN_Y);
3031          micro_mul(&r[4], &r[1], &r[1]);
3032          micro_add(&r[3], &r[3], &r[4]);
3033          FETCH(&r[2], 0, CHAN_Z);
3034          micro_mul(&r[4], &r[2], &r[2]);
3035          micro_add(&r[3], &r[3], &r[4]);
3036          micro_sqrt(&r[3], &r[3]);
3037
3038          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3039             micro_div(&r[0], &r[0], &r[3]);
3040             STORE(&r[0], 0, CHAN_X);
3041          }
3042          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3043             micro_div(&r[1], &r[1], &r[3]);
3044             STORE(&r[1], 0, CHAN_Y);
3045          }
3046          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3047             micro_div(&r[2], &r[2], &r[3]);
3048             STORE(&r[2], 0, CHAN_Z);
3049          }
3050       }
3051       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3052          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
3053       }
3054       break;
3055
3056    case TGSI_OPCODE_NRM4:
3057       /* 4-component vector normalize */
3058       {
3059          union tgsi_exec_channel tmp, dot;
3060
3061          /* tmp = dp4(src0, src0): */
3062          FETCH( &r[0], 0, CHAN_X );
3063          micro_mul( &tmp, &r[0], &r[0] );
3064
3065          FETCH( &r[1], 0, CHAN_Y );
3066          micro_mul( &dot, &r[1], &r[1] );
3067          micro_add( &tmp, &tmp, &dot );
3068
3069          FETCH( &r[2], 0, CHAN_Z );
3070          micro_mul( &dot, &r[2], &r[2] );
3071          micro_add( &tmp, &tmp, &dot );
3072
3073          FETCH( &r[3], 0, CHAN_W );
3074          micro_mul( &dot, &r[3], &r[3] );
3075          micro_add( &tmp, &tmp, &dot );
3076
3077          /* tmp = 1 / sqrt(tmp) */
3078          micro_sqrt( &tmp, &tmp );
3079          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
3080
3081          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3082             /* chan = chan * tmp */
3083             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
3084             STORE( &r[chan_index], 0, chan_index );
3085          }
3086       }
3087       break;
3088
3089    case TGSI_OPCODE_DIV:
3090       assert( 0 );
3091       break;
3092
3093    case TGSI_OPCODE_DP2:
3094       FETCH( &r[0], 0, CHAN_X );
3095       FETCH( &r[1], 1, CHAN_X );
3096       micro_mul( &r[0], &r[0], &r[1] );
3097
3098       FETCH( &r[1], 0, CHAN_Y );
3099       FETCH( &r[2], 1, CHAN_Y );
3100       micro_mul( &r[1], &r[1], &r[2] );
3101       micro_add( &r[0], &r[0], &r[1] );
3102
3103       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3104          STORE( &r[0], 0, chan_index );
3105       }
3106       break;
3107
3108    case TGSI_OPCODE_IF:
3109       /* push CondMask */
3110       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3111       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3112       FETCH( &r[0], 0, CHAN_X );
3113       /* update CondMask */
3114       if( ! r[0].u[0] ) {
3115          mach->CondMask &= ~0x1;
3116       }
3117       if( ! r[0].u[1] ) {
3118          mach->CondMask &= ~0x2;
3119       }
3120       if( ! r[0].u[2] ) {
3121          mach->CondMask &= ~0x4;
3122       }
3123       if( ! r[0].u[3] ) {
3124          mach->CondMask &= ~0x8;
3125       }
3126       UPDATE_EXEC_MASK(mach);
3127       /* Todo: If CondMask==0, jump to ELSE */
3128       break;
3129
3130    case TGSI_OPCODE_ELSE:
3131       /* invert CondMask wrt previous mask */
3132       {
3133          uint prevMask;
3134          assert(mach->CondStackTop > 0);
3135          prevMask = mach->CondStack[mach->CondStackTop - 1];
3136          mach->CondMask = ~mach->CondMask & prevMask;
3137          UPDATE_EXEC_MASK(mach);
3138          /* Todo: If CondMask==0, jump to ENDIF */
3139       }
3140       break;
3141
3142    case TGSI_OPCODE_ENDIF:
3143       /* pop CondMask */
3144       assert(mach->CondStackTop > 0);
3145       mach->CondMask = mach->CondStack[--mach->CondStackTop];
3146       UPDATE_EXEC_MASK(mach);
3147       break;
3148
3149    case TGSI_OPCODE_END:
3150       /* halt execution */
3151       *pc = -1;
3152       break;
3153
3154    case TGSI_OPCODE_REP:
3155       assert (0);
3156       break;
3157
3158    case TGSI_OPCODE_ENDREP:
3159        assert (0);
3160        break;
3161
3162    case TGSI_OPCODE_PUSHA:
3163       assert (0);
3164       break;
3165
3166    case TGSI_OPCODE_POPA:
3167       assert (0);
3168       break;
3169
3170    case TGSI_OPCODE_CEIL:
3171       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3172       break;
3173
3174    case TGSI_OPCODE_I2F:
3175       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3176       break;
3177
3178    case TGSI_OPCODE_NOT:
3179       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3180       break;
3181
3182    case TGSI_OPCODE_TRUNC:
3183       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3184       break;
3185
3186    case TGSI_OPCODE_SHL:
3187       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3188       break;
3189
3190    case TGSI_OPCODE_AND:
3191       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3192       break;
3193
3194    case TGSI_OPCODE_OR:
3195       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3196       break;
3197
3198    case TGSI_OPCODE_MOD:
3199       assert (0);
3200       break;
3201
3202    case TGSI_OPCODE_XOR:
3203       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3204       break;
3205
3206    case TGSI_OPCODE_SAD:
3207       assert (0);
3208       break;
3209
3210    case TGSI_OPCODE_TXF:
3211       assert (0);
3212       break;
3213
3214    case TGSI_OPCODE_TXQ:
3215       assert (0);
3216       break;
3217
3218    case TGSI_OPCODE_EMIT:
3219       emit_vertex(mach);
3220       break;
3221
3222    case TGSI_OPCODE_ENDPRIM:
3223       emit_primitive(mach);
3224       break;
3225
3226    case TGSI_OPCODE_BGNFOR:
3227       assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3228       for (chan_index = 0; chan_index < 3; chan_index++) {
3229          FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3230       }
3231       ++mach->LoopCounterStackTop;
3232       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3233       /* update LoopMask */
3234       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3235          mach->LoopMask &= ~0x1;
3236       }
3237       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3238          mach->LoopMask &= ~0x2;
3239       }
3240       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3241          mach->LoopMask &= ~0x4;
3242       }
3243       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3244          mach->LoopMask &= ~0x8;
3245       }
3246       /* TODO: if mach->LoopMask == 0, jump to end of loop */
3247       UPDATE_EXEC_MASK(mach);
3248       /* fall-through (for now) */
3249    case TGSI_OPCODE_BGNLOOP:
3250       /* push LoopMask and ContMasks */
3251       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3252       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3253       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3254       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3255
3256       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3257       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3258       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3259       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3260       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3261       break;
3262
3263    case TGSI_OPCODE_ENDFOR:
3264       assert(mach->LoopCounterStackTop > 0);
3265       micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3266                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3267                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3268       /* update LoopMask */
3269       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3270          mach->LoopMask &= ~0x1;
3271       }
3272       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3273          mach->LoopMask &= ~0x2;
3274       }
3275       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3276          mach->LoopMask &= ~0x4;
3277       }
3278       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3279          mach->LoopMask &= ~0x8;
3280       }
3281       micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3282                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3283                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3284       assert(mach->LoopLabelStackTop > 0);
3285       inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3286       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3287       /* Restore ContMask, but don't pop */
3288       assert(mach->ContStackTop > 0);
3289       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3290       UPDATE_EXEC_MASK(mach);
3291       if (mach->ExecMask) {
3292          /* repeat loop: jump to instruction just past BGNLOOP */
3293          assert(mach->LoopLabelStackTop > 0);
3294          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3295       }
3296       else {
3297          /* exit loop: pop LoopMask */
3298          assert(mach->LoopStackTop > 0);
3299          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3300          /* pop ContMask */
3301          assert(mach->ContStackTop > 0);
3302          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3303          assert(mach->LoopLabelStackTop > 0);
3304          --mach->LoopLabelStackTop;
3305          assert(mach->LoopCounterStackTop > 0);
3306          --mach->LoopCounterStackTop;
3307
3308          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3309       }
3310       UPDATE_EXEC_MASK(mach);
3311       break;
3312
3313    case TGSI_OPCODE_ENDLOOP:
3314       /* Restore ContMask, but don't pop */
3315       assert(mach->ContStackTop > 0);
3316       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3317       UPDATE_EXEC_MASK(mach);
3318       if (mach->ExecMask) {
3319          /* repeat loop: jump to instruction just past BGNLOOP */
3320          assert(mach->LoopLabelStackTop > 0);
3321          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3322       }
3323       else {
3324          /* exit loop: pop LoopMask */
3325          assert(mach->LoopStackTop > 0);
3326          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3327          /* pop ContMask */
3328          assert(mach->ContStackTop > 0);
3329          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3330          assert(mach->LoopLabelStackTop > 0);
3331          --mach->LoopLabelStackTop;
3332
3333          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3334       }
3335       UPDATE_EXEC_MASK(mach);
3336       break;
3337
3338    case TGSI_OPCODE_BRK:
3339       exec_break(mach);
3340       break;
3341
3342    case TGSI_OPCODE_CONT:
3343       /* turn off cont channels for each enabled exec channel */
3344       mach->ContMask &= ~mach->ExecMask;
3345       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3346       UPDATE_EXEC_MASK(mach);
3347       break;
3348
3349    case TGSI_OPCODE_BGNSUB:
3350       /* no-op */
3351       break;
3352
3353    case TGSI_OPCODE_ENDSUB:
3354       /*
3355        * XXX: This really should be a no-op. We should never reach this opcode.
3356        */
3357
3358       assert(mach->CallStackTop > 0);
3359       mach->CallStackTop--;
3360
3361       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3362       mach->CondMask = mach->CondStack[mach->CondStackTop];
3363
3364       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3365       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3366
3367       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3368       mach->ContMask = mach->ContStack[mach->ContStackTop];
3369
3370       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3371       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3372
3373       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3374       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3375
3376       assert(mach->FuncStackTop > 0);
3377       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3378
3379       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3380
3381       UPDATE_EXEC_MASK(mach);
3382       break;
3383
3384    case TGSI_OPCODE_NOP:
3385       break;
3386
3387    case TGSI_OPCODE_BREAKC:
3388       FETCH(&r[0], 0, CHAN_X);
3389       /* update CondMask */
3390       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3391          mach->LoopMask &= ~0x1;
3392       }
3393       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3394          mach->LoopMask &= ~0x2;
3395       }
3396       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3397          mach->LoopMask &= ~0x4;
3398       }
3399       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3400          mach->LoopMask &= ~0x8;
3401       }
3402       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3403       UPDATE_EXEC_MASK(mach);
3404       break;
3405
3406    case TGSI_OPCODE_F2I:
3407       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3408       break;
3409
3410    case TGSI_OPCODE_IDIV:
3411       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3412       break;
3413
3414    case TGSI_OPCODE_IMAX:
3415       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3416       break;
3417
3418    case TGSI_OPCODE_IMIN:
3419       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3420       break;
3421
3422    case TGSI_OPCODE_INEG:
3423       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3424       break;
3425
3426    case TGSI_OPCODE_ISGE:
3427       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3428       break;
3429
3430    case TGSI_OPCODE_ISHR:
3431       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3432       break;
3433
3434    case TGSI_OPCODE_ISLT:
3435       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3436       break;
3437
3438    case TGSI_OPCODE_F2U:
3439       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3440       break;
3441
3442    case TGSI_OPCODE_U2F:
3443       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3444       break;
3445
3446    case TGSI_OPCODE_UADD:
3447       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3448       break;
3449
3450    case TGSI_OPCODE_UDIV:
3451       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3452       break;
3453
3454    case TGSI_OPCODE_UMAD:
3455       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3456       break;
3457
3458    case TGSI_OPCODE_UMAX:
3459       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3460       break;
3461
3462    case TGSI_OPCODE_UMIN:
3463       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3464       break;
3465
3466    case TGSI_OPCODE_UMOD:
3467       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3468       break;
3469
3470    case TGSI_OPCODE_UMUL:
3471       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3472       break;
3473
3474    case TGSI_OPCODE_USEQ:
3475       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3476       break;
3477
3478    case TGSI_OPCODE_USGE:
3479       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3480       break;
3481
3482    case TGSI_OPCODE_USHR:
3483       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3484       break;
3485
3486    case TGSI_OPCODE_USLT:
3487       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3488       break;
3489
3490    case TGSI_OPCODE_USNE:
3491       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3492       break;
3493
3494    case TGSI_OPCODE_SWITCH:
3495       exec_switch(mach, inst);
3496       break;
3497
3498    case TGSI_OPCODE_CASE:
3499       exec_case(mach, inst);
3500       break;
3501
3502    case TGSI_OPCODE_DEFAULT:
3503       exec_default(mach);
3504       break;
3505
3506    case TGSI_OPCODE_ENDSWITCH:
3507       exec_endswitch(mach);
3508       break;
3509
3510    default:
3511       assert( 0 );
3512    }
3513 }
3514
3515
3516 #define DEBUG_EXECUTION 0
3517
3518
3519 /**
3520  * Run TGSI interpreter.
3521  * \return bitmask of "alive" quad components
3522  */
3523 uint
3524 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3525 {
3526    uint i;
3527    int pc = 0;
3528
3529    mach->CondMask = 0xf;
3530    mach->LoopMask = 0xf;
3531    mach->ContMask = 0xf;
3532    mach->FuncMask = 0xf;
3533    mach->ExecMask = 0xf;
3534
3535    mach->Switch.mask = 0xf;
3536
3537    assert(mach->CondStackTop == 0);
3538    assert(mach->LoopStackTop == 0);
3539    assert(mach->ContStackTop == 0);
3540    assert(mach->SwitchStackTop == 0);
3541    assert(mach->BreakStackTop == 0);
3542    assert(mach->CallStackTop == 0);
3543
3544    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3545    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3546
3547    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3548       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3549       mach->Primitives[0] = 0;
3550    }
3551
3552    for (i = 0; i < QUAD_SIZE; i++) {
3553       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3554          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3555          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3556          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3557          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3558    }
3559
3560    /* execute declarations (interpolants) */
3561    for (i = 0; i < mach->NumDeclarations; i++) {
3562       exec_declaration( mach, mach->Declarations+i );
3563    }
3564
3565    {
3566 #if DEBUG_EXECUTION
3567       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3568       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3569       uint inst = 1;
3570
3571       memcpy(temps, mach->Temps, sizeof(temps));
3572       memcpy(outputs, mach->Outputs, sizeof(outputs));
3573 #endif
3574
3575       /* execute instructions, until pc is set to -1 */
3576       while (pc != -1) {
3577
3578 #if DEBUG_EXECUTION
3579          uint i;
3580
3581          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3582 #endif
3583
3584          assert(pc < (int) mach->NumInstructions);
3585          exec_instruction(mach, mach->Instructions + pc, &pc);
3586
3587 #if DEBUG_EXECUTION
3588          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3589             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3590                uint j;
3591
3592                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3593                debug_printf("TEMP[%2u] = ", i);
3594                for (j = 0; j < 4; j++) {
3595                   if (j > 0) {
3596                      debug_printf("           ");
3597                   }
3598                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3599                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3600                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3601                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3602                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3603                }
3604             }
3605          }
3606          for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3607             if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3608                uint j;
3609
3610                memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3611                debug_printf("OUT[%2u] =  ", i);
3612                for (j = 0; j < 4; j++) {
3613                   if (j > 0) {
3614                      debug_printf("           ");
3615                   }
3616                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3617                                outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3618                                outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3619                                outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3620                                outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3621                }
3622             }
3623          }
3624 #endif
3625       }
3626    }
3627
3628 #if 0
3629    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3630    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3631       /*
3632        * Scale back depth component.
3633        */
3634       for (i = 0; i < 4; i++)
3635          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3636    }
3637 #endif
3638
3639    assert(mach->CondStackTop == 0);
3640    assert(mach->LoopStackTop == 0);
3641    assert(mach->ContStackTop == 0);
3642    assert(mach->SwitchStackTop == 0);
3643    assert(mach->BreakStackTop == 0);
3644    assert(mach->CallStackTop == 0);
3645
3646    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3647 }