src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_memory.h"
  62 #include "util/u_math.h"
  63
  64
  65 #define FAST_MATH 1
  66
  67 #define TILE_TOP_LEFT     0
  68 #define TILE_TOP_RIGHT    1
  69 #define TILE_BOTTOM_LEFT  2
  70 #define TILE_BOTTOM_RIGHT 3
  71
  72 static void
  73 micro_abs(union tgsi_exec_channel *dst,
  74           const union tgsi_exec_channel *src)
  75 {
  76    dst->f[0] = fabsf(src->f[0]);
  77    dst->f[1] = fabsf(src->f[1]);
  78    dst->f[2] = fabsf(src->f[2]);
  79    dst->f[3] = fabsf(src->f[3]);
  80 }
  81
  82 static void
  83 micro_arl(union tgsi_exec_channel *dst,
  84           const union tgsi_exec_channel *src)
  85 {
  86    dst->i[0] = (int)floorf(src->f[0]);
  87    dst->i[1] = (int)floorf(src->f[1]);
  88    dst->i[2] = (int)floorf(src->f[2]);
  89    dst->i[3] = (int)floorf(src->f[3]);
  90 }
  91
  92 static void
  93 micro_arr(union tgsi_exec_channel *dst,
  94           const union tgsi_exec_channel *src)
  95 {
  96    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
  97    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
  98    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
  99    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 100 }
 101
 102 static void
 103 micro_ceil(union tgsi_exec_channel *dst,
 104            const union tgsi_exec_channel *src)
 105 {
 106    dst->f[0] = ceilf(src->f[0]);
 107    dst->f[1] = ceilf(src->f[1]);
 108    dst->f[2] = ceilf(src->f[2]);
 109    dst->f[3] = ceilf(src->f[3]);
 110 }
 111
 112 static void
 113 micro_cos(union tgsi_exec_channel *dst,
 114           const union tgsi_exec_channel *src)
 115 {
 116    dst->f[0] = cosf(src->f[0]);
 117    dst->f[1] = cosf(src->f[1]);
 118    dst->f[2] = cosf(src->f[2]);
 119    dst->f[3] = cosf(src->f[3]);
 120 }
 121
 122 static void
 123 micro_ddx(union tgsi_exec_channel *dst,
 124           const union tgsi_exec_channel *src)
 125 {
 126    dst->f[0] =
 127    dst->f[1] =
 128    dst->f[2] =
 129    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 130 }
 131
 132 static void
 133 micro_ddy(union tgsi_exec_channel *dst,
 134           const union tgsi_exec_channel *src)
 135 {
 136    dst->f[0] =
 137    dst->f[1] =
 138    dst->f[2] =
 139    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 140 }
 141
 142 static void
 143 micro_exp2(union tgsi_exec_channel *dst,
 144            const union tgsi_exec_channel *src)
 145 {
 146 #if FAST_MATH
 147    dst->f[0] = util_fast_exp2(src->f[0]);
 148    dst->f[1] = util_fast_exp2(src->f[1]);
 149    dst->f[2] = util_fast_exp2(src->f[2]);
 150    dst->f[3] = util_fast_exp2(src->f[3]);
 151 #else
 152 #if DEBUG
 153    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 154    uint i;
 155    union tgsi_exec_channel clamped;
 156
 157    for (i = 0; i < 4; i++) {
 158       if (src->f[i] > 127.99999f) {
 159          clamped.f[i] = 127.99999f;
 160       } else if (src->f[i] < -126.99999f) {
 161          clamped.f[i] = -126.99999f;
 162       } else {
 163          clamped.f[i] = src->f[i];
 164       }
 165    }
 166    src = &clamped;
 167 #endif /* DEBUG */
 168
 169    dst->f[0] = powf(2.0f, src->f[0]);
 170    dst->f[1] = powf(2.0f, src->f[1]);
 171    dst->f[2] = powf(2.0f, src->f[2]);
 172    dst->f[3] = powf(2.0f, src->f[3]);
 173 #endif /* FAST_MATH */
 174 }
 175
 176 static void
 177 micro_flr(union tgsi_exec_channel *dst,
 178           const union tgsi_exec_channel *src)
 179 {
 180    dst->f[0] = floorf(src->f[0]);
 181    dst->f[1] = floorf(src->f[1]);
 182    dst->f[2] = floorf(src->f[2]);
 183    dst->f[3] = floorf(src->f[3]);
 184 }
 185
 186 static void
 187 micro_frc(union tgsi_exec_channel *dst,
 188           const union tgsi_exec_channel *src)
 189 {
 190    dst->f[0] = src->f[0] - floorf(src->f[0]);
 191    dst->f[1] = src->f[1] - floorf(src->f[1]);
 192    dst->f[2] = src->f[2] - floorf(src->f[2]);
 193    dst->f[3] = src->f[3] - floorf(src->f[3]);
 194 }
 195
 196 static void
 197 micro_iabs(union tgsi_exec_channel *dst,
 198            const union tgsi_exec_channel *src)
 199 {
 200    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 201    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 202    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 203    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 204 }
 205
 206 static void
 207 micro_ineg(union tgsi_exec_channel *dst,
 208            const union tgsi_exec_channel *src)
 209 {
 210    dst->i[0] = -src->i[0];
 211    dst->i[1] = -src->i[1];
 212    dst->i[2] = -src->i[2];
 213    dst->i[3] = -src->i[3];
 214 }
 215
 216 static void
 217 micro_lg2(union tgsi_exec_channel *dst,
 218           const union tgsi_exec_channel *src)
 219 {
 220 #if FAST_MATH
 221    dst->f[0] = util_fast_log2(src->f[0]);
 222    dst->f[1] = util_fast_log2(src->f[1]);
 223    dst->f[2] = util_fast_log2(src->f[2]);
 224    dst->f[3] = util_fast_log2(src->f[3]);
 225 #else
 226    dst->f[0] = logf(src->f[0]) * 1.442695f;
 227    dst->f[1] = logf(src->f[1]) * 1.442695f;
 228    dst->f[2] = logf(src->f[2]) * 1.442695f;
 229    dst->f[3] = logf(src->f[3]) * 1.442695f;
 230 #endif
 231 }
 232
 233 static void
 234 micro_lrp(union tgsi_exec_channel *dst,
 235           const union tgsi_exec_channel *src)
 236 {
 237    dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
 238    dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
 239    dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
 240    dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
 241 }
 242
 243 static void
 244 micro_mad(union tgsi_exec_channel *dst,
 245           const union tgsi_exec_channel *src)
 246 {
 247    dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
 248    dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
 249    dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
 250    dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
 251 }
 252
 253 static void
 254 micro_mov(union tgsi_exec_channel *dst,
 255           const union tgsi_exec_channel *src)
 256 {
 257    dst->u[0] = src->u[0];
 258    dst->u[1] = src->u[1];
 259    dst->u[2] = src->u[2];
 260    dst->u[3] = src->u[3];
 261 }
 262
 263 static void
 264 micro_rcp(union tgsi_exec_channel *dst,
 265           const union tgsi_exec_channel *src)
 266 {
 267    dst->f[0] = 1.0f / src->f[0];
 268    dst->f[1] = 1.0f / src->f[1];
 269    dst->f[2] = 1.0f / src->f[2];
 270    dst->f[3] = 1.0f / src->f[3];
 271 }
 272
 273 static void
 274 micro_rnd(union tgsi_exec_channel *dst,
 275           const union tgsi_exec_channel *src)
 276 {
 277    dst->f[0] = floorf(src->f[0] + 0.5f);
 278    dst->f[1] = floorf(src->f[1] + 0.5f);
 279    dst->f[2] = floorf(src->f[2] + 0.5f);
 280    dst->f[3] = floorf(src->f[3] + 0.5f);
 281 }
 282
 283 static void
 284 micro_rsq(union tgsi_exec_channel *dst,
 285           const union tgsi_exec_channel *src)
 286 {
 287    dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
 288    dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
 289    dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
 290    dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
 291 }
 292
 293 static void
 294 micro_seq(union tgsi_exec_channel *dst,
 295           const union tgsi_exec_channel *src)
 296 {
 297    dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
 298    dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
 299    dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
 300    dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
 301 }
 302
 303 static void
 304 micro_sge(union tgsi_exec_channel *dst,
 305           const union tgsi_exec_channel *src)
 306 {
 307    dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
 308    dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
 309    dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
 310    dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
 311 }
 312
 313 static void
 314 micro_sgn(union tgsi_exec_channel *dst,
 315           const union tgsi_exec_channel *src)
 316 {
 317    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 318    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 319    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 320    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 321 }
 322
 323 static void
 324 micro_sgt(union tgsi_exec_channel *dst,
 325           const union tgsi_exec_channel *src)
 326 {
 327    dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
 328    dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
 329    dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
 330    dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
 331 }
 332
 333 static void
 334 micro_sin(union tgsi_exec_channel *dst,
 335           const union tgsi_exec_channel *src)
 336 {
 337    dst->f[0] = sinf(src->f[0]);
 338    dst->f[1] = sinf(src->f[1]);
 339    dst->f[2] = sinf(src->f[2]);
 340    dst->f[3] = sinf(src->f[3]);
 341 }
 342
 343 static void
 344 micro_sle(union tgsi_exec_channel *dst,
 345           const union tgsi_exec_channel *src)
 346 {
 347    dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
 348    dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
 349    dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
 350    dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
 351 }
 352
 353 static void
 354 micro_slt(union tgsi_exec_channel *dst,
 355           const union tgsi_exec_channel *src)
 356 {
 357    dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
 358    dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
 359    dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
 360    dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
 361 }
 362
 363 static void
 364 micro_sne(union tgsi_exec_channel *dst,
 365           const union tgsi_exec_channel *src)
 366 {
 367    dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
 368    dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
 369    dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
 370    dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
 371 }
 372
 373 static void
 374 micro_trunc(union tgsi_exec_channel *dst,
 375             const union tgsi_exec_channel *src)
 376 {
 377    dst->f[0] = (float)(int)src->f[0];
 378    dst->f[1] = (float)(int)src->f[1];
 379    dst->f[2] = (float)(int)src->f[2];
 380    dst->f[3] = (float)(int)src->f[3];
 381 }
 382
 383
 384 #define CHAN_X  0
 385 #define CHAN_Y  1
 386 #define CHAN_Z  2
 387 #define CHAN_W  3
 388
 389 enum tgsi_exec_datatype {
 390    TGSI_EXEC_DATA_FLOAT,
 391    TGSI_EXEC_DATA_INT,
 392    TGSI_EXEC_DATA_UINT
 393 };
 394
 395 /*
 396  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 397  */
 398 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
 399 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
 400 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
 401 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
 402 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
 403 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
 404 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
 405 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
 406 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
 407 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
 408 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
 409 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
 410 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
 411 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
 412 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
 413 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
 414 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 415 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 416 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 417 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 418 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 419 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 420 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 421 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 422 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 423 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 424 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 425 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 426 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 427 #define TEMP_P0            TGSI_EXEC_TEMP_P0
 428
 429 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 430    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
 431
 432 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 433    ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
 434
 435 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 436    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 437       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 438
 439 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 440    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 441       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 442
 443
 444 /** The execution mask depends on the conditional mask and the loop mask */
 445 #define UPDATE_EXEC_MASK(MACH) \
 446       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 447
 448
 449 static const union tgsi_exec_channel ZeroVec =
 450    { { 0.0, 0.0, 0.0, 0.0 } };
 451
 452
 453 /**
 454  * Assert that none of the float values in 'chan' are infinite or NaN.
 455  * NaN and Inf may occur normally during program execution and should
 456  * not lead to crashes, etc.  But when debugging, it's helpful to catch
 457  * them.
 458  */
 459 static INLINE void
 460 check_inf_or_nan(const union tgsi_exec_channel *chan)
 461 {
 462    assert(!util_is_inf_or_nan((chan)->f[0]));
 463    assert(!util_is_inf_or_nan((chan)->f[1]));
 464    assert(!util_is_inf_or_nan((chan)->f[2]));
 465    assert(!util_is_inf_or_nan((chan)->f[3]));
 466 }
 467
 468
 469 #ifdef DEBUG
 470 static void
 471 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 472 {
 473    debug_printf("%s = {%f, %f, %f, %f}\n",
 474                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 475 }
 476 #endif
 477
 478
 479 #ifdef DEBUG
 480 static void
 481 print_temp(const struct tgsi_exec_machine *mach, uint index)
 482 {
 483    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 484    int i;
 485    debug_printf("Temp[%u] =\n", index);
 486    for (i = 0; i < 4; i++) {
 487       debug_printf("  %c: { %f, %f, %f, %f }\n",
 488                    "XYZW"[i],
 489                    tmp->xyzw[i].f[0],
 490                    tmp->xyzw[i].f[1],
 491                    tmp->xyzw[i].f[2],
 492                    tmp->xyzw[i].f[3]);
 493    }
 494 }
 495 #endif
 496
 497
 498 /**
 499  * Check if there's a potential src/dst register data dependency when
 500  * using SOA execution.
 501  * Example:
 502  *   MOV T, T.yxwz;
 503  * This would expand into:
 504  *   MOV t0, t1;
 505  *   MOV t1, t0;
 506  *   MOV t2, t3;
 507  *   MOV t3, t2;
 508  * The second instruction will have the wrong value for t0 if executed as-is.
 509  */
 510 boolean
 511 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
 512 {
 513    uint i, chan;
 514
 515    uint writemask = inst->Dst[0].Register.WriteMask;
 516    if (writemask == TGSI_WRITEMASK_X ||
 517        writemask == TGSI_WRITEMASK_Y ||
 518        writemask == TGSI_WRITEMASK_Z ||
 519        writemask == TGSI_WRITEMASK_W ||
 520        writemask == TGSI_WRITEMASK_NONE) {
 521       /* no chance of data dependency */
 522       return FALSE;
 523    }
 524
 525    /* loop over src regs */
 526    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 527       if ((inst->Src[i].Register.File ==
 528            inst->Dst[0].Register.File) &&
 529           (inst->Src[i].Register.Index ==
 530            inst->Dst[0].Register.Index)) {
 531          /* loop over dest channels */
 532          uint channelsWritten = 0x0;
 533          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
 534             /* check if we're reading a channel that's been written */
 535             uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
 536             if (channelsWritten & (1 << swizzle)) {
 537                return TRUE;
 538             }
 539
 540             channelsWritten |= (1 << chan);
 541          }
 542       }
 543    }
 544    return FALSE;
 545 }
 546
 547
 548 /**
 549  * Initialize machine state by expanding tokens to full instructions,
 550  * allocating temporary storage, setting up constants, etc.
 551  * After this, we can call tgsi_exec_machine_run() many times.
 552  */
 553 void
 554 tgsi_exec_machine_bind_shader(
 555    struct tgsi_exec_machine *mach,
 556    const struct tgsi_token *tokens,
 557    uint numSamplers,
 558    struct tgsi_sampler **samplers)
 559 {
 560    uint k;
 561    struct tgsi_parse_context parse;
 562    struct tgsi_exec_labels *labels = &mach->Labels;
 563    struct tgsi_full_instruction *instructions;
 564    struct tgsi_full_declaration *declarations;
 565    uint maxInstructions = 10, numInstructions = 0;
 566    uint maxDeclarations = 10, numDeclarations = 0;
 567    uint instno = 0;
 568
 569 #if 0
 570    tgsi_dump(tokens, 0);
 571 #endif
 572
 573    util_init_math();
 574
 575    mach->Tokens = tokens;
 576    mach->Samplers = samplers;
 577
 578    k = tgsi_parse_init (&parse, mach->Tokens);
 579    if (k != TGSI_PARSE_OK) {
 580       debug_printf( "Problem parsing!\n" );
 581       return;
 582    }
 583
 584    mach->Processor = parse.FullHeader.Processor.Processor;
 585    mach->ImmLimit = 0;
 586    labels->count = 0;
 587
 588    declarations = (struct tgsi_full_declaration *)
 589       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 590
 591    if (!declarations) {
 592       return;
 593    }
 594
 595    instructions = (struct tgsi_full_instruction *)
 596       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 597
 598    if (!instructions) {
 599       FREE( declarations );
 600       return;
 601    }
 602
 603    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 604       uint pointer = parse.Position;
 605       uint i;
 606
 607       tgsi_parse_token( &parse );
 608       switch( parse.FullToken.Token.Type ) {
 609       case TGSI_TOKEN_TYPE_DECLARATION:
 610          /* save expanded declaration */
 611          if (numDeclarations == maxDeclarations) {
 612             declarations = REALLOC(declarations,
 613                                    maxDeclarations
 614                                    * sizeof(struct tgsi_full_declaration),
 615                                    (maxDeclarations + 10)
 616                                    * sizeof(struct tgsi_full_declaration));
 617             maxDeclarations += 10;
 618          }
 619          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
 620             unsigned reg;
 621             for (reg = parse.FullToken.FullDeclaration.Range.First;
 622                  reg <= parse.FullToken.FullDeclaration.Range.Last;
 623                  ++reg) {
 624                ++mach->NumOutputs;
 625             }
 626          }
 627          memcpy(declarations + numDeclarations,
 628                 &parse.FullToken.FullDeclaration,
 629                 sizeof(declarations[0]));
 630          numDeclarations++;
 631          break;
 632
 633       case TGSI_TOKEN_TYPE_IMMEDIATE:
 634          {
 635             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 636             assert( size <= 4 );
 637             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
 638
 639             for( i = 0; i < size; i++ ) {
 640                mach->Imms[mach->ImmLimit][i] =
 641                   parse.FullToken.FullImmediate.u[i].Float;
 642             }
 643             mach->ImmLimit += 1;
 644          }
 645          break;
 646
 647       case TGSI_TOKEN_TYPE_INSTRUCTION:
 648          assert( labels->count < MAX_LABELS );
 649
 650          labels->labels[labels->count][0] = instno;
 651          labels->labels[labels->count][1] = pointer;
 652          labels->count++;
 653
 654          /* save expanded instruction */
 655          if (numInstructions == maxInstructions) {
 656             instructions = REALLOC(instructions,
 657                                    maxInstructions
 658                                    * sizeof(struct tgsi_full_instruction),
 659                                    (maxInstructions + 10)
 660                                    * sizeof(struct tgsi_full_instruction));
 661             maxInstructions += 10;
 662          }
 663
 664          memcpy(instructions + numInstructions,
 665                 &parse.FullToken.FullInstruction,
 666                 sizeof(instructions[0]));
 667
 668          numInstructions++;
 669          break;
 670
 671       case TGSI_TOKEN_TYPE_PROPERTY:
 672          break;
 673
 674       default:
 675          assert( 0 );
 676       }
 677    }
 678    tgsi_parse_free (&parse);
 679
 680    if (mach->Declarations) {
 681       FREE( mach->Declarations );
 682    }
 683    mach->Declarations = declarations;
 684    mach->NumDeclarations = numDeclarations;
 685
 686    if (mach->Instructions) {
 687       FREE( mach->Instructions );
 688    }
 689    mach->Instructions = instructions;
 690    mach->NumInstructions = numInstructions;
 691 }
 692
 693
 694 struct tgsi_exec_machine *
 695 tgsi_exec_machine_create( void )
 696 {
 697    struct tgsi_exec_machine *mach;
 698    uint i;
 699
 700    mach = align_malloc( sizeof *mach, 16 );
 701    if (!mach)
 702       goto fail;
 703
 704    memset(mach, 0, sizeof(*mach));
 705
 706    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 707    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
 708    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
 709
 710    /* Setup constants. */
 711    for( i = 0; i < 4; i++ ) {
 712       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 713       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 714       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 715       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 716       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 717       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 718       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 719       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 720       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 721       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 722    }
 723
 724 #ifdef DEBUG
 725    /* silence warnings */
 726    (void) print_chan;
 727    (void) print_temp;
 728 #endif
 729
 730    return mach;
 731
 732 fail:
 733    align_free(mach);
 734    return NULL;
 735 }
 736
 737
 738 void
 739 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 740 {
 741    if (mach) {
 742       FREE(mach->Instructions);
 743       FREE(mach->Declarations);
 744    }
 745
 746    align_free(mach);
 747 }
 748
 749 static void
 750 micro_add(
 751    union tgsi_exec_channel *dst,
 752    const union tgsi_exec_channel *src0,
 753    const union tgsi_exec_channel *src1 )
 754 {
 755    dst->f[0] = src0->f[0] + src1->f[0];
 756    dst->f[1] = src0->f[1] + src1->f[1];
 757    dst->f[2] = src0->f[2] + src1->f[2];
 758    dst->f[3] = src0->f[3] + src1->f[3];
 759 }
 760
 761 static void
 762 micro_div(
 763    union tgsi_exec_channel *dst,
 764    const union tgsi_exec_channel *src0,
 765    const union tgsi_exec_channel *src1 )
 766 {
 767    if (src1->f[0] != 0) {
 768       dst->f[0] = src0->f[0] / src1->f[0];
 769    }
 770    if (src1->f[1] != 0) {
 771       dst->f[1] = src0->f[1] / src1->f[1];
 772    }
 773    if (src1->f[2] != 0) {
 774       dst->f[2] = src0->f[2] / src1->f[2];
 775    }
 776    if (src1->f[3] != 0) {
 777       dst->f[3] = src0->f[3] / src1->f[3];
 778    }
 779 }
 780
 781 static void
 782 micro_float_clamp(union tgsi_exec_channel *dst,
 783                   const union tgsi_exec_channel *src)
 784 {
 785    uint i;
 786
 787    for (i = 0; i < 4; i++) {
 788       if (src->f[i] > 0.0f) {
 789          if (src->f[i] > 1.884467e+019f)
 790             dst->f[i] = 1.884467e+019f;
 791          else if (src->f[i] < 5.42101e-020f)
 792             dst->f[i] = 5.42101e-020f;
 793          else
 794             dst->f[i] = src->f[i];
 795       }
 796       else {
 797          if (src->f[i] < -1.884467e+019f)
 798             dst->f[i] = -1.884467e+019f;
 799          else if (src->f[i] > -5.42101e-020f)
 800             dst->f[i] = -5.42101e-020f;
 801          else
 802             dst->f[i] = src->f[i];
 803       }
 804    }
 805 }
 806
 807 static void
 808 micro_lt(
 809    union tgsi_exec_channel *dst,
 810    const union tgsi_exec_channel *src0,
 811    const union tgsi_exec_channel *src1,
 812    const union tgsi_exec_channel *src2,
 813    const union tgsi_exec_channel *src3 )
 814 {
 815    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 816    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 817    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 818    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 819 }
 820
 821 static void
 822 micro_max(
 823    union tgsi_exec_channel *dst,
 824    const union tgsi_exec_channel *src0,
 825    const union tgsi_exec_channel *src1 )
 826 {
 827    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 828    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 829    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 830    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 831 }
 832
 833 static void
 834 micro_min(
 835    union tgsi_exec_channel *dst,
 836    const union tgsi_exec_channel *src0,
 837    const union tgsi_exec_channel *src1 )
 838 {
 839    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 840    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 841    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 842    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 843 }
 844
 845 static void
 846 micro_mul(
 847    union tgsi_exec_channel *dst,
 848    const union tgsi_exec_channel *src0,
 849    const union tgsi_exec_channel *src1 )
 850 {
 851    dst->f[0] = src0->f[0] * src1->f[0];
 852    dst->f[1] = src0->f[1] * src1->f[1];
 853    dst->f[2] = src0->f[2] * src1->f[2];
 854    dst->f[3] = src0->f[3] * src1->f[3];
 855 }
 856
 857 #if 0
 858 static void
 859 micro_imul64(
 860    union tgsi_exec_channel *dst0,
 861    union tgsi_exec_channel *dst1,
 862    const union tgsi_exec_channel *src0,
 863    const union tgsi_exec_channel *src1 )
 864 {
 865    dst1->i[0] = src0->i[0] * src1->i[0];
 866    dst1->i[1] = src0->i[1] * src1->i[1];
 867    dst1->i[2] = src0->i[2] * src1->i[2];
 868    dst1->i[3] = src0->i[3] * src1->i[3];
 869    dst0->i[0] = 0;
 870    dst0->i[1] = 0;
 871    dst0->i[2] = 0;
 872    dst0->i[3] = 0;
 873 }
 874 #endif
 875
 876 #if 0
 877 static void
 878 micro_umul64(
 879    union tgsi_exec_channel *dst0,
 880    union tgsi_exec_channel *dst1,
 881    const union tgsi_exec_channel *src0,
 882    const union tgsi_exec_channel *src1 )
 883 {
 884    dst1->u[0] = src0->u[0] * src1->u[0];
 885    dst1->u[1] = src0->u[1] * src1->u[1];
 886    dst1->u[2] = src0->u[2] * src1->u[2];
 887    dst1->u[3] = src0->u[3] * src1->u[3];
 888    dst0->u[0] = 0;
 889    dst0->u[1] = 0;
 890    dst0->u[2] = 0;
 891    dst0->u[3] = 0;
 892 }
 893 #endif
 894
 895
 896 #if 0
 897 static void
 898 micro_movc(
 899    union tgsi_exec_channel *dst,
 900    const union tgsi_exec_channel *src0,
 901    const union tgsi_exec_channel *src1,
 902    const union tgsi_exec_channel *src2 )
 903 {
 904    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 905    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 906    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 907    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 908 }
 909 #endif
 910
 911 static void
 912 micro_neg(
 913    union tgsi_exec_channel *dst,
 914    const union tgsi_exec_channel *src )
 915 {
 916    dst->f[0] = -src->f[0];
 917    dst->f[1] = -src->f[1];
 918    dst->f[2] = -src->f[2];
 919    dst->f[3] = -src->f[3];
 920 }
 921
 922 static void
 923 micro_pow(
 924    union tgsi_exec_channel *dst,
 925    const union tgsi_exec_channel *src0,
 926    const union tgsi_exec_channel *src1 )
 927 {
 928 #if FAST_MATH
 929    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 930    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 931    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 932    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 933 #else
 934    dst->f[0] = powf( src0->f[0], src1->f[0] );
 935    dst->f[1] = powf( src0->f[1], src1->f[1] );
 936    dst->f[2] = powf( src0->f[2], src1->f[2] );
 937    dst->f[3] = powf( src0->f[3], src1->f[3] );
 938 #endif
 939 }
 940
 941 static void
 942 micro_sqrt( union tgsi_exec_channel *dst,
 943             const union tgsi_exec_channel *src )
 944 {
 945    dst->f[0] = sqrtf( src->f[0] );
 946    dst->f[1] = sqrtf( src->f[1] );
 947    dst->f[2] = sqrtf( src->f[2] );
 948    dst->f[3] = sqrtf( src->f[3] );
 949 }
 950
 951 static void
 952 micro_sub(
 953    union tgsi_exec_channel *dst,
 954    const union tgsi_exec_channel *src0,
 955    const union tgsi_exec_channel *src1 )
 956 {
 957    dst->f[0] = src0->f[0] - src1->f[0];
 958    dst->f[1] = src0->f[1] - src1->f[1];
 959    dst->f[2] = src0->f[2] - src1->f[2];
 960    dst->f[3] = src0->f[3] - src1->f[3];
 961 }
 962
 963 static void
 964 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
 965                        const uint file,
 966                        const uint swizzle,
 967                        const union tgsi_exec_channel *index,
 968                        const union tgsi_exec_channel *index2D,
 969                        union tgsi_exec_channel *chan)
 970 {
 971    uint i;
 972
 973    switch (file) {
 974    case TGSI_FILE_CONSTANT:
 975       for (i = 0; i < QUAD_SIZE; i++) {
 976          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
 977          assert(mach->Consts[index2D->i[i]]);
 978
 979          if (index->i[i] < 0) {
 980             chan->u[i] = 0;
 981          } else {
 982             const uint *p = (const uint *)mach->Consts[index2D->i[i]];
 983
 984             chan->u[i] = p[index->i[i] * 4 + swizzle];
 985          }
 986       }
 987       break;
 988
 989    case TGSI_FILE_INPUT:
 990    case TGSI_FILE_SYSTEM_VALUE:
 991       for (i = 0; i < QUAD_SIZE; i++) {
 992          /* XXX: 2D indexing */
 993          chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
 994       }
 995       break;
 996
 997    case TGSI_FILE_TEMPORARY:
 998       for (i = 0; i < QUAD_SIZE; i++) {
 999          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1000          assert(index2D->i[i] == 0);
1001
1002          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1003       }
1004       break;
1005
1006    case TGSI_FILE_IMMEDIATE:
1007       for (i = 0; i < QUAD_SIZE; i++) {
1008          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1009          assert(index2D->i[i] == 0);
1010
1011          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1012       }
1013       break;
1014
1015    case TGSI_FILE_ADDRESS:
1016       for (i = 0; i < QUAD_SIZE; i++) {
1017          assert(index->i[i] >= 0);
1018          assert(index2D->i[i] == 0);
1019
1020          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1021       }
1022       break;
1023
1024    case TGSI_FILE_PREDICATE:
1025       for (i = 0; i < QUAD_SIZE; i++) {
1026          assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1027          assert(index2D->i[i] == 0);
1028
1029          chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1030       }
1031       break;
1032
1033    case TGSI_FILE_OUTPUT:
1034       /* vertex/fragment output vars can be read too */
1035       for (i = 0; i < QUAD_SIZE; i++) {
1036          assert(index->i[i] >= 0);
1037          assert(index2D->i[i] == 0);
1038
1039          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1040       }
1041       break;
1042
1043    default:
1044       assert(0);
1045       for (i = 0; i < QUAD_SIZE; i++) {
1046          chan->u[i] = 0;
1047       }
1048    }
1049 }
1050
1051 static void
1052 fetch_source(const struct tgsi_exec_machine *mach,
1053              union tgsi_exec_channel *chan,
1054              const struct tgsi_full_src_register *reg,
1055              const uint chan_index,
1056              enum tgsi_exec_datatype src_datatype)
1057 {
1058    union tgsi_exec_channel index;
1059    union tgsi_exec_channel index2D;
1060    uint swizzle;
1061
1062    /* We start with a direct index into a register file.
1063     *
1064     *    file[1],
1065     *    where:
1066     *       file = Register.File
1067     *       [1] = Register.Index
1068     */
1069    index.i[0] =
1070    index.i[1] =
1071    index.i[2] =
1072    index.i[3] = reg->Register.Index;
1073
1074    /* There is an extra source register that indirectly subscripts
1075     * a register file. The direct index now becomes an offset
1076     * that is being added to the indirect register.
1077     *
1078     *    file[ind[2].x+1],
1079     *    where:
1080     *       ind = Indirect.File
1081     *       [2] = Indirect.Index
1082     *       .x = Indirect.SwizzleX
1083     */
1084    if (reg->Register.Indirect) {
1085       union tgsi_exec_channel index2;
1086       union tgsi_exec_channel indir_index;
1087       const uint execmask = mach->ExecMask;
1088       uint i;
1089
1090       /* which address register (always zero now) */
1091       index2.i[0] =
1092       index2.i[1] =
1093       index2.i[2] =
1094       index2.i[3] = reg->Indirect.Index;
1095
1096       /* get current value of address register[swizzle] */
1097       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1098       fetch_src_file_channel(mach,
1099                              reg->Indirect.File,
1100                              swizzle,
1101                              &index2,
1102                              &ZeroVec,
1103                              &indir_index);
1104
1105       /* add value of address register to the offset */
1106       index.i[0] += indir_index.i[0];
1107       index.i[1] += indir_index.i[1];
1108       index.i[2] += indir_index.i[2];
1109       index.i[3] += indir_index.i[3];
1110
1111       /* for disabled execution channels, zero-out the index to
1112        * avoid using a potential garbage value.
1113        */
1114       for (i = 0; i < QUAD_SIZE; i++) {
1115          if ((execmask & (1 << i)) == 0)
1116             index.i[i] = 0;
1117       }
1118    }
1119
1120    /* There is an extra source register that is a second
1121     * subscript to a register file. Effectively it means that
1122     * the register file is actually a 2D array of registers.
1123     *
1124     *    file[3][1],
1125     *    where:
1126     *       [3] = Dimension.Index
1127     */
1128    if (reg->Register.Dimension) {
1129       index2D.i[0] =
1130       index2D.i[1] =
1131       index2D.i[2] =
1132       index2D.i[3] = reg->Dimension.Index;
1133
1134       /* Again, the second subscript index can be addressed indirectly
1135        * identically to the first one.
1136        * Nothing stops us from indirectly addressing the indirect register,
1137        * but there is no need for that, so we won't exercise it.
1138        *
1139        *    file[ind[4].y+3][1],
1140        *    where:
1141        *       ind = DimIndirect.File
1142        *       [4] = DimIndirect.Index
1143        *       .y = DimIndirect.SwizzleX
1144        */
1145       if (reg->Dimension.Indirect) {
1146          union tgsi_exec_channel index2;
1147          union tgsi_exec_channel indir_index;
1148          const uint execmask = mach->ExecMask;
1149          uint i;
1150
1151          index2.i[0] =
1152          index2.i[1] =
1153          index2.i[2] =
1154          index2.i[3] = reg->DimIndirect.Index;
1155
1156          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1157          fetch_src_file_channel(mach,
1158                                 reg->DimIndirect.File,
1159                                 swizzle,
1160                                 &index2,
1161                                 &ZeroVec,
1162                                 &indir_index);
1163
1164          index2D.i[0] += indir_index.i[0];
1165          index2D.i[1] += indir_index.i[1];
1166          index2D.i[2] += indir_index.i[2];
1167          index2D.i[3] += indir_index.i[3];
1168
1169          /* for disabled execution channels, zero-out the index to
1170           * avoid using a potential garbage value.
1171           */
1172          for (i = 0; i < QUAD_SIZE; i++) {
1173             if ((execmask & (1 << i)) == 0) {
1174                index2D.i[i] = 0;
1175             }
1176          }
1177       }
1178
1179       /* If by any chance there was a need for a 3D array of register
1180        * files, we would have to check whether Dimension is followed
1181        * by a dimension register and continue the saga.
1182        */
1183    } else {
1184       index2D.i[0] =
1185       index2D.i[1] =
1186       index2D.i[2] =
1187       index2D.i[3] = 0;
1188    }
1189
1190    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1191    fetch_src_file_channel(mach,
1192                           reg->Register.File,
1193                           swizzle,
1194                           &index,
1195                           &index2D,
1196                           chan);
1197
1198    if (reg->Register.Absolute) {
1199       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1200          micro_abs(chan, chan);
1201       } else {
1202          micro_iabs(chan, chan);
1203       }
1204    }
1205
1206    if (reg->Register.Negate) {
1207       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1208          micro_neg(chan, chan);
1209       } else {
1210          micro_ineg(chan, chan);
1211       }
1212    }
1213 }
1214
1215 static void
1216 store_dest(struct tgsi_exec_machine *mach,
1217            const union tgsi_exec_channel *chan,
1218            const struct tgsi_full_dst_register *reg,
1219            const struct tgsi_full_instruction *inst,
1220            uint chan_index,
1221            enum tgsi_exec_datatype dst_datatype)
1222 {
1223    uint i;
1224    union tgsi_exec_channel null;
1225    union tgsi_exec_channel *dst;
1226    uint execmask = mach->ExecMask;
1227    int offset = 0;  /* indirection offset */
1228    int index;
1229
1230    /* for debugging */
1231    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1232       check_inf_or_nan(chan);
1233    }
1234
1235    /* There is an extra source register that indirectly subscripts
1236     * a register file. The direct index now becomes an offset
1237     * that is being added to the indirect register.
1238     *
1239     *    file[ind[2].x+1],
1240     *    where:
1241     *       ind = Indirect.File
1242     *       [2] = Indirect.Index
1243     *       .x = Indirect.SwizzleX
1244     */
1245    if (reg->Register.Indirect) {
1246       union tgsi_exec_channel index;
1247       union tgsi_exec_channel indir_index;
1248       uint swizzle;
1249
1250       /* which address register (always zero for now) */
1251       index.i[0] =
1252       index.i[1] =
1253       index.i[2] =
1254       index.i[3] = reg->Indirect.Index;
1255
1256       /* get current value of address register[swizzle] */
1257       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1258
1259       /* fetch values from the address/indirection register */
1260       fetch_src_file_channel(mach,
1261                              reg->Indirect.File,
1262                              swizzle,
1263                              &index,
1264                              &ZeroVec,
1265                              &indir_index);
1266
1267       /* save indirection offset */
1268       offset = indir_index.i[0];
1269    }
1270
1271    switch (reg->Register.File) {
1272    case TGSI_FILE_NULL:
1273       dst = &null;
1274       break;
1275
1276    case TGSI_FILE_OUTPUT:
1277       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1278          + reg->Register.Index;
1279       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1280 #if 0
1281       if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1282          fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1283          for (i = 0; i < QUAD_SIZE; i++)
1284             if (execmask & (1 << i))
1285                fprintf(stderr, "%f, ", chan->f[i]);
1286          fprintf(stderr, ")\n");
1287       }
1288 #endif
1289       break;
1290
1291    case TGSI_FILE_TEMPORARY:
1292       index = reg->Register.Index;
1293       assert( index < TGSI_EXEC_NUM_TEMPS );
1294       dst = &mach->Temps[offset + index].xyzw[chan_index];
1295       break;
1296
1297    case TGSI_FILE_ADDRESS:
1298       index = reg->Register.Index;
1299       dst = &mach->Addrs[index].xyzw[chan_index];
1300       break;
1301
1302    case TGSI_FILE_LOOP:
1303       assert(reg->Register.Index == 0);
1304       assert(mach->LoopCounterStackTop > 0);
1305       assert(chan_index == CHAN_X);
1306       dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1307       break;
1308
1309    case TGSI_FILE_PREDICATE:
1310       index = reg->Register.Index;
1311       assert(index < TGSI_EXEC_NUM_PREDS);
1312       dst = &mach->Predicates[index].xyzw[chan_index];
1313       break;
1314
1315    default:
1316       assert( 0 );
1317       return;
1318    }
1319
1320    if (inst->Instruction.Predicate) {
1321       uint swizzle;
1322       union tgsi_exec_channel *pred;
1323
1324       switch (chan_index) {
1325       case CHAN_X:
1326          swizzle = inst->Predicate.SwizzleX;
1327          break;
1328       case CHAN_Y:
1329          swizzle = inst->Predicate.SwizzleY;
1330          break;
1331       case CHAN_Z:
1332          swizzle = inst->Predicate.SwizzleZ;
1333          break;
1334       case CHAN_W:
1335          swizzle = inst->Predicate.SwizzleW;
1336          break;
1337       default:
1338          assert(0);
1339          return;
1340       }
1341
1342       assert(inst->Predicate.Index == 0);
1343
1344       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1345
1346       if (inst->Predicate.Negate) {
1347          for (i = 0; i < QUAD_SIZE; i++) {
1348             if (pred->u[i]) {
1349                execmask &= ~(1 << i);
1350             }
1351          }
1352       } else {
1353          for (i = 0; i < QUAD_SIZE; i++) {
1354             if (!pred->u[i]) {
1355                execmask &= ~(1 << i);
1356             }
1357          }
1358       }
1359    }
1360
1361    switch (inst->Instruction.Saturate) {
1362    case TGSI_SAT_NONE:
1363       for (i = 0; i < QUAD_SIZE; i++)
1364          if (execmask & (1 << i))
1365             dst->i[i] = chan->i[i];
1366       break;
1367
1368    case TGSI_SAT_ZERO_ONE:
1369       for (i = 0; i < QUAD_SIZE; i++)
1370          if (execmask & (1 << i)) {
1371             if (chan->f[i] < 0.0f)
1372                dst->f[i] = 0.0f;
1373             else if (chan->f[i] > 1.0f)
1374                dst->f[i] = 1.0f;
1375             else
1376                dst->i[i] = chan->i[i];
1377          }
1378       break;
1379
1380    case TGSI_SAT_MINUS_PLUS_ONE:
1381       for (i = 0; i < QUAD_SIZE; i++)
1382          if (execmask & (1 << i)) {
1383             if (chan->f[i] < -1.0f)
1384                dst->f[i] = -1.0f;
1385             else if (chan->f[i] > 1.0f)
1386                dst->f[i] = 1.0f;
1387             else
1388                dst->i[i] = chan->i[i];
1389          }
1390       break;
1391
1392    default:
1393       assert( 0 );
1394    }
1395 }
1396
1397 #define FETCH(VAL,INDEX,CHAN)\
1398     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1399
1400 #define STORE(VAL,INDEX,CHAN)\
1401    store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1402
1403
1404 /**
1405  * Execute ARB-style KIL which is predicated by a src register.
1406  * Kill fragment if any of the four values is less than zero.
1407  */
1408 static void
1409 exec_kil(struct tgsi_exec_machine *mach,
1410          const struct tgsi_full_instruction *inst)
1411 {
1412    uint uniquemask;
1413    uint chan_index;
1414    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1415    union tgsi_exec_channel r[1];
1416
1417    /* This mask stores component bits that were already tested. */
1418    uniquemask = 0;
1419
1420    for (chan_index = 0; chan_index < 4; chan_index++)
1421    {
1422       uint swizzle;
1423       uint i;
1424
1425       /* unswizzle channel */
1426       swizzle = tgsi_util_get_full_src_register_swizzle (
1427                         &inst->Src[0],
1428                         chan_index);
1429
1430       /* check if the component has not been already tested */
1431       if (uniquemask & (1 << swizzle))
1432          continue;
1433       uniquemask |= 1 << swizzle;
1434
1435       FETCH(&r[0], 0, chan_index);
1436       for (i = 0; i < 4; i++)
1437          if (r[0].f[i] < 0.0f)
1438             kilmask |= 1 << i;
1439    }
1440
1441    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1442 }
1443
1444 /**
1445  * Execute NVIDIA-style KIL which is predicated by a condition code.
1446  * Kill fragment if the condition code is TRUE.
1447  */
1448 static void
1449 exec_kilp(struct tgsi_exec_machine *mach,
1450           const struct tgsi_full_instruction *inst)
1451 {
1452    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1453
1454    /* "unconditional" kil */
1455    kilmask = mach->ExecMask;
1456    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1457 }
1458
1459 static void
1460 emit_vertex(struct tgsi_exec_machine *mach)
1461 {
1462    /* FIXME: check for exec mask correctly
1463    unsigned i;
1464    for (i = 0; i < QUAD_SIZE; ++i) {
1465          if ((mach->ExecMask & (1 << i)))
1466    */
1467    if (mach->ExecMask) {
1468       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1469       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1470    }
1471 }
1472
1473 static void
1474 emit_primitive(struct tgsi_exec_machine *mach)
1475 {
1476    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1477    /* FIXME: check for exec mask correctly
1478    unsigned i;
1479    for (i = 0; i < QUAD_SIZE; ++i) {
1480          if ((mach->ExecMask & (1 << i)))
1481    */
1482    if (mach->ExecMask) {
1483       ++(*prim_count);
1484       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1485       mach->Primitives[*prim_count] = 0;
1486    }
1487 }
1488
1489 /*
1490  * Fetch four texture samples using STR texture coordinates.
1491  */
1492 static void
1493 fetch_texel( struct tgsi_sampler *sampler,
1494              const union tgsi_exec_channel *s,
1495              const union tgsi_exec_channel *t,
1496              const union tgsi_exec_channel *p,
1497              const union tgsi_exec_channel *c0,
1498              enum tgsi_sampler_control control,
1499              union tgsi_exec_channel *r,
1500              union tgsi_exec_channel *g,
1501              union tgsi_exec_channel *b,
1502              union tgsi_exec_channel *a )
1503 {
1504    uint j;
1505    float rgba[NUM_CHANNELS][QUAD_SIZE];
1506
1507    sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1508
1509    for (j = 0; j < 4; j++) {
1510       r->f[j] = rgba[0][j];
1511       g->f[j] = rgba[1][j];
1512       b->f[j] = rgba[2][j];
1513       a->f[j] = rgba[3][j];
1514    }
1515 }
1516
1517
1518 #define TEX_MODIFIER_NONE           0
1519 #define TEX_MODIFIER_PROJECTED      1
1520 #define TEX_MODIFIER_LOD_BIAS       2
1521 #define TEX_MODIFIER_EXPLICIT_LOD   3
1522
1523
1524 static void
1525 exec_tex(struct tgsi_exec_machine *mach,
1526          const struct tgsi_full_instruction *inst,
1527          uint modifier)
1528 {
1529    const uint unit = inst->Src[1].Register.Index;
1530    union tgsi_exec_channel r[4];
1531    const union tgsi_exec_channel *lod = &ZeroVec;
1532    enum tgsi_sampler_control control;
1533    uint chan_index;
1534
1535    if (modifier != TEX_MODIFIER_NONE) {
1536       FETCH(&r[3], 0, CHAN_W);
1537       if (modifier != TEX_MODIFIER_PROJECTED) {
1538          lod = &r[3];
1539       }
1540    }
1541
1542    if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1543       control = tgsi_sampler_lod_explicit;
1544    } else {
1545       control = tgsi_sampler_lod_bias;
1546    }
1547
1548    switch (inst->Texture.Texture) {
1549    case TGSI_TEXTURE_1D:
1550    case TGSI_TEXTURE_SHADOW1D:
1551       FETCH(&r[0], 0, CHAN_X);
1552
1553       if (modifier == TEX_MODIFIER_PROJECTED) {
1554          micro_div(&r[0], &r[0], &r[3]);
1555       }
1556
1557       fetch_texel(mach->Samplers[unit],
1558                   &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1559                   control,
1560                   &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1561       break;
1562
1563    case TGSI_TEXTURE_2D:
1564    case TGSI_TEXTURE_RECT:
1565    case TGSI_TEXTURE_SHADOW2D:
1566    case TGSI_TEXTURE_SHADOWRECT:
1567       FETCH(&r[0], 0, CHAN_X);
1568       FETCH(&r[1], 0, CHAN_Y);
1569       FETCH(&r[2], 0, CHAN_Z);
1570
1571       if (modifier == TEX_MODIFIER_PROJECTED) {
1572          micro_div(&r[0], &r[0], &r[3]);
1573          micro_div(&r[1], &r[1], &r[3]);
1574          micro_div(&r[2], &r[2], &r[3]);
1575       }
1576
1577       fetch_texel(mach->Samplers[unit],
1578                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1579                   control,
1580                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1581       break;
1582
1583    case TGSI_TEXTURE_3D:
1584    case TGSI_TEXTURE_CUBE:
1585       FETCH(&r[0], 0, CHAN_X);
1586       FETCH(&r[1], 0, CHAN_Y);
1587       FETCH(&r[2], 0, CHAN_Z);
1588
1589       if (modifier == TEX_MODIFIER_PROJECTED) {
1590          micro_div(&r[0], &r[0], &r[3]);
1591          micro_div(&r[1], &r[1], &r[3]);
1592          micro_div(&r[2], &r[2], &r[3]);
1593       }
1594
1595       fetch_texel(mach->Samplers[unit],
1596                   &r[0], &r[1], &r[2], lod,
1597                   control,
1598                   &r[0], &r[1], &r[2], &r[3]);
1599       break;
1600
1601    default:
1602       assert(0);
1603    }
1604
1605    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1606       STORE(&r[chan_index], 0, chan_index);
1607    }
1608 }
1609
1610 static void
1611 exec_txd(struct tgsi_exec_machine *mach,
1612          const struct tgsi_full_instruction *inst)
1613 {
1614    const uint unit = inst->Src[3].Register.Index;
1615    union tgsi_exec_channel r[4];
1616    uint chan_index;
1617
1618    /*
1619     * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1620     */
1621
1622    switch (inst->Texture.Texture) {
1623    case TGSI_TEXTURE_1D:
1624    case TGSI_TEXTURE_SHADOW1D:
1625
1626       FETCH(&r[0], 0, CHAN_X);
1627
1628       fetch_texel(mach->Samplers[unit],
1629                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1630                   tgsi_sampler_lod_bias,
1631                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1632       break;
1633
1634    case TGSI_TEXTURE_2D:
1635    case TGSI_TEXTURE_RECT:
1636    case TGSI_TEXTURE_SHADOW2D:
1637    case TGSI_TEXTURE_SHADOWRECT:
1638
1639       FETCH(&r[0], 0, CHAN_X);
1640       FETCH(&r[1], 0, CHAN_Y);
1641       FETCH(&r[2], 0, CHAN_Z);
1642
1643       fetch_texel(mach->Samplers[unit],
1644                   &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1645                   tgsi_sampler_lod_bias,
1646                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1647       break;
1648
1649    case TGSI_TEXTURE_3D:
1650    case TGSI_TEXTURE_CUBE:
1651
1652       FETCH(&r[0], 0, CHAN_X);
1653       FETCH(&r[1], 0, CHAN_Y);
1654       FETCH(&r[2], 0, CHAN_Z);
1655
1656       fetch_texel(mach->Samplers[unit],
1657                   &r[0], &r[1], &r[2], &ZeroVec,
1658                   tgsi_sampler_lod_bias,
1659                   &r[0], &r[1], &r[2], &r[3]);
1660       break;
1661
1662    default:
1663       assert(0);
1664    }
1665
1666    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1667       STORE(&r[chan_index], 0, chan_index);
1668    }
1669 }
1670
1671
1672 /**
1673  * Evaluate a constant-valued coefficient at the position of the
1674  * current quad.
1675  */
1676 static void
1677 eval_constant_coef(
1678    struct tgsi_exec_machine *mach,
1679    unsigned attrib,
1680    unsigned chan )
1681 {
1682    unsigned i;
1683
1684    for( i = 0; i < QUAD_SIZE; i++ ) {
1685       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1686    }
1687 }
1688
1689 /**
1690  * Evaluate a linear-valued coefficient at the position of the
1691  * current quad.
1692  */
1693 static void
1694 eval_linear_coef(
1695    struct tgsi_exec_machine *mach,
1696    unsigned attrib,
1697    unsigned chan )
1698 {
1699    const float x = mach->QuadPos.xyzw[0].f[0];
1700    const float y = mach->QuadPos.xyzw[1].f[0];
1701    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1702    const float dady = mach->InterpCoefs[attrib].dady[chan];
1703    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1704    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1705    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1706    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1707    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1708 }
1709
1710 /**
1711  * Evaluate a perspective-valued coefficient at the position of the
1712  * current quad.
1713  */
1714 static void
1715 eval_perspective_coef(
1716    struct tgsi_exec_machine *mach,
1717    unsigned attrib,
1718    unsigned chan )
1719 {
1720    const float x = mach->QuadPos.xyzw[0].f[0];
1721    const float y = mach->QuadPos.xyzw[1].f[0];
1722    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1723    const float dady = mach->InterpCoefs[attrib].dady[chan];
1724    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1725    const float *w = mach->QuadPos.xyzw[3].f;
1726    /* divide by W here */
1727    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1728    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1729    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1730    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1731 }
1732
1733
1734 typedef void (* eval_coef_func)(
1735    struct tgsi_exec_machine *mach,
1736    unsigned attrib,
1737    unsigned chan );
1738
1739 static void
1740 exec_declaration(struct tgsi_exec_machine *mach,
1741                  const struct tgsi_full_declaration *decl)
1742 {
1743    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1744       if (decl->Declaration.File == TGSI_FILE_INPUT ||
1745           decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1746          uint first, last, mask;
1747
1748          first = decl->Range.First;
1749          last = decl->Range.Last;
1750          mask = decl->Declaration.UsageMask;
1751
1752          if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1753             assert(decl->Semantic.Index == 0);
1754             assert(first == last);
1755             assert(mask == TGSI_WRITEMASK_XYZW);
1756
1757             mach->Inputs[first] = mach->QuadPos;
1758          } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1759             uint i;
1760
1761             assert(decl->Semantic.Index == 0);
1762             assert(first == last);
1763
1764             for (i = 0; i < QUAD_SIZE; i++) {
1765                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1766             }
1767          } else {
1768             eval_coef_func eval;
1769             uint i, j;
1770
1771             switch (decl->Declaration.Interpolate) {
1772             case TGSI_INTERPOLATE_CONSTANT:
1773                eval = eval_constant_coef;
1774                break;
1775
1776             case TGSI_INTERPOLATE_LINEAR:
1777                eval = eval_linear_coef;
1778                break;
1779
1780             case TGSI_INTERPOLATE_PERSPECTIVE:
1781                eval = eval_perspective_coef;
1782                break;
1783
1784             default:
1785                assert(0);
1786                return;
1787             }
1788
1789             for (j = 0; j < NUM_CHANNELS; j++) {
1790                if (mask & (1 << j)) {
1791                   for (i = first; i <= last; i++) {
1792                      eval(mach, i, j);
1793                   }
1794                }
1795             }
1796          }
1797       }
1798    }
1799 }
1800
1801 typedef void (* micro_op)(union tgsi_exec_channel *dst,
1802                           const union tgsi_exec_channel *src);
1803
1804 static void
1805 exec_scalar_unary(struct tgsi_exec_machine *mach,
1806                   const struct tgsi_full_instruction *inst,
1807                   micro_op op,
1808                   enum tgsi_exec_datatype dst_datatype,
1809                   enum tgsi_exec_datatype src_datatype)
1810 {
1811    unsigned int chan;
1812    union tgsi_exec_channel src;
1813    union tgsi_exec_channel dst;
1814
1815    fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1816    op(&dst, &src);
1817    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1818       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1819          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1820       }
1821    }
1822 }
1823
1824 static void
1825 exec_vector_unary(struct tgsi_exec_machine *mach,
1826                   const struct tgsi_full_instruction *inst,
1827                   micro_op op,
1828                   enum tgsi_exec_datatype dst_datatype,
1829                   enum tgsi_exec_datatype src_datatype)
1830 {
1831    unsigned int chan;
1832    struct tgsi_exec_vector dst;
1833
1834    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1835       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1836          union tgsi_exec_channel src;
1837
1838          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1839          op(&dst.xyzw[chan], &src);
1840       }
1841    }
1842    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1843       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1844          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1845       }
1846    }
1847 }
1848
1849 static void
1850 exec_vector_binary(struct tgsi_exec_machine *mach,
1851                    const struct tgsi_full_instruction *inst,
1852                    micro_op op,
1853                    enum tgsi_exec_datatype dst_datatype,
1854                    enum tgsi_exec_datatype src_datatype)
1855 {
1856    unsigned int chan;
1857    struct tgsi_exec_vector dst;
1858
1859    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1860       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1861          union tgsi_exec_channel src[2];
1862
1863          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1864          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1865          op(&dst.xyzw[chan], src);
1866       }
1867    }
1868    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1869       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1870          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1871       }
1872    }
1873 }
1874
1875 static void
1876 exec_vector_trinary(struct tgsi_exec_machine *mach,
1877                     const struct tgsi_full_instruction *inst,
1878                     micro_op op,
1879                     enum tgsi_exec_datatype dst_datatype,
1880                     enum tgsi_exec_datatype src_datatype)
1881 {
1882    unsigned int chan;
1883    struct tgsi_exec_vector dst;
1884
1885    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1886       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1887          union tgsi_exec_channel src[3];
1888
1889          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1890          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1891          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1892          op(&dst.xyzw[chan], src);
1893       }
1894    }
1895    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1896       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1897          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1898       }
1899    }
1900 }
1901
1902 static void
1903 exec_dp3(struct tgsi_exec_machine *mach,
1904          const struct tgsi_full_instruction *inst)
1905 {
1906    unsigned int chan;
1907    union tgsi_exec_channel arg[3];
1908
1909    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1910    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1911    micro_mul(&arg[2], &arg[0], &arg[1]);
1912
1913    for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1914       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1915       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1916       micro_mad(&arg[2], arg);
1917    }
1918
1919    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1920       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1921          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1922       }
1923    }
1924 }
1925
1926 static void
1927 exec_dp4(struct tgsi_exec_machine *mach,
1928          const struct tgsi_full_instruction *inst)
1929 {
1930    unsigned int chan;
1931    union tgsi_exec_channel arg[3];
1932
1933    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1934    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1935    micro_mul(&arg[2], &arg[0], &arg[1]);
1936
1937    for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1938       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1939       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1940       micro_mad(&arg[2], arg);
1941    }
1942
1943    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1944       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1945          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1946       }
1947    }
1948 }
1949
1950 static void
1951 exec_dp2a(struct tgsi_exec_machine *mach,
1952           const struct tgsi_full_instruction *inst)
1953 {
1954    unsigned int chan;
1955    union tgsi_exec_channel arg[3];
1956
1957    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1958    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1959    micro_mul(&arg[2], &arg[0], &arg[1]);
1960
1961    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1962    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1963    micro_mad(&arg[0], arg);
1964
1965    fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1966    micro_add(&arg[0], &arg[0], &arg[1]);
1967
1968    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1969       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1970          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1971       }
1972    }
1973 }
1974
1975 static void
1976 exec_dph(struct tgsi_exec_machine *mach,
1977          const struct tgsi_full_instruction *inst)
1978 {
1979    unsigned int chan;
1980    union tgsi_exec_channel arg[3];
1981
1982    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1983    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1984    micro_mul(&arg[2], &arg[0], &arg[1]);
1985
1986    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1987    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1988    micro_mad(&arg[2], arg);
1989
1990    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
1991    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
1992    micro_mad(&arg[0], arg);
1993
1994    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
1995    micro_add(&arg[0], &arg[0], &arg[1]);
1996
1997    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1998       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1999          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2000       }
2001    }
2002 }
2003
2004 static void
2005 exec_dp2(struct tgsi_exec_machine *mach,
2006          const struct tgsi_full_instruction *inst)
2007 {
2008    unsigned int chan;
2009    union tgsi_exec_channel arg[3];
2010
2011    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2012    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2013    micro_mul(&arg[2], &arg[0], &arg[1]);
2014
2015    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2016    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2017    micro_mad(&arg[2], arg);
2018
2019    for (chan = 0; chan < NUM_CHANNELS; chan++) {
2020       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2021          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2022       }
2023    }
2024 }
2025
2026 static void
2027 exec_break(struct tgsi_exec_machine *mach)
2028 {
2029    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2030       /* turn off loop channels for each enabled exec channel */
2031       mach->LoopMask &= ~mach->ExecMask;
2032       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2033       UPDATE_EXEC_MASK(mach);
2034    } else {
2035       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2036
2037       mach->Switch.mask = 0x0;
2038
2039       UPDATE_EXEC_MASK(mach);
2040    }
2041 }
2042
2043 static void
2044 exec_switch(struct tgsi_exec_machine *mach,
2045             const struct tgsi_full_instruction *inst)
2046 {
2047    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2048    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2049
2050    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2051    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2052    mach->Switch.mask = 0x0;
2053    mach->Switch.defaultMask = 0x0;
2054
2055    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2056    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2057
2058    UPDATE_EXEC_MASK(mach);
2059 }
2060
2061 static void
2062 exec_case(struct tgsi_exec_machine *mach,
2063           const struct tgsi_full_instruction *inst)
2064 {
2065    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2066    union tgsi_exec_channel src;
2067    uint mask = 0;
2068
2069    fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2070
2071    if (mach->Switch.selector.u[0] == src.u[0]) {
2072       mask |= 0x1;
2073    }
2074    if (mach->Switch.selector.u[1] == src.u[1]) {
2075       mask |= 0x2;
2076    }
2077    if (mach->Switch.selector.u[2] == src.u[2]) {
2078       mask |= 0x4;
2079    }
2080    if (mach->Switch.selector.u[3] == src.u[3]) {
2081       mask |= 0x8;
2082    }
2083
2084    mach->Switch.defaultMask |= mask;
2085
2086    mach->Switch.mask |= mask & prevMask;
2087
2088    UPDATE_EXEC_MASK(mach);
2089 }
2090
2091 static void
2092 exec_default(struct tgsi_exec_machine *mach)
2093 {
2094    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2095
2096    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2097
2098    UPDATE_EXEC_MASK(mach);
2099 }
2100
2101 static void
2102 exec_endswitch(struct tgsi_exec_machine *mach)
2103 {
2104    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2105    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2106
2107    UPDATE_EXEC_MASK(mach);
2108 }
2109
2110 static void
2111 micro_i2f(union tgsi_exec_channel *dst,
2112           const union tgsi_exec_channel *src)
2113 {
2114    dst->f[0] = (float)src->i[0];
2115    dst->f[1] = (float)src->i[1];
2116    dst->f[2] = (float)src->i[2];
2117    dst->f[3] = (float)src->i[3];
2118 }
2119
2120 static void
2121 micro_not(union tgsi_exec_channel *dst,
2122           const union tgsi_exec_channel *src)
2123 {
2124    dst->u[0] = ~src->u[0];
2125    dst->u[1] = ~src->u[1];
2126    dst->u[2] = ~src->u[2];
2127    dst->u[3] = ~src->u[3];
2128 }
2129
2130 static void
2131 micro_shl(union tgsi_exec_channel *dst,
2132           const union tgsi_exec_channel *src)
2133 {
2134    dst->u[0] = src[0].u[0] << src[1].u[0];
2135    dst->u[1] = src[0].u[1] << src[1].u[1];
2136    dst->u[2] = src[0].u[2] << src[1].u[2];
2137    dst->u[3] = src[0].u[3] << src[1].u[3];
2138 }
2139
2140 static void
2141 micro_and(union tgsi_exec_channel *dst,
2142           const union tgsi_exec_channel *src)
2143 {
2144    dst->u[0] = src[0].u[0] & src[1].u[0];
2145    dst->u[1] = src[0].u[1] & src[1].u[1];
2146    dst->u[2] = src[0].u[2] & src[1].u[2];
2147    dst->u[3] = src[0].u[3] & src[1].u[3];
2148 }
2149
2150 static void
2151 micro_or(union tgsi_exec_channel *dst,
2152          const union tgsi_exec_channel *src)
2153 {
2154    dst->u[0] = src[0].u[0] | src[1].u[0];
2155    dst->u[1] = src[0].u[1] | src[1].u[1];
2156    dst->u[2] = src[0].u[2] | src[1].u[2];
2157    dst->u[3] = src[0].u[3] | src[1].u[3];
2158 }
2159
2160 static void
2161 micro_xor(union tgsi_exec_channel *dst,
2162           const union tgsi_exec_channel *src)
2163 {
2164    dst->u[0] = src[0].u[0] ^ src[1].u[0];
2165    dst->u[1] = src[0].u[1] ^ src[1].u[1];
2166    dst->u[2] = src[0].u[2] ^ src[1].u[2];
2167    dst->u[3] = src[0].u[3] ^ src[1].u[3];
2168 }
2169
2170 static void
2171 micro_f2i(union tgsi_exec_channel *dst,
2172           const union tgsi_exec_channel *src)
2173 {
2174    dst->i[0] = (int)src->f[0];
2175    dst->i[1] = (int)src->f[1];
2176    dst->i[2] = (int)src->f[2];
2177    dst->i[3] = (int)src->f[3];
2178 }
2179
2180 static void
2181 micro_idiv(union tgsi_exec_channel *dst,
2182            const union tgsi_exec_channel *src)
2183 {
2184    dst->i[0] = src[0].i[0] / src[1].i[0];
2185    dst->i[1] = src[0].i[1] / src[1].i[1];
2186    dst->i[2] = src[0].i[2] / src[1].i[2];
2187    dst->i[3] = src[0].i[3] / src[1].i[3];
2188 }
2189
2190 static void
2191 micro_imax(union tgsi_exec_channel *dst,
2192            const union tgsi_exec_channel *src)
2193 {
2194    dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2195    dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2196    dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2197    dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2198 }
2199
2200 static void
2201 micro_imin(union tgsi_exec_channel *dst,
2202            const union tgsi_exec_channel *src)
2203 {
2204    dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2205    dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2206    dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2207    dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2208 }
2209
2210 static void
2211 micro_isge(union tgsi_exec_channel *dst,
2212            const union tgsi_exec_channel *src)
2213 {
2214    dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2215    dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2216    dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2217    dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2218 }
2219
2220 static void
2221 micro_ishr(union tgsi_exec_channel *dst,
2222            const union tgsi_exec_channel *src)
2223 {
2224    dst->i[0] = src[0].i[0] >> src[1].i[0];
2225    dst->i[1] = src[0].i[1] >> src[1].i[1];
2226    dst->i[2] = src[0].i[2] >> src[1].i[2];
2227    dst->i[3] = src[0].i[3] >> src[1].i[3];
2228 }
2229
2230 static void
2231 micro_islt(union tgsi_exec_channel *dst,
2232            const union tgsi_exec_channel *src)
2233 {
2234    dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2235    dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2236    dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2237    dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2238 }
2239
2240 static void
2241 micro_f2u(union tgsi_exec_channel *dst,
2242           const union tgsi_exec_channel *src)
2243 {
2244    dst->u[0] = (uint)src->f[0];
2245    dst->u[1] = (uint)src->f[1];
2246    dst->u[2] = (uint)src->f[2];
2247    dst->u[3] = (uint)src->f[3];
2248 }
2249
2250 static void
2251 micro_u2f(union tgsi_exec_channel *dst,
2252           const union tgsi_exec_channel *src)
2253 {
2254    dst->f[0] = (float)src->u[0];
2255    dst->f[1] = (float)src->u[1];
2256    dst->f[2] = (float)src->u[2];
2257    dst->f[3] = (float)src->u[3];
2258 }
2259
2260 static void
2261 micro_uadd(union tgsi_exec_channel *dst,
2262            const union tgsi_exec_channel *src)
2263 {
2264    dst->u[0] = src[0].u[0] + src[1].u[0];
2265    dst->u[1] = src[0].u[1] + src[1].u[1];
2266    dst->u[2] = src[0].u[2] + src[1].u[2];
2267    dst->u[3] = src[0].u[3] + src[1].u[3];
2268 }
2269
2270 static void
2271 micro_udiv(union tgsi_exec_channel *dst,
2272            const union tgsi_exec_channel *src)
2273 {
2274    dst->u[0] = src[0].u[0] / src[1].u[0];
2275    dst->u[1] = src[0].u[1] / src[1].u[1];
2276    dst->u[2] = src[0].u[2] / src[1].u[2];
2277    dst->u[3] = src[0].u[3] / src[1].u[3];
2278 }
2279
2280 static void
2281 micro_umad(union tgsi_exec_channel *dst,
2282            const union tgsi_exec_channel *src)
2283 {
2284    dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2285    dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2286    dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2287    dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2288 }
2289
2290 static void
2291 micro_umax(union tgsi_exec_channel *dst,
2292            const union tgsi_exec_channel *src)
2293 {
2294    dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2295    dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2296    dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2297    dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2298 }
2299
2300 static void
2301 micro_umin(union tgsi_exec_channel *dst,
2302            const union tgsi_exec_channel *src)
2303 {
2304    dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2305    dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2306    dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2307    dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2308 }
2309
2310 static void
2311 micro_umod(union tgsi_exec_channel *dst,
2312            const union tgsi_exec_channel *src)
2313 {
2314    dst->u[0] = src[0].u[0] % src[1].u[0];
2315    dst->u[1] = src[0].u[1] % src[1].u[1];
2316    dst->u[2] = src[0].u[2] % src[1].u[2];
2317    dst->u[3] = src[0].u[3] % src[1].u[3];
2318 }
2319
2320 static void
2321 micro_umul(union tgsi_exec_channel *dst,
2322            const union tgsi_exec_channel *src)
2323 {
2324    dst->u[0] = src[0].u[0] * src[1].u[0];
2325    dst->u[1] = src[0].u[1] * src[1].u[1];
2326    dst->u[2] = src[0].u[2] * src[1].u[2];
2327    dst->u[3] = src[0].u[3] * src[1].u[3];
2328 }
2329
2330 static void
2331 micro_useq(union tgsi_exec_channel *dst,
2332            const union tgsi_exec_channel *src)
2333 {
2334    dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2335    dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2336    dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2337    dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2338 }
2339
2340 static void
2341 micro_usge(union tgsi_exec_channel *dst,
2342            const union tgsi_exec_channel *src)
2343 {
2344    dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2345    dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2346    dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2347    dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2348 }
2349
2350 static void
2351 micro_ushr(union tgsi_exec_channel *dst,
2352            const union tgsi_exec_channel *src)
2353 {
2354    dst->u[0] = src[0].u[0] >> src[1].u[0];
2355    dst->u[1] = src[0].u[1] >> src[1].u[1];
2356    dst->u[2] = src[0].u[2] >> src[1].u[2];
2357    dst->u[3] = src[0].u[3] >> src[1].u[3];
2358 }
2359
2360 static void
2361 micro_uslt(union tgsi_exec_channel *dst,
2362            const union tgsi_exec_channel *src)
2363 {
2364    dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2365    dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2366    dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2367    dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2368 }
2369
2370 static void
2371 micro_usne(union tgsi_exec_channel *dst,
2372            const union tgsi_exec_channel *src)
2373 {
2374    dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2375    dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2376    dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2377    dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2378 }
2379
2380 static void
2381 exec_instruction(
2382    struct tgsi_exec_machine *mach,
2383    const struct tgsi_full_instruction *inst,
2384    int *pc )
2385 {
2386    uint chan_index;
2387    union tgsi_exec_channel r[10];
2388    union tgsi_exec_channel d[8];
2389
2390    (*pc)++;
2391
2392    switch (inst->Instruction.Opcode) {
2393    case TGSI_OPCODE_ARL:
2394       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2395       break;
2396
2397    case TGSI_OPCODE_MOV:
2398       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2399       break;
2400
2401    case TGSI_OPCODE_LIT:
2402       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2403          FETCH( &r[0], 0, CHAN_X );
2404          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2405             micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2406          }
2407
2408          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2409             FETCH( &r[1], 0, CHAN_Y );
2410             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2411
2412             FETCH( &r[2], 0, CHAN_W );
2413             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2414             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2415             micro_pow( &r[1], &r[1], &r[2] );
2416             micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2417          }
2418
2419          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2420             STORE(&d[CHAN_Y], 0, CHAN_Y);
2421          }
2422          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2423             STORE(&d[CHAN_Z], 0, CHAN_Z);
2424          }
2425       }
2426       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2427          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2428       }
2429       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2430          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2431       }
2432       break;
2433
2434    case TGSI_OPCODE_RCP:
2435       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2436       break;
2437
2438    case TGSI_OPCODE_RSQ:
2439       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2440       break;
2441
2442    case TGSI_OPCODE_EXP:
2443       FETCH( &r[0], 0, CHAN_X );
2444       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2445       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2446          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2447          STORE( &r[2], 0, CHAN_X );        /* store r2 */
2448       }
2449       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2450          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2451          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2452       }
2453       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2454          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2455          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2456       }
2457       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2458          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2459       }
2460       break;
2461
2462    case TGSI_OPCODE_LOG:
2463       FETCH( &r[0], 0, CHAN_X );
2464       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2465       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2466       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2467       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2468          STORE( &r[0], 0, CHAN_X );
2469       }
2470       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2471          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2472          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2473          STORE( &r[0], 0, CHAN_Y );
2474       }
2475       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2476          STORE( &r[1], 0, CHAN_Z );
2477       }
2478       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2479          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2480       }
2481       break;
2482
2483    case TGSI_OPCODE_MUL:
2484       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2485          FETCH(&r[0], 0, chan_index);
2486          FETCH(&r[1], 1, chan_index);
2487          micro_mul(&d[chan_index], &r[0], &r[1]);
2488       }
2489       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2490          STORE(&d[chan_index], 0, chan_index);
2491       }
2492       break;
2493
2494    case TGSI_OPCODE_ADD:
2495       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2496          FETCH( &r[0], 0, chan_index );
2497          FETCH( &r[1], 1, chan_index );
2498          micro_add(&d[chan_index], &r[0], &r[1]);
2499       }
2500       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2501          STORE(&d[chan_index], 0, chan_index);
2502       }
2503       break;
2504
2505    case TGSI_OPCODE_DP3:
2506       exec_dp3(mach, inst);
2507       break;
2508
2509    case TGSI_OPCODE_DP4:
2510       exec_dp4(mach, inst);
2511       break;
2512
2513    case TGSI_OPCODE_DST:
2514       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2515          FETCH( &r[0], 0, CHAN_Y );
2516          FETCH( &r[1], 1, CHAN_Y);
2517          micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2518       }
2519       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2520          FETCH(&d[CHAN_Z], 0, CHAN_Z);
2521       }
2522       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2523          FETCH(&d[CHAN_W], 1, CHAN_W);
2524       }
2525
2526       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2527          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2528       }
2529       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2530          STORE(&d[CHAN_Y], 0, CHAN_Y);
2531       }
2532       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2533          STORE(&d[CHAN_Z], 0, CHAN_Z);
2534       }
2535       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2536          STORE(&d[CHAN_W], 0, CHAN_W);
2537       }
2538       break;
2539
2540    case TGSI_OPCODE_MIN:
2541       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2542          FETCH(&r[0], 0, chan_index);
2543          FETCH(&r[1], 1, chan_index);
2544
2545          /* XXX use micro_min()?? */
2546          micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2547       }
2548       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2549          STORE(&d[chan_index], 0, chan_index);
2550       }
2551       break;
2552
2553    case TGSI_OPCODE_MAX:
2554       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2555          FETCH(&r[0], 0, chan_index);
2556          FETCH(&r[1], 1, chan_index);
2557
2558          /* XXX use micro_max()?? */
2559          micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2560       }
2561       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2562          STORE(&d[chan_index], 0, chan_index);
2563       }
2564       break;
2565
2566    case TGSI_OPCODE_SLT:
2567       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2568       break;
2569
2570    case TGSI_OPCODE_SGE:
2571       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2572       break;
2573
2574    case TGSI_OPCODE_MAD:
2575       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2576       break;
2577
2578    case TGSI_OPCODE_SUB:
2579       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2580          FETCH(&r[0], 0, chan_index);
2581          FETCH(&r[1], 1, chan_index);
2582          micro_sub(&d[chan_index], &r[0], &r[1]);
2583       }
2584       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2585          STORE(&d[chan_index], 0, chan_index);
2586       }
2587       break;
2588
2589    case TGSI_OPCODE_LRP:
2590       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2591       break;
2592
2593    case TGSI_OPCODE_CND:
2594       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2595          FETCH(&r[0], 0, chan_index);
2596          FETCH(&r[1], 1, chan_index);
2597          FETCH(&r[2], 2, chan_index);
2598          micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2599       }
2600       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2601          STORE(&d[chan_index], 0, chan_index);
2602       }
2603       break;
2604
2605    case TGSI_OPCODE_DP2A:
2606       exec_dp2a(mach, inst);
2607       break;
2608
2609    case TGSI_OPCODE_FRC:
2610       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2611       break;
2612
2613    case TGSI_OPCODE_CLAMP:
2614       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2615          FETCH(&r[0], 0, chan_index);
2616          FETCH(&r[1], 1, chan_index);
2617          micro_max(&r[0], &r[0], &r[1]);
2618          FETCH(&r[1], 2, chan_index);
2619          micro_min(&d[chan_index], &r[0], &r[1]);
2620       }
2621       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2622          STORE(&d[chan_index], 0, chan_index);
2623       }
2624       break;
2625
2626    case TGSI_OPCODE_FLR:
2627       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2628       break;
2629
2630    case TGSI_OPCODE_ROUND:
2631       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2632       break;
2633
2634    case TGSI_OPCODE_EX2:
2635       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2636       break;
2637
2638    case TGSI_OPCODE_LG2:
2639       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2640       break;
2641
2642    case TGSI_OPCODE_POW:
2643       FETCH(&r[0], 0, CHAN_X);
2644       FETCH(&r[1], 1, CHAN_X);
2645
2646       micro_pow( &r[0], &r[0], &r[1] );
2647
2648       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2649          STORE( &r[0], 0, chan_index );
2650       }
2651       break;
2652
2653    case TGSI_OPCODE_XPD:
2654       FETCH(&r[0], 0, CHAN_Y);
2655       FETCH(&r[1], 1, CHAN_Z);
2656
2657       micro_mul( &r[2], &r[0], &r[1] );
2658
2659       FETCH(&r[3], 0, CHAN_Z);
2660       FETCH(&r[4], 1, CHAN_Y);
2661
2662       micro_mul( &r[5], &r[3], &r[4] );
2663       micro_sub(&d[CHAN_X], &r[2], &r[5]);
2664
2665       FETCH(&r[2], 1, CHAN_X);
2666
2667       micro_mul( &r[3], &r[3], &r[2] );
2668
2669       FETCH(&r[5], 0, CHAN_X);
2670
2671       micro_mul( &r[1], &r[1], &r[5] );
2672       micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2673
2674       micro_mul( &r[5], &r[5], &r[4] );
2675       micro_mul( &r[0], &r[0], &r[2] );
2676       micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2677
2678       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2679          STORE(&d[CHAN_X], 0, CHAN_X);
2680       }
2681       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2682          STORE(&d[CHAN_Y], 0, CHAN_Y);
2683       }
2684       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2685          STORE(&d[CHAN_Z], 0, CHAN_Z);
2686       }
2687       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2688          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2689       }
2690       break;
2691
2692    case TGSI_OPCODE_ABS:
2693       exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2694       break;
2695
2696    case TGSI_OPCODE_RCC:
2697       FETCH(&r[0], 0, CHAN_X);
2698       micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2699       micro_float_clamp(&r[0], &r[0]);
2700       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2701          STORE(&r[0], 0, chan_index);
2702       }
2703       break;
2704
2705    case TGSI_OPCODE_DPH:
2706       exec_dph(mach, inst);
2707       break;
2708
2709    case TGSI_OPCODE_COS:
2710       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2711       break;
2712
2713    case TGSI_OPCODE_DDX:
2714       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2715       break;
2716
2717    case TGSI_OPCODE_DDY:
2718       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2719       break;
2720
2721    case TGSI_OPCODE_KILP:
2722       exec_kilp (mach, inst);
2723       break;
2724
2725    case TGSI_OPCODE_KIL:
2726       exec_kil (mach, inst);
2727       break;
2728
2729    case TGSI_OPCODE_PK2H:
2730       assert (0);
2731       break;
2732
2733    case TGSI_OPCODE_PK2US:
2734       assert (0);
2735       break;
2736
2737    case TGSI_OPCODE_PK4B:
2738       assert (0);
2739       break;
2740
2741    case TGSI_OPCODE_PK4UB:
2742       assert (0);
2743       break;
2744
2745    case TGSI_OPCODE_RFL:
2746       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2747           IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2748           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2749          /* r0 = dp3(src0, src0) */
2750          FETCH(&r[2], 0, CHAN_X);
2751          micro_mul(&r[0], &r[2], &r[2]);
2752          FETCH(&r[4], 0, CHAN_Y);
2753          micro_mul(&r[8], &r[4], &r[4]);
2754          micro_add(&r[0], &r[0], &r[8]);
2755          FETCH(&r[6], 0, CHAN_Z);
2756          micro_mul(&r[8], &r[6], &r[6]);
2757          micro_add(&r[0], &r[0], &r[8]);
2758
2759          /* r1 = dp3(src0, src1) */
2760          FETCH(&r[3], 1, CHAN_X);
2761          micro_mul(&r[1], &r[2], &r[3]);
2762          FETCH(&r[5], 1, CHAN_Y);
2763          micro_mul(&r[8], &r[4], &r[5]);
2764          micro_add(&r[1], &r[1], &r[8]);
2765          FETCH(&r[7], 1, CHAN_Z);
2766          micro_mul(&r[8], &r[6], &r[7]);
2767          micro_add(&r[1], &r[1], &r[8]);
2768
2769          /* r1 = 2 * r1 / r0 */
2770          micro_add(&r[1], &r[1], &r[1]);
2771          micro_div(&r[1], &r[1], &r[0]);
2772
2773          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2774             micro_mul(&r[2], &r[2], &r[1]);
2775             micro_sub(&r[2], &r[2], &r[3]);
2776             STORE(&r[2], 0, CHAN_X);
2777          }
2778          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2779             micro_mul(&r[4], &r[4], &r[1]);
2780             micro_sub(&r[4], &r[4], &r[5]);
2781             STORE(&r[4], 0, CHAN_Y);
2782          }
2783          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2784             micro_mul(&r[6], &r[6], &r[1]);
2785             micro_sub(&r[6], &r[6], &r[7]);
2786             STORE(&r[6], 0, CHAN_Z);
2787          }
2788       }
2789       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2790          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2791       }
2792       break;
2793
2794    case TGSI_OPCODE_SEQ:
2795       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2796       break;
2797
2798    case TGSI_OPCODE_SFL:
2799       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2800          STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2801       }
2802       break;
2803
2804    case TGSI_OPCODE_SGT:
2805       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2806       break;
2807
2808    case TGSI_OPCODE_SIN:
2809       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2810       break;
2811
2812    case TGSI_OPCODE_SLE:
2813       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2814       break;
2815
2816    case TGSI_OPCODE_SNE:
2817       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2818       break;
2819
2820    case TGSI_OPCODE_STR:
2821       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2822          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2823       }
2824       break;
2825
2826    case TGSI_OPCODE_TEX:
2827       /* simple texture lookup */
2828       /* src[0] = texcoord */
2829       /* src[1] = sampler unit */
2830       exec_tex(mach, inst, TEX_MODIFIER_NONE);
2831       break;
2832
2833    case TGSI_OPCODE_TXB:
2834       /* Texture lookup with lod bias */
2835       /* src[0] = texcoord (src[0].w = LOD bias) */
2836       /* src[1] = sampler unit */
2837       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2838       break;
2839
2840    case TGSI_OPCODE_TXD:
2841       /* Texture lookup with explict partial derivatives */
2842       /* src[0] = texcoord */
2843       /* src[1] = d[strq]/dx */
2844       /* src[2] = d[strq]/dy */
2845       /* src[3] = sampler unit */
2846       exec_txd(mach, inst);
2847       break;
2848
2849    case TGSI_OPCODE_TXL:
2850       /* Texture lookup with explit LOD */
2851       /* src[0] = texcoord (src[0].w = LOD) */
2852       /* src[1] = sampler unit */
2853       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2854       break;
2855
2856    case TGSI_OPCODE_TXP:
2857       /* Texture lookup with projection */
2858       /* src[0] = texcoord (src[0].w = projection) */
2859       /* src[1] = sampler unit */
2860       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2861       break;
2862
2863    case TGSI_OPCODE_UP2H:
2864       assert (0);
2865       break;
2866
2867    case TGSI_OPCODE_UP2US:
2868       assert (0);
2869       break;
2870
2871    case TGSI_OPCODE_UP4B:
2872       assert (0);
2873       break;
2874
2875    case TGSI_OPCODE_UP4UB:
2876       assert (0);
2877       break;
2878
2879    case TGSI_OPCODE_X2D:
2880       FETCH(&r[0], 1, CHAN_X);
2881       FETCH(&r[1], 1, CHAN_Y);
2882       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2883           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2884          FETCH(&r[2], 2, CHAN_X);
2885          micro_mul(&r[2], &r[2], &r[0]);
2886          FETCH(&r[3], 2, CHAN_Y);
2887          micro_mul(&r[3], &r[3], &r[1]);
2888          micro_add(&r[2], &r[2], &r[3]);
2889          FETCH(&r[3], 0, CHAN_X);
2890          micro_add(&d[CHAN_X], &r[2], &r[3]);
2891
2892       }
2893       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2894           IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2895          FETCH(&r[2], 2, CHAN_Z);
2896          micro_mul(&r[2], &r[2], &r[0]);
2897          FETCH(&r[3], 2, CHAN_W);
2898          micro_mul(&r[3], &r[3], &r[1]);
2899          micro_add(&r[2], &r[2], &r[3]);
2900          FETCH(&r[3], 0, CHAN_Y);
2901          micro_add(&d[CHAN_Y], &r[2], &r[3]);
2902
2903       }
2904       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2905          STORE(&d[CHAN_X], 0, CHAN_X);
2906       }
2907       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2908          STORE(&d[CHAN_Y], 0, CHAN_Y);
2909       }
2910       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2911          STORE(&d[CHAN_X], 0, CHAN_Z);
2912       }
2913       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2914          STORE(&d[CHAN_Y], 0, CHAN_W);
2915       }
2916       break;
2917
2918    case TGSI_OPCODE_ARA:
2919       assert (0);
2920       break;
2921
2922    case TGSI_OPCODE_ARR:
2923       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2924       break;
2925
2926    case TGSI_OPCODE_BRA:
2927       assert (0);
2928       break;
2929
2930    case TGSI_OPCODE_CAL:
2931       /* skip the call if no execution channels are enabled */
2932       if (mach->ExecMask) {
2933          /* do the call */
2934
2935          /* First, record the depths of the execution stacks.
2936           * This is important for deeply nested/looped return statements.
2937           * We have to unwind the stacks by the correct amount.  For a
2938           * real code generator, we could determine the number of entries
2939           * to pop off each stack with simple static analysis and avoid
2940           * implementing this data structure at run time.
2941           */
2942          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2943          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2944          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2945          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
2946          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
2947          /* note that PC was already incremented above */
2948          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2949
2950          mach->CallStackTop++;
2951
2952          /* Second, push the Cond, Loop, Cont, Func stacks */
2953          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2954          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2955          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2956          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2957          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2958          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2959
2960          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2961          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2962          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2963          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2964          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2965          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2966
2967          /* Finally, jump to the subroutine */
2968          *pc = inst->Label.Label;
2969       }
2970       break;
2971
2972    case TGSI_OPCODE_RET:
2973       mach->FuncMask &= ~mach->ExecMask;
2974       UPDATE_EXEC_MASK(mach);
2975
2976       if (mach->FuncMask == 0x0) {
2977          /* really return now (otherwise, keep executing */
2978
2979          if (mach->CallStackTop == 0) {
2980             /* returning from main() */
2981             *pc = -1;
2982             return;
2983          }
2984
2985          assert(mach->CallStackTop > 0);
2986          mach->CallStackTop--;
2987
2988          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2989          mach->CondMask = mach->CondStack[mach->CondStackTop];
2990
2991          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2992          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2993
2994          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2995          mach->ContMask = mach->ContStack[mach->ContStackTop];
2996
2997          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
2998          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
2999
3000          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3001          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3002
3003          assert(mach->FuncStackTop > 0);
3004          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3005
3006          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3007
3008          UPDATE_EXEC_MASK(mach);
3009       }
3010       break;
3011
3012    case TGSI_OPCODE_SSG:
3013       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3014       break;
3015
3016    case TGSI_OPCODE_CMP:
3017       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3018          FETCH(&r[0], 0, chan_index);
3019          FETCH(&r[1], 1, chan_index);
3020          FETCH(&r[2], 2, chan_index);
3021          micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
3022       }
3023       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3024          STORE(&d[chan_index], 0, chan_index);
3025       }
3026       break;
3027
3028    case TGSI_OPCODE_SCS:
3029       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3030          FETCH( &r[0], 0, CHAN_X );
3031          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3032             micro_cos(&r[1], &r[0]);
3033             STORE(&r[1], 0, CHAN_X);
3034          }
3035          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3036             micro_sin(&r[1], &r[0]);
3037             STORE(&r[1], 0, CHAN_Y);
3038          }
3039       }
3040       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3041          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3042       }
3043       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3044          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3045       }
3046       break;
3047
3048    case TGSI_OPCODE_NRM:
3049       /* 3-component vector normalize */
3050       if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
3051          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3052          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3053          /* r3 = sqrt(dp3(src0, src0)) */
3054          FETCH(&r[0], 0, CHAN_X);
3055          micro_mul(&r[3], &r[0], &r[0]);
3056          FETCH(&r[1], 0, CHAN_Y);
3057          micro_mul(&r[4], &r[1], &r[1]);
3058          micro_add(&r[3], &r[3], &r[4]);
3059          FETCH(&r[2], 0, CHAN_Z);
3060          micro_mul(&r[4], &r[2], &r[2]);
3061          micro_add(&r[3], &r[3], &r[4]);
3062          micro_sqrt(&r[3], &r[3]);
3063
3064          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3065             micro_div(&r[0], &r[0], &r[3]);
3066             STORE(&r[0], 0, CHAN_X);
3067          }
3068          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3069             micro_div(&r[1], &r[1], &r[3]);
3070             STORE(&r[1], 0, CHAN_Y);
3071          }
3072          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3073             micro_div(&r[2], &r[2], &r[3]);
3074             STORE(&r[2], 0, CHAN_Z);
3075          }
3076       }
3077       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3078          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
3079       }
3080       break;
3081
3082    case TGSI_OPCODE_NRM4:
3083       /* 4-component vector normalize */
3084       {
3085          union tgsi_exec_channel tmp, dot;
3086
3087          /* tmp = dp4(src0, src0): */
3088          FETCH( &r[0], 0, CHAN_X );
3089          micro_mul( &tmp, &r[0], &r[0] );
3090
3091          FETCH( &r[1], 0, CHAN_Y );
3092          micro_mul( &dot, &r[1], &r[1] );
3093          micro_add( &tmp, &tmp, &dot );
3094
3095          FETCH( &r[2], 0, CHAN_Z );
3096          micro_mul( &dot, &r[2], &r[2] );
3097          micro_add( &tmp, &tmp, &dot );
3098
3099          FETCH( &r[3], 0, CHAN_W );
3100          micro_mul( &dot, &r[3], &r[3] );
3101          micro_add( &tmp, &tmp, &dot );
3102
3103          /* tmp = 1 / sqrt(tmp) */
3104          micro_sqrt( &tmp, &tmp );
3105          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
3106
3107          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3108             /* chan = chan * tmp */
3109             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
3110             STORE( &r[chan_index], 0, chan_index );
3111          }
3112       }
3113       break;
3114
3115    case TGSI_OPCODE_DIV:
3116       assert( 0 );
3117       break;
3118
3119    case TGSI_OPCODE_DP2:
3120       exec_dp2(mach, inst);
3121       break;
3122
3123    case TGSI_OPCODE_IF:
3124       /* push CondMask */
3125       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3126       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3127       FETCH( &r[0], 0, CHAN_X );
3128       /* update CondMask */
3129       if( ! r[0].u[0] ) {
3130          mach->CondMask &= ~0x1;
3131       }
3132       if( ! r[0].u[1] ) {
3133          mach->CondMask &= ~0x2;
3134       }
3135       if( ! r[0].u[2] ) {
3136          mach->CondMask &= ~0x4;
3137       }
3138       if( ! r[0].u[3] ) {
3139          mach->CondMask &= ~0x8;
3140       }
3141       UPDATE_EXEC_MASK(mach);
3142       /* Todo: If CondMask==0, jump to ELSE */
3143       break;
3144
3145    case TGSI_OPCODE_ELSE:
3146       /* invert CondMask wrt previous mask */
3147       {
3148          uint prevMask;
3149          assert(mach->CondStackTop > 0);
3150          prevMask = mach->CondStack[mach->CondStackTop - 1];
3151          mach->CondMask = ~mach->CondMask & prevMask;
3152          UPDATE_EXEC_MASK(mach);
3153          /* Todo: If CondMask==0, jump to ENDIF */
3154       }
3155       break;
3156
3157    case TGSI_OPCODE_ENDIF:
3158       /* pop CondMask */
3159       assert(mach->CondStackTop > 0);
3160       mach->CondMask = mach->CondStack[--mach->CondStackTop];
3161       UPDATE_EXEC_MASK(mach);
3162       break;
3163
3164    case TGSI_OPCODE_END:
3165       /* halt execution */
3166       *pc = -1;
3167       break;
3168
3169    case TGSI_OPCODE_REP:
3170       assert (0);
3171       break;
3172
3173    case TGSI_OPCODE_ENDREP:
3174        assert (0);
3175        break;
3176
3177    case TGSI_OPCODE_PUSHA:
3178       assert (0);
3179       break;
3180
3181    case TGSI_OPCODE_POPA:
3182       assert (0);
3183       break;
3184
3185    case TGSI_OPCODE_CEIL:
3186       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3187       break;
3188
3189    case TGSI_OPCODE_I2F:
3190       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3191       break;
3192
3193    case TGSI_OPCODE_NOT:
3194       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3195       break;
3196
3197    case TGSI_OPCODE_TRUNC:
3198       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3199       break;
3200
3201    case TGSI_OPCODE_SHL:
3202       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3203       break;
3204
3205    case TGSI_OPCODE_AND:
3206       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3207       break;
3208
3209    case TGSI_OPCODE_OR:
3210       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3211       break;
3212
3213    case TGSI_OPCODE_MOD:
3214       assert (0);
3215       break;
3216
3217    case TGSI_OPCODE_XOR:
3218       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3219       break;
3220
3221    case TGSI_OPCODE_SAD:
3222       assert (0);
3223       break;
3224
3225    case TGSI_OPCODE_TXF:
3226       assert (0);
3227       break;
3228
3229    case TGSI_OPCODE_TXQ:
3230       assert (0);
3231       break;
3232
3233    case TGSI_OPCODE_EMIT:
3234       emit_vertex(mach);
3235       break;
3236
3237    case TGSI_OPCODE_ENDPRIM:
3238       emit_primitive(mach);
3239       break;
3240
3241    case TGSI_OPCODE_BGNFOR:
3242       assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3243       for (chan_index = 0; chan_index < 3; chan_index++) {
3244          FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3245       }
3246       ++mach->LoopCounterStackTop;
3247       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3248       /* update LoopMask */
3249       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3250          mach->LoopMask &= ~0x1;
3251       }
3252       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3253          mach->LoopMask &= ~0x2;
3254       }
3255       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3256          mach->LoopMask &= ~0x4;
3257       }
3258       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3259          mach->LoopMask &= ~0x8;
3260       }
3261       /* TODO: if mach->LoopMask == 0, jump to end of loop */
3262       UPDATE_EXEC_MASK(mach);
3263       /* fall-through (for now) */
3264    case TGSI_OPCODE_BGNLOOP:
3265       /* push LoopMask and ContMasks */
3266       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3267       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3268       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3269       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3270
3271       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3272       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3273       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3274       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3275       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3276       break;
3277
3278    case TGSI_OPCODE_ENDFOR:
3279       assert(mach->LoopCounterStackTop > 0);
3280       micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3281                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3282                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3283       /* update LoopMask */
3284       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3285          mach->LoopMask &= ~0x1;
3286       }
3287       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3288          mach->LoopMask &= ~0x2;
3289       }
3290       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3291          mach->LoopMask &= ~0x4;
3292       }
3293       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3294          mach->LoopMask &= ~0x8;
3295       }
3296       micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3297                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3298                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3299       assert(mach->LoopLabelStackTop > 0);
3300       inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3301       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3302       /* Restore ContMask, but don't pop */
3303       assert(mach->ContStackTop > 0);
3304       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3305       UPDATE_EXEC_MASK(mach);
3306       if (mach->ExecMask) {
3307          /* repeat loop: jump to instruction just past BGNLOOP */
3308          assert(mach->LoopLabelStackTop > 0);
3309          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3310       }
3311       else {
3312          /* exit loop: pop LoopMask */
3313          assert(mach->LoopStackTop > 0);
3314          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3315          /* pop ContMask */
3316          assert(mach->ContStackTop > 0);
3317          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3318          assert(mach->LoopLabelStackTop > 0);
3319          --mach->LoopLabelStackTop;
3320          assert(mach->LoopCounterStackTop > 0);
3321          --mach->LoopCounterStackTop;
3322
3323          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3324       }
3325       UPDATE_EXEC_MASK(mach);
3326       break;
3327
3328    case TGSI_OPCODE_ENDLOOP:
3329       /* Restore ContMask, but don't pop */
3330       assert(mach->ContStackTop > 0);
3331       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3332       UPDATE_EXEC_MASK(mach);
3333       if (mach->ExecMask) {
3334          /* repeat loop: jump to instruction just past BGNLOOP */
3335          assert(mach->LoopLabelStackTop > 0);
3336          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3337       }
3338       else {
3339          /* exit loop: pop LoopMask */
3340          assert(mach->LoopStackTop > 0);
3341          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3342          /* pop ContMask */
3343          assert(mach->ContStackTop > 0);
3344          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3345          assert(mach->LoopLabelStackTop > 0);
3346          --mach->LoopLabelStackTop;
3347
3348          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3349       }
3350       UPDATE_EXEC_MASK(mach);
3351       break;
3352
3353    case TGSI_OPCODE_BRK:
3354       exec_break(mach);
3355       break;
3356
3357    case TGSI_OPCODE_CONT:
3358       /* turn off cont channels for each enabled exec channel */
3359       mach->ContMask &= ~mach->ExecMask;
3360       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3361       UPDATE_EXEC_MASK(mach);
3362       break;
3363
3364    case TGSI_OPCODE_BGNSUB:
3365       /* no-op */
3366       break;
3367
3368    case TGSI_OPCODE_ENDSUB:
3369       /*
3370        * XXX: This really should be a no-op. We should never reach this opcode.
3371        */
3372
3373       assert(mach->CallStackTop > 0);
3374       mach->CallStackTop--;
3375
3376       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3377       mach->CondMask = mach->CondStack[mach->CondStackTop];
3378
3379       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3380       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3381
3382       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3383       mach->ContMask = mach->ContStack[mach->ContStackTop];
3384
3385       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3386       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3387
3388       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3389       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3390
3391       assert(mach->FuncStackTop > 0);
3392       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3393
3394       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3395
3396       UPDATE_EXEC_MASK(mach);
3397       break;
3398
3399    case TGSI_OPCODE_NOP:
3400       break;
3401
3402    case TGSI_OPCODE_BREAKC:
3403       FETCH(&r[0], 0, CHAN_X);
3404       /* update CondMask */
3405       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3406          mach->LoopMask &= ~0x1;
3407       }
3408       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3409          mach->LoopMask &= ~0x2;
3410       }
3411       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3412          mach->LoopMask &= ~0x4;
3413       }
3414       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3415          mach->LoopMask &= ~0x8;
3416       }
3417       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3418       UPDATE_EXEC_MASK(mach);
3419       break;
3420
3421    case TGSI_OPCODE_F2I:
3422       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3423       break;
3424
3425    case TGSI_OPCODE_IDIV:
3426       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3427       break;
3428
3429    case TGSI_OPCODE_IMAX:
3430       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3431       break;
3432
3433    case TGSI_OPCODE_IMIN:
3434       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3435       break;
3436
3437    case TGSI_OPCODE_INEG:
3438       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3439       break;
3440
3441    case TGSI_OPCODE_ISGE:
3442       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3443       break;
3444
3445    case TGSI_OPCODE_ISHR:
3446       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3447       break;
3448
3449    case TGSI_OPCODE_ISLT:
3450       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3451       break;
3452
3453    case TGSI_OPCODE_F2U:
3454       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3455       break;
3456
3457    case TGSI_OPCODE_U2F:
3458       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3459       break;
3460
3461    case TGSI_OPCODE_UADD:
3462       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3463       break;
3464
3465    case TGSI_OPCODE_UDIV:
3466       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3467       break;
3468
3469    case TGSI_OPCODE_UMAD:
3470       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3471       break;
3472
3473    case TGSI_OPCODE_UMAX:
3474       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3475       break;
3476
3477    case TGSI_OPCODE_UMIN:
3478       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3479       break;
3480
3481    case TGSI_OPCODE_UMOD:
3482       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3483       break;
3484
3485    case TGSI_OPCODE_UMUL:
3486       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3487       break;
3488
3489    case TGSI_OPCODE_USEQ:
3490       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3491       break;
3492
3493    case TGSI_OPCODE_USGE:
3494       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3495       break;
3496
3497    case TGSI_OPCODE_USHR:
3498       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3499       break;
3500
3501    case TGSI_OPCODE_USLT:
3502       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3503       break;
3504
3505    case TGSI_OPCODE_USNE:
3506       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3507       break;
3508
3509    case TGSI_OPCODE_SWITCH:
3510       exec_switch(mach, inst);
3511       break;
3512
3513    case TGSI_OPCODE_CASE:
3514       exec_case(mach, inst);
3515       break;
3516
3517    case TGSI_OPCODE_DEFAULT:
3518       exec_default(mach);
3519       break;
3520
3521    case TGSI_OPCODE_ENDSWITCH:
3522       exec_endswitch(mach);
3523       break;
3524
3525    default:
3526       assert( 0 );
3527    }
3528 }
3529
3530
3531 #define DEBUG_EXECUTION 0
3532
3533
3534 /**
3535  * Run TGSI interpreter.
3536  * \return bitmask of "alive" quad components
3537  */
3538 uint
3539 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3540 {
3541    uint i;
3542    int pc = 0;
3543
3544    mach->CondMask = 0xf;
3545    mach->LoopMask = 0xf;
3546    mach->ContMask = 0xf;
3547    mach->FuncMask = 0xf;
3548    mach->ExecMask = 0xf;
3549
3550    mach->Switch.mask = 0xf;
3551
3552    assert(mach->CondStackTop == 0);
3553    assert(mach->LoopStackTop == 0);
3554    assert(mach->ContStackTop == 0);
3555    assert(mach->SwitchStackTop == 0);
3556    assert(mach->BreakStackTop == 0);
3557    assert(mach->CallStackTop == 0);
3558
3559    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3560    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3561
3562    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3563       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3564       mach->Primitives[0] = 0;
3565    }
3566
3567    for (i = 0; i < QUAD_SIZE; i++) {
3568       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3569          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3570          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3571          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3572          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3573    }
3574
3575    /* execute declarations (interpolants) */
3576    for (i = 0; i < mach->NumDeclarations; i++) {
3577       exec_declaration( mach, mach->Declarations+i );
3578    }
3579
3580    {
3581 #if DEBUG_EXECUTION
3582       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3583       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3584       uint inst = 1;
3585
3586       memcpy(temps, mach->Temps, sizeof(temps));
3587       memcpy(outputs, mach->Outputs, sizeof(outputs));
3588 #endif
3589
3590       /* execute instructions, until pc is set to -1 */
3591       while (pc != -1) {
3592
3593 #if DEBUG_EXECUTION
3594          uint i;
3595
3596          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3597 #endif
3598
3599          assert(pc < (int) mach->NumInstructions);
3600          exec_instruction(mach, mach->Instructions + pc, &pc);
3601
3602 #if DEBUG_EXECUTION
3603          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3604             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3605                uint j;
3606
3607                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3608                debug_printf("TEMP[%2u] = ", i);
3609                for (j = 0; j < 4; j++) {
3610                   if (j > 0) {
3611                      debug_printf("           ");
3612                   }
3613                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3614                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3615                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3616                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3617                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3618                }
3619             }
3620          }
3621          for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3622             if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3623                uint j;
3624
3625                memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3626                debug_printf("OUT[%2u] =  ", i);
3627                for (j = 0; j < 4; j++) {
3628                   if (j > 0) {
3629                      debug_printf("           ");
3630                   }
3631                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3632                                outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3633                                outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3634                                outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3635                                outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3636                }
3637             }
3638          }
3639 #endif
3640       }
3641    }
3642
3643 #if 0
3644    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3645    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3646       /*
3647        * Scale back depth component.
3648        */
3649       for (i = 0; i < 4; i++)
3650          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3651    }
3652 #endif
3653
3654    assert(mach->CondStackTop == 0);
3655    assert(mach->LoopStackTop == 0);
3656    assert(mach->ContStackTop == 0);
3657    assert(mach->SwitchStackTop == 0);
3658    assert(mach->BreakStackTop == 0);
3659    assert(mach->CallStackTop == 0);
3660
3661    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3662 }