src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_memory.h"
  62 #include "util/u_math.h"
  63
  64
  65 #define FAST_MATH 1
  66
  67 #define TILE_TOP_LEFT     0
  68 #define TILE_TOP_RIGHT    1
  69 #define TILE_BOTTOM_LEFT  2
  70 #define TILE_BOTTOM_RIGHT 3
  71
  72 static void
  73 micro_abs(union tgsi_exec_channel *dst,
  74           const union tgsi_exec_channel *src)
  75 {
  76    dst->f[0] = fabsf(src->f[0]);
  77    dst->f[1] = fabsf(src->f[1]);
  78    dst->f[2] = fabsf(src->f[2]);
  79    dst->f[3] = fabsf(src->f[3]);
  80 }
  81
  82 static void
  83 micro_arl(union tgsi_exec_channel *dst,
  84           const union tgsi_exec_channel *src)
  85 {
  86    dst->i[0] = (int)floorf(src->f[0]);
  87    dst->i[1] = (int)floorf(src->f[1]);
  88    dst->i[2] = (int)floorf(src->f[2]);
  89    dst->i[3] = (int)floorf(src->f[3]);
  90 }
  91
  92 static void
  93 micro_arr(union tgsi_exec_channel *dst,
  94           const union tgsi_exec_channel *src)
  95 {
  96    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
  97    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
  98    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
  99    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 100 }
 101
 102 static void
 103 micro_ceil(union tgsi_exec_channel *dst,
 104            const union tgsi_exec_channel *src)
 105 {
 106    dst->f[0] = ceilf(src->f[0]);
 107    dst->f[1] = ceilf(src->f[1]);
 108    dst->f[2] = ceilf(src->f[2]);
 109    dst->f[3] = ceilf(src->f[3]);
 110 }
 111
 112 static void
 113 micro_cos(union tgsi_exec_channel *dst,
 114           const union tgsi_exec_channel *src)
 115 {
 116    dst->f[0] = cosf(src->f[0]);
 117    dst->f[1] = cosf(src->f[1]);
 118    dst->f[2] = cosf(src->f[2]);
 119    dst->f[3] = cosf(src->f[3]);
 120 }
 121
 122 static void
 123 micro_ddx(union tgsi_exec_channel *dst,
 124           const union tgsi_exec_channel *src)
 125 {
 126    dst->f[0] =
 127    dst->f[1] =
 128    dst->f[2] =
 129    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 130 }
 131
 132 static void
 133 micro_ddy(union tgsi_exec_channel *dst,
 134           const union tgsi_exec_channel *src)
 135 {
 136    dst->f[0] =
 137    dst->f[1] =
 138    dst->f[2] =
 139    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 140 }
 141
 142 static void
 143 micro_exp2(union tgsi_exec_channel *dst,
 144            const union tgsi_exec_channel *src)
 145 {
 146 #if FAST_MATH
 147    dst->f[0] = util_fast_exp2(src->f[0]);
 148    dst->f[1] = util_fast_exp2(src->f[1]);
 149    dst->f[2] = util_fast_exp2(src->f[2]);
 150    dst->f[3] = util_fast_exp2(src->f[3]);
 151 #else
 152 #if DEBUG
 153    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 154    uint i;
 155    union tgsi_exec_channel clamped;
 156
 157    for (i = 0; i < 4; i++) {
 158       if (src->f[i] > 127.99999f) {
 159          clamped.f[i] = 127.99999f;
 160       } else if (src->f[i] < -126.99999f) {
 161          clamped.f[i] = -126.99999f;
 162       } else {
 163          clamped.f[i] = src->f[i];
 164       }
 165    }
 166    src = &clamped;
 167 #endif /* DEBUG */
 168
 169    dst->f[0] = powf(2.0f, src->f[0]);
 170    dst->f[1] = powf(2.0f, src->f[1]);
 171    dst->f[2] = powf(2.0f, src->f[2]);
 172    dst->f[3] = powf(2.0f, src->f[3]);
 173 #endif /* FAST_MATH */
 174 }
 175
 176 static void
 177 micro_flr(union tgsi_exec_channel *dst,
 178           const union tgsi_exec_channel *src)
 179 {
 180    dst->f[0] = floorf(src->f[0]);
 181    dst->f[1] = floorf(src->f[1]);
 182    dst->f[2] = floorf(src->f[2]);
 183    dst->f[3] = floorf(src->f[3]);
 184 }
 185
 186 static void
 187 micro_frc(union tgsi_exec_channel *dst,
 188           const union tgsi_exec_channel *src)
 189 {
 190    dst->f[0] = src->f[0] - floorf(src->f[0]);
 191    dst->f[1] = src->f[1] - floorf(src->f[1]);
 192    dst->f[2] = src->f[2] - floorf(src->f[2]);
 193    dst->f[3] = src->f[3] - floorf(src->f[3]);
 194 }
 195
 196 static void
 197 micro_iabs(union tgsi_exec_channel *dst,
 198            const union tgsi_exec_channel *src)
 199 {
 200    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 201    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 202    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 203    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 204 }
 205
 206 static void
 207 micro_ineg(union tgsi_exec_channel *dst,
 208            const union tgsi_exec_channel *src)
 209 {
 210    dst->i[0] = -src->i[0];
 211    dst->i[1] = -src->i[1];
 212    dst->i[2] = -src->i[2];
 213    dst->i[3] = -src->i[3];
 214 }
 215
 216 static void
 217 micro_lg2(union tgsi_exec_channel *dst,
 218           const union tgsi_exec_channel *src)
 219 {
 220 #if FAST_MATH
 221    dst->f[0] = util_fast_log2(src->f[0]);
 222    dst->f[1] = util_fast_log2(src->f[1]);
 223    dst->f[2] = util_fast_log2(src->f[2]);
 224    dst->f[3] = util_fast_log2(src->f[3]);
 225 #else
 226    dst->f[0] = logf(src->f[0]) * 1.442695f;
 227    dst->f[1] = logf(src->f[1]) * 1.442695f;
 228    dst->f[2] = logf(src->f[2]) * 1.442695f;
 229    dst->f[3] = logf(src->f[3]) * 1.442695f;
 230 #endif
 231 }
 232
 233 static void
 234 micro_lrp(union tgsi_exec_channel *dst,
 235           const union tgsi_exec_channel *src)
 236 {
 237    dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
 238    dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
 239    dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
 240    dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
 241 }
 242
 243 static void
 244 micro_mad(union tgsi_exec_channel *dst,
 245           const union tgsi_exec_channel *src)
 246 {
 247    dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
 248    dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
 249    dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
 250    dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
 251 }
 252
 253 static void
 254 micro_mov(union tgsi_exec_channel *dst,
 255           const union tgsi_exec_channel *src)
 256 {
 257    dst->u[0] = src->u[0];
 258    dst->u[1] = src->u[1];
 259    dst->u[2] = src->u[2];
 260    dst->u[3] = src->u[3];
 261 }
 262
 263 static void
 264 micro_rcp(union tgsi_exec_channel *dst,
 265           const union tgsi_exec_channel *src)
 266 {
 267 #if 0 /* for debugging */
 268    assert(src->f[0] != 0.0f);
 269    assert(src->f[1] != 0.0f);
 270    assert(src->f[2] != 0.0f);
 271    assert(src->f[3] != 0.0f);
 272 #endif
 273    dst->f[0] = 1.0f / src->f[0];
 274    dst->f[1] = 1.0f / src->f[1];
 275    dst->f[2] = 1.0f / src->f[2];
 276    dst->f[3] = 1.0f / src->f[3];
 277 }
 278
 279 static void
 280 micro_rnd(union tgsi_exec_channel *dst,
 281           const union tgsi_exec_channel *src)
 282 {
 283    dst->f[0] = floorf(src->f[0] + 0.5f);
 284    dst->f[1] = floorf(src->f[1] + 0.5f);
 285    dst->f[2] = floorf(src->f[2] + 0.5f);
 286    dst->f[3] = floorf(src->f[3] + 0.5f);
 287 }
 288
 289 static void
 290 micro_rsq(union tgsi_exec_channel *dst,
 291           const union tgsi_exec_channel *src)
 292 {
 293 #if 0 /* for debugging */
 294    assert(src->f[0] != 0.0f);
 295    assert(src->f[1] != 0.0f);
 296    assert(src->f[2] != 0.0f);
 297    assert(src->f[3] != 0.0f);
 298 #endif
 299    dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
 300    dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
 301    dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
 302    dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
 303 }
 304
 305 static void
 306 micro_seq(union tgsi_exec_channel *dst,
 307           const union tgsi_exec_channel *src)
 308 {
 309    dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
 310    dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
 311    dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
 312    dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
 313 }
 314
 315 static void
 316 micro_sge(union tgsi_exec_channel *dst,
 317           const union tgsi_exec_channel *src)
 318 {
 319    dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
 320    dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
 321    dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
 322    dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
 323 }
 324
 325 static void
 326 micro_sgn(union tgsi_exec_channel *dst,
 327           const union tgsi_exec_channel *src)
 328 {
 329    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 330    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 331    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 332    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 333 }
 334
 335 static void
 336 micro_sgt(union tgsi_exec_channel *dst,
 337           const union tgsi_exec_channel *src)
 338 {
 339    dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
 340    dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
 341    dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
 342    dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
 343 }
 344
 345 static void
 346 micro_sin(union tgsi_exec_channel *dst,
 347           const union tgsi_exec_channel *src)
 348 {
 349    dst->f[0] = sinf(src->f[0]);
 350    dst->f[1] = sinf(src->f[1]);
 351    dst->f[2] = sinf(src->f[2]);
 352    dst->f[3] = sinf(src->f[3]);
 353 }
 354
 355 static void
 356 micro_sle(union tgsi_exec_channel *dst,
 357           const union tgsi_exec_channel *src)
 358 {
 359    dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
 360    dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
 361    dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
 362    dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
 363 }
 364
 365 static void
 366 micro_slt(union tgsi_exec_channel *dst,
 367           const union tgsi_exec_channel *src)
 368 {
 369    dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
 370    dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
 371    dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
 372    dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
 373 }
 374
 375 static void
 376 micro_sne(union tgsi_exec_channel *dst,
 377           const union tgsi_exec_channel *src)
 378 {
 379    dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
 380    dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
 381    dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
 382    dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
 383 }
 384
 385 static void
 386 micro_trunc(union tgsi_exec_channel *dst,
 387             const union tgsi_exec_channel *src)
 388 {
 389    dst->f[0] = (float)(int)src->f[0];
 390    dst->f[1] = (float)(int)src->f[1];
 391    dst->f[2] = (float)(int)src->f[2];
 392    dst->f[3] = (float)(int)src->f[3];
 393 }
 394
 395
 396 #define CHAN_X  0
 397 #define CHAN_Y  1
 398 #define CHAN_Z  2
 399 #define CHAN_W  3
 400
 401 enum tgsi_exec_datatype {
 402    TGSI_EXEC_DATA_FLOAT,
 403    TGSI_EXEC_DATA_INT,
 404    TGSI_EXEC_DATA_UINT
 405 };
 406
 407 /*
 408  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 409  */
 410 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
 411 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
 412 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
 413 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
 414 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
 415 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
 416 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
 417 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
 418 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
 419 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
 420 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
 421 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
 422 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
 423 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
 424 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
 425 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
 426 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 427 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 428 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 429 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 430 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 431 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 432 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 433 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 434 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 435 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 436 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 437 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 438 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 439 #define TEMP_P0            TGSI_EXEC_TEMP_P0
 440
 441 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 442    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
 443
 444 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 445    ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
 446
 447 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 448    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 449       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 450
 451 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 452    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 453       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 454
 455
 456 /** The execution mask depends on the conditional mask and the loop mask */
 457 #define UPDATE_EXEC_MASK(MACH) \
 458       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 459
 460
 461 static const union tgsi_exec_channel ZeroVec =
 462    { { 0.0, 0.0, 0.0, 0.0 } };
 463
 464
 465 /**
 466  * Assert that none of the float values in 'chan' are infinite or NaN.
 467  * NaN and Inf may occur normally during program execution and should
 468  * not lead to crashes, etc.  But when debugging, it's helpful to catch
 469  * them.
 470  */
 471 static INLINE void
 472 check_inf_or_nan(const union tgsi_exec_channel *chan)
 473 {
 474    assert(!util_is_inf_or_nan((chan)->f[0]));
 475    assert(!util_is_inf_or_nan((chan)->f[1]));
 476    assert(!util_is_inf_or_nan((chan)->f[2]));
 477    assert(!util_is_inf_or_nan((chan)->f[3]));
 478 }
 479
 480
 481 #ifdef DEBUG
 482 static void
 483 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 484 {
 485    debug_printf("%s = {%f, %f, %f, %f}\n",
 486                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 487 }
 488 #endif
 489
 490
 491 #ifdef DEBUG
 492 static void
 493 print_temp(const struct tgsi_exec_machine *mach, uint index)
 494 {
 495    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 496    int i;
 497    debug_printf("Temp[%u] =\n", index);
 498    for (i = 0; i < 4; i++) {
 499       debug_printf("  %c: { %f, %f, %f, %f }\n",
 500                    "XYZW"[i],
 501                    tmp->xyzw[i].f[0],
 502                    tmp->xyzw[i].f[1],
 503                    tmp->xyzw[i].f[2],
 504                    tmp->xyzw[i].f[3]);
 505    }
 506 }
 507 #endif
 508
 509
 510 /**
 511  * Check if there's a potential src/dst register data dependency when
 512  * using SOA execution.
 513  * Example:
 514  *   MOV T, T.yxwz;
 515  * This would expand into:
 516  *   MOV t0, t1;
 517  *   MOV t1, t0;
 518  *   MOV t2, t3;
 519  *   MOV t3, t2;
 520  * The second instruction will have the wrong value for t0 if executed as-is.
 521  */
 522 boolean
 523 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
 524 {
 525    uint i, chan;
 526
 527    uint writemask = inst->Dst[0].Register.WriteMask;
 528    if (writemask == TGSI_WRITEMASK_X ||
 529        writemask == TGSI_WRITEMASK_Y ||
 530        writemask == TGSI_WRITEMASK_Z ||
 531        writemask == TGSI_WRITEMASK_W ||
 532        writemask == TGSI_WRITEMASK_NONE) {
 533       /* no chance of data dependency */
 534       return FALSE;
 535    }
 536
 537    /* loop over src regs */
 538    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 539       if ((inst->Src[i].Register.File ==
 540            inst->Dst[0].Register.File) &&
 541           (inst->Src[i].Register.Index ==
 542            inst->Dst[0].Register.Index)) {
 543          /* loop over dest channels */
 544          uint channelsWritten = 0x0;
 545          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
 546             /* check if we're reading a channel that's been written */
 547             uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
 548             if (channelsWritten & (1 << swizzle)) {
 549                return TRUE;
 550             }
 551
 552             channelsWritten |= (1 << chan);
 553          }
 554       }
 555    }
 556    return FALSE;
 557 }
 558
 559
 560 /**
 561  * Initialize machine state by expanding tokens to full instructions,
 562  * allocating temporary storage, setting up constants, etc.
 563  * After this, we can call tgsi_exec_machine_run() many times.
 564  */
 565 void
 566 tgsi_exec_machine_bind_shader(
 567    struct tgsi_exec_machine *mach,
 568    const struct tgsi_token *tokens,
 569    uint numSamplers,
 570    struct tgsi_sampler **samplers)
 571 {
 572    uint k;
 573    struct tgsi_parse_context parse;
 574    struct tgsi_exec_labels *labels = &mach->Labels;
 575    struct tgsi_full_instruction *instructions;
 576    struct tgsi_full_declaration *declarations;
 577    uint maxInstructions = 10, numInstructions = 0;
 578    uint maxDeclarations = 10, numDeclarations = 0;
 579    uint instno = 0;
 580
 581 #if 0
 582    tgsi_dump(tokens, 0);
 583 #endif
 584
 585    util_init_math();
 586
 587    mach->Tokens = tokens;
 588    mach->Samplers = samplers;
 589
 590    k = tgsi_parse_init (&parse, mach->Tokens);
 591    if (k != TGSI_PARSE_OK) {
 592       debug_printf( "Problem parsing!\n" );
 593       return;
 594    }
 595
 596    mach->Processor = parse.FullHeader.Processor.Processor;
 597    mach->ImmLimit = 0;
 598    labels->count = 0;
 599
 600    declarations = (struct tgsi_full_declaration *)
 601       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 602
 603    if (!declarations) {
 604       return;
 605    }
 606
 607    instructions = (struct tgsi_full_instruction *)
 608       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 609
 610    if (!instructions) {
 611       FREE( declarations );
 612       return;
 613    }
 614
 615    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 616       uint pointer = parse.Position;
 617       uint i;
 618
 619       tgsi_parse_token( &parse );
 620       switch( parse.FullToken.Token.Type ) {
 621       case TGSI_TOKEN_TYPE_DECLARATION:
 622          /* save expanded declaration */
 623          if (numDeclarations == maxDeclarations) {
 624             declarations = REALLOC(declarations,
 625                                    maxDeclarations
 626                                    * sizeof(struct tgsi_full_declaration),
 627                                    (maxDeclarations + 10)
 628                                    * sizeof(struct tgsi_full_declaration));
 629             maxDeclarations += 10;
 630          }
 631          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
 632             unsigned reg;
 633             for (reg = parse.FullToken.FullDeclaration.Range.First;
 634                  reg <= parse.FullToken.FullDeclaration.Range.Last;
 635                  ++reg) {
 636                ++mach->NumOutputs;
 637             }
 638          }
 639          memcpy(declarations + numDeclarations,
 640                 &parse.FullToken.FullDeclaration,
 641                 sizeof(declarations[0]));
 642          numDeclarations++;
 643          break;
 644
 645       case TGSI_TOKEN_TYPE_IMMEDIATE:
 646          {
 647             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 648             assert( size <= 4 );
 649             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
 650
 651             for( i = 0; i < size; i++ ) {
 652                mach->Imms[mach->ImmLimit][i] =
 653                   parse.FullToken.FullImmediate.u[i].Float;
 654             }
 655             mach->ImmLimit += 1;
 656          }
 657          break;
 658
 659       case TGSI_TOKEN_TYPE_INSTRUCTION:
 660          assert( labels->count < MAX_LABELS );
 661
 662          labels->labels[labels->count][0] = instno;
 663          labels->labels[labels->count][1] = pointer;
 664          labels->count++;
 665
 666          /* save expanded instruction */
 667          if (numInstructions == maxInstructions) {
 668             instructions = REALLOC(instructions,
 669                                    maxInstructions
 670                                    * sizeof(struct tgsi_full_instruction),
 671                                    (maxInstructions + 10)
 672                                    * sizeof(struct tgsi_full_instruction));
 673             maxInstructions += 10;
 674          }
 675
 676          memcpy(instructions + numInstructions,
 677                 &parse.FullToken.FullInstruction,
 678                 sizeof(instructions[0]));
 679
 680          numInstructions++;
 681          break;
 682
 683       case TGSI_TOKEN_TYPE_PROPERTY:
 684          break;
 685
 686       default:
 687          assert( 0 );
 688       }
 689    }
 690    tgsi_parse_free (&parse);
 691
 692    if (mach->Declarations) {
 693       FREE( mach->Declarations );
 694    }
 695    mach->Declarations = declarations;
 696    mach->NumDeclarations = numDeclarations;
 697
 698    if (mach->Instructions) {
 699       FREE( mach->Instructions );
 700    }
 701    mach->Instructions = instructions;
 702    mach->NumInstructions = numInstructions;
 703 }
 704
 705
 706 struct tgsi_exec_machine *
 707 tgsi_exec_machine_create( void )
 708 {
 709    struct tgsi_exec_machine *mach;
 710    uint i;
 711
 712    mach = align_malloc( sizeof *mach, 16 );
 713    if (!mach)
 714       goto fail;
 715
 716    memset(mach, 0, sizeof(*mach));
 717
 718    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 719    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
 720    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
 721
 722    /* Setup constants. */
 723    for( i = 0; i < 4; i++ ) {
 724       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 725       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 726       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 727       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 728       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 729       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 730       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 731       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 732       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 733       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 734    }
 735
 736 #ifdef DEBUG
 737    /* silence warnings */
 738    (void) print_chan;
 739    (void) print_temp;
 740 #endif
 741
 742    return mach;
 743
 744 fail:
 745    align_free(mach);
 746    return NULL;
 747 }
 748
 749
 750 void
 751 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 752 {
 753    if (mach) {
 754       FREE(mach->Instructions);
 755       FREE(mach->Declarations);
 756    }
 757
 758    align_free(mach);
 759 }
 760
 761 static void
 762 micro_add(
 763    union tgsi_exec_channel *dst,
 764    const union tgsi_exec_channel *src0,
 765    const union tgsi_exec_channel *src1 )
 766 {
 767    dst->f[0] = src0->f[0] + src1->f[0];
 768    dst->f[1] = src0->f[1] + src1->f[1];
 769    dst->f[2] = src0->f[2] + src1->f[2];
 770    dst->f[3] = src0->f[3] + src1->f[3];
 771 }
 772
 773 static void
 774 micro_div(
 775    union tgsi_exec_channel *dst,
 776    const union tgsi_exec_channel *src0,
 777    const union tgsi_exec_channel *src1 )
 778 {
 779    if (src1->f[0] != 0) {
 780       dst->f[0] = src0->f[0] / src1->f[0];
 781    }
 782    if (src1->f[1] != 0) {
 783       dst->f[1] = src0->f[1] / src1->f[1];
 784    }
 785    if (src1->f[2] != 0) {
 786       dst->f[2] = src0->f[2] / src1->f[2];
 787    }
 788    if (src1->f[3] != 0) {
 789       dst->f[3] = src0->f[3] / src1->f[3];
 790    }
 791 }
 792
 793 static void
 794 micro_float_clamp(union tgsi_exec_channel *dst,
 795                   const union tgsi_exec_channel *src)
 796 {
 797    uint i;
 798
 799    for (i = 0; i < 4; i++) {
 800       if (src->f[i] > 0.0f) {
 801          if (src->f[i] > 1.884467e+019f)
 802             dst->f[i] = 1.884467e+019f;
 803          else if (src->f[i] < 5.42101e-020f)
 804             dst->f[i] = 5.42101e-020f;
 805          else
 806             dst->f[i] = src->f[i];
 807       }
 808       else {
 809          if (src->f[i] < -1.884467e+019f)
 810             dst->f[i] = -1.884467e+019f;
 811          else if (src->f[i] > -5.42101e-020f)
 812             dst->f[i] = -5.42101e-020f;
 813          else
 814             dst->f[i] = src->f[i];
 815       }
 816    }
 817 }
 818
 819 static void
 820 micro_lt(
 821    union tgsi_exec_channel *dst,
 822    const union tgsi_exec_channel *src0,
 823    const union tgsi_exec_channel *src1,
 824    const union tgsi_exec_channel *src2,
 825    const union tgsi_exec_channel *src3 )
 826 {
 827    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 828    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 829    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 830    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 831 }
 832
 833 static void
 834 micro_max(
 835    union tgsi_exec_channel *dst,
 836    const union tgsi_exec_channel *src0,
 837    const union tgsi_exec_channel *src1 )
 838 {
 839    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 840    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 841    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 842    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 843 }
 844
 845 static void
 846 micro_min(
 847    union tgsi_exec_channel *dst,
 848    const union tgsi_exec_channel *src0,
 849    const union tgsi_exec_channel *src1 )
 850 {
 851    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 852    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 853    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 854    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 855 }
 856
 857 static void
 858 micro_mul(
 859    union tgsi_exec_channel *dst,
 860    const union tgsi_exec_channel *src0,
 861    const union tgsi_exec_channel *src1 )
 862 {
 863    dst->f[0] = src0->f[0] * src1->f[0];
 864    dst->f[1] = src0->f[1] * src1->f[1];
 865    dst->f[2] = src0->f[2] * src1->f[2];
 866    dst->f[3] = src0->f[3] * src1->f[3];
 867 }
 868
 869 #if 0
 870 static void
 871 micro_imul64(
 872    union tgsi_exec_channel *dst0,
 873    union tgsi_exec_channel *dst1,
 874    const union tgsi_exec_channel *src0,
 875    const union tgsi_exec_channel *src1 )
 876 {
 877    dst1->i[0] = src0->i[0] * src1->i[0];
 878    dst1->i[1] = src0->i[1] * src1->i[1];
 879    dst1->i[2] = src0->i[2] * src1->i[2];
 880    dst1->i[3] = src0->i[3] * src1->i[3];
 881    dst0->i[0] = 0;
 882    dst0->i[1] = 0;
 883    dst0->i[2] = 0;
 884    dst0->i[3] = 0;
 885 }
 886 #endif
 887
 888 #if 0
 889 static void
 890 micro_umul64(
 891    union tgsi_exec_channel *dst0,
 892    union tgsi_exec_channel *dst1,
 893    const union tgsi_exec_channel *src0,
 894    const union tgsi_exec_channel *src1 )
 895 {
 896    dst1->u[0] = src0->u[0] * src1->u[0];
 897    dst1->u[1] = src0->u[1] * src1->u[1];
 898    dst1->u[2] = src0->u[2] * src1->u[2];
 899    dst1->u[3] = src0->u[3] * src1->u[3];
 900    dst0->u[0] = 0;
 901    dst0->u[1] = 0;
 902    dst0->u[2] = 0;
 903    dst0->u[3] = 0;
 904 }
 905 #endif
 906
 907
 908 #if 0
 909 static void
 910 micro_movc(
 911    union tgsi_exec_channel *dst,
 912    const union tgsi_exec_channel *src0,
 913    const union tgsi_exec_channel *src1,
 914    const union tgsi_exec_channel *src2 )
 915 {
 916    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 917    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 918    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 919    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 920 }
 921 #endif
 922
 923 static void
 924 micro_neg(
 925    union tgsi_exec_channel *dst,
 926    const union tgsi_exec_channel *src )
 927 {
 928    dst->f[0] = -src->f[0];
 929    dst->f[1] = -src->f[1];
 930    dst->f[2] = -src->f[2];
 931    dst->f[3] = -src->f[3];
 932 }
 933
 934 static void
 935 micro_pow(
 936    union tgsi_exec_channel *dst,
 937    const union tgsi_exec_channel *src0,
 938    const union tgsi_exec_channel *src1 )
 939 {
 940 #if FAST_MATH
 941    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 942    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 943    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 944    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 945 #else
 946    dst->f[0] = powf( src0->f[0], src1->f[0] );
 947    dst->f[1] = powf( src0->f[1], src1->f[1] );
 948    dst->f[2] = powf( src0->f[2], src1->f[2] );
 949    dst->f[3] = powf( src0->f[3], src1->f[3] );
 950 #endif
 951 }
 952
 953 static void
 954 micro_sqrt( union tgsi_exec_channel *dst,
 955             const union tgsi_exec_channel *src )
 956 {
 957    dst->f[0] = sqrtf( src->f[0] );
 958    dst->f[1] = sqrtf( src->f[1] );
 959    dst->f[2] = sqrtf( src->f[2] );
 960    dst->f[3] = sqrtf( src->f[3] );
 961 }
 962
 963 static void
 964 micro_sub(
 965    union tgsi_exec_channel *dst,
 966    const union tgsi_exec_channel *src0,
 967    const union tgsi_exec_channel *src1 )
 968 {
 969    dst->f[0] = src0->f[0] - src1->f[0];
 970    dst->f[1] = src0->f[1] - src1->f[1];
 971    dst->f[2] = src0->f[2] - src1->f[2];
 972    dst->f[3] = src0->f[3] - src1->f[3];
 973 }
 974
 975 static void
 976 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
 977                        const uint file,
 978                        const uint swizzle,
 979                        const union tgsi_exec_channel *index,
 980                        const union tgsi_exec_channel *index2D,
 981                        union tgsi_exec_channel *chan)
 982 {
 983    uint i;
 984
 985    switch (file) {
 986    case TGSI_FILE_CONSTANT:
 987       for (i = 0; i < QUAD_SIZE; i++) {
 988          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
 989          assert(mach->Consts[index2D->i[i]]);
 990
 991          if (index->i[i] < 0) {
 992             chan->u[i] = 0;
 993          } else {
 994             const uint *p = (const uint *)mach->Consts[index2D->i[i]];
 995
 996             chan->u[i] = p[index->i[i] * 4 + swizzle];
 997          }
 998       }
 999       break;
1000
1001    case TGSI_FILE_INPUT:
1002    case TGSI_FILE_SYSTEM_VALUE:
1003       for (i = 0; i < QUAD_SIZE; i++) {
1004          /* XXX: 2D indexing */
1005          chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
1006       }
1007       break;
1008
1009    case TGSI_FILE_TEMPORARY:
1010       for (i = 0; i < QUAD_SIZE; i++) {
1011          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1012          assert(index2D->i[i] == 0);
1013
1014          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1015       }
1016       break;
1017
1018    case TGSI_FILE_IMMEDIATE:
1019       for (i = 0; i < QUAD_SIZE; i++) {
1020          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1021          assert(index2D->i[i] == 0);
1022
1023          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1024       }
1025       break;
1026
1027    case TGSI_FILE_ADDRESS:
1028       for (i = 0; i < QUAD_SIZE; i++) {
1029          assert(index->i[i] >= 0);
1030          assert(index2D->i[i] == 0);
1031
1032          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1033       }
1034       break;
1035
1036    case TGSI_FILE_PREDICATE:
1037       for (i = 0; i < QUAD_SIZE; i++) {
1038          assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1039          assert(index2D->i[i] == 0);
1040
1041          chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1042       }
1043       break;
1044
1045    case TGSI_FILE_OUTPUT:
1046       /* vertex/fragment output vars can be read too */
1047       for (i = 0; i < QUAD_SIZE; i++) {
1048          assert(index->i[i] >= 0);
1049          assert(index2D->i[i] == 0);
1050
1051          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1052       }
1053       break;
1054
1055    default:
1056       assert(0);
1057       for (i = 0; i < QUAD_SIZE; i++) {
1058          chan->u[i] = 0;
1059       }
1060    }
1061 }
1062
1063 static void
1064 fetch_source(const struct tgsi_exec_machine *mach,
1065              union tgsi_exec_channel *chan,
1066              const struct tgsi_full_src_register *reg,
1067              const uint chan_index,
1068              enum tgsi_exec_datatype src_datatype)
1069 {
1070    union tgsi_exec_channel index;
1071    union tgsi_exec_channel index2D;
1072    uint swizzle;
1073
1074    /* We start with a direct index into a register file.
1075     *
1076     *    file[1],
1077     *    where:
1078     *       file = Register.File
1079     *       [1] = Register.Index
1080     */
1081    index.i[0] =
1082    index.i[1] =
1083    index.i[2] =
1084    index.i[3] = reg->Register.Index;
1085
1086    /* There is an extra source register that indirectly subscripts
1087     * a register file. The direct index now becomes an offset
1088     * that is being added to the indirect register.
1089     *
1090     *    file[ind[2].x+1],
1091     *    where:
1092     *       ind = Indirect.File
1093     *       [2] = Indirect.Index
1094     *       .x = Indirect.SwizzleX
1095     */
1096    if (reg->Register.Indirect) {
1097       union tgsi_exec_channel index2;
1098       union tgsi_exec_channel indir_index;
1099       const uint execmask = mach->ExecMask;
1100       uint i;
1101
1102       /* which address register (always zero now) */
1103       index2.i[0] =
1104       index2.i[1] =
1105       index2.i[2] =
1106       index2.i[3] = reg->Indirect.Index;
1107
1108       /* get current value of address register[swizzle] */
1109       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1110       fetch_src_file_channel(mach,
1111                              reg->Indirect.File,
1112                              swizzle,
1113                              &index2,
1114                              &ZeroVec,
1115                              &indir_index);
1116
1117       /* add value of address register to the offset */
1118       index.i[0] += indir_index.i[0];
1119       index.i[1] += indir_index.i[1];
1120       index.i[2] += indir_index.i[2];
1121       index.i[3] += indir_index.i[3];
1122
1123       /* for disabled execution channels, zero-out the index to
1124        * avoid using a potential garbage value.
1125        */
1126       for (i = 0; i < QUAD_SIZE; i++) {
1127          if ((execmask & (1 << i)) == 0)
1128             index.i[i] = 0;
1129       }
1130    }
1131
1132    /* There is an extra source register that is a second
1133     * subscript to a register file. Effectively it means that
1134     * the register file is actually a 2D array of registers.
1135     *
1136     *    file[3][1],
1137     *    where:
1138     *       [3] = Dimension.Index
1139     */
1140    if (reg->Register.Dimension) {
1141       index2D.i[0] =
1142       index2D.i[1] =
1143       index2D.i[2] =
1144       index2D.i[3] = reg->Dimension.Index;
1145
1146       /* Again, the second subscript index can be addressed indirectly
1147        * identically to the first one.
1148        * Nothing stops us from indirectly addressing the indirect register,
1149        * but there is no need for that, so we won't exercise it.
1150        *
1151        *    file[ind[4].y+3][1],
1152        *    where:
1153        *       ind = DimIndirect.File
1154        *       [4] = DimIndirect.Index
1155        *       .y = DimIndirect.SwizzleX
1156        */
1157       if (reg->Dimension.Indirect) {
1158          union tgsi_exec_channel index2;
1159          union tgsi_exec_channel indir_index;
1160          const uint execmask = mach->ExecMask;
1161          uint i;
1162
1163          index2.i[0] =
1164          index2.i[1] =
1165          index2.i[2] =
1166          index2.i[3] = reg->DimIndirect.Index;
1167
1168          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1169          fetch_src_file_channel(mach,
1170                                 reg->DimIndirect.File,
1171                                 swizzle,
1172                                 &index2,
1173                                 &ZeroVec,
1174                                 &indir_index);
1175
1176          index2D.i[0] += indir_index.i[0];
1177          index2D.i[1] += indir_index.i[1];
1178          index2D.i[2] += indir_index.i[2];
1179          index2D.i[3] += indir_index.i[3];
1180
1181          /* for disabled execution channels, zero-out the index to
1182           * avoid using a potential garbage value.
1183           */
1184          for (i = 0; i < QUAD_SIZE; i++) {
1185             if ((execmask & (1 << i)) == 0) {
1186                index2D.i[i] = 0;
1187             }
1188          }
1189       }
1190
1191       /* If by any chance there was a need for a 3D array of register
1192        * files, we would have to check whether Dimension is followed
1193        * by a dimension register and continue the saga.
1194        */
1195    } else {
1196       index2D.i[0] =
1197       index2D.i[1] =
1198       index2D.i[2] =
1199       index2D.i[3] = 0;
1200    }
1201
1202    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1203    fetch_src_file_channel(mach,
1204                           reg->Register.File,
1205                           swizzle,
1206                           &index,
1207                           &index2D,
1208                           chan);
1209
1210    if (reg->Register.Absolute) {
1211       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1212          micro_abs(chan, chan);
1213       } else {
1214          micro_iabs(chan, chan);
1215       }
1216    }
1217
1218    if (reg->Register.Negate) {
1219       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1220          micro_neg(chan, chan);
1221       } else {
1222          micro_ineg(chan, chan);
1223       }
1224    }
1225 }
1226
1227 static void
1228 store_dest(struct tgsi_exec_machine *mach,
1229            const union tgsi_exec_channel *chan,
1230            const struct tgsi_full_dst_register *reg,
1231            const struct tgsi_full_instruction *inst,
1232            uint chan_index,
1233            enum tgsi_exec_datatype dst_datatype)
1234 {
1235    uint i;
1236    union tgsi_exec_channel null;
1237    union tgsi_exec_channel *dst;
1238    uint execmask = mach->ExecMask;
1239    int offset = 0;  /* indirection offset */
1240    int index;
1241
1242    /* for debugging */
1243    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1244       check_inf_or_nan(chan);
1245    }
1246
1247    /* There is an extra source register that indirectly subscripts
1248     * a register file. The direct index now becomes an offset
1249     * that is being added to the indirect register.
1250     *
1251     *    file[ind[2].x+1],
1252     *    where:
1253     *       ind = Indirect.File
1254     *       [2] = Indirect.Index
1255     *       .x = Indirect.SwizzleX
1256     */
1257    if (reg->Register.Indirect) {
1258       union tgsi_exec_channel index;
1259       union tgsi_exec_channel indir_index;
1260       uint swizzle;
1261
1262       /* which address register (always zero for now) */
1263       index.i[0] =
1264       index.i[1] =
1265       index.i[2] =
1266       index.i[3] = reg->Indirect.Index;
1267
1268       /* get current value of address register[swizzle] */
1269       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1270
1271       /* fetch values from the address/indirection register */
1272       fetch_src_file_channel(mach,
1273                              reg->Indirect.File,
1274                              swizzle,
1275                              &index,
1276                              &ZeroVec,
1277                              &indir_index);
1278
1279       /* save indirection offset */
1280       offset = indir_index.i[0];
1281    }
1282
1283    switch (reg->Register.File) {
1284    case TGSI_FILE_NULL:
1285       dst = &null;
1286       break;
1287
1288    case TGSI_FILE_OUTPUT:
1289       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1290          + reg->Register.Index;
1291       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1292 #if 0
1293       if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1294          fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1295          for (i = 0; i < QUAD_SIZE; i++)
1296             if (execmask & (1 << i))
1297                fprintf(stderr, "%f, ", chan->f[i]);
1298          fprintf(stderr, ")\n");
1299       }
1300 #endif
1301       break;
1302
1303    case TGSI_FILE_TEMPORARY:
1304       index = reg->Register.Index;
1305       assert( index < TGSI_EXEC_NUM_TEMPS );
1306       dst = &mach->Temps[offset + index].xyzw[chan_index];
1307       break;
1308
1309    case TGSI_FILE_ADDRESS:
1310       index = reg->Register.Index;
1311       dst = &mach->Addrs[index].xyzw[chan_index];
1312       break;
1313
1314    case TGSI_FILE_LOOP:
1315       assert(reg->Register.Index == 0);
1316       assert(mach->LoopCounterStackTop > 0);
1317       assert(chan_index == CHAN_X);
1318       dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1319       break;
1320
1321    case TGSI_FILE_PREDICATE:
1322       index = reg->Register.Index;
1323       assert(index < TGSI_EXEC_NUM_PREDS);
1324       dst = &mach->Predicates[index].xyzw[chan_index];
1325       break;
1326
1327    default:
1328       assert( 0 );
1329       return;
1330    }
1331
1332    if (inst->Instruction.Predicate) {
1333       uint swizzle;
1334       union tgsi_exec_channel *pred;
1335
1336       switch (chan_index) {
1337       case CHAN_X:
1338          swizzle = inst->Predicate.SwizzleX;
1339          break;
1340       case CHAN_Y:
1341          swizzle = inst->Predicate.SwizzleY;
1342          break;
1343       case CHAN_Z:
1344          swizzle = inst->Predicate.SwizzleZ;
1345          break;
1346       case CHAN_W:
1347          swizzle = inst->Predicate.SwizzleW;
1348          break;
1349       default:
1350          assert(0);
1351          return;
1352       }
1353
1354       assert(inst->Predicate.Index == 0);
1355
1356       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1357
1358       if (inst->Predicate.Negate) {
1359          for (i = 0; i < QUAD_SIZE; i++) {
1360             if (pred->u[i]) {
1361                execmask &= ~(1 << i);
1362             }
1363          }
1364       } else {
1365          for (i = 0; i < QUAD_SIZE; i++) {
1366             if (!pred->u[i]) {
1367                execmask &= ~(1 << i);
1368             }
1369          }
1370       }
1371    }
1372
1373    switch (inst->Instruction.Saturate) {
1374    case TGSI_SAT_NONE:
1375       for (i = 0; i < QUAD_SIZE; i++)
1376          if (execmask & (1 << i))
1377             dst->i[i] = chan->i[i];
1378       break;
1379
1380    case TGSI_SAT_ZERO_ONE:
1381       for (i = 0; i < QUAD_SIZE; i++)
1382          if (execmask & (1 << i)) {
1383             if (chan->f[i] < 0.0f)
1384                dst->f[i] = 0.0f;
1385             else if (chan->f[i] > 1.0f)
1386                dst->f[i] = 1.0f;
1387             else
1388                dst->i[i] = chan->i[i];
1389          }
1390       break;
1391
1392    case TGSI_SAT_MINUS_PLUS_ONE:
1393       for (i = 0; i < QUAD_SIZE; i++)
1394          if (execmask & (1 << i)) {
1395             if (chan->f[i] < -1.0f)
1396                dst->f[i] = -1.0f;
1397             else if (chan->f[i] > 1.0f)
1398                dst->f[i] = 1.0f;
1399             else
1400                dst->i[i] = chan->i[i];
1401          }
1402       break;
1403
1404    default:
1405       assert( 0 );
1406    }
1407 }
1408
1409 #define FETCH(VAL,INDEX,CHAN)\
1410     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1411
1412 #define STORE(VAL,INDEX,CHAN)\
1413    store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1414
1415
1416 /**
1417  * Execute ARB-style KIL which is predicated by a src register.
1418  * Kill fragment if any of the four values is less than zero.
1419  */
1420 static void
1421 exec_kil(struct tgsi_exec_machine *mach,
1422          const struct tgsi_full_instruction *inst)
1423 {
1424    uint uniquemask;
1425    uint chan_index;
1426    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1427    union tgsi_exec_channel r[1];
1428
1429    /* This mask stores component bits that were already tested. */
1430    uniquemask = 0;
1431
1432    for (chan_index = 0; chan_index < 4; chan_index++)
1433    {
1434       uint swizzle;
1435       uint i;
1436
1437       /* unswizzle channel */
1438       swizzle = tgsi_util_get_full_src_register_swizzle (
1439                         &inst->Src[0],
1440                         chan_index);
1441
1442       /* check if the component has not been already tested */
1443       if (uniquemask & (1 << swizzle))
1444          continue;
1445       uniquemask |= 1 << swizzle;
1446
1447       FETCH(&r[0], 0, chan_index);
1448       for (i = 0; i < 4; i++)
1449          if (r[0].f[i] < 0.0f)
1450             kilmask |= 1 << i;
1451    }
1452
1453    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1454 }
1455
1456 /**
1457  * Execute NVIDIA-style KIL which is predicated by a condition code.
1458  * Kill fragment if the condition code is TRUE.
1459  */
1460 static void
1461 exec_kilp(struct tgsi_exec_machine *mach,
1462           const struct tgsi_full_instruction *inst)
1463 {
1464    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1465
1466    /* "unconditional" kil */
1467    kilmask = mach->ExecMask;
1468    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1469 }
1470
1471 static void
1472 emit_vertex(struct tgsi_exec_machine *mach)
1473 {
1474    /* FIXME: check for exec mask correctly
1475    unsigned i;
1476    for (i = 0; i < QUAD_SIZE; ++i) {
1477          if ((mach->ExecMask & (1 << i)))
1478    */
1479    if (mach->ExecMask) {
1480       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1481       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1482    }
1483 }
1484
1485 static void
1486 emit_primitive(struct tgsi_exec_machine *mach)
1487 {
1488    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1489    /* FIXME: check for exec mask correctly
1490    unsigned i;
1491    for (i = 0; i < QUAD_SIZE; ++i) {
1492          if ((mach->ExecMask & (1 << i)))
1493    */
1494    if (mach->ExecMask) {
1495       ++(*prim_count);
1496       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1497       mach->Primitives[*prim_count] = 0;
1498    }
1499 }
1500
1501 /*
1502  * Fetch four texture samples using STR texture coordinates.
1503  */
1504 static void
1505 fetch_texel( struct tgsi_sampler *sampler,
1506              const union tgsi_exec_channel *s,
1507              const union tgsi_exec_channel *t,
1508              const union tgsi_exec_channel *p,
1509              const union tgsi_exec_channel *c0,
1510              enum tgsi_sampler_control control,
1511              union tgsi_exec_channel *r,
1512              union tgsi_exec_channel *g,
1513              union tgsi_exec_channel *b,
1514              union tgsi_exec_channel *a )
1515 {
1516    uint j;
1517    float rgba[NUM_CHANNELS][QUAD_SIZE];
1518
1519    sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1520
1521    for (j = 0; j < 4; j++) {
1522       r->f[j] = rgba[0][j];
1523       g->f[j] = rgba[1][j];
1524       b->f[j] = rgba[2][j];
1525       a->f[j] = rgba[3][j];
1526    }
1527 }
1528
1529
1530 #define TEX_MODIFIER_NONE           0
1531 #define TEX_MODIFIER_PROJECTED      1
1532 #define TEX_MODIFIER_LOD_BIAS       2
1533 #define TEX_MODIFIER_EXPLICIT_LOD   3
1534
1535
1536 static void
1537 exec_tex(struct tgsi_exec_machine *mach,
1538          const struct tgsi_full_instruction *inst,
1539          uint modifier)
1540 {
1541    const uint unit = inst->Src[1].Register.Index;
1542    union tgsi_exec_channel r[4];
1543    const union tgsi_exec_channel *lod = &ZeroVec;
1544    enum tgsi_sampler_control control;
1545    uint chan_index;
1546
1547    if (modifier != TEX_MODIFIER_NONE) {
1548       FETCH(&r[3], 0, CHAN_W);
1549       if (modifier != TEX_MODIFIER_PROJECTED) {
1550          lod = &r[3];
1551       }
1552    }
1553
1554    if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1555       control = tgsi_sampler_lod_explicit;
1556    } else {
1557       control = tgsi_sampler_lod_bias;
1558    }
1559
1560    switch (inst->Texture.Texture) {
1561    case TGSI_TEXTURE_1D:
1562    case TGSI_TEXTURE_SHADOW1D:
1563       FETCH(&r[0], 0, CHAN_X);
1564
1565       if (modifier == TEX_MODIFIER_PROJECTED) {
1566          micro_div(&r[0], &r[0], &r[3]);
1567       }
1568
1569       fetch_texel(mach->Samplers[unit],
1570                   &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1571                   control,
1572                   &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1573       break;
1574
1575    case TGSI_TEXTURE_2D:
1576    case TGSI_TEXTURE_RECT:
1577    case TGSI_TEXTURE_SHADOW2D:
1578    case TGSI_TEXTURE_SHADOWRECT:
1579       FETCH(&r[0], 0, CHAN_X);
1580       FETCH(&r[1], 0, CHAN_Y);
1581       FETCH(&r[2], 0, CHAN_Z);
1582
1583       if (modifier == TEX_MODIFIER_PROJECTED) {
1584          micro_div(&r[0], &r[0], &r[3]);
1585          micro_div(&r[1], &r[1], &r[3]);
1586          micro_div(&r[2], &r[2], &r[3]);
1587       }
1588
1589       fetch_texel(mach->Samplers[unit],
1590                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1591                   control,
1592                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1593       break;
1594
1595    case TGSI_TEXTURE_3D:
1596    case TGSI_TEXTURE_CUBE:
1597       FETCH(&r[0], 0, CHAN_X);
1598       FETCH(&r[1], 0, CHAN_Y);
1599       FETCH(&r[2], 0, CHAN_Z);
1600
1601       if (modifier == TEX_MODIFIER_PROJECTED) {
1602          micro_div(&r[0], &r[0], &r[3]);
1603          micro_div(&r[1], &r[1], &r[3]);
1604          micro_div(&r[2], &r[2], &r[3]);
1605       }
1606
1607       fetch_texel(mach->Samplers[unit],
1608                   &r[0], &r[1], &r[2], lod,
1609                   control,
1610                   &r[0], &r[1], &r[2], &r[3]);
1611       break;
1612
1613    default:
1614       assert(0);
1615    }
1616
1617    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1618       STORE(&r[chan_index], 0, chan_index);
1619    }
1620 }
1621
1622 static void
1623 exec_txd(struct tgsi_exec_machine *mach,
1624          const struct tgsi_full_instruction *inst)
1625 {
1626    const uint unit = inst->Src[3].Register.Index;
1627    union tgsi_exec_channel r[4];
1628    uint chan_index;
1629
1630    /*
1631     * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1632     */
1633
1634    switch (inst->Texture.Texture) {
1635    case TGSI_TEXTURE_1D:
1636    case TGSI_TEXTURE_SHADOW1D:
1637
1638       FETCH(&r[0], 0, CHAN_X);
1639
1640       fetch_texel(mach->Samplers[unit],
1641                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1642                   tgsi_sampler_lod_bias,
1643                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1644       break;
1645
1646    case TGSI_TEXTURE_2D:
1647    case TGSI_TEXTURE_RECT:
1648    case TGSI_TEXTURE_SHADOW2D:
1649    case TGSI_TEXTURE_SHADOWRECT:
1650
1651       FETCH(&r[0], 0, CHAN_X);
1652       FETCH(&r[1], 0, CHAN_Y);
1653       FETCH(&r[2], 0, CHAN_Z);
1654
1655       fetch_texel(mach->Samplers[unit],
1656                   &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1657                   tgsi_sampler_lod_bias,
1658                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1659       break;
1660
1661    case TGSI_TEXTURE_3D:
1662    case TGSI_TEXTURE_CUBE:
1663
1664       FETCH(&r[0], 0, CHAN_X);
1665       FETCH(&r[1], 0, CHAN_Y);
1666       FETCH(&r[2], 0, CHAN_Z);
1667
1668       fetch_texel(mach->Samplers[unit],
1669                   &r[0], &r[1], &r[2], &ZeroVec,
1670                   tgsi_sampler_lod_bias,
1671                   &r[0], &r[1], &r[2], &r[3]);
1672       break;
1673
1674    default:
1675       assert(0);
1676    }
1677
1678    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1679       STORE(&r[chan_index], 0, chan_index);
1680    }
1681 }
1682
1683
1684 /**
1685  * Evaluate a constant-valued coefficient at the position of the
1686  * current quad.
1687  */
1688 static void
1689 eval_constant_coef(
1690    struct tgsi_exec_machine *mach,
1691    unsigned attrib,
1692    unsigned chan )
1693 {
1694    unsigned i;
1695
1696    for( i = 0; i < QUAD_SIZE; i++ ) {
1697       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1698    }
1699 }
1700
1701 /**
1702  * Evaluate a linear-valued coefficient at the position of the
1703  * current quad.
1704  */
1705 static void
1706 eval_linear_coef(
1707    struct tgsi_exec_machine *mach,
1708    unsigned attrib,
1709    unsigned chan )
1710 {
1711    const float x = mach->QuadPos.xyzw[0].f[0];
1712    const float y = mach->QuadPos.xyzw[1].f[0];
1713    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1714    const float dady = mach->InterpCoefs[attrib].dady[chan];
1715    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1716    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1717    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1718    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1719    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1720 }
1721
1722 /**
1723  * Evaluate a perspective-valued coefficient at the position of the
1724  * current quad.
1725  */
1726 static void
1727 eval_perspective_coef(
1728    struct tgsi_exec_machine *mach,
1729    unsigned attrib,
1730    unsigned chan )
1731 {
1732    const float x = mach->QuadPos.xyzw[0].f[0];
1733    const float y = mach->QuadPos.xyzw[1].f[0];
1734    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1735    const float dady = mach->InterpCoefs[attrib].dady[chan];
1736    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1737    const float *w = mach->QuadPos.xyzw[3].f;
1738    /* divide by W here */
1739    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1740    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1741    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1742    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1743 }
1744
1745
1746 typedef void (* eval_coef_func)(
1747    struct tgsi_exec_machine *mach,
1748    unsigned attrib,
1749    unsigned chan );
1750
1751 static void
1752 exec_declaration(struct tgsi_exec_machine *mach,
1753                  const struct tgsi_full_declaration *decl)
1754 {
1755    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1756       if (decl->Declaration.File == TGSI_FILE_INPUT ||
1757           decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1758          uint first, last, mask;
1759
1760          first = decl->Range.First;
1761          last = decl->Range.Last;
1762          mask = decl->Declaration.UsageMask;
1763
1764          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1765             uint i;
1766
1767             assert(decl->Semantic.Index == 0);
1768             assert(first == last);
1769
1770             for (i = 0; i < QUAD_SIZE; i++) {
1771                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1772             }
1773          } else {
1774             eval_coef_func eval;
1775             uint i, j;
1776
1777             switch (decl->Declaration.Interpolate) {
1778             case TGSI_INTERPOLATE_CONSTANT:
1779                eval = eval_constant_coef;
1780                break;
1781
1782             case TGSI_INTERPOLATE_LINEAR:
1783                eval = eval_linear_coef;
1784                break;
1785
1786             case TGSI_INTERPOLATE_PERSPECTIVE:
1787                eval = eval_perspective_coef;
1788                break;
1789
1790             default:
1791                assert(0);
1792                return;
1793             }
1794
1795             for (j = 0; j < NUM_CHANNELS; j++) {
1796                if (mask & (1 << j)) {
1797                   for (i = first; i <= last; i++) {
1798                      eval(mach, i, j);
1799                   }
1800                }
1801             }
1802          }
1803       }
1804    }
1805 }
1806
1807 typedef void (* micro_op)(union tgsi_exec_channel *dst,
1808                           const union tgsi_exec_channel *src);
1809
1810 static void
1811 exec_scalar_unary(struct tgsi_exec_machine *mach,
1812                   const struct tgsi_full_instruction *inst,
1813                   micro_op op,
1814                   enum tgsi_exec_datatype dst_datatype,
1815                   enum tgsi_exec_datatype src_datatype)
1816 {
1817    unsigned int chan;
1818    union tgsi_exec_channel src;
1819    union tgsi_exec_channel dst;
1820
1821    fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1822    op(&dst, &src);
1823    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1824       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1825          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1826       }
1827    }
1828 }
1829
1830 static void
1831 exec_vector_unary(struct tgsi_exec_machine *mach,
1832                   const struct tgsi_full_instruction *inst,
1833                   micro_op op,
1834                   enum tgsi_exec_datatype dst_datatype,
1835                   enum tgsi_exec_datatype src_datatype)
1836 {
1837    unsigned int chan;
1838    struct tgsi_exec_vector dst;
1839
1840    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1841       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1842          union tgsi_exec_channel src;
1843
1844          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1845          op(&dst.xyzw[chan], &src);
1846       }
1847    }
1848    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1849       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1850          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1851       }
1852    }
1853 }
1854
1855 static void
1856 exec_vector_binary(struct tgsi_exec_machine *mach,
1857                    const struct tgsi_full_instruction *inst,
1858                    micro_op op,
1859                    enum tgsi_exec_datatype dst_datatype,
1860                    enum tgsi_exec_datatype src_datatype)
1861 {
1862    unsigned int chan;
1863    struct tgsi_exec_vector dst;
1864
1865    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1866       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1867          union tgsi_exec_channel src[2];
1868
1869          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1870          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1871          op(&dst.xyzw[chan], src);
1872       }
1873    }
1874    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1875       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1876          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1877       }
1878    }
1879 }
1880
1881 static void
1882 exec_vector_trinary(struct tgsi_exec_machine *mach,
1883                     const struct tgsi_full_instruction *inst,
1884                     micro_op op,
1885                     enum tgsi_exec_datatype dst_datatype,
1886                     enum tgsi_exec_datatype src_datatype)
1887 {
1888    unsigned int chan;
1889    struct tgsi_exec_vector dst;
1890
1891    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1892       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1893          union tgsi_exec_channel src[3];
1894
1895          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1896          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1897          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1898          op(&dst.xyzw[chan], src);
1899       }
1900    }
1901    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1902       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1903          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1904       }
1905    }
1906 }
1907
1908 static void
1909 exec_dp3(struct tgsi_exec_machine *mach,
1910          const struct tgsi_full_instruction *inst)
1911 {
1912    unsigned int chan;
1913    union tgsi_exec_channel arg[3];
1914
1915    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1916    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1917    micro_mul(&arg[2], &arg[0], &arg[1]);
1918
1919    for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1920       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1921       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1922       micro_mad(&arg[2], arg);
1923    }
1924
1925    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1926       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1927          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1928       }
1929    }
1930 }
1931
1932 static void
1933 exec_dp4(struct tgsi_exec_machine *mach,
1934          const struct tgsi_full_instruction *inst)
1935 {
1936    unsigned int chan;
1937    union tgsi_exec_channel arg[3];
1938
1939    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1940    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1941    micro_mul(&arg[2], &arg[0], &arg[1]);
1942
1943    for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1944       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1945       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1946       micro_mad(&arg[2], arg);
1947    }
1948
1949    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1950       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1951          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1952       }
1953    }
1954 }
1955
1956 static void
1957 exec_dp2a(struct tgsi_exec_machine *mach,
1958           const struct tgsi_full_instruction *inst)
1959 {
1960    unsigned int chan;
1961    union tgsi_exec_channel arg[3];
1962
1963    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1964    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1965    micro_mul(&arg[2], &arg[0], &arg[1]);
1966
1967    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1968    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1969    micro_mad(&arg[0], arg);
1970
1971    fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1972    micro_add(&arg[0], &arg[0], &arg[1]);
1973
1974    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1975       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1976          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1977       }
1978    }
1979 }
1980
1981 static void
1982 exec_dph(struct tgsi_exec_machine *mach,
1983          const struct tgsi_full_instruction *inst)
1984 {
1985    unsigned int chan;
1986    union tgsi_exec_channel arg[3];
1987
1988    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1989    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1990    micro_mul(&arg[2], &arg[0], &arg[1]);
1991
1992    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1993    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1994    micro_mad(&arg[2], arg);
1995
1996    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
1997    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
1998    micro_mad(&arg[0], arg);
1999
2000    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2001    micro_add(&arg[0], &arg[0], &arg[1]);
2002
2003    for (chan = 0; chan < NUM_CHANNELS; chan++) {
2004       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2005          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2006       }
2007    }
2008 }
2009
2010 static void
2011 exec_dp2(struct tgsi_exec_machine *mach,
2012          const struct tgsi_full_instruction *inst)
2013 {
2014    unsigned int chan;
2015    union tgsi_exec_channel arg[3];
2016
2017    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2018    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2019    micro_mul(&arg[2], &arg[0], &arg[1]);
2020
2021    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2022    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2023    micro_mad(&arg[2], arg);
2024
2025    for (chan = 0; chan < NUM_CHANNELS; chan++) {
2026       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2027          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2028       }
2029    }
2030 }
2031
2032 static void
2033 exec_break(struct tgsi_exec_machine *mach)
2034 {
2035    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2036       /* turn off loop channels for each enabled exec channel */
2037       mach->LoopMask &= ~mach->ExecMask;
2038       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2039       UPDATE_EXEC_MASK(mach);
2040    } else {
2041       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2042
2043       mach->Switch.mask = 0x0;
2044
2045       UPDATE_EXEC_MASK(mach);
2046    }
2047 }
2048
2049 static void
2050 exec_switch(struct tgsi_exec_machine *mach,
2051             const struct tgsi_full_instruction *inst)
2052 {
2053    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2054    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2055
2056    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2057    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2058    mach->Switch.mask = 0x0;
2059    mach->Switch.defaultMask = 0x0;
2060
2061    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2062    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2063
2064    UPDATE_EXEC_MASK(mach);
2065 }
2066
2067 static void
2068 exec_case(struct tgsi_exec_machine *mach,
2069           const struct tgsi_full_instruction *inst)
2070 {
2071    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2072    union tgsi_exec_channel src;
2073    uint mask = 0;
2074
2075    fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2076
2077    if (mach->Switch.selector.u[0] == src.u[0]) {
2078       mask |= 0x1;
2079    }
2080    if (mach->Switch.selector.u[1] == src.u[1]) {
2081       mask |= 0x2;
2082    }
2083    if (mach->Switch.selector.u[2] == src.u[2]) {
2084       mask |= 0x4;
2085    }
2086    if (mach->Switch.selector.u[3] == src.u[3]) {
2087       mask |= 0x8;
2088    }
2089
2090    mach->Switch.defaultMask |= mask;
2091
2092    mach->Switch.mask |= mask & prevMask;
2093
2094    UPDATE_EXEC_MASK(mach);
2095 }
2096
2097 static void
2098 exec_default(struct tgsi_exec_machine *mach)
2099 {
2100    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2101
2102    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2103
2104    UPDATE_EXEC_MASK(mach);
2105 }
2106
2107 static void
2108 exec_endswitch(struct tgsi_exec_machine *mach)
2109 {
2110    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2111    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2112
2113    UPDATE_EXEC_MASK(mach);
2114 }
2115
2116 static void
2117 micro_i2f(union tgsi_exec_channel *dst,
2118           const union tgsi_exec_channel *src)
2119 {
2120    dst->f[0] = (float)src->i[0];
2121    dst->f[1] = (float)src->i[1];
2122    dst->f[2] = (float)src->i[2];
2123    dst->f[3] = (float)src->i[3];
2124 }
2125
2126 static void
2127 micro_not(union tgsi_exec_channel *dst,
2128           const union tgsi_exec_channel *src)
2129 {
2130    dst->u[0] = ~src->u[0];
2131    dst->u[1] = ~src->u[1];
2132    dst->u[2] = ~src->u[2];
2133    dst->u[3] = ~src->u[3];
2134 }
2135
2136 static void
2137 micro_shl(union tgsi_exec_channel *dst,
2138           const union tgsi_exec_channel *src)
2139 {
2140    dst->u[0] = src[0].u[0] << src[1].u[0];
2141    dst->u[1] = src[0].u[1] << src[1].u[1];
2142    dst->u[2] = src[0].u[2] << src[1].u[2];
2143    dst->u[3] = src[0].u[3] << src[1].u[3];
2144 }
2145
2146 static void
2147 micro_and(union tgsi_exec_channel *dst,
2148           const union tgsi_exec_channel *src)
2149 {
2150    dst->u[0] = src[0].u[0] & src[1].u[0];
2151    dst->u[1] = src[0].u[1] & src[1].u[1];
2152    dst->u[2] = src[0].u[2] & src[1].u[2];
2153    dst->u[3] = src[0].u[3] & src[1].u[3];
2154 }
2155
2156 static void
2157 micro_or(union tgsi_exec_channel *dst,
2158          const union tgsi_exec_channel *src)
2159 {
2160    dst->u[0] = src[0].u[0] | src[1].u[0];
2161    dst->u[1] = src[0].u[1] | src[1].u[1];
2162    dst->u[2] = src[0].u[2] | src[1].u[2];
2163    dst->u[3] = src[0].u[3] | src[1].u[3];
2164 }
2165
2166 static void
2167 micro_xor(union tgsi_exec_channel *dst,
2168           const union tgsi_exec_channel *src)
2169 {
2170    dst->u[0] = src[0].u[0] ^ src[1].u[0];
2171    dst->u[1] = src[0].u[1] ^ src[1].u[1];
2172    dst->u[2] = src[0].u[2] ^ src[1].u[2];
2173    dst->u[3] = src[0].u[3] ^ src[1].u[3];
2174 }
2175
2176 static void
2177 micro_f2i(union tgsi_exec_channel *dst,
2178           const union tgsi_exec_channel *src)
2179 {
2180    dst->i[0] = (int)src->f[0];
2181    dst->i[1] = (int)src->f[1];
2182    dst->i[2] = (int)src->f[2];
2183    dst->i[3] = (int)src->f[3];
2184 }
2185
2186 static void
2187 micro_idiv(union tgsi_exec_channel *dst,
2188            const union tgsi_exec_channel *src)
2189 {
2190    dst->i[0] = src[0].i[0] / src[1].i[0];
2191    dst->i[1] = src[0].i[1] / src[1].i[1];
2192    dst->i[2] = src[0].i[2] / src[1].i[2];
2193    dst->i[3] = src[0].i[3] / src[1].i[3];
2194 }
2195
2196 static void
2197 micro_imax(union tgsi_exec_channel *dst,
2198            const union tgsi_exec_channel *src)
2199 {
2200    dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2201    dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2202    dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2203    dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2204 }
2205
2206 static void
2207 micro_imin(union tgsi_exec_channel *dst,
2208            const union tgsi_exec_channel *src)
2209 {
2210    dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2211    dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2212    dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2213    dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2214 }
2215
2216 static void
2217 micro_isge(union tgsi_exec_channel *dst,
2218            const union tgsi_exec_channel *src)
2219 {
2220    dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2221    dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2222    dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2223    dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2224 }
2225
2226 static void
2227 micro_ishr(union tgsi_exec_channel *dst,
2228            const union tgsi_exec_channel *src)
2229 {
2230    dst->i[0] = src[0].i[0] >> src[1].i[0];
2231    dst->i[1] = src[0].i[1] >> src[1].i[1];
2232    dst->i[2] = src[0].i[2] >> src[1].i[2];
2233    dst->i[3] = src[0].i[3] >> src[1].i[3];
2234 }
2235
2236 static void
2237 micro_islt(union tgsi_exec_channel *dst,
2238            const union tgsi_exec_channel *src)
2239 {
2240    dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2241    dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2242    dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2243    dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2244 }
2245
2246 static void
2247 micro_f2u(union tgsi_exec_channel *dst,
2248           const union tgsi_exec_channel *src)
2249 {
2250    dst->u[0] = (uint)src->f[0];
2251    dst->u[1] = (uint)src->f[1];
2252    dst->u[2] = (uint)src->f[2];
2253    dst->u[3] = (uint)src->f[3];
2254 }
2255
2256 static void
2257 micro_u2f(union tgsi_exec_channel *dst,
2258           const union tgsi_exec_channel *src)
2259 {
2260    dst->f[0] = (float)src->u[0];
2261    dst->f[1] = (float)src->u[1];
2262    dst->f[2] = (float)src->u[2];
2263    dst->f[3] = (float)src->u[3];
2264 }
2265
2266 static void
2267 micro_uadd(union tgsi_exec_channel *dst,
2268            const union tgsi_exec_channel *src)
2269 {
2270    dst->u[0] = src[0].u[0] + src[1].u[0];
2271    dst->u[1] = src[0].u[1] + src[1].u[1];
2272    dst->u[2] = src[0].u[2] + src[1].u[2];
2273    dst->u[3] = src[0].u[3] + src[1].u[3];
2274 }
2275
2276 static void
2277 micro_udiv(union tgsi_exec_channel *dst,
2278            const union tgsi_exec_channel *src)
2279 {
2280    dst->u[0] = src[0].u[0] / src[1].u[0];
2281    dst->u[1] = src[0].u[1] / src[1].u[1];
2282    dst->u[2] = src[0].u[2] / src[1].u[2];
2283    dst->u[3] = src[0].u[3] / src[1].u[3];
2284 }
2285
2286 static void
2287 micro_umad(union tgsi_exec_channel *dst,
2288            const union tgsi_exec_channel *src)
2289 {
2290    dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2291    dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2292    dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2293    dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2294 }
2295
2296 static void
2297 micro_umax(union tgsi_exec_channel *dst,
2298            const union tgsi_exec_channel *src)
2299 {
2300    dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2301    dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2302    dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2303    dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2304 }
2305
2306 static void
2307 micro_umin(union tgsi_exec_channel *dst,
2308            const union tgsi_exec_channel *src)
2309 {
2310    dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2311    dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2312    dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2313    dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2314 }
2315
2316 static void
2317 micro_umod(union tgsi_exec_channel *dst,
2318            const union tgsi_exec_channel *src)
2319 {
2320    dst->u[0] = src[0].u[0] % src[1].u[0];
2321    dst->u[1] = src[0].u[1] % src[1].u[1];
2322    dst->u[2] = src[0].u[2] % src[1].u[2];
2323    dst->u[3] = src[0].u[3] % src[1].u[3];
2324 }
2325
2326 static void
2327 micro_umul(union tgsi_exec_channel *dst,
2328            const union tgsi_exec_channel *src)
2329 {
2330    dst->u[0] = src[0].u[0] * src[1].u[0];
2331    dst->u[1] = src[0].u[1] * src[1].u[1];
2332    dst->u[2] = src[0].u[2] * src[1].u[2];
2333    dst->u[3] = src[0].u[3] * src[1].u[3];
2334 }
2335
2336 static void
2337 micro_useq(union tgsi_exec_channel *dst,
2338            const union tgsi_exec_channel *src)
2339 {
2340    dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2341    dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2342    dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2343    dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2344 }
2345
2346 static void
2347 micro_usge(union tgsi_exec_channel *dst,
2348            const union tgsi_exec_channel *src)
2349 {
2350    dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2351    dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2352    dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2353    dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2354 }
2355
2356 static void
2357 micro_ushr(union tgsi_exec_channel *dst,
2358            const union tgsi_exec_channel *src)
2359 {
2360    dst->u[0] = src[0].u[0] >> src[1].u[0];
2361    dst->u[1] = src[0].u[1] >> src[1].u[1];
2362    dst->u[2] = src[0].u[2] >> src[1].u[2];
2363    dst->u[3] = src[0].u[3] >> src[1].u[3];
2364 }
2365
2366 static void
2367 micro_uslt(union tgsi_exec_channel *dst,
2368            const union tgsi_exec_channel *src)
2369 {
2370    dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2371    dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2372    dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2373    dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2374 }
2375
2376 static void
2377 micro_usne(union tgsi_exec_channel *dst,
2378            const union tgsi_exec_channel *src)
2379 {
2380    dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2381    dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2382    dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2383    dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2384 }
2385
2386 static void
2387 exec_instruction(
2388    struct tgsi_exec_machine *mach,
2389    const struct tgsi_full_instruction *inst,
2390    int *pc )
2391 {
2392    uint chan_index;
2393    union tgsi_exec_channel r[10];
2394    union tgsi_exec_channel d[8];
2395
2396    (*pc)++;
2397
2398    switch (inst->Instruction.Opcode) {
2399    case TGSI_OPCODE_ARL:
2400       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2401       break;
2402
2403    case TGSI_OPCODE_MOV:
2404       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2405       break;
2406
2407    case TGSI_OPCODE_LIT:
2408       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2409          FETCH( &r[0], 0, CHAN_X );
2410          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2411             micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2412          }
2413
2414          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2415             FETCH( &r[1], 0, CHAN_Y );
2416             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2417
2418             FETCH( &r[2], 0, CHAN_W );
2419             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2420             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2421             micro_pow( &r[1], &r[1], &r[2] );
2422             micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2423          }
2424
2425          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2426             STORE(&d[CHAN_Y], 0, CHAN_Y);
2427          }
2428          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2429             STORE(&d[CHAN_Z], 0, CHAN_Z);
2430          }
2431       }
2432       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2433          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2434       }
2435       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2436          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2437       }
2438       break;
2439
2440    case TGSI_OPCODE_RCP:
2441       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2442       break;
2443
2444    case TGSI_OPCODE_RSQ:
2445       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2446       break;
2447
2448    case TGSI_OPCODE_EXP:
2449       FETCH( &r[0], 0, CHAN_X );
2450       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2451       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2452          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2453          STORE( &r[2], 0, CHAN_X );        /* store r2 */
2454       }
2455       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2456          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2457          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2458       }
2459       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2460          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2461          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2462       }
2463       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2464          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2465       }
2466       break;
2467
2468    case TGSI_OPCODE_LOG:
2469       FETCH( &r[0], 0, CHAN_X );
2470       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2471       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2472       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2473       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2474          STORE( &r[0], 0, CHAN_X );
2475       }
2476       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2477          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2478          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2479          STORE( &r[0], 0, CHAN_Y );
2480       }
2481       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2482          STORE( &r[1], 0, CHAN_Z );
2483       }
2484       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2485          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2486       }
2487       break;
2488
2489    case TGSI_OPCODE_MUL:
2490       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2491          FETCH(&r[0], 0, chan_index);
2492          FETCH(&r[1], 1, chan_index);
2493          micro_mul(&d[chan_index], &r[0], &r[1]);
2494       }
2495       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2496          STORE(&d[chan_index], 0, chan_index);
2497       }
2498       break;
2499
2500    case TGSI_OPCODE_ADD:
2501       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2502          FETCH( &r[0], 0, chan_index );
2503          FETCH( &r[1], 1, chan_index );
2504          micro_add(&d[chan_index], &r[0], &r[1]);
2505       }
2506       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2507          STORE(&d[chan_index], 0, chan_index);
2508       }
2509       break;
2510
2511    case TGSI_OPCODE_DP3:
2512       exec_dp3(mach, inst);
2513       break;
2514
2515    case TGSI_OPCODE_DP4:
2516       exec_dp4(mach, inst);
2517       break;
2518
2519    case TGSI_OPCODE_DST:
2520       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2521          FETCH( &r[0], 0, CHAN_Y );
2522          FETCH( &r[1], 1, CHAN_Y);
2523          micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2524       }
2525       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2526          FETCH(&d[CHAN_Z], 0, CHAN_Z);
2527       }
2528       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2529          FETCH(&d[CHAN_W], 1, CHAN_W);
2530       }
2531
2532       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2533          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2534       }
2535       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2536          STORE(&d[CHAN_Y], 0, CHAN_Y);
2537       }
2538       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2539          STORE(&d[CHAN_Z], 0, CHAN_Z);
2540       }
2541       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2542          STORE(&d[CHAN_W], 0, CHAN_W);
2543       }
2544       break;
2545
2546    case TGSI_OPCODE_MIN:
2547       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2548          FETCH(&r[0], 0, chan_index);
2549          FETCH(&r[1], 1, chan_index);
2550
2551          /* XXX use micro_min()?? */
2552          micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2553       }
2554       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2555          STORE(&d[chan_index], 0, chan_index);
2556       }
2557       break;
2558
2559    case TGSI_OPCODE_MAX:
2560       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2561          FETCH(&r[0], 0, chan_index);
2562          FETCH(&r[1], 1, chan_index);
2563
2564          /* XXX use micro_max()?? */
2565          micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2566       }
2567       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2568          STORE(&d[chan_index], 0, chan_index);
2569       }
2570       break;
2571
2572    case TGSI_OPCODE_SLT:
2573       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2574       break;
2575
2576    case TGSI_OPCODE_SGE:
2577       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2578       break;
2579
2580    case TGSI_OPCODE_MAD:
2581       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2582       break;
2583
2584    case TGSI_OPCODE_SUB:
2585       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2586          FETCH(&r[0], 0, chan_index);
2587          FETCH(&r[1], 1, chan_index);
2588          micro_sub(&d[chan_index], &r[0], &r[1]);
2589       }
2590       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2591          STORE(&d[chan_index], 0, chan_index);
2592       }
2593       break;
2594
2595    case TGSI_OPCODE_LRP:
2596       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2597       break;
2598
2599    case TGSI_OPCODE_CND:
2600       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2601          FETCH(&r[0], 0, chan_index);
2602          FETCH(&r[1], 1, chan_index);
2603          FETCH(&r[2], 2, chan_index);
2604          micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2605       }
2606       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2607          STORE(&d[chan_index], 0, chan_index);
2608       }
2609       break;
2610
2611    case TGSI_OPCODE_DP2A:
2612       exec_dp2a(mach, inst);
2613       break;
2614
2615    case TGSI_OPCODE_FRC:
2616       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2617       break;
2618
2619    case TGSI_OPCODE_CLAMP:
2620       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2621          FETCH(&r[0], 0, chan_index);
2622          FETCH(&r[1], 1, chan_index);
2623          micro_max(&r[0], &r[0], &r[1]);
2624          FETCH(&r[1], 2, chan_index);
2625          micro_min(&d[chan_index], &r[0], &r[1]);
2626       }
2627       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2628          STORE(&d[chan_index], 0, chan_index);
2629       }
2630       break;
2631
2632    case TGSI_OPCODE_FLR:
2633       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2634       break;
2635
2636    case TGSI_OPCODE_ROUND:
2637       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2638       break;
2639
2640    case TGSI_OPCODE_EX2:
2641       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2642       break;
2643
2644    case TGSI_OPCODE_LG2:
2645       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2646       break;
2647
2648    case TGSI_OPCODE_POW:
2649       FETCH(&r[0], 0, CHAN_X);
2650       FETCH(&r[1], 1, CHAN_X);
2651
2652       micro_pow( &r[0], &r[0], &r[1] );
2653
2654       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2655          STORE( &r[0], 0, chan_index );
2656       }
2657       break;
2658
2659    case TGSI_OPCODE_XPD:
2660       FETCH(&r[0], 0, CHAN_Y);
2661       FETCH(&r[1], 1, CHAN_Z);
2662
2663       micro_mul( &r[2], &r[0], &r[1] );
2664
2665       FETCH(&r[3], 0, CHAN_Z);
2666       FETCH(&r[4], 1, CHAN_Y);
2667
2668       micro_mul( &r[5], &r[3], &r[4] );
2669       micro_sub(&d[CHAN_X], &r[2], &r[5]);
2670
2671       FETCH(&r[2], 1, CHAN_X);
2672
2673       micro_mul( &r[3], &r[3], &r[2] );
2674
2675       FETCH(&r[5], 0, CHAN_X);
2676
2677       micro_mul( &r[1], &r[1], &r[5] );
2678       micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2679
2680       micro_mul( &r[5], &r[5], &r[4] );
2681       micro_mul( &r[0], &r[0], &r[2] );
2682       micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2683
2684       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2685          STORE(&d[CHAN_X], 0, CHAN_X);
2686       }
2687       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2688          STORE(&d[CHAN_Y], 0, CHAN_Y);
2689       }
2690       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2691          STORE(&d[CHAN_Z], 0, CHAN_Z);
2692       }
2693       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2694          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2695       }
2696       break;
2697
2698    case TGSI_OPCODE_ABS:
2699       exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2700       break;
2701
2702    case TGSI_OPCODE_RCC:
2703       FETCH(&r[0], 0, CHAN_X);
2704       micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2705       micro_float_clamp(&r[0], &r[0]);
2706       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2707          STORE(&r[0], 0, chan_index);
2708       }
2709       break;
2710
2711    case TGSI_OPCODE_DPH:
2712       exec_dph(mach, inst);
2713       break;
2714
2715    case TGSI_OPCODE_COS:
2716       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2717       break;
2718
2719    case TGSI_OPCODE_DDX:
2720       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2721       break;
2722
2723    case TGSI_OPCODE_DDY:
2724       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2725       break;
2726
2727    case TGSI_OPCODE_KILP:
2728       exec_kilp (mach, inst);
2729       break;
2730
2731    case TGSI_OPCODE_KIL:
2732       exec_kil (mach, inst);
2733       break;
2734
2735    case TGSI_OPCODE_PK2H:
2736       assert (0);
2737       break;
2738
2739    case TGSI_OPCODE_PK2US:
2740       assert (0);
2741       break;
2742
2743    case TGSI_OPCODE_PK4B:
2744       assert (0);
2745       break;
2746
2747    case TGSI_OPCODE_PK4UB:
2748       assert (0);
2749       break;
2750
2751    case TGSI_OPCODE_RFL:
2752       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2753           IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2754           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2755          /* r0 = dp3(src0, src0) */
2756          FETCH(&r[2], 0, CHAN_X);
2757          micro_mul(&r[0], &r[2], &r[2]);
2758          FETCH(&r[4], 0, CHAN_Y);
2759          micro_mul(&r[8], &r[4], &r[4]);
2760          micro_add(&r[0], &r[0], &r[8]);
2761          FETCH(&r[6], 0, CHAN_Z);
2762          micro_mul(&r[8], &r[6], &r[6]);
2763          micro_add(&r[0], &r[0], &r[8]);
2764
2765          /* r1 = dp3(src0, src1) */
2766          FETCH(&r[3], 1, CHAN_X);
2767          micro_mul(&r[1], &r[2], &r[3]);
2768          FETCH(&r[5], 1, CHAN_Y);
2769          micro_mul(&r[8], &r[4], &r[5]);
2770          micro_add(&r[1], &r[1], &r[8]);
2771          FETCH(&r[7], 1, CHAN_Z);
2772          micro_mul(&r[8], &r[6], &r[7]);
2773          micro_add(&r[1], &r[1], &r[8]);
2774
2775          /* r1 = 2 * r1 / r0 */
2776          micro_add(&r[1], &r[1], &r[1]);
2777          micro_div(&r[1], &r[1], &r[0]);
2778
2779          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2780             micro_mul(&r[2], &r[2], &r[1]);
2781             micro_sub(&r[2], &r[2], &r[3]);
2782             STORE(&r[2], 0, CHAN_X);
2783          }
2784          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2785             micro_mul(&r[4], &r[4], &r[1]);
2786             micro_sub(&r[4], &r[4], &r[5]);
2787             STORE(&r[4], 0, CHAN_Y);
2788          }
2789          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2790             micro_mul(&r[6], &r[6], &r[1]);
2791             micro_sub(&r[6], &r[6], &r[7]);
2792             STORE(&r[6], 0, CHAN_Z);
2793          }
2794       }
2795       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2796          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2797       }
2798       break;
2799
2800    case TGSI_OPCODE_SEQ:
2801       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2802       break;
2803
2804    case TGSI_OPCODE_SFL:
2805       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2806          STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2807       }
2808       break;
2809
2810    case TGSI_OPCODE_SGT:
2811       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2812       break;
2813
2814    case TGSI_OPCODE_SIN:
2815       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2816       break;
2817
2818    case TGSI_OPCODE_SLE:
2819       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2820       break;
2821
2822    case TGSI_OPCODE_SNE:
2823       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2824       break;
2825
2826    case TGSI_OPCODE_STR:
2827       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2828          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2829       }
2830       break;
2831
2832    case TGSI_OPCODE_TEX:
2833       /* simple texture lookup */
2834       /* src[0] = texcoord */
2835       /* src[1] = sampler unit */
2836       exec_tex(mach, inst, TEX_MODIFIER_NONE);
2837       break;
2838
2839    case TGSI_OPCODE_TXB:
2840       /* Texture lookup with lod bias */
2841       /* src[0] = texcoord (src[0].w = LOD bias) */
2842       /* src[1] = sampler unit */
2843       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2844       break;
2845
2846    case TGSI_OPCODE_TXD:
2847       /* Texture lookup with explict partial derivatives */
2848       /* src[0] = texcoord */
2849       /* src[1] = d[strq]/dx */
2850       /* src[2] = d[strq]/dy */
2851       /* src[3] = sampler unit */
2852       exec_txd(mach, inst);
2853       break;
2854
2855    case TGSI_OPCODE_TXL:
2856       /* Texture lookup with explit LOD */
2857       /* src[0] = texcoord (src[0].w = LOD) */
2858       /* src[1] = sampler unit */
2859       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2860       break;
2861
2862    case TGSI_OPCODE_TXP:
2863       /* Texture lookup with projection */
2864       /* src[0] = texcoord (src[0].w = projection) */
2865       /* src[1] = sampler unit */
2866       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2867       break;
2868
2869    case TGSI_OPCODE_UP2H:
2870       assert (0);
2871       break;
2872
2873    case TGSI_OPCODE_UP2US:
2874       assert (0);
2875       break;
2876
2877    case TGSI_OPCODE_UP4B:
2878       assert (0);
2879       break;
2880
2881    case TGSI_OPCODE_UP4UB:
2882       assert (0);
2883       break;
2884
2885    case TGSI_OPCODE_X2D:
2886       FETCH(&r[0], 1, CHAN_X);
2887       FETCH(&r[1], 1, CHAN_Y);
2888       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2889           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2890          FETCH(&r[2], 2, CHAN_X);
2891          micro_mul(&r[2], &r[2], &r[0]);
2892          FETCH(&r[3], 2, CHAN_Y);
2893          micro_mul(&r[3], &r[3], &r[1]);
2894          micro_add(&r[2], &r[2], &r[3]);
2895          FETCH(&r[3], 0, CHAN_X);
2896          micro_add(&d[CHAN_X], &r[2], &r[3]);
2897
2898       }
2899       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2900           IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2901          FETCH(&r[2], 2, CHAN_Z);
2902          micro_mul(&r[2], &r[2], &r[0]);
2903          FETCH(&r[3], 2, CHAN_W);
2904          micro_mul(&r[3], &r[3], &r[1]);
2905          micro_add(&r[2], &r[2], &r[3]);
2906          FETCH(&r[3], 0, CHAN_Y);
2907          micro_add(&d[CHAN_Y], &r[2], &r[3]);
2908
2909       }
2910       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2911          STORE(&d[CHAN_X], 0, CHAN_X);
2912       }
2913       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2914          STORE(&d[CHAN_Y], 0, CHAN_Y);
2915       }
2916       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2917          STORE(&d[CHAN_X], 0, CHAN_Z);
2918       }
2919       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2920          STORE(&d[CHAN_Y], 0, CHAN_W);
2921       }
2922       break;
2923
2924    case TGSI_OPCODE_ARA:
2925       assert (0);
2926       break;
2927
2928    case TGSI_OPCODE_ARR:
2929       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2930       break;
2931
2932    case TGSI_OPCODE_BRA:
2933       assert (0);
2934       break;
2935
2936    case TGSI_OPCODE_CAL:
2937       /* skip the call if no execution channels are enabled */
2938       if (mach->ExecMask) {
2939          /* do the call */
2940
2941          /* First, record the depths of the execution stacks.
2942           * This is important for deeply nested/looped return statements.
2943           * We have to unwind the stacks by the correct amount.  For a
2944           * real code generator, we could determine the number of entries
2945           * to pop off each stack with simple static analysis and avoid
2946           * implementing this data structure at run time.
2947           */
2948          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2949          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2950          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2951          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
2952          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
2953          /* note that PC was already incremented above */
2954          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2955
2956          mach->CallStackTop++;
2957
2958          /* Second, push the Cond, Loop, Cont, Func stacks */
2959          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2960          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2961          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2962          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2963          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2964          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2965
2966          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2967          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2968          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2969          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2970          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2971          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2972
2973          /* Finally, jump to the subroutine */
2974          *pc = inst->Label.Label;
2975       }
2976       break;
2977
2978    case TGSI_OPCODE_RET:
2979       mach->FuncMask &= ~mach->ExecMask;
2980       UPDATE_EXEC_MASK(mach);
2981
2982       if (mach->FuncMask == 0x0) {
2983          /* really return now (otherwise, keep executing */
2984
2985          if (mach->CallStackTop == 0) {
2986             /* returning from main() */
2987             *pc = -1;
2988             return;
2989          }
2990
2991          assert(mach->CallStackTop > 0);
2992          mach->CallStackTop--;
2993
2994          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2995          mach->CondMask = mach->CondStack[mach->CondStackTop];
2996
2997          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2998          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2999
3000          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3001          mach->ContMask = mach->ContStack[mach->ContStackTop];
3002
3003          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3004          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3005
3006          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3007          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3008
3009          assert(mach->FuncStackTop > 0);
3010          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3011
3012          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3013
3014          UPDATE_EXEC_MASK(mach);
3015       }
3016       break;
3017
3018    case TGSI_OPCODE_SSG:
3019       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3020       break;
3021
3022    case TGSI_OPCODE_CMP:
3023       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3024          FETCH(&r[0], 0, chan_index);
3025          FETCH(&r[1], 1, chan_index);
3026          FETCH(&r[2], 2, chan_index);
3027          micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
3028       }
3029       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3030          STORE(&d[chan_index], 0, chan_index);
3031       }
3032       break;
3033
3034    case TGSI_OPCODE_SCS:
3035       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3036          FETCH( &r[0], 0, CHAN_X );
3037          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3038             micro_cos(&r[1], &r[0]);
3039             STORE(&r[1], 0, CHAN_X);
3040          }
3041          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3042             micro_sin(&r[1], &r[0]);
3043             STORE(&r[1], 0, CHAN_Y);
3044          }
3045       }
3046       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3047          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3048       }
3049       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3050          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3051       }
3052       break;
3053
3054    case TGSI_OPCODE_NRM:
3055       /* 3-component vector normalize */
3056       if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
3057          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3058          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3059          /* r3 = sqrt(dp3(src0, src0)) */
3060          FETCH(&r[0], 0, CHAN_X);
3061          micro_mul(&r[3], &r[0], &r[0]);
3062          FETCH(&r[1], 0, CHAN_Y);
3063          micro_mul(&r[4], &r[1], &r[1]);
3064          micro_add(&r[3], &r[3], &r[4]);
3065          FETCH(&r[2], 0, CHAN_Z);
3066          micro_mul(&r[4], &r[2], &r[2]);
3067          micro_add(&r[3], &r[3], &r[4]);
3068          micro_sqrt(&r[3], &r[3]);
3069
3070          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3071             micro_div(&r[0], &r[0], &r[3]);
3072             STORE(&r[0], 0, CHAN_X);
3073          }
3074          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3075             micro_div(&r[1], &r[1], &r[3]);
3076             STORE(&r[1], 0, CHAN_Y);
3077          }
3078          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3079             micro_div(&r[2], &r[2], &r[3]);
3080             STORE(&r[2], 0, CHAN_Z);
3081          }
3082       }
3083       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3084          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
3085       }
3086       break;
3087
3088    case TGSI_OPCODE_NRM4:
3089       /* 4-component vector normalize */
3090       {
3091          union tgsi_exec_channel tmp, dot;
3092
3093          /* tmp = dp4(src0, src0): */
3094          FETCH( &r[0], 0, CHAN_X );
3095          micro_mul( &tmp, &r[0], &r[0] );
3096
3097          FETCH( &r[1], 0, CHAN_Y );
3098          micro_mul( &dot, &r[1], &r[1] );
3099          micro_add( &tmp, &tmp, &dot );
3100
3101          FETCH( &r[2], 0, CHAN_Z );
3102          micro_mul( &dot, &r[2], &r[2] );
3103          micro_add( &tmp, &tmp, &dot );
3104
3105          FETCH( &r[3], 0, CHAN_W );
3106          micro_mul( &dot, &r[3], &r[3] );
3107          micro_add( &tmp, &tmp, &dot );
3108
3109          /* tmp = 1 / sqrt(tmp) */
3110          micro_sqrt( &tmp, &tmp );
3111          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
3112
3113          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3114             /* chan = chan * tmp */
3115             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
3116             STORE( &r[chan_index], 0, chan_index );
3117          }
3118       }
3119       break;
3120
3121    case TGSI_OPCODE_DIV:
3122       assert( 0 );
3123       break;
3124
3125    case TGSI_OPCODE_DP2:
3126       exec_dp2(mach, inst);
3127       break;
3128
3129    case TGSI_OPCODE_IF:
3130       /* push CondMask */
3131       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3132       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3133       FETCH( &r[0], 0, CHAN_X );
3134       /* update CondMask */
3135       if( ! r[0].u[0] ) {
3136          mach->CondMask &= ~0x1;
3137       }
3138       if( ! r[0].u[1] ) {
3139          mach->CondMask &= ~0x2;
3140       }
3141       if( ! r[0].u[2] ) {
3142          mach->CondMask &= ~0x4;
3143       }
3144       if( ! r[0].u[3] ) {
3145          mach->CondMask &= ~0x8;
3146       }
3147       UPDATE_EXEC_MASK(mach);
3148       /* Todo: If CondMask==0, jump to ELSE */
3149       break;
3150
3151    case TGSI_OPCODE_ELSE:
3152       /* invert CondMask wrt previous mask */
3153       {
3154          uint prevMask;
3155          assert(mach->CondStackTop > 0);
3156          prevMask = mach->CondStack[mach->CondStackTop - 1];
3157          mach->CondMask = ~mach->CondMask & prevMask;
3158          UPDATE_EXEC_MASK(mach);
3159          /* Todo: If CondMask==0, jump to ENDIF */
3160       }
3161       break;
3162
3163    case TGSI_OPCODE_ENDIF:
3164       /* pop CondMask */
3165       assert(mach->CondStackTop > 0);
3166       mach->CondMask = mach->CondStack[--mach->CondStackTop];
3167       UPDATE_EXEC_MASK(mach);
3168       break;
3169
3170    case TGSI_OPCODE_END:
3171       /* halt execution */
3172       *pc = -1;
3173       break;
3174
3175    case TGSI_OPCODE_REP:
3176       assert (0);
3177       break;
3178
3179    case TGSI_OPCODE_ENDREP:
3180        assert (0);
3181        break;
3182
3183    case TGSI_OPCODE_PUSHA:
3184       assert (0);
3185       break;
3186
3187    case TGSI_OPCODE_POPA:
3188       assert (0);
3189       break;
3190
3191    case TGSI_OPCODE_CEIL:
3192       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3193       break;
3194
3195    case TGSI_OPCODE_I2F:
3196       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3197       break;
3198
3199    case TGSI_OPCODE_NOT:
3200       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3201       break;
3202
3203    case TGSI_OPCODE_TRUNC:
3204       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3205       break;
3206
3207    case TGSI_OPCODE_SHL:
3208       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3209       break;
3210
3211    case TGSI_OPCODE_AND:
3212       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3213       break;
3214
3215    case TGSI_OPCODE_OR:
3216       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3217       break;
3218
3219    case TGSI_OPCODE_MOD:
3220       assert (0);
3221       break;
3222
3223    case TGSI_OPCODE_XOR:
3224       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3225       break;
3226
3227    case TGSI_OPCODE_SAD:
3228       assert (0);
3229       break;
3230
3231    case TGSI_OPCODE_TXF:
3232       assert (0);
3233       break;
3234
3235    case TGSI_OPCODE_TXQ:
3236       assert (0);
3237       break;
3238
3239    case TGSI_OPCODE_EMIT:
3240       emit_vertex(mach);
3241       break;
3242
3243    case TGSI_OPCODE_ENDPRIM:
3244       emit_primitive(mach);
3245       break;
3246
3247    case TGSI_OPCODE_BGNFOR:
3248       assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3249       for (chan_index = 0; chan_index < 3; chan_index++) {
3250          FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3251       }
3252       ++mach->LoopCounterStackTop;
3253       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3254       /* update LoopMask */
3255       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3256          mach->LoopMask &= ~0x1;
3257       }
3258       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3259          mach->LoopMask &= ~0x2;
3260       }
3261       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3262          mach->LoopMask &= ~0x4;
3263       }
3264       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3265          mach->LoopMask &= ~0x8;
3266       }
3267       /* TODO: if mach->LoopMask == 0, jump to end of loop */
3268       UPDATE_EXEC_MASK(mach);
3269       /* fall-through (for now) */
3270    case TGSI_OPCODE_BGNLOOP:
3271       /* push LoopMask and ContMasks */
3272       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3273       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3274       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3275       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3276
3277       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3278       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3279       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3280       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3281       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3282       break;
3283
3284    case TGSI_OPCODE_ENDFOR:
3285       assert(mach->LoopCounterStackTop > 0);
3286       micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3287                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3288                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3289       /* update LoopMask */
3290       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3291          mach->LoopMask &= ~0x1;
3292       }
3293       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3294          mach->LoopMask &= ~0x2;
3295       }
3296       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3297          mach->LoopMask &= ~0x4;
3298       }
3299       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3300          mach->LoopMask &= ~0x8;
3301       }
3302       micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3303                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3304                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3305       assert(mach->LoopLabelStackTop > 0);
3306       inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3307       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3308       /* Restore ContMask, but don't pop */
3309       assert(mach->ContStackTop > 0);
3310       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3311       UPDATE_EXEC_MASK(mach);
3312       if (mach->ExecMask) {
3313          /* repeat loop: jump to instruction just past BGNLOOP */
3314          assert(mach->LoopLabelStackTop > 0);
3315          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3316       }
3317       else {
3318          /* exit loop: pop LoopMask */
3319          assert(mach->LoopStackTop > 0);
3320          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3321          /* pop ContMask */
3322          assert(mach->ContStackTop > 0);
3323          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3324          assert(mach->LoopLabelStackTop > 0);
3325          --mach->LoopLabelStackTop;
3326          assert(mach->LoopCounterStackTop > 0);
3327          --mach->LoopCounterStackTop;
3328
3329          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3330       }
3331       UPDATE_EXEC_MASK(mach);
3332       break;
3333
3334    case TGSI_OPCODE_ENDLOOP:
3335       /* Restore ContMask, but don't pop */
3336       assert(mach->ContStackTop > 0);
3337       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3338       UPDATE_EXEC_MASK(mach);
3339       if (mach->ExecMask) {
3340          /* repeat loop: jump to instruction just past BGNLOOP */
3341          assert(mach->LoopLabelStackTop > 0);
3342          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3343       }
3344       else {
3345          /* exit loop: pop LoopMask */
3346          assert(mach->LoopStackTop > 0);
3347          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3348          /* pop ContMask */
3349          assert(mach->ContStackTop > 0);
3350          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3351          assert(mach->LoopLabelStackTop > 0);
3352          --mach->LoopLabelStackTop;
3353
3354          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3355       }
3356       UPDATE_EXEC_MASK(mach);
3357       break;
3358
3359    case TGSI_OPCODE_BRK:
3360       exec_break(mach);
3361       break;
3362
3363    case TGSI_OPCODE_CONT:
3364       /* turn off cont channels for each enabled exec channel */
3365       mach->ContMask &= ~mach->ExecMask;
3366       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3367       UPDATE_EXEC_MASK(mach);
3368       break;
3369
3370    case TGSI_OPCODE_BGNSUB:
3371       /* no-op */
3372       break;
3373
3374    case TGSI_OPCODE_ENDSUB:
3375       /*
3376        * XXX: This really should be a no-op. We should never reach this opcode.
3377        */
3378
3379       assert(mach->CallStackTop > 0);
3380       mach->CallStackTop--;
3381
3382       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3383       mach->CondMask = mach->CondStack[mach->CondStackTop];
3384
3385       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3386       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3387
3388       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3389       mach->ContMask = mach->ContStack[mach->ContStackTop];
3390
3391       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3392       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3393
3394       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3395       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3396
3397       assert(mach->FuncStackTop > 0);
3398       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3399
3400       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3401
3402       UPDATE_EXEC_MASK(mach);
3403       break;
3404
3405    case TGSI_OPCODE_NOP:
3406       break;
3407
3408    case TGSI_OPCODE_BREAKC:
3409       FETCH(&r[0], 0, CHAN_X);
3410       /* update CondMask */
3411       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3412          mach->LoopMask &= ~0x1;
3413       }
3414       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3415          mach->LoopMask &= ~0x2;
3416       }
3417       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3418          mach->LoopMask &= ~0x4;
3419       }
3420       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3421          mach->LoopMask &= ~0x8;
3422       }
3423       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3424       UPDATE_EXEC_MASK(mach);
3425       break;
3426
3427    case TGSI_OPCODE_F2I:
3428       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3429       break;
3430
3431    case TGSI_OPCODE_IDIV:
3432       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3433       break;
3434
3435    case TGSI_OPCODE_IMAX:
3436       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3437       break;
3438
3439    case TGSI_OPCODE_IMIN:
3440       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3441       break;
3442
3443    case TGSI_OPCODE_INEG:
3444       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3445       break;
3446
3447    case TGSI_OPCODE_ISGE:
3448       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3449       break;
3450
3451    case TGSI_OPCODE_ISHR:
3452       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3453       break;
3454
3455    case TGSI_OPCODE_ISLT:
3456       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3457       break;
3458
3459    case TGSI_OPCODE_F2U:
3460       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3461       break;
3462
3463    case TGSI_OPCODE_U2F:
3464       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3465       break;
3466
3467    case TGSI_OPCODE_UADD:
3468       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3469       break;
3470
3471    case TGSI_OPCODE_UDIV:
3472       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3473       break;
3474
3475    case TGSI_OPCODE_UMAD:
3476       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3477       break;
3478
3479    case TGSI_OPCODE_UMAX:
3480       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3481       break;
3482
3483    case TGSI_OPCODE_UMIN:
3484       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3485       break;
3486
3487    case TGSI_OPCODE_UMOD:
3488       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3489       break;
3490
3491    case TGSI_OPCODE_UMUL:
3492       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3493       break;
3494
3495    case TGSI_OPCODE_USEQ:
3496       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3497       break;
3498
3499    case TGSI_OPCODE_USGE:
3500       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3501       break;
3502
3503    case TGSI_OPCODE_USHR:
3504       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3505       break;
3506
3507    case TGSI_OPCODE_USLT:
3508       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3509       break;
3510
3511    case TGSI_OPCODE_USNE:
3512       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3513       break;
3514
3515    case TGSI_OPCODE_SWITCH:
3516       exec_switch(mach, inst);
3517       break;
3518
3519    case TGSI_OPCODE_CASE:
3520       exec_case(mach, inst);
3521       break;
3522
3523    case TGSI_OPCODE_DEFAULT:
3524       exec_default(mach);
3525       break;
3526
3527    case TGSI_OPCODE_ENDSWITCH:
3528       exec_endswitch(mach);
3529       break;
3530
3531    default:
3532       assert( 0 );
3533    }
3534 }
3535
3536
3537 #define DEBUG_EXECUTION 0
3538
3539
3540 /**
3541  * Run TGSI interpreter.
3542  * \return bitmask of "alive" quad components
3543  */
3544 uint
3545 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3546 {
3547    uint i;
3548    int pc = 0;
3549
3550    mach->CondMask = 0xf;
3551    mach->LoopMask = 0xf;
3552    mach->ContMask = 0xf;
3553    mach->FuncMask = 0xf;
3554    mach->ExecMask = 0xf;
3555
3556    mach->Switch.mask = 0xf;
3557
3558    assert(mach->CondStackTop == 0);
3559    assert(mach->LoopStackTop == 0);
3560    assert(mach->ContStackTop == 0);
3561    assert(mach->SwitchStackTop == 0);
3562    assert(mach->BreakStackTop == 0);
3563    assert(mach->CallStackTop == 0);
3564
3565    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3566    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3567
3568    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3569       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3570       mach->Primitives[0] = 0;
3571    }
3572
3573    for (i = 0; i < QUAD_SIZE; i++) {
3574       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3575          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3576          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3577          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3578          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3579    }
3580
3581    /* execute declarations (interpolants) */
3582    for (i = 0; i < mach->NumDeclarations; i++) {
3583       exec_declaration( mach, mach->Declarations+i );
3584    }
3585
3586    {
3587 #if DEBUG_EXECUTION
3588       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3589       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3590       uint inst = 1;
3591
3592       memcpy(temps, mach->Temps, sizeof(temps));
3593       memcpy(outputs, mach->Outputs, sizeof(outputs));
3594 #endif
3595
3596       /* execute instructions, until pc is set to -1 */
3597       while (pc != -1) {
3598
3599 #if DEBUG_EXECUTION
3600          uint i;
3601
3602          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3603 #endif
3604
3605          assert(pc < (int) mach->NumInstructions);
3606          exec_instruction(mach, mach->Instructions + pc, &pc);
3607
3608 #if DEBUG_EXECUTION
3609          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3610             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3611                uint j;
3612
3613                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3614                debug_printf("TEMP[%2u] = ", i);
3615                for (j = 0; j < 4; j++) {
3616                   if (j > 0) {
3617                      debug_printf("           ");
3618                   }
3619                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3620                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3621                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3622                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3623                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3624                }
3625             }
3626          }
3627          for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3628             if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3629                uint j;
3630
3631                memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3632                debug_printf("OUT[%2u] =  ", i);
3633                for (j = 0; j < 4; j++) {
3634                   if (j > 0) {
3635                      debug_printf("           ");
3636                   }
3637                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3638                                outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3639                                outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3640                                outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3641                                outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3642                }
3643             }
3644          }
3645 #endif
3646       }
3647    }
3648
3649 #if 0
3650    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3651    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3652       /*
3653        * Scale back depth component.
3654        */
3655       for (i = 0; i < 4; i++)
3656          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3657    }
3658 #endif
3659
3660    assert(mach->CondStackTop == 0);
3661    assert(mach->LoopStackTop == 0);
3662    assert(mach->ContStackTop == 0);
3663    assert(mach->SwitchStackTop == 0);
3664    assert(mach->BreakStackTop == 0);
3665    assert(mach->CallStackTop == 0);
3666
3667    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3668 }