src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_memory.h"
  62 #include "util/u_math.h"
  63
  64
  65 #define FAST_MATH 1
  66
  67 #define TILE_TOP_LEFT     0
  68 #define TILE_TOP_RIGHT    1
  69 #define TILE_BOTTOM_LEFT  2
  70 #define TILE_BOTTOM_RIGHT 3
  71
  72 static void
  73 micro_abs(union tgsi_exec_channel *dst,
  74           const union tgsi_exec_channel *src)
  75 {
  76    dst->f[0] = fabsf(src->f[0]);
  77    dst->f[1] = fabsf(src->f[1]);
  78    dst->f[2] = fabsf(src->f[2]);
  79    dst->f[3] = fabsf(src->f[3]);
  80 }
  81
  82 static void
  83 micro_arl(union tgsi_exec_channel *dst,
  84           const union tgsi_exec_channel *src)
  85 {
  86    dst->i[0] = (int)floorf(src->f[0]);
  87    dst->i[1] = (int)floorf(src->f[1]);
  88    dst->i[2] = (int)floorf(src->f[2]);
  89    dst->i[3] = (int)floorf(src->f[3]);
  90 }
  91
  92 static void
  93 micro_arr(union tgsi_exec_channel *dst,
  94           const union tgsi_exec_channel *src)
  95 {
  96    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
  97    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
  98    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
  99    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 100 }
 101
 102 static void
 103 micro_ceil(union tgsi_exec_channel *dst,
 104            const union tgsi_exec_channel *src)
 105 {
 106    dst->f[0] = ceilf(src->f[0]);
 107    dst->f[1] = ceilf(src->f[1]);
 108    dst->f[2] = ceilf(src->f[2]);
 109    dst->f[3] = ceilf(src->f[3]);
 110 }
 111
 112 static void
 113 micro_cos(union tgsi_exec_channel *dst,
 114           const union tgsi_exec_channel *src)
 115 {
 116    dst->f[0] = cosf(src->f[0]);
 117    dst->f[1] = cosf(src->f[1]);
 118    dst->f[2] = cosf(src->f[2]);
 119    dst->f[3] = cosf(src->f[3]);
 120 }
 121
 122 static void
 123 micro_ddx(union tgsi_exec_channel *dst,
 124           const union tgsi_exec_channel *src)
 125 {
 126    dst->f[0] =
 127    dst->f[1] =
 128    dst->f[2] =
 129    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 130 }
 131
 132 static void
 133 micro_ddy(union tgsi_exec_channel *dst,
 134           const union tgsi_exec_channel *src)
 135 {
 136    dst->f[0] =
 137    dst->f[1] =
 138    dst->f[2] =
 139    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 140 }
 141
 142 static void
 143 micro_exp2(union tgsi_exec_channel *dst,
 144            const union tgsi_exec_channel *src)
 145 {
 146 #if FAST_MATH
 147    dst->f[0] = util_fast_exp2(src->f[0]);
 148    dst->f[1] = util_fast_exp2(src->f[1]);
 149    dst->f[2] = util_fast_exp2(src->f[2]);
 150    dst->f[3] = util_fast_exp2(src->f[3]);
 151 #else
 152 #if DEBUG
 153    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 154    uint i;
 155    union tgsi_exec_channel clamped;
 156
 157    for (i = 0; i < 4; i++) {
 158       if (src->f[i] > 127.99999f) {
 159          clamped.f[i] = 127.99999f;
 160       } else if (src->f[i] < -126.99999f) {
 161          clamped.f[i] = -126.99999f;
 162       } else {
 163          clamped.f[i] = src->f[i];
 164       }
 165    }
 166    src = &clamped;
 167 #endif /* DEBUG */
 168
 169    dst->f[0] = powf(2.0f, src->f[0]);
 170    dst->f[1] = powf(2.0f, src->f[1]);
 171    dst->f[2] = powf(2.0f, src->f[2]);
 172    dst->f[3] = powf(2.0f, src->f[3]);
 173 #endif /* FAST_MATH */
 174 }
 175
 176 static void
 177 micro_flr(union tgsi_exec_channel *dst,
 178           const union tgsi_exec_channel *src)
 179 {
 180    dst->f[0] = floorf(src->f[0]);
 181    dst->f[1] = floorf(src->f[1]);
 182    dst->f[2] = floorf(src->f[2]);
 183    dst->f[3] = floorf(src->f[3]);
 184 }
 185
 186 static void
 187 micro_frc(union tgsi_exec_channel *dst,
 188           const union tgsi_exec_channel *src)
 189 {
 190    dst->f[0] = src->f[0] - floorf(src->f[0]);
 191    dst->f[1] = src->f[1] - floorf(src->f[1]);
 192    dst->f[2] = src->f[2] - floorf(src->f[2]);
 193    dst->f[3] = src->f[3] - floorf(src->f[3]);
 194 }
 195
 196 static void
 197 micro_iabs(union tgsi_exec_channel *dst,
 198            const union tgsi_exec_channel *src)
 199 {
 200    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 201    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 202    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 203    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 204 }
 205
 206 static void
 207 micro_ineg(union tgsi_exec_channel *dst,
 208            const union tgsi_exec_channel *src)
 209 {
 210    dst->i[0] = -src->i[0];
 211    dst->i[1] = -src->i[1];
 212    dst->i[2] = -src->i[2];
 213    dst->i[3] = -src->i[3];
 214 }
 215
 216 static void
 217 micro_lg2(union tgsi_exec_channel *dst,
 218           const union tgsi_exec_channel *src)
 219 {
 220 #if FAST_MATH
 221    dst->f[0] = util_fast_log2(src->f[0]);
 222    dst->f[1] = util_fast_log2(src->f[1]);
 223    dst->f[2] = util_fast_log2(src->f[2]);
 224    dst->f[3] = util_fast_log2(src->f[3]);
 225 #else
 226    dst->f[0] = logf(src->f[0]) * 1.442695f;
 227    dst->f[1] = logf(src->f[1]) * 1.442695f;
 228    dst->f[2] = logf(src->f[2]) * 1.442695f;
 229    dst->f[3] = logf(src->f[3]) * 1.442695f;
 230 #endif
 231 }
 232
 233 static void
 234 micro_lrp(union tgsi_exec_channel *dst,
 235           const union tgsi_exec_channel *src)
 236 {
 237    dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
 238    dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
 239    dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
 240    dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
 241 }
 242
 243 static void
 244 micro_mad(union tgsi_exec_channel *dst,
 245           const union tgsi_exec_channel *src)
 246 {
 247    dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
 248    dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
 249    dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
 250    dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
 251 }
 252
 253 static void
 254 micro_mov(union tgsi_exec_channel *dst,
 255           const union tgsi_exec_channel *src)
 256 {
 257    dst->u[0] = src->u[0];
 258    dst->u[1] = src->u[1];
 259    dst->u[2] = src->u[2];
 260    dst->u[3] = src->u[3];
 261 }
 262
 263 static void
 264 micro_rcp(union tgsi_exec_channel *dst,
 265           const union tgsi_exec_channel *src)
 266 {
 267    dst->f[0] = 1.0f / src->f[0];
 268    dst->f[1] = 1.0f / src->f[1];
 269    dst->f[2] = 1.0f / src->f[2];
 270    dst->f[3] = 1.0f / src->f[3];
 271 }
 272
 273 static void
 274 micro_rnd(union tgsi_exec_channel *dst,
 275           const union tgsi_exec_channel *src)
 276 {
 277    dst->f[0] = floorf(src->f[0] + 0.5f);
 278    dst->f[1] = floorf(src->f[1] + 0.5f);
 279    dst->f[2] = floorf(src->f[2] + 0.5f);
 280    dst->f[3] = floorf(src->f[3] + 0.5f);
 281 }
 282
 283 static void
 284 micro_rsq(union tgsi_exec_channel *dst,
 285           const union tgsi_exec_channel *src)
 286 {
 287    dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
 288    dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
 289    dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
 290    dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
 291 }
 292
 293 static void
 294 micro_seq(union tgsi_exec_channel *dst,
 295           const union tgsi_exec_channel *src)
 296 {
 297    dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
 298    dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
 299    dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
 300    dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
 301 }
 302
 303 static void
 304 micro_sge(union tgsi_exec_channel *dst,
 305           const union tgsi_exec_channel *src)
 306 {
 307    dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
 308    dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
 309    dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
 310    dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
 311 }
 312
 313 static void
 314 micro_sgn(union tgsi_exec_channel *dst,
 315           const union tgsi_exec_channel *src)
 316 {
 317    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 318    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 319    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 320    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 321 }
 322
 323 static void
 324 micro_sgt(union tgsi_exec_channel *dst,
 325           const union tgsi_exec_channel *src)
 326 {
 327    dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
 328    dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
 329    dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
 330    dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
 331 }
 332
 333 static void
 334 micro_sin(union tgsi_exec_channel *dst,
 335           const union tgsi_exec_channel *src)
 336 {
 337    dst->f[0] = sinf(src->f[0]);
 338    dst->f[1] = sinf(src->f[1]);
 339    dst->f[2] = sinf(src->f[2]);
 340    dst->f[3] = sinf(src->f[3]);
 341 }
 342
 343 static void
 344 micro_sle(union tgsi_exec_channel *dst,
 345           const union tgsi_exec_channel *src)
 346 {
 347    dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
 348    dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
 349    dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
 350    dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
 351 }
 352
 353 static void
 354 micro_slt(union tgsi_exec_channel *dst,
 355           const union tgsi_exec_channel *src)
 356 {
 357    dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
 358    dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
 359    dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
 360    dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
 361 }
 362
 363 static void
 364 micro_sne(union tgsi_exec_channel *dst,
 365           const union tgsi_exec_channel *src)
 366 {
 367    dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
 368    dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
 369    dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
 370    dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
 371 }
 372
 373 static void
 374 micro_trunc(union tgsi_exec_channel *dst,
 375             const union tgsi_exec_channel *src)
 376 {
 377    dst->f[0] = (float)(int)src->f[0];
 378    dst->f[1] = (float)(int)src->f[1];
 379    dst->f[2] = (float)(int)src->f[2];
 380    dst->f[3] = (float)(int)src->f[3];
 381 }
 382
 383
 384 #define CHAN_X  0
 385 #define CHAN_Y  1
 386 #define CHAN_Z  2
 387 #define CHAN_W  3
 388
 389 enum tgsi_exec_datatype {
 390    TGSI_EXEC_DATA_FLOAT,
 391    TGSI_EXEC_DATA_INT,
 392    TGSI_EXEC_DATA_UINT
 393 };
 394
 395 /*
 396  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 397  */
 398 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
 399 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
 400 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
 401 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
 402 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
 403 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
 404 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
 405 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
 406 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
 407 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
 408 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
 409 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
 410 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
 411 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
 412 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
 413 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
 414 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 415 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 416 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 417 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 418 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 419 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 420 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 421 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 422 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 423 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 424 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 425 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 426 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 427 #define TEMP_P0            TGSI_EXEC_TEMP_P0
 428
 429 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 430    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
 431
 432 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 433    ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
 434
 435 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 436    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 437       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 438
 439 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 440    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 441       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 442
 443
 444 /** The execution mask depends on the conditional mask and the loop mask */
 445 #define UPDATE_EXEC_MASK(MACH) \
 446       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 447
 448
 449 static const union tgsi_exec_channel ZeroVec =
 450    { { 0.0, 0.0, 0.0, 0.0 } };
 451
 452
 453 #define CHECK_INF_OR_NAN(chan) do {\
 454       assert(!util_is_inf_or_nan((chan)->f[0]));\
 455       assert(!util_is_inf_or_nan((chan)->f[1]));\
 456       assert(!util_is_inf_or_nan((chan)->f[2]));\
 457       assert(!util_is_inf_or_nan((chan)->f[3]));\
 458    } while (0)
 459
 460
 461 #ifdef DEBUG
 462 static void
 463 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 464 {
 465    debug_printf("%s = {%f, %f, %f, %f}\n",
 466                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 467 }
 468 #endif
 469
 470
 471 #ifdef DEBUG
 472 static void
 473 print_temp(const struct tgsi_exec_machine *mach, uint index)
 474 {
 475    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 476    int i;
 477    debug_printf("Temp[%u] =\n", index);
 478    for (i = 0; i < 4; i++) {
 479       debug_printf("  %c: { %f, %f, %f, %f }\n",
 480                    "XYZW"[i],
 481                    tmp->xyzw[i].f[0],
 482                    tmp->xyzw[i].f[1],
 483                    tmp->xyzw[i].f[2],
 484                    tmp->xyzw[i].f[3]);
 485    }
 486 }
 487 #endif
 488
 489
 490 /**
 491  * Check if there's a potential src/dst register data dependency when
 492  * using SOA execution.
 493  * Example:
 494  *   MOV T, T.yxwz;
 495  * This would expand into:
 496  *   MOV t0, t1;
 497  *   MOV t1, t0;
 498  *   MOV t2, t3;
 499  *   MOV t3, t2;
 500  * The second instruction will have the wrong value for t0 if executed as-is.
 501  */
 502 boolean
 503 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
 504 {
 505    uint i, chan;
 506
 507    uint writemask = inst->Dst[0].Register.WriteMask;
 508    if (writemask == TGSI_WRITEMASK_X ||
 509        writemask == TGSI_WRITEMASK_Y ||
 510        writemask == TGSI_WRITEMASK_Z ||
 511        writemask == TGSI_WRITEMASK_W ||
 512        writemask == TGSI_WRITEMASK_NONE) {
 513       /* no chance of data dependency */
 514       return FALSE;
 515    }
 516
 517    /* loop over src regs */
 518    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 519       if ((inst->Src[i].Register.File ==
 520            inst->Dst[0].Register.File) &&
 521           (inst->Src[i].Register.Index ==
 522            inst->Dst[0].Register.Index)) {
 523          /* loop over dest channels */
 524          uint channelsWritten = 0x0;
 525          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
 526             /* check if we're reading a channel that's been written */
 527             uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
 528             if (channelsWritten & (1 << swizzle)) {
 529                return TRUE;
 530             }
 531
 532             channelsWritten |= (1 << chan);
 533          }
 534       }
 535    }
 536    return FALSE;
 537 }
 538
 539
 540 /**
 541  * Initialize machine state by expanding tokens to full instructions,
 542  * allocating temporary storage, setting up constants, etc.
 543  * After this, we can call tgsi_exec_machine_run() many times.
 544  */
 545 void
 546 tgsi_exec_machine_bind_shader(
 547    struct tgsi_exec_machine *mach,
 548    const struct tgsi_token *tokens,
 549    uint numSamplers,
 550    struct tgsi_sampler **samplers)
 551 {
 552    uint k;
 553    struct tgsi_parse_context parse;
 554    struct tgsi_exec_labels *labels = &mach->Labels;
 555    struct tgsi_full_instruction *instructions;
 556    struct tgsi_full_declaration *declarations;
 557    uint maxInstructions = 10, numInstructions = 0;
 558    uint maxDeclarations = 10, numDeclarations = 0;
 559    uint instno = 0;
 560
 561 #if 0
 562    tgsi_dump(tokens, 0);
 563 #endif
 564
 565    util_init_math();
 566
 567    mach->Tokens = tokens;
 568    mach->Samplers = samplers;
 569
 570    k = tgsi_parse_init (&parse, mach->Tokens);
 571    if (k != TGSI_PARSE_OK) {
 572       debug_printf( "Problem parsing!\n" );
 573       return;
 574    }
 575
 576    mach->Processor = parse.FullHeader.Processor.Processor;
 577    mach->ImmLimit = 0;
 578    labels->count = 0;
 579
 580    declarations = (struct tgsi_full_declaration *)
 581       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 582
 583    if (!declarations) {
 584       return;
 585    }
 586
 587    instructions = (struct tgsi_full_instruction *)
 588       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 589
 590    if (!instructions) {
 591       FREE( declarations );
 592       return;
 593    }
 594
 595    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 596       uint pointer = parse.Position;
 597       uint i;
 598
 599       tgsi_parse_token( &parse );
 600       switch( parse.FullToken.Token.Type ) {
 601       case TGSI_TOKEN_TYPE_DECLARATION:
 602          /* save expanded declaration */
 603          if (numDeclarations == maxDeclarations) {
 604             declarations = REALLOC(declarations,
 605                                    maxDeclarations
 606                                    * sizeof(struct tgsi_full_declaration),
 607                                    (maxDeclarations + 10)
 608                                    * sizeof(struct tgsi_full_declaration));
 609             maxDeclarations += 10;
 610          }
 611          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
 612             unsigned reg;
 613             for (reg = parse.FullToken.FullDeclaration.Range.First;
 614                  reg <= parse.FullToken.FullDeclaration.Range.Last;
 615                  ++reg) {
 616                ++mach->NumOutputs;
 617             }
 618          }
 619          memcpy(declarations + numDeclarations,
 620                 &parse.FullToken.FullDeclaration,
 621                 sizeof(declarations[0]));
 622          numDeclarations++;
 623          break;
 624
 625       case TGSI_TOKEN_TYPE_IMMEDIATE:
 626          {
 627             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 628             assert( size <= 4 );
 629             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
 630
 631             for( i = 0; i < size; i++ ) {
 632                mach->Imms[mach->ImmLimit][i] =
 633                   parse.FullToken.FullImmediate.u[i].Float;
 634             }
 635             mach->ImmLimit += 1;
 636          }
 637          break;
 638
 639       case TGSI_TOKEN_TYPE_INSTRUCTION:
 640          assert( labels->count < MAX_LABELS );
 641
 642          labels->labels[labels->count][0] = instno;
 643          labels->labels[labels->count][1] = pointer;
 644          labels->count++;
 645
 646          /* save expanded instruction */
 647          if (numInstructions == maxInstructions) {
 648             instructions = REALLOC(instructions,
 649                                    maxInstructions
 650                                    * sizeof(struct tgsi_full_instruction),
 651                                    (maxInstructions + 10)
 652                                    * sizeof(struct tgsi_full_instruction));
 653             maxInstructions += 10;
 654          }
 655
 656          memcpy(instructions + numInstructions,
 657                 &parse.FullToken.FullInstruction,
 658                 sizeof(instructions[0]));
 659
 660          numInstructions++;
 661          break;
 662
 663       case TGSI_TOKEN_TYPE_PROPERTY:
 664          break;
 665
 666       default:
 667          assert( 0 );
 668       }
 669    }
 670    tgsi_parse_free (&parse);
 671
 672    if (mach->Declarations) {
 673       FREE( mach->Declarations );
 674    }
 675    mach->Declarations = declarations;
 676    mach->NumDeclarations = numDeclarations;
 677
 678    if (mach->Instructions) {
 679       FREE( mach->Instructions );
 680    }
 681    mach->Instructions = instructions;
 682    mach->NumInstructions = numInstructions;
 683 }
 684
 685
 686 struct tgsi_exec_machine *
 687 tgsi_exec_machine_create( void )
 688 {
 689    struct tgsi_exec_machine *mach;
 690    uint i;
 691
 692    mach = align_malloc( sizeof *mach, 16 );
 693    if (!mach)
 694       goto fail;
 695
 696    memset(mach, 0, sizeof(*mach));
 697
 698    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 699    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
 700    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
 701
 702    /* Setup constants. */
 703    for( i = 0; i < 4; i++ ) {
 704       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 705       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 706       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 707       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 708       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 709       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 710       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 711       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 712       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 713       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 714    }
 715
 716 #ifdef DEBUG
 717    /* silence warnings */
 718    (void) print_chan;
 719    (void) print_temp;
 720 #endif
 721
 722    return mach;
 723
 724 fail:
 725    align_free(mach);
 726    return NULL;
 727 }
 728
 729
 730 void
 731 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 732 {
 733    if (mach) {
 734       FREE(mach->Instructions);
 735       FREE(mach->Declarations);
 736    }
 737
 738    align_free(mach);
 739 }
 740
 741 static void
 742 micro_add(
 743    union tgsi_exec_channel *dst,
 744    const union tgsi_exec_channel *src0,
 745    const union tgsi_exec_channel *src1 )
 746 {
 747    dst->f[0] = src0->f[0] + src1->f[0];
 748    dst->f[1] = src0->f[1] + src1->f[1];
 749    dst->f[2] = src0->f[2] + src1->f[2];
 750    dst->f[3] = src0->f[3] + src1->f[3];
 751 }
 752
 753 static void
 754 micro_div(
 755    union tgsi_exec_channel *dst,
 756    const union tgsi_exec_channel *src0,
 757    const union tgsi_exec_channel *src1 )
 758 {
 759    if (src1->f[0] != 0) {
 760       dst->f[0] = src0->f[0] / src1->f[0];
 761    }
 762    if (src1->f[1] != 0) {
 763       dst->f[1] = src0->f[1] / src1->f[1];
 764    }
 765    if (src1->f[2] != 0) {
 766       dst->f[2] = src0->f[2] / src1->f[2];
 767    }
 768    if (src1->f[3] != 0) {
 769       dst->f[3] = src0->f[3] / src1->f[3];
 770    }
 771 }
 772
 773 static void
 774 micro_float_clamp(union tgsi_exec_channel *dst,
 775                   const union tgsi_exec_channel *src)
 776 {
 777    uint i;
 778
 779    for (i = 0; i < 4; i++) {
 780       if (src->f[i] > 0.0f) {
 781          if (src->f[i] > 1.884467e+019f)
 782             dst->f[i] = 1.884467e+019f;
 783          else if (src->f[i] < 5.42101e-020f)
 784             dst->f[i] = 5.42101e-020f;
 785          else
 786             dst->f[i] = src->f[i];
 787       }
 788       else {
 789          if (src->f[i] < -1.884467e+019f)
 790             dst->f[i] = -1.884467e+019f;
 791          else if (src->f[i] > -5.42101e-020f)
 792             dst->f[i] = -5.42101e-020f;
 793          else
 794             dst->f[i] = src->f[i];
 795       }
 796    }
 797 }
 798
 799 static void
 800 micro_lt(
 801    union tgsi_exec_channel *dst,
 802    const union tgsi_exec_channel *src0,
 803    const union tgsi_exec_channel *src1,
 804    const union tgsi_exec_channel *src2,
 805    const union tgsi_exec_channel *src3 )
 806 {
 807    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 808    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 809    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 810    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 811 }
 812
 813 static void
 814 micro_max(
 815    union tgsi_exec_channel *dst,
 816    const union tgsi_exec_channel *src0,
 817    const union tgsi_exec_channel *src1 )
 818 {
 819    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 820    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 821    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 822    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 823 }
 824
 825 static void
 826 micro_min(
 827    union tgsi_exec_channel *dst,
 828    const union tgsi_exec_channel *src0,
 829    const union tgsi_exec_channel *src1 )
 830 {
 831    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 832    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 833    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 834    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 835 }
 836
 837 static void
 838 micro_mul(
 839    union tgsi_exec_channel *dst,
 840    const union tgsi_exec_channel *src0,
 841    const union tgsi_exec_channel *src1 )
 842 {
 843    dst->f[0] = src0->f[0] * src1->f[0];
 844    dst->f[1] = src0->f[1] * src1->f[1];
 845    dst->f[2] = src0->f[2] * src1->f[2];
 846    dst->f[3] = src0->f[3] * src1->f[3];
 847 }
 848
 849 #if 0
 850 static void
 851 micro_imul64(
 852    union tgsi_exec_channel *dst0,
 853    union tgsi_exec_channel *dst1,
 854    const union tgsi_exec_channel *src0,
 855    const union tgsi_exec_channel *src1 )
 856 {
 857    dst1->i[0] = src0->i[0] * src1->i[0];
 858    dst1->i[1] = src0->i[1] * src1->i[1];
 859    dst1->i[2] = src0->i[2] * src1->i[2];
 860    dst1->i[3] = src0->i[3] * src1->i[3];
 861    dst0->i[0] = 0;
 862    dst0->i[1] = 0;
 863    dst0->i[2] = 0;
 864    dst0->i[3] = 0;
 865 }
 866 #endif
 867
 868 #if 0
 869 static void
 870 micro_umul64(
 871    union tgsi_exec_channel *dst0,
 872    union tgsi_exec_channel *dst1,
 873    const union tgsi_exec_channel *src0,
 874    const union tgsi_exec_channel *src1 )
 875 {
 876    dst1->u[0] = src0->u[0] * src1->u[0];
 877    dst1->u[1] = src0->u[1] * src1->u[1];
 878    dst1->u[2] = src0->u[2] * src1->u[2];
 879    dst1->u[3] = src0->u[3] * src1->u[3];
 880    dst0->u[0] = 0;
 881    dst0->u[1] = 0;
 882    dst0->u[2] = 0;
 883    dst0->u[3] = 0;
 884 }
 885 #endif
 886
 887
 888 #if 0
 889 static void
 890 micro_movc(
 891    union tgsi_exec_channel *dst,
 892    const union tgsi_exec_channel *src0,
 893    const union tgsi_exec_channel *src1,
 894    const union tgsi_exec_channel *src2 )
 895 {
 896    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 897    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 898    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 899    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 900 }
 901 #endif
 902
 903 static void
 904 micro_neg(
 905    union tgsi_exec_channel *dst,
 906    const union tgsi_exec_channel *src )
 907 {
 908    dst->f[0] = -src->f[0];
 909    dst->f[1] = -src->f[1];
 910    dst->f[2] = -src->f[2];
 911    dst->f[3] = -src->f[3];
 912 }
 913
 914 static void
 915 micro_pow(
 916    union tgsi_exec_channel *dst,
 917    const union tgsi_exec_channel *src0,
 918    const union tgsi_exec_channel *src1 )
 919 {
 920 #if FAST_MATH
 921    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 922    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 923    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 924    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 925 #else
 926    dst->f[0] = powf( src0->f[0], src1->f[0] );
 927    dst->f[1] = powf( src0->f[1], src1->f[1] );
 928    dst->f[2] = powf( src0->f[2], src1->f[2] );
 929    dst->f[3] = powf( src0->f[3], src1->f[3] );
 930 #endif
 931 }
 932
 933 static void
 934 micro_sqrt( union tgsi_exec_channel *dst,
 935             const union tgsi_exec_channel *src )
 936 {
 937    dst->f[0] = sqrtf( src->f[0] );
 938    dst->f[1] = sqrtf( src->f[1] );
 939    dst->f[2] = sqrtf( src->f[2] );
 940    dst->f[3] = sqrtf( src->f[3] );
 941 }
 942
 943 static void
 944 micro_sub(
 945    union tgsi_exec_channel *dst,
 946    const union tgsi_exec_channel *src0,
 947    const union tgsi_exec_channel *src1 )
 948 {
 949    dst->f[0] = src0->f[0] - src1->f[0];
 950    dst->f[1] = src0->f[1] - src1->f[1];
 951    dst->f[2] = src0->f[2] - src1->f[2];
 952    dst->f[3] = src0->f[3] - src1->f[3];
 953 }
 954
 955 static void
 956 fetch_src_file_channel(
 957    const struct tgsi_exec_machine *mach,
 958    const uint file,
 959    const uint swizzle,
 960    const union tgsi_exec_channel *index,
 961    union tgsi_exec_channel *chan )
 962 {
 963    switch( swizzle ) {
 964    case TGSI_SWIZZLE_X:
 965    case TGSI_SWIZZLE_Y:
 966    case TGSI_SWIZZLE_Z:
 967    case TGSI_SWIZZLE_W:
 968       switch( file ) {
 969       case TGSI_FILE_CONSTANT:
 970          assert(mach->Consts);
 971          if (index->i[0] < 0)
 972             chan->f[0] = 0.0f;
 973          else
 974             chan->f[0] = mach->Consts[index->i[0]][swizzle];
 975          if (index->i[1] < 0)
 976             chan->f[1] = 0.0f;
 977          else
 978             chan->f[1] = mach->Consts[index->i[1]][swizzle];
 979          if (index->i[2] < 0)
 980             chan->f[2] = 0.0f;
 981          else
 982             chan->f[2] = mach->Consts[index->i[2]][swizzle];
 983          if (index->i[3] < 0)
 984             chan->f[3] = 0.0f;
 985          else
 986             chan->f[3] = mach->Consts[index->i[3]][swizzle];
 987          break;
 988
 989       case TGSI_FILE_INPUT:
 990       case TGSI_FILE_SYSTEM_VALUE:
 991          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
 992          chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
 993          chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
 994          chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
 995          break;
 996
 997       case TGSI_FILE_TEMPORARY:
 998          assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
 999          chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1000          chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1001          chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1002          chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1003          break;
1004
1005       case TGSI_FILE_IMMEDIATE:
1006          assert( index->i[0] < (int) mach->ImmLimit );
1007          chan->f[0] = mach->Imms[index->i[0]][swizzle];
1008          assert( index->i[1] < (int) mach->ImmLimit );
1009          chan->f[1] = mach->Imms[index->i[1]][swizzle];
1010          assert( index->i[2] < (int) mach->ImmLimit );
1011          chan->f[2] = mach->Imms[index->i[2]][swizzle];
1012          assert( index->i[3] < (int) mach->ImmLimit );
1013          chan->f[3] = mach->Imms[index->i[3]][swizzle];
1014          break;
1015
1016       case TGSI_FILE_ADDRESS:
1017          chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1018          chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1019          chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1020          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1021          break;
1022
1023       case TGSI_FILE_PREDICATE:
1024          assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1025          assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1026          assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1027          assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1028          chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
1029          chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
1030          chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
1031          chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
1032          break;
1033
1034       case TGSI_FILE_OUTPUT:
1035          /* vertex/fragment output vars can be read too */
1036          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1037          chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1038          chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1039          chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1040          break;
1041
1042       default:
1043          assert( 0 );
1044          chan->u[0] = 0;
1045          chan->u[1] = 0;
1046          chan->u[2] = 0;
1047          chan->u[3] = 0;
1048       }
1049       break;
1050
1051    default:
1052       assert( 0 );
1053       chan->u[0] = 0;
1054       chan->u[1] = 0;
1055       chan->u[2] = 0;
1056       chan->u[3] = 0;
1057    }
1058 }
1059
1060 static void
1061 fetch_source(const struct tgsi_exec_machine *mach,
1062              union tgsi_exec_channel *chan,
1063              const struct tgsi_full_src_register *reg,
1064              const uint chan_index,
1065              enum tgsi_exec_datatype src_datatype)
1066 {
1067    union tgsi_exec_channel index;
1068    uint swizzle;
1069
1070    /* We start with a direct index into a register file.
1071     *
1072     *    file[1],
1073     *    where:
1074     *       file = Register.File
1075     *       [1] = Register.Index
1076     */
1077    index.i[0] =
1078    index.i[1] =
1079    index.i[2] =
1080    index.i[3] = reg->Register.Index;
1081
1082    /* There is an extra source register that indirectly subscripts
1083     * a register file. The direct index now becomes an offset
1084     * that is being added to the indirect register.
1085     *
1086     *    file[ind[2].x+1],
1087     *    where:
1088     *       ind = Indirect.File
1089     *       [2] = Indirect.Index
1090     *       .x = Indirect.SwizzleX
1091     */
1092    if (reg->Register.Indirect) {
1093       union tgsi_exec_channel index2;
1094       union tgsi_exec_channel indir_index;
1095       const uint execmask = mach->ExecMask;
1096       uint i;
1097
1098       /* which address register (always zero now) */
1099       index2.i[0] =
1100       index2.i[1] =
1101       index2.i[2] =
1102       index2.i[3] = reg->Indirect.Index;
1103
1104       /* get current value of address register[swizzle] */
1105       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1106       fetch_src_file_channel(
1107          mach,
1108          reg->Indirect.File,
1109          swizzle,
1110          &index2,
1111          &indir_index );
1112
1113       /* add value of address register to the offset */
1114       index.i[0] += indir_index.i[0];
1115       index.i[1] += indir_index.i[1];
1116       index.i[2] += indir_index.i[2];
1117       index.i[3] += indir_index.i[3];
1118
1119       /* for disabled execution channels, zero-out the index to
1120        * avoid using a potential garbage value.
1121        */
1122       for (i = 0; i < QUAD_SIZE; i++) {
1123          if ((execmask & (1 << i)) == 0)
1124             index.i[i] = 0;
1125       }
1126    }
1127
1128    /* There is an extra source register that is a second
1129     * subscript to a register file. Effectively it means that
1130     * the register file is actually a 2D array of registers.
1131     *
1132     *    file[1][3] == file[1*sizeof(file[1])+3],
1133     *    where:
1134     *       [3] = Dimension.Index
1135     */
1136    if (reg->Register.Dimension) {
1137       /* The size of the first-order array depends on the register file type.
1138        * We need to multiply the index to the first array to get an effective,
1139        * "flat" index that points to the beginning of the second-order array.
1140        */
1141       switch (reg->Register.File) {
1142       case TGSI_FILE_INPUT:
1143       case TGSI_FILE_SYSTEM_VALUE:
1144          index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1145          index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1146          index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1147          index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1148          break;
1149       case TGSI_FILE_CONSTANT:
1150          index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1151          index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1152          index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1153          index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1154          break;
1155       default:
1156          assert( 0 );
1157       }
1158
1159       index.i[0] += reg->Dimension.Index;
1160       index.i[1] += reg->Dimension.Index;
1161       index.i[2] += reg->Dimension.Index;
1162       index.i[3] += reg->Dimension.Index;
1163
1164       /* Again, the second subscript index can be addressed indirectly
1165        * identically to the first one.
1166        * Nothing stops us from indirectly addressing the indirect register,
1167        * but there is no need for that, so we won't exercise it.
1168        *
1169        *    file[1][ind[4].y+3],
1170        *    where:
1171        *       ind = DimIndirect.File
1172        *       [4] = DimIndirect.Index
1173        *       .y = DimIndirect.SwizzleX
1174        */
1175       if (reg->Dimension.Indirect) {
1176          union tgsi_exec_channel index2;
1177          union tgsi_exec_channel indir_index;
1178          const uint execmask = mach->ExecMask;
1179          uint i;
1180
1181          index2.i[0] =
1182          index2.i[1] =
1183          index2.i[2] =
1184          index2.i[3] = reg->DimIndirect.Index;
1185
1186          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1187          fetch_src_file_channel(
1188             mach,
1189             reg->DimIndirect.File,
1190             swizzle,
1191             &index2,
1192             &indir_index );
1193
1194          index.i[0] += indir_index.i[0];
1195          index.i[1] += indir_index.i[1];
1196          index.i[2] += indir_index.i[2];
1197          index.i[3] += indir_index.i[3];
1198
1199          /* for disabled execution channels, zero-out the index to
1200           * avoid using a potential garbage value.
1201           */
1202          for (i = 0; i < QUAD_SIZE; i++) {
1203             if ((execmask & (1 << i)) == 0)
1204                index.i[i] = 0;
1205          }
1206       }
1207
1208       /* If by any chance there was a need for a 3D array of register
1209        * files, we would have to check whether Dimension is followed
1210        * by a dimension register and continue the saga.
1211        */
1212    }
1213
1214    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1215    fetch_src_file_channel(
1216       mach,
1217       reg->Register.File,
1218       swizzle,
1219       &index,
1220       chan );
1221
1222    if (reg->Register.Absolute) {
1223       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1224          micro_abs(chan, chan);
1225       } else {
1226          micro_iabs(chan, chan);
1227       }
1228    }
1229
1230    if (reg->Register.Negate) {
1231       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1232          micro_neg(chan, chan);
1233       } else {
1234          micro_ineg(chan, chan);
1235       }
1236    }
1237 }
1238
1239 static void
1240 store_dest(struct tgsi_exec_machine *mach,
1241            const union tgsi_exec_channel *chan,
1242            const struct tgsi_full_dst_register *reg,
1243            const struct tgsi_full_instruction *inst,
1244            uint chan_index,
1245            enum tgsi_exec_datatype dst_datatype)
1246 {
1247    uint i;
1248    union tgsi_exec_channel null;
1249    union tgsi_exec_channel *dst;
1250    uint execmask = mach->ExecMask;
1251    int offset = 0;  /* indirection offset */
1252    int index;
1253
1254    if (dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1255       CHECK_INF_OR_NAN(chan);
1256    }
1257
1258    /* There is an extra source register that indirectly subscripts
1259     * a register file. The direct index now becomes an offset
1260     * that is being added to the indirect register.
1261     *
1262     *    file[ind[2].x+1],
1263     *    where:
1264     *       ind = Indirect.File
1265     *       [2] = Indirect.Index
1266     *       .x = Indirect.SwizzleX
1267     */
1268    if (reg->Register.Indirect) {
1269       union tgsi_exec_channel index;
1270       union tgsi_exec_channel indir_index;
1271       uint swizzle;
1272
1273       /* which address register (always zero for now) */
1274       index.i[0] =
1275       index.i[1] =
1276       index.i[2] =
1277       index.i[3] = reg->Indirect.Index;
1278
1279       /* get current value of address register[swizzle] */
1280       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1281
1282       /* fetch values from the address/indirection register */
1283       fetch_src_file_channel(
1284          mach,
1285          reg->Indirect.File,
1286          swizzle,
1287          &index,
1288          &indir_index );
1289
1290       /* save indirection offset */
1291       offset = indir_index.i[0];
1292    }
1293
1294    switch (reg->Register.File) {
1295    case TGSI_FILE_NULL:
1296       dst = &null;
1297       break;
1298
1299    case TGSI_FILE_OUTPUT:
1300       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1301          + reg->Register.Index;
1302       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1303 #if 0
1304       if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1305          fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1306          for (i = 0; i < QUAD_SIZE; i++)
1307             if (execmask & (1 << i))
1308                fprintf(stderr, "%f, ", chan->f[i]);
1309          fprintf(stderr, ")\n");
1310       }
1311 #endif
1312       break;
1313
1314    case TGSI_FILE_TEMPORARY:
1315       index = reg->Register.Index;
1316       assert( index < TGSI_EXEC_NUM_TEMPS );
1317       dst = &mach->Temps[offset + index].xyzw[chan_index];
1318       break;
1319
1320    case TGSI_FILE_ADDRESS:
1321       index = reg->Register.Index;
1322       dst = &mach->Addrs[index].xyzw[chan_index];
1323       break;
1324
1325    case TGSI_FILE_LOOP:
1326       assert(reg->Register.Index == 0);
1327       assert(mach->LoopCounterStackTop > 0);
1328       assert(chan_index == CHAN_X);
1329       dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1330       break;
1331
1332    case TGSI_FILE_PREDICATE:
1333       index = reg->Register.Index;
1334       assert(index < TGSI_EXEC_NUM_PREDS);
1335       dst = &mach->Predicates[index].xyzw[chan_index];
1336       break;
1337
1338    default:
1339       assert( 0 );
1340       return;
1341    }
1342
1343    if (inst->Instruction.Predicate) {
1344       uint swizzle;
1345       union tgsi_exec_channel *pred;
1346
1347       switch (chan_index) {
1348       case CHAN_X:
1349          swizzle = inst->Predicate.SwizzleX;
1350          break;
1351       case CHAN_Y:
1352          swizzle = inst->Predicate.SwizzleY;
1353          break;
1354       case CHAN_Z:
1355          swizzle = inst->Predicate.SwizzleZ;
1356          break;
1357       case CHAN_W:
1358          swizzle = inst->Predicate.SwizzleW;
1359          break;
1360       default:
1361          assert(0);
1362          return;
1363       }
1364
1365       assert(inst->Predicate.Index == 0);
1366
1367       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1368
1369       if (inst->Predicate.Negate) {
1370          for (i = 0; i < QUAD_SIZE; i++) {
1371             if (pred->u[i]) {
1372                execmask &= ~(1 << i);
1373             }
1374          }
1375       } else {
1376          for (i = 0; i < QUAD_SIZE; i++) {
1377             if (!pred->u[i]) {
1378                execmask &= ~(1 << i);
1379             }
1380          }
1381       }
1382    }
1383
1384    switch (inst->Instruction.Saturate) {
1385    case TGSI_SAT_NONE:
1386       for (i = 0; i < QUAD_SIZE; i++)
1387          if (execmask & (1 << i))
1388             dst->i[i] = chan->i[i];
1389       break;
1390
1391    case TGSI_SAT_ZERO_ONE:
1392       for (i = 0; i < QUAD_SIZE; i++)
1393          if (execmask & (1 << i)) {
1394             if (chan->f[i] < 0.0f)
1395                dst->f[i] = 0.0f;
1396             else if (chan->f[i] > 1.0f)
1397                dst->f[i] = 1.0f;
1398             else
1399                dst->i[i] = chan->i[i];
1400          }
1401       break;
1402
1403    case TGSI_SAT_MINUS_PLUS_ONE:
1404       for (i = 0; i < QUAD_SIZE; i++)
1405          if (execmask & (1 << i)) {
1406             if (chan->f[i] < -1.0f)
1407                dst->f[i] = -1.0f;
1408             else if (chan->f[i] > 1.0f)
1409                dst->f[i] = 1.0f;
1410             else
1411                dst->i[i] = chan->i[i];
1412          }
1413       break;
1414
1415    default:
1416       assert( 0 );
1417    }
1418 }
1419
1420 #define FETCH(VAL,INDEX,CHAN)\
1421     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1422
1423 #define STORE(VAL,INDEX,CHAN)\
1424    store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1425
1426
1427 /**
1428  * Execute ARB-style KIL which is predicated by a src register.
1429  * Kill fragment if any of the four values is less than zero.
1430  */
1431 static void
1432 exec_kil(struct tgsi_exec_machine *mach,
1433          const struct tgsi_full_instruction *inst)
1434 {
1435    uint uniquemask;
1436    uint chan_index;
1437    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1438    union tgsi_exec_channel r[1];
1439
1440    /* This mask stores component bits that were already tested. */
1441    uniquemask = 0;
1442
1443    for (chan_index = 0; chan_index < 4; chan_index++)
1444    {
1445       uint swizzle;
1446       uint i;
1447
1448       /* unswizzle channel */
1449       swizzle = tgsi_util_get_full_src_register_swizzle (
1450                         &inst->Src[0],
1451                         chan_index);
1452
1453       /* check if the component has not been already tested */
1454       if (uniquemask & (1 << swizzle))
1455          continue;
1456       uniquemask |= 1 << swizzle;
1457
1458       FETCH(&r[0], 0, chan_index);
1459       for (i = 0; i < 4; i++)
1460          if (r[0].f[i] < 0.0f)
1461             kilmask |= 1 << i;
1462    }
1463
1464    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1465 }
1466
1467 /**
1468  * Execute NVIDIA-style KIL which is predicated by a condition code.
1469  * Kill fragment if the condition code is TRUE.
1470  */
1471 static void
1472 exec_kilp(struct tgsi_exec_machine *mach,
1473           const struct tgsi_full_instruction *inst)
1474 {
1475    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1476
1477    /* "unconditional" kil */
1478    kilmask = mach->ExecMask;
1479    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1480 }
1481
1482 static void
1483 emit_vertex(struct tgsi_exec_machine *mach)
1484 {
1485    /* FIXME: check for exec mask correctly
1486    unsigned i;
1487    for (i = 0; i < QUAD_SIZE; ++i) {
1488          if ((mach->ExecMask & (1 << i)))
1489    */
1490    if (mach->ExecMask) {
1491       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1492       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1493    }
1494 }
1495
1496 static void
1497 emit_primitive(struct tgsi_exec_machine *mach)
1498 {
1499    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1500    /* FIXME: check for exec mask correctly
1501    unsigned i;
1502    for (i = 0; i < QUAD_SIZE; ++i) {
1503          if ((mach->ExecMask & (1 << i)))
1504    */
1505    if (mach->ExecMask) {
1506       ++(*prim_count);
1507       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1508       mach->Primitives[*prim_count] = 0;
1509    }
1510 }
1511
1512 /*
1513  * Fetch a four texture samples using STR texture coordinates.
1514  */
1515 static void
1516 fetch_texel( struct tgsi_sampler *sampler,
1517              const union tgsi_exec_channel *s,
1518              const union tgsi_exec_channel *t,
1519              const union tgsi_exec_channel *p,
1520              const union tgsi_exec_channel *c0,
1521              enum tgsi_sampler_control control,
1522              union tgsi_exec_channel *r,
1523              union tgsi_exec_channel *g,
1524              union tgsi_exec_channel *b,
1525              union tgsi_exec_channel *a )
1526 {
1527    uint j;
1528    float rgba[NUM_CHANNELS][QUAD_SIZE];
1529
1530    sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1531
1532    for (j = 0; j < 4; j++) {
1533       r->f[j] = rgba[0][j];
1534       g->f[j] = rgba[1][j];
1535       b->f[j] = rgba[2][j];
1536       a->f[j] = rgba[3][j];
1537    }
1538 }
1539
1540
1541 #define TEX_MODIFIER_NONE           0
1542 #define TEX_MODIFIER_PROJECTED      1
1543 #define TEX_MODIFIER_LOD_BIAS       2
1544 #define TEX_MODIFIER_EXPLICIT_LOD   3
1545
1546
1547 static void
1548 exec_tex(struct tgsi_exec_machine *mach,
1549          const struct tgsi_full_instruction *inst,
1550          uint modifier)
1551 {
1552    const uint unit = inst->Src[1].Register.Index;
1553    union tgsi_exec_channel r[4];
1554    const union tgsi_exec_channel *lod = &ZeroVec;
1555    enum tgsi_sampler_control control;
1556    uint chan_index;
1557
1558    if (modifier != TEX_MODIFIER_NONE) {
1559       FETCH(&r[3], 0, CHAN_W);
1560       if (modifier != TEX_MODIFIER_PROJECTED) {
1561          lod = &r[3];
1562       }
1563    }
1564
1565    if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1566       control = tgsi_sampler_lod_explicit;
1567    } else {
1568       control = tgsi_sampler_lod_bias;
1569    }
1570
1571    switch (inst->Texture.Texture) {
1572    case TGSI_TEXTURE_1D:
1573    case TGSI_TEXTURE_SHADOW1D:
1574       FETCH(&r[0], 0, CHAN_X);
1575
1576       if (modifier == TEX_MODIFIER_PROJECTED) {
1577          micro_div(&r[0], &r[0], &r[3]);
1578       }
1579
1580       fetch_texel(mach->Samplers[unit],
1581                   &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1582                   control,
1583                   &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1584       break;
1585
1586    case TGSI_TEXTURE_2D:
1587    case TGSI_TEXTURE_RECT:
1588    case TGSI_TEXTURE_SHADOW2D:
1589    case TGSI_TEXTURE_SHADOWRECT:
1590       FETCH(&r[0], 0, CHAN_X);
1591       FETCH(&r[1], 0, CHAN_Y);
1592       FETCH(&r[2], 0, CHAN_Z);
1593
1594       if (modifier == TEX_MODIFIER_PROJECTED) {
1595          micro_div(&r[0], &r[0], &r[3]);
1596          micro_div(&r[1], &r[1], &r[3]);
1597          micro_div(&r[2], &r[2], &r[3]);
1598       }
1599
1600       fetch_texel(mach->Samplers[unit],
1601                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1602                   control,
1603                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1604       break;
1605
1606    case TGSI_TEXTURE_3D:
1607    case TGSI_TEXTURE_CUBE:
1608       FETCH(&r[0], 0, CHAN_X);
1609       FETCH(&r[1], 0, CHAN_Y);
1610       FETCH(&r[2], 0, CHAN_Z);
1611
1612       if (modifier == TEX_MODIFIER_PROJECTED) {
1613          micro_div(&r[0], &r[0], &r[3]);
1614          micro_div(&r[1], &r[1], &r[3]);
1615          micro_div(&r[2], &r[2], &r[3]);
1616       }
1617
1618       fetch_texel(mach->Samplers[unit],
1619                   &r[0], &r[1], &r[2], lod,
1620                   control,
1621                   &r[0], &r[1], &r[2], &r[3]);
1622       break;
1623
1624    default:
1625       assert(0);
1626    }
1627
1628    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1629       STORE(&r[chan_index], 0, chan_index);
1630    }
1631 }
1632
1633 static void
1634 exec_txd(struct tgsi_exec_machine *mach,
1635          const struct tgsi_full_instruction *inst)
1636 {
1637    const uint unit = inst->Src[3].Register.Index;
1638    union tgsi_exec_channel r[4];
1639    uint chan_index;
1640
1641    /*
1642     * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1643     */
1644
1645    switch (inst->Texture.Texture) {
1646    case TGSI_TEXTURE_1D:
1647    case TGSI_TEXTURE_SHADOW1D:
1648
1649       FETCH(&r[0], 0, CHAN_X);
1650
1651       fetch_texel(mach->Samplers[unit],
1652                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1653                   tgsi_sampler_lod_bias,
1654                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1655       break;
1656
1657    case TGSI_TEXTURE_2D:
1658    case TGSI_TEXTURE_RECT:
1659    case TGSI_TEXTURE_SHADOW2D:
1660    case TGSI_TEXTURE_SHADOWRECT:
1661
1662       FETCH(&r[0], 0, CHAN_X);
1663       FETCH(&r[1], 0, CHAN_Y);
1664       FETCH(&r[2], 0, CHAN_Z);
1665
1666       fetch_texel(mach->Samplers[unit],
1667                   &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1668                   tgsi_sampler_lod_bias,
1669                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1670       break;
1671
1672    case TGSI_TEXTURE_3D:
1673    case TGSI_TEXTURE_CUBE:
1674
1675       FETCH(&r[0], 0, CHAN_X);
1676       FETCH(&r[1], 0, CHAN_Y);
1677       FETCH(&r[2], 0, CHAN_Z);
1678
1679       fetch_texel(mach->Samplers[unit],
1680                   &r[0], &r[1], &r[2], &ZeroVec,
1681                   tgsi_sampler_lod_bias,
1682                   &r[0], &r[1], &r[2], &r[3]);
1683       break;
1684
1685    default:
1686       assert(0);
1687    }
1688
1689    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1690       STORE(&r[chan_index], 0, chan_index);
1691    }
1692 }
1693
1694
1695 /**
1696  * Evaluate a constant-valued coefficient at the position of the
1697  * current quad.
1698  */
1699 static void
1700 eval_constant_coef(
1701    struct tgsi_exec_machine *mach,
1702    unsigned attrib,
1703    unsigned chan )
1704 {
1705    unsigned i;
1706
1707    for( i = 0; i < QUAD_SIZE; i++ ) {
1708       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1709    }
1710 }
1711
1712 /**
1713  * Evaluate a linear-valued coefficient at the position of the
1714  * current quad.
1715  */
1716 static void
1717 eval_linear_coef(
1718    struct tgsi_exec_machine *mach,
1719    unsigned attrib,
1720    unsigned chan )
1721 {
1722    const float x = mach->QuadPos.xyzw[0].f[0];
1723    const float y = mach->QuadPos.xyzw[1].f[0];
1724    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1725    const float dady = mach->InterpCoefs[attrib].dady[chan];
1726    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1727    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1728    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1729    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1730    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1731 }
1732
1733 /**
1734  * Evaluate a perspective-valued coefficient at the position of the
1735  * current quad.
1736  */
1737 static void
1738 eval_perspective_coef(
1739    struct tgsi_exec_machine *mach,
1740    unsigned attrib,
1741    unsigned chan )
1742 {
1743    const float x = mach->QuadPos.xyzw[0].f[0];
1744    const float y = mach->QuadPos.xyzw[1].f[0];
1745    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1746    const float dady = mach->InterpCoefs[attrib].dady[chan];
1747    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1748    const float *w = mach->QuadPos.xyzw[3].f;
1749    /* divide by W here */
1750    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1751    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1752    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1753    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1754 }
1755
1756
1757 typedef void (* eval_coef_func)(
1758    struct tgsi_exec_machine *mach,
1759    unsigned attrib,
1760    unsigned chan );
1761
1762 static void
1763 exec_declaration(struct tgsi_exec_machine *mach,
1764                  const struct tgsi_full_declaration *decl)
1765 {
1766    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1767       if (decl->Declaration.File == TGSI_FILE_INPUT ||
1768           decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1769          uint first, last, mask;
1770
1771          first = decl->Range.First;
1772          last = decl->Range.Last;
1773          mask = decl->Declaration.UsageMask;
1774
1775          if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1776             assert(decl->Semantic.Index == 0);
1777             assert(first == last);
1778             assert(mask == TGSI_WRITEMASK_XYZW);
1779
1780             mach->Inputs[first] = mach->QuadPos;
1781          } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1782             uint i;
1783
1784             assert(decl->Semantic.Index == 0);
1785             assert(first == last);
1786
1787             for (i = 0; i < QUAD_SIZE; i++) {
1788                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1789             }
1790          } else {
1791             eval_coef_func eval;
1792             uint i, j;
1793
1794             switch (decl->Declaration.Interpolate) {
1795             case TGSI_INTERPOLATE_CONSTANT:
1796                eval = eval_constant_coef;
1797                break;
1798
1799             case TGSI_INTERPOLATE_LINEAR:
1800                eval = eval_linear_coef;
1801                break;
1802
1803             case TGSI_INTERPOLATE_PERSPECTIVE:
1804                eval = eval_perspective_coef;
1805                break;
1806
1807             default:
1808                assert(0);
1809                return;
1810             }
1811
1812             for (j = 0; j < NUM_CHANNELS; j++) {
1813                if (mask & (1 << j)) {
1814                   for (i = first; i <= last; i++) {
1815                      eval(mach, i, j);
1816                   }
1817                }
1818             }
1819          }
1820       }
1821    }
1822 }
1823
1824 typedef void (* micro_op)(union tgsi_exec_channel *dst,
1825                           const union tgsi_exec_channel *src);
1826
1827 static void
1828 exec_scalar_unary(struct tgsi_exec_machine *mach,
1829                   const struct tgsi_full_instruction *inst,
1830                   micro_op op,
1831                   enum tgsi_exec_datatype dst_datatype,
1832                   enum tgsi_exec_datatype src_datatype)
1833 {
1834    unsigned int chan;
1835    union tgsi_exec_channel src;
1836    union tgsi_exec_channel dst;
1837
1838    fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1839    op(&dst, &src);
1840    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1841       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1842          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1843       }
1844    }
1845 }
1846
1847 static void
1848 exec_vector_unary(struct tgsi_exec_machine *mach,
1849                   const struct tgsi_full_instruction *inst,
1850                   micro_op op,
1851                   enum tgsi_exec_datatype dst_datatype,
1852                   enum tgsi_exec_datatype src_datatype)
1853 {
1854    unsigned int chan;
1855    struct tgsi_exec_vector dst;
1856
1857    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1858       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1859          union tgsi_exec_channel src;
1860
1861          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1862          op(&dst.xyzw[chan], &src);
1863       }
1864    }
1865    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1866       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1867          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1868       }
1869    }
1870 }
1871
1872 static void
1873 exec_vector_binary(struct tgsi_exec_machine *mach,
1874                    const struct tgsi_full_instruction *inst,
1875                    micro_op op,
1876                    enum tgsi_exec_datatype dst_datatype,
1877                    enum tgsi_exec_datatype src_datatype)
1878 {
1879    unsigned int chan;
1880    struct tgsi_exec_vector dst;
1881
1882    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1883       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1884          union tgsi_exec_channel src[2];
1885
1886          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1887          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1888          op(&dst.xyzw[chan], src);
1889       }
1890    }
1891    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1892       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1893          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1894       }
1895    }
1896 }
1897
1898 static void
1899 exec_vector_trinary(struct tgsi_exec_machine *mach,
1900                     const struct tgsi_full_instruction *inst,
1901                     micro_op op,
1902                     enum tgsi_exec_datatype dst_datatype,
1903                     enum tgsi_exec_datatype src_datatype)
1904 {
1905    unsigned int chan;
1906    struct tgsi_exec_vector dst;
1907
1908    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1909       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1910          union tgsi_exec_channel src[3];
1911
1912          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1913          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1914          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1915          op(&dst.xyzw[chan], src);
1916       }
1917    }
1918    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1919       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1920          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1921       }
1922    }
1923 }
1924
1925 static void
1926 exec_dp3(struct tgsi_exec_machine *mach,
1927          const struct tgsi_full_instruction *inst)
1928 {
1929    unsigned int chan;
1930    union tgsi_exec_channel arg[3];
1931
1932    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1933    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1934    micro_mul(&arg[2], &arg[0], &arg[1]);
1935
1936    for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1937       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1938       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1939       micro_mad(&arg[2], arg);
1940    }
1941
1942    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1943       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1944          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1945       }
1946    }
1947 }
1948
1949 static void
1950 exec_dp4(struct tgsi_exec_machine *mach,
1951          const struct tgsi_full_instruction *inst)
1952 {
1953    unsigned int chan;
1954    union tgsi_exec_channel arg[3];
1955
1956    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1957    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1958    micro_mul(&arg[2], &arg[0], &arg[1]);
1959
1960    for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1961       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1962       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1963       micro_mad(&arg[2], arg);
1964    }
1965
1966    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1967       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1968          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1969       }
1970    }
1971 }
1972
1973 static void
1974 exec_dp2a(struct tgsi_exec_machine *mach,
1975           const struct tgsi_full_instruction *inst)
1976 {
1977    unsigned int chan;
1978    union tgsi_exec_channel arg[3];
1979
1980    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1981    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1982    micro_mul(&arg[2], &arg[0], &arg[1]);
1983
1984    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1985    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1986    micro_mad(&arg[0], arg);
1987
1988    fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1989    micro_add(&arg[0], &arg[0], &arg[1]);
1990
1991    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1992       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1993          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1994       }
1995    }
1996 }
1997
1998 static void
1999 exec_dph(struct tgsi_exec_machine *mach,
2000          const struct tgsi_full_instruction *inst)
2001 {
2002    unsigned int chan;
2003    union tgsi_exec_channel arg[3];
2004
2005    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2006    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2007    micro_mul(&arg[2], &arg[0], &arg[1]);
2008
2009    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2010    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2011    micro_mad(&arg[2], arg);
2012
2013    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2014    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2015    micro_mad(&arg[0], arg);
2016
2017    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2018    micro_add(&arg[0], &arg[0], &arg[1]);
2019
2020    for (chan = 0; chan < NUM_CHANNELS; chan++) {
2021       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2022          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2023       }
2024    }
2025 }
2026
2027 static void
2028 exec_dp2(struct tgsi_exec_machine *mach,
2029          const struct tgsi_full_instruction *inst)
2030 {
2031    unsigned int chan;
2032    union tgsi_exec_channel arg[3];
2033
2034    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2035    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2036    micro_mul(&arg[2], &arg[0], &arg[1]);
2037
2038    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2039    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2040    micro_mad(&arg[2], arg);
2041
2042    for (chan = 0; chan < NUM_CHANNELS; chan++) {
2043       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2044          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2045       }
2046    }
2047 }
2048
2049 static void
2050 exec_break(struct tgsi_exec_machine *mach)
2051 {
2052    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2053       /* turn off loop channels for each enabled exec channel */
2054       mach->LoopMask &= ~mach->ExecMask;
2055       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2056       UPDATE_EXEC_MASK(mach);
2057    } else {
2058       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2059
2060       mach->Switch.mask = 0x0;
2061
2062       UPDATE_EXEC_MASK(mach);
2063    }
2064 }
2065
2066 static void
2067 exec_switch(struct tgsi_exec_machine *mach,
2068             const struct tgsi_full_instruction *inst)
2069 {
2070    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2071    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2072
2073    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2074    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2075    mach->Switch.mask = 0x0;
2076    mach->Switch.defaultMask = 0x0;
2077
2078    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2079    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2080
2081    UPDATE_EXEC_MASK(mach);
2082 }
2083
2084 static void
2085 exec_case(struct tgsi_exec_machine *mach,
2086           const struct tgsi_full_instruction *inst)
2087 {
2088    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2089    union tgsi_exec_channel src;
2090    uint mask = 0;
2091
2092    fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2093
2094    if (mach->Switch.selector.u[0] == src.u[0]) {
2095       mask |= 0x1;
2096    }
2097    if (mach->Switch.selector.u[1] == src.u[1]) {
2098       mask |= 0x2;
2099    }
2100    if (mach->Switch.selector.u[2] == src.u[2]) {
2101       mask |= 0x4;
2102    }
2103    if (mach->Switch.selector.u[3] == src.u[3]) {
2104       mask |= 0x8;
2105    }
2106
2107    mach->Switch.defaultMask |= mask;
2108
2109    mach->Switch.mask |= mask & prevMask;
2110
2111    UPDATE_EXEC_MASK(mach);
2112 }
2113
2114 static void
2115 exec_default(struct tgsi_exec_machine *mach)
2116 {
2117    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2118
2119    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2120
2121    UPDATE_EXEC_MASK(mach);
2122 }
2123
2124 static void
2125 exec_endswitch(struct tgsi_exec_machine *mach)
2126 {
2127    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2128    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2129
2130    UPDATE_EXEC_MASK(mach);
2131 }
2132
2133 static void
2134 micro_i2f(union tgsi_exec_channel *dst,
2135           const union tgsi_exec_channel *src)
2136 {
2137    dst->f[0] = (float)src->i[0];
2138    dst->f[1] = (float)src->i[1];
2139    dst->f[2] = (float)src->i[2];
2140    dst->f[3] = (float)src->i[3];
2141 }
2142
2143 static void
2144 micro_not(union tgsi_exec_channel *dst,
2145           const union tgsi_exec_channel *src)
2146 {
2147    dst->u[0] = ~src->u[0];
2148    dst->u[1] = ~src->u[1];
2149    dst->u[2] = ~src->u[2];
2150    dst->u[3] = ~src->u[3];
2151 }
2152
2153 static void
2154 micro_shl(union tgsi_exec_channel *dst,
2155           const union tgsi_exec_channel *src)
2156 {
2157    dst->u[0] = src[0].u[0] << src[1].u[0];
2158    dst->u[1] = src[0].u[1] << src[1].u[1];
2159    dst->u[2] = src[0].u[2] << src[1].u[2];
2160    dst->u[3] = src[0].u[3] << src[1].u[3];
2161 }
2162
2163 static void
2164 micro_and(union tgsi_exec_channel *dst,
2165           const union tgsi_exec_channel *src)
2166 {
2167    dst->u[0] = src[0].u[0] & src[1].u[0];
2168    dst->u[1] = src[0].u[1] & src[1].u[1];
2169    dst->u[2] = src[0].u[2] & src[1].u[2];
2170    dst->u[3] = src[0].u[3] & src[1].u[3];
2171 }
2172
2173 static void
2174 micro_or(union tgsi_exec_channel *dst,
2175          const union tgsi_exec_channel *src)
2176 {
2177    dst->u[0] = src[0].u[0] | src[1].u[0];
2178    dst->u[1] = src[0].u[1] | src[1].u[1];
2179    dst->u[2] = src[0].u[2] | src[1].u[2];
2180    dst->u[3] = src[0].u[3] | src[1].u[3];
2181 }
2182
2183 static void
2184 micro_xor(union tgsi_exec_channel *dst,
2185           const union tgsi_exec_channel *src)
2186 {
2187    dst->u[0] = src[0].u[0] ^ src[1].u[0];
2188    dst->u[1] = src[0].u[1] ^ src[1].u[1];
2189    dst->u[2] = src[0].u[2] ^ src[1].u[2];
2190    dst->u[3] = src[0].u[3] ^ src[1].u[3];
2191 }
2192
2193 static void
2194 micro_f2i(union tgsi_exec_channel *dst,
2195           const union tgsi_exec_channel *src)
2196 {
2197    dst->i[0] = (int)src->f[0];
2198    dst->i[1] = (int)src->f[1];
2199    dst->i[2] = (int)src->f[2];
2200    dst->i[3] = (int)src->f[3];
2201 }
2202
2203 static void
2204 micro_idiv(union tgsi_exec_channel *dst,
2205            const union tgsi_exec_channel *src)
2206 {
2207    dst->i[0] = src[0].i[0] / src[1].i[0];
2208    dst->i[1] = src[0].i[1] / src[1].i[1];
2209    dst->i[2] = src[0].i[2] / src[1].i[2];
2210    dst->i[3] = src[0].i[3] / src[1].i[3];
2211 }
2212
2213 static void
2214 micro_imax(union tgsi_exec_channel *dst,
2215            const union tgsi_exec_channel *src)
2216 {
2217    dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2218    dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2219    dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2220    dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2221 }
2222
2223 static void
2224 micro_imin(union tgsi_exec_channel *dst,
2225            const union tgsi_exec_channel *src)
2226 {
2227    dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2228    dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2229    dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2230    dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2231 }
2232
2233 static void
2234 micro_isge(union tgsi_exec_channel *dst,
2235            const union tgsi_exec_channel *src)
2236 {
2237    dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2238    dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2239    dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2240    dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2241 }
2242
2243 static void
2244 micro_ishr(union tgsi_exec_channel *dst,
2245            const union tgsi_exec_channel *src)
2246 {
2247    dst->i[0] = src[0].i[0] >> src[1].i[0];
2248    dst->i[1] = src[0].i[1] >> src[1].i[1];
2249    dst->i[2] = src[0].i[2] >> src[1].i[2];
2250    dst->i[3] = src[0].i[3] >> src[1].i[3];
2251 }
2252
2253 static void
2254 micro_islt(union tgsi_exec_channel *dst,
2255            const union tgsi_exec_channel *src)
2256 {
2257    dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2258    dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2259    dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2260    dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2261 }
2262
2263 static void
2264 micro_f2u(union tgsi_exec_channel *dst,
2265           const union tgsi_exec_channel *src)
2266 {
2267    dst->u[0] = (uint)src->f[0];
2268    dst->u[1] = (uint)src->f[1];
2269    dst->u[2] = (uint)src->f[2];
2270    dst->u[3] = (uint)src->f[3];
2271 }
2272
2273 static void
2274 micro_u2f(union tgsi_exec_channel *dst,
2275           const union tgsi_exec_channel *src)
2276 {
2277    dst->f[0] = (float)src->u[0];
2278    dst->f[1] = (float)src->u[1];
2279    dst->f[2] = (float)src->u[2];
2280    dst->f[3] = (float)src->u[3];
2281 }
2282
2283 static void
2284 micro_uadd(union tgsi_exec_channel *dst,
2285            const union tgsi_exec_channel *src)
2286 {
2287    dst->u[0] = src[0].u[0] + src[1].u[0];
2288    dst->u[1] = src[0].u[1] + src[1].u[1];
2289    dst->u[2] = src[0].u[2] + src[1].u[2];
2290    dst->u[3] = src[0].u[3] + src[1].u[3];
2291 }
2292
2293 static void
2294 micro_udiv(union tgsi_exec_channel *dst,
2295            const union tgsi_exec_channel *src)
2296 {
2297    dst->u[0] = src[0].u[0] / src[1].u[0];
2298    dst->u[1] = src[0].u[1] / src[1].u[1];
2299    dst->u[2] = src[0].u[2] / src[1].u[2];
2300    dst->u[3] = src[0].u[3] / src[1].u[3];
2301 }
2302
2303 static void
2304 micro_umad(union tgsi_exec_channel *dst,
2305            const union tgsi_exec_channel *src)
2306 {
2307    dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2308    dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2309    dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2310    dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2311 }
2312
2313 static void
2314 micro_umax(union tgsi_exec_channel *dst,
2315            const union tgsi_exec_channel *src)
2316 {
2317    dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2318    dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2319    dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2320    dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2321 }
2322
2323 static void
2324 micro_umin(union tgsi_exec_channel *dst,
2325            const union tgsi_exec_channel *src)
2326 {
2327    dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2328    dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2329    dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2330    dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2331 }
2332
2333 static void
2334 micro_umod(union tgsi_exec_channel *dst,
2335            const union tgsi_exec_channel *src)
2336 {
2337    dst->u[0] = src[0].u[0] % src[1].u[0];
2338    dst->u[1] = src[0].u[1] % src[1].u[1];
2339    dst->u[2] = src[0].u[2] % src[1].u[2];
2340    dst->u[3] = src[0].u[3] % src[1].u[3];
2341 }
2342
2343 static void
2344 micro_umul(union tgsi_exec_channel *dst,
2345            const union tgsi_exec_channel *src)
2346 {
2347    dst->u[0] = src[0].u[0] * src[1].u[0];
2348    dst->u[1] = src[0].u[1] * src[1].u[1];
2349    dst->u[2] = src[0].u[2] * src[1].u[2];
2350    dst->u[3] = src[0].u[3] * src[1].u[3];
2351 }
2352
2353 static void
2354 micro_useq(union tgsi_exec_channel *dst,
2355            const union tgsi_exec_channel *src)
2356 {
2357    dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2358    dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2359    dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2360    dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2361 }
2362
2363 static void
2364 micro_usge(union tgsi_exec_channel *dst,
2365            const union tgsi_exec_channel *src)
2366 {
2367    dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2368    dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2369    dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2370    dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2371 }
2372
2373 static void
2374 micro_ushr(union tgsi_exec_channel *dst,
2375            const union tgsi_exec_channel *src)
2376 {
2377    dst->u[0] = src[0].u[0] >> src[1].u[0];
2378    dst->u[1] = src[0].u[1] >> src[1].u[1];
2379    dst->u[2] = src[0].u[2] >> src[1].u[2];
2380    dst->u[3] = src[0].u[3] >> src[1].u[3];
2381 }
2382
2383 static void
2384 micro_uslt(union tgsi_exec_channel *dst,
2385            const union tgsi_exec_channel *src)
2386 {
2387    dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2388    dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2389    dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2390    dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2391 }
2392
2393 static void
2394 micro_usne(union tgsi_exec_channel *dst,
2395            const union tgsi_exec_channel *src)
2396 {
2397    dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2398    dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2399    dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2400    dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2401 }
2402
2403 static void
2404 exec_instruction(
2405    struct tgsi_exec_machine *mach,
2406    const struct tgsi_full_instruction *inst,
2407    int *pc )
2408 {
2409    uint chan_index;
2410    union tgsi_exec_channel r[10];
2411    union tgsi_exec_channel d[8];
2412
2413    (*pc)++;
2414
2415    switch (inst->Instruction.Opcode) {
2416    case TGSI_OPCODE_ARL:
2417       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2418       break;
2419
2420    case TGSI_OPCODE_MOV:
2421       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2422       break;
2423
2424    case TGSI_OPCODE_LIT:
2425       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2426          FETCH( &r[0], 0, CHAN_X );
2427          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2428             micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2429          }
2430
2431          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2432             FETCH( &r[1], 0, CHAN_Y );
2433             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2434
2435             FETCH( &r[2], 0, CHAN_W );
2436             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2437             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2438             micro_pow( &r[1], &r[1], &r[2] );
2439             micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2440          }
2441
2442          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2443             STORE(&d[CHAN_Y], 0, CHAN_Y);
2444          }
2445          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2446             STORE(&d[CHAN_Z], 0, CHAN_Z);
2447          }
2448       }
2449       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2450          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2451       }
2452       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2453          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2454       }
2455       break;
2456
2457    case TGSI_OPCODE_RCP:
2458       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2459       break;
2460
2461    case TGSI_OPCODE_RSQ:
2462       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2463       break;
2464
2465    case TGSI_OPCODE_EXP:
2466       FETCH( &r[0], 0, CHAN_X );
2467       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2468       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2469          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2470          STORE( &r[2], 0, CHAN_X );        /* store r2 */
2471       }
2472       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2473          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2474          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2475       }
2476       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2477          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2478          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2479       }
2480       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2481          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2482       }
2483       break;
2484
2485    case TGSI_OPCODE_LOG:
2486       FETCH( &r[0], 0, CHAN_X );
2487       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2488       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2489       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2490       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2491          STORE( &r[0], 0, CHAN_X );
2492       }
2493       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2494          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2495          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2496          STORE( &r[0], 0, CHAN_Y );
2497       }
2498       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2499          STORE( &r[1], 0, CHAN_Z );
2500       }
2501       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2502          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2503       }
2504       break;
2505
2506    case TGSI_OPCODE_MUL:
2507       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2508          FETCH(&r[0], 0, chan_index);
2509          FETCH(&r[1], 1, chan_index);
2510          micro_mul(&d[chan_index], &r[0], &r[1]);
2511       }
2512       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2513          STORE(&d[chan_index], 0, chan_index);
2514       }
2515       break;
2516
2517    case TGSI_OPCODE_ADD:
2518       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2519          FETCH( &r[0], 0, chan_index );
2520          FETCH( &r[1], 1, chan_index );
2521          micro_add(&d[chan_index], &r[0], &r[1]);
2522       }
2523       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2524          STORE(&d[chan_index], 0, chan_index);
2525       }
2526       break;
2527
2528    case TGSI_OPCODE_DP3:
2529       exec_dp3(mach, inst);
2530       break;
2531
2532    case TGSI_OPCODE_DP4:
2533       exec_dp4(mach, inst);
2534       break;
2535
2536    case TGSI_OPCODE_DST:
2537       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2538          FETCH( &r[0], 0, CHAN_Y );
2539          FETCH( &r[1], 1, CHAN_Y);
2540          micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2541       }
2542       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2543          FETCH(&d[CHAN_Z], 0, CHAN_Z);
2544       }
2545       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2546          FETCH(&d[CHAN_W], 1, CHAN_W);
2547       }
2548
2549       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2550          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2551       }
2552       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2553          STORE(&d[CHAN_Y], 0, CHAN_Y);
2554       }
2555       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2556          STORE(&d[CHAN_Z], 0, CHAN_Z);
2557       }
2558       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2559          STORE(&d[CHAN_W], 0, CHAN_W);
2560       }
2561       break;
2562
2563    case TGSI_OPCODE_MIN:
2564       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2565          FETCH(&r[0], 0, chan_index);
2566          FETCH(&r[1], 1, chan_index);
2567
2568          /* XXX use micro_min()?? */
2569          micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2570       }
2571       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2572          STORE(&d[chan_index], 0, chan_index);
2573       }
2574       break;
2575
2576    case TGSI_OPCODE_MAX:
2577       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2578          FETCH(&r[0], 0, chan_index);
2579          FETCH(&r[1], 1, chan_index);
2580
2581          /* XXX use micro_max()?? */
2582          micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2583       }
2584       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2585          STORE(&d[chan_index], 0, chan_index);
2586       }
2587       break;
2588
2589    case TGSI_OPCODE_SLT:
2590       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2591       break;
2592
2593    case TGSI_OPCODE_SGE:
2594       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2595       break;
2596
2597    case TGSI_OPCODE_MAD:
2598       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2599       break;
2600
2601    case TGSI_OPCODE_SUB:
2602       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2603          FETCH(&r[0], 0, chan_index);
2604          FETCH(&r[1], 1, chan_index);
2605          micro_sub(&d[chan_index], &r[0], &r[1]);
2606       }
2607       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2608          STORE(&d[chan_index], 0, chan_index);
2609       }
2610       break;
2611
2612    case TGSI_OPCODE_LRP:
2613       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2614       break;
2615
2616    case TGSI_OPCODE_CND:
2617       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2618          FETCH(&r[0], 0, chan_index);
2619          FETCH(&r[1], 1, chan_index);
2620          FETCH(&r[2], 2, chan_index);
2621          micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2622       }
2623       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2624          STORE(&d[chan_index], 0, chan_index);
2625       }
2626       break;
2627
2628    case TGSI_OPCODE_DP2A:
2629       exec_dp2a(mach, inst);
2630       break;
2631
2632    case TGSI_OPCODE_FRC:
2633       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2634       break;
2635
2636    case TGSI_OPCODE_CLAMP:
2637       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2638          FETCH(&r[0], 0, chan_index);
2639          FETCH(&r[1], 1, chan_index);
2640          micro_max(&r[0], &r[0], &r[1]);
2641          FETCH(&r[1], 2, chan_index);
2642          micro_min(&d[chan_index], &r[0], &r[1]);
2643       }
2644       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2645          STORE(&d[chan_index], 0, chan_index);
2646       }
2647       break;
2648
2649    case TGSI_OPCODE_FLR:
2650       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2651       break;
2652
2653    case TGSI_OPCODE_ROUND:
2654       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2655       break;
2656
2657    case TGSI_OPCODE_EX2:
2658       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2659       break;
2660
2661    case TGSI_OPCODE_LG2:
2662       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2663       break;
2664
2665    case TGSI_OPCODE_POW:
2666       FETCH(&r[0], 0, CHAN_X);
2667       FETCH(&r[1], 1, CHAN_X);
2668
2669       micro_pow( &r[0], &r[0], &r[1] );
2670
2671       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2672          STORE( &r[0], 0, chan_index );
2673       }
2674       break;
2675
2676    case TGSI_OPCODE_XPD:
2677       FETCH(&r[0], 0, CHAN_Y);
2678       FETCH(&r[1], 1, CHAN_Z);
2679
2680       micro_mul( &r[2], &r[0], &r[1] );
2681
2682       FETCH(&r[3], 0, CHAN_Z);
2683       FETCH(&r[4], 1, CHAN_Y);
2684
2685       micro_mul( &r[5], &r[3], &r[4] );
2686       micro_sub(&d[CHAN_X], &r[2], &r[5]);
2687
2688       FETCH(&r[2], 1, CHAN_X);
2689
2690       micro_mul( &r[3], &r[3], &r[2] );
2691
2692       FETCH(&r[5], 0, CHAN_X);
2693
2694       micro_mul( &r[1], &r[1], &r[5] );
2695       micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2696
2697       micro_mul( &r[5], &r[5], &r[4] );
2698       micro_mul( &r[0], &r[0], &r[2] );
2699       micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2700
2701       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2702          STORE(&d[CHAN_X], 0, CHAN_X);
2703       }
2704       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2705          STORE(&d[CHAN_Y], 0, CHAN_Y);
2706       }
2707       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2708          STORE(&d[CHAN_Z], 0, CHAN_Z);
2709       }
2710       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2711          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2712       }
2713       break;
2714
2715    case TGSI_OPCODE_ABS:
2716       exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2717       break;
2718
2719    case TGSI_OPCODE_RCC:
2720       FETCH(&r[0], 0, CHAN_X);
2721       micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2722       micro_float_clamp(&r[0], &r[0]);
2723       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2724          STORE(&r[0], 0, chan_index);
2725       }
2726       break;
2727
2728    case TGSI_OPCODE_DPH:
2729       exec_dph(mach, inst);
2730       break;
2731
2732    case TGSI_OPCODE_COS:
2733       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2734       break;
2735
2736    case TGSI_OPCODE_DDX:
2737       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2738       break;
2739
2740    case TGSI_OPCODE_DDY:
2741       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2742       break;
2743
2744    case TGSI_OPCODE_KILP:
2745       exec_kilp (mach, inst);
2746       break;
2747
2748    case TGSI_OPCODE_KIL:
2749       exec_kil (mach, inst);
2750       break;
2751
2752    case TGSI_OPCODE_PK2H:
2753       assert (0);
2754       break;
2755
2756    case TGSI_OPCODE_PK2US:
2757       assert (0);
2758       break;
2759
2760    case TGSI_OPCODE_PK4B:
2761       assert (0);
2762       break;
2763
2764    case TGSI_OPCODE_PK4UB:
2765       assert (0);
2766       break;
2767
2768    case TGSI_OPCODE_RFL:
2769       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2770           IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2771           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2772          /* r0 = dp3(src0, src0) */
2773          FETCH(&r[2], 0, CHAN_X);
2774          micro_mul(&r[0], &r[2], &r[2]);
2775          FETCH(&r[4], 0, CHAN_Y);
2776          micro_mul(&r[8], &r[4], &r[4]);
2777          micro_add(&r[0], &r[0], &r[8]);
2778          FETCH(&r[6], 0, CHAN_Z);
2779          micro_mul(&r[8], &r[6], &r[6]);
2780          micro_add(&r[0], &r[0], &r[8]);
2781
2782          /* r1 = dp3(src0, src1) */
2783          FETCH(&r[3], 1, CHAN_X);
2784          micro_mul(&r[1], &r[2], &r[3]);
2785          FETCH(&r[5], 1, CHAN_Y);
2786          micro_mul(&r[8], &r[4], &r[5]);
2787          micro_add(&r[1], &r[1], &r[8]);
2788          FETCH(&r[7], 1, CHAN_Z);
2789          micro_mul(&r[8], &r[6], &r[7]);
2790          micro_add(&r[1], &r[1], &r[8]);
2791
2792          /* r1 = 2 * r1 / r0 */
2793          micro_add(&r[1], &r[1], &r[1]);
2794          micro_div(&r[1], &r[1], &r[0]);
2795
2796          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2797             micro_mul(&r[2], &r[2], &r[1]);
2798             micro_sub(&r[2], &r[2], &r[3]);
2799             STORE(&r[2], 0, CHAN_X);
2800          }
2801          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2802             micro_mul(&r[4], &r[4], &r[1]);
2803             micro_sub(&r[4], &r[4], &r[5]);
2804             STORE(&r[4], 0, CHAN_Y);
2805          }
2806          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2807             micro_mul(&r[6], &r[6], &r[1]);
2808             micro_sub(&r[6], &r[6], &r[7]);
2809             STORE(&r[6], 0, CHAN_Z);
2810          }
2811       }
2812       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2813          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2814       }
2815       break;
2816
2817    case TGSI_OPCODE_SEQ:
2818       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2819       break;
2820
2821    case TGSI_OPCODE_SFL:
2822       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2823          STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2824       }
2825       break;
2826
2827    case TGSI_OPCODE_SGT:
2828       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2829       break;
2830
2831    case TGSI_OPCODE_SIN:
2832       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2833       break;
2834
2835    case TGSI_OPCODE_SLE:
2836       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2837       break;
2838
2839    case TGSI_OPCODE_SNE:
2840       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2841       break;
2842
2843    case TGSI_OPCODE_STR:
2844       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2845          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2846       }
2847       break;
2848
2849    case TGSI_OPCODE_TEX:
2850       /* simple texture lookup */
2851       /* src[0] = texcoord */
2852       /* src[1] = sampler unit */
2853       exec_tex(mach, inst, TEX_MODIFIER_NONE);
2854       break;
2855
2856    case TGSI_OPCODE_TXB:
2857       /* Texture lookup with lod bias */
2858       /* src[0] = texcoord (src[0].w = LOD bias) */
2859       /* src[1] = sampler unit */
2860       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2861       break;
2862
2863    case TGSI_OPCODE_TXD:
2864       /* Texture lookup with explict partial derivatives */
2865       /* src[0] = texcoord */
2866       /* src[1] = d[strq]/dx */
2867       /* src[2] = d[strq]/dy */
2868       /* src[3] = sampler unit */
2869       exec_txd(mach, inst);
2870       break;
2871
2872    case TGSI_OPCODE_TXL:
2873       /* Texture lookup with explit LOD */
2874       /* src[0] = texcoord (src[0].w = LOD) */
2875       /* src[1] = sampler unit */
2876       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2877       break;
2878
2879    case TGSI_OPCODE_TXP:
2880       /* Texture lookup with projection */
2881       /* src[0] = texcoord (src[0].w = projection) */
2882       /* src[1] = sampler unit */
2883       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2884       break;
2885
2886    case TGSI_OPCODE_UP2H:
2887       assert (0);
2888       break;
2889
2890    case TGSI_OPCODE_UP2US:
2891       assert (0);
2892       break;
2893
2894    case TGSI_OPCODE_UP4B:
2895       assert (0);
2896       break;
2897
2898    case TGSI_OPCODE_UP4UB:
2899       assert (0);
2900       break;
2901
2902    case TGSI_OPCODE_X2D:
2903       FETCH(&r[0], 1, CHAN_X);
2904       FETCH(&r[1], 1, CHAN_Y);
2905       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2906           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2907          FETCH(&r[2], 2, CHAN_X);
2908          micro_mul(&r[2], &r[2], &r[0]);
2909          FETCH(&r[3], 2, CHAN_Y);
2910          micro_mul(&r[3], &r[3], &r[1]);
2911          micro_add(&r[2], &r[2], &r[3]);
2912          FETCH(&r[3], 0, CHAN_X);
2913          micro_add(&d[CHAN_X], &r[2], &r[3]);
2914
2915       }
2916       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2917           IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2918          FETCH(&r[2], 2, CHAN_Z);
2919          micro_mul(&r[2], &r[2], &r[0]);
2920          FETCH(&r[3], 2, CHAN_W);
2921          micro_mul(&r[3], &r[3], &r[1]);
2922          micro_add(&r[2], &r[2], &r[3]);
2923          FETCH(&r[3], 0, CHAN_Y);
2924          micro_add(&d[CHAN_Y], &r[2], &r[3]);
2925
2926       }
2927       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2928          STORE(&d[CHAN_X], 0, CHAN_X);
2929       }
2930       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2931          STORE(&d[CHAN_Y], 0, CHAN_Y);
2932       }
2933       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2934          STORE(&d[CHAN_X], 0, CHAN_Z);
2935       }
2936       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2937          STORE(&d[CHAN_Y], 0, CHAN_W);
2938       }
2939       break;
2940
2941    case TGSI_OPCODE_ARA:
2942       assert (0);
2943       break;
2944
2945    case TGSI_OPCODE_ARR:
2946       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2947       break;
2948
2949    case TGSI_OPCODE_BRA:
2950       assert (0);
2951       break;
2952
2953    case TGSI_OPCODE_CAL:
2954       /* skip the call if no execution channels are enabled */
2955       if (mach->ExecMask) {
2956          /* do the call */
2957
2958          /* First, record the depths of the execution stacks.
2959           * This is important for deeply nested/looped return statements.
2960           * We have to unwind the stacks by the correct amount.  For a
2961           * real code generator, we could determine the number of entries
2962           * to pop off each stack with simple static analysis and avoid
2963           * implementing this data structure at run time.
2964           */
2965          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2966          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2967          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2968          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
2969          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
2970          /* note that PC was already incremented above */
2971          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2972
2973          mach->CallStackTop++;
2974
2975          /* Second, push the Cond, Loop, Cont, Func stacks */
2976          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2977          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2978          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2979          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2980          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2981          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2982
2983          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2984          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2985          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2986          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2987          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2988          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2989
2990          /* Finally, jump to the subroutine */
2991          *pc = inst->Label.Label;
2992       }
2993       break;
2994
2995    case TGSI_OPCODE_RET:
2996       mach->FuncMask &= ~mach->ExecMask;
2997       UPDATE_EXEC_MASK(mach);
2998
2999       if (mach->FuncMask == 0x0) {
3000          /* really return now (otherwise, keep executing */
3001
3002          if (mach->CallStackTop == 0) {
3003             /* returning from main() */
3004             *pc = -1;
3005             return;
3006          }
3007
3008          assert(mach->CallStackTop > 0);
3009          mach->CallStackTop--;
3010
3011          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3012          mach->CondMask = mach->CondStack[mach->CondStackTop];
3013
3014          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3015          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3016
3017          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3018          mach->ContMask = mach->ContStack[mach->ContStackTop];
3019
3020          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3021          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3022
3023          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3024          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3025
3026          assert(mach->FuncStackTop > 0);
3027          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3028
3029          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3030
3031          UPDATE_EXEC_MASK(mach);
3032       }
3033       break;
3034
3035    case TGSI_OPCODE_SSG:
3036       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3037       break;
3038
3039    case TGSI_OPCODE_CMP:
3040       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3041          FETCH(&r[0], 0, chan_index);
3042          FETCH(&r[1], 1, chan_index);
3043          FETCH(&r[2], 2, chan_index);
3044          micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
3045       }
3046       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3047          STORE(&d[chan_index], 0, chan_index);
3048       }
3049       break;
3050
3051    case TGSI_OPCODE_SCS:
3052       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3053          FETCH( &r[0], 0, CHAN_X );
3054          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3055             micro_cos(&r[1], &r[0]);
3056             STORE(&r[1], 0, CHAN_X);
3057          }
3058          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3059             micro_sin(&r[1], &r[0]);
3060             STORE(&r[1], 0, CHAN_Y);
3061          }
3062       }
3063       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3064          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3065       }
3066       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3067          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3068       }
3069       break;
3070
3071    case TGSI_OPCODE_NRM:
3072       /* 3-component vector normalize */
3073       if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
3074          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3075          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3076          /* r3 = sqrt(dp3(src0, src0)) */
3077          FETCH(&r[0], 0, CHAN_X);
3078          micro_mul(&r[3], &r[0], &r[0]);
3079          FETCH(&r[1], 0, CHAN_Y);
3080          micro_mul(&r[4], &r[1], &r[1]);
3081          micro_add(&r[3], &r[3], &r[4]);
3082          FETCH(&r[2], 0, CHAN_Z);
3083          micro_mul(&r[4], &r[2], &r[2]);
3084          micro_add(&r[3], &r[3], &r[4]);
3085          micro_sqrt(&r[3], &r[3]);
3086
3087          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3088             micro_div(&r[0], &r[0], &r[3]);
3089             STORE(&r[0], 0, CHAN_X);
3090          }
3091          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3092             micro_div(&r[1], &r[1], &r[3]);
3093             STORE(&r[1], 0, CHAN_Y);
3094          }
3095          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3096             micro_div(&r[2], &r[2], &r[3]);
3097             STORE(&r[2], 0, CHAN_Z);
3098          }
3099       }
3100       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3101          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
3102       }
3103       break;
3104
3105    case TGSI_OPCODE_NRM4:
3106       /* 4-component vector normalize */
3107       {
3108          union tgsi_exec_channel tmp, dot;
3109
3110          /* tmp = dp4(src0, src0): */
3111          FETCH( &r[0], 0, CHAN_X );
3112          micro_mul( &tmp, &r[0], &r[0] );
3113
3114          FETCH( &r[1], 0, CHAN_Y );
3115          micro_mul( &dot, &r[1], &r[1] );
3116          micro_add( &tmp, &tmp, &dot );
3117
3118          FETCH( &r[2], 0, CHAN_Z );
3119          micro_mul( &dot, &r[2], &r[2] );
3120          micro_add( &tmp, &tmp, &dot );
3121
3122          FETCH( &r[3], 0, CHAN_W );
3123          micro_mul( &dot, &r[3], &r[3] );
3124          micro_add( &tmp, &tmp, &dot );
3125
3126          /* tmp = 1 / sqrt(tmp) */
3127          micro_sqrt( &tmp, &tmp );
3128          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
3129
3130          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3131             /* chan = chan * tmp */
3132             micro_mul( &r[chan_index], &tmp, &r[chan_index] );
3133             STORE( &r[chan_index], 0, chan_index );
3134          }
3135       }
3136       break;
3137
3138    case TGSI_OPCODE_DIV:
3139       assert( 0 );
3140       break;
3141
3142    case TGSI_OPCODE_DP2:
3143       exec_dp2(mach, inst);
3144       break;
3145
3146    case TGSI_OPCODE_IF:
3147       /* push CondMask */
3148       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3149       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3150       FETCH( &r[0], 0, CHAN_X );
3151       /* update CondMask */
3152       if( ! r[0].u[0] ) {
3153          mach->CondMask &= ~0x1;
3154       }
3155       if( ! r[0].u[1] ) {
3156          mach->CondMask &= ~0x2;
3157       }
3158       if( ! r[0].u[2] ) {
3159          mach->CondMask &= ~0x4;
3160       }
3161       if( ! r[0].u[3] ) {
3162          mach->CondMask &= ~0x8;
3163       }
3164       UPDATE_EXEC_MASK(mach);
3165       /* Todo: If CondMask==0, jump to ELSE */
3166       break;
3167
3168    case TGSI_OPCODE_ELSE:
3169       /* invert CondMask wrt previous mask */
3170       {
3171          uint prevMask;
3172          assert(mach->CondStackTop > 0);
3173          prevMask = mach->CondStack[mach->CondStackTop - 1];
3174          mach->CondMask = ~mach->CondMask & prevMask;
3175          UPDATE_EXEC_MASK(mach);
3176          /* Todo: If CondMask==0, jump to ENDIF */
3177       }
3178       break;
3179
3180    case TGSI_OPCODE_ENDIF:
3181       /* pop CondMask */
3182       assert(mach->CondStackTop > 0);
3183       mach->CondMask = mach->CondStack[--mach->CondStackTop];
3184       UPDATE_EXEC_MASK(mach);
3185       break;
3186
3187    case TGSI_OPCODE_END:
3188       /* halt execution */
3189       *pc = -1;
3190       break;
3191
3192    case TGSI_OPCODE_REP:
3193       assert (0);
3194       break;
3195
3196    case TGSI_OPCODE_ENDREP:
3197        assert (0);
3198        break;
3199
3200    case TGSI_OPCODE_PUSHA:
3201       assert (0);
3202       break;
3203
3204    case TGSI_OPCODE_POPA:
3205       assert (0);
3206       break;
3207
3208    case TGSI_OPCODE_CEIL:
3209       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3210       break;
3211
3212    case TGSI_OPCODE_I2F:
3213       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3214       break;
3215
3216    case TGSI_OPCODE_NOT:
3217       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3218       break;
3219
3220    case TGSI_OPCODE_TRUNC:
3221       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3222       break;
3223
3224    case TGSI_OPCODE_SHL:
3225       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3226       break;
3227
3228    case TGSI_OPCODE_AND:
3229       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3230       break;
3231
3232    case TGSI_OPCODE_OR:
3233       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3234       break;
3235
3236    case TGSI_OPCODE_MOD:
3237       assert (0);
3238       break;
3239
3240    case TGSI_OPCODE_XOR:
3241       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3242       break;
3243
3244    case TGSI_OPCODE_SAD:
3245       assert (0);
3246       break;
3247
3248    case TGSI_OPCODE_TXF:
3249       assert (0);
3250       break;
3251
3252    case TGSI_OPCODE_TXQ:
3253       assert (0);
3254       break;
3255
3256    case TGSI_OPCODE_EMIT:
3257       emit_vertex(mach);
3258       break;
3259
3260    case TGSI_OPCODE_ENDPRIM:
3261       emit_primitive(mach);
3262       break;
3263
3264    case TGSI_OPCODE_BGNFOR:
3265       assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3266       for (chan_index = 0; chan_index < 3; chan_index++) {
3267          FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3268       }
3269       ++mach->LoopCounterStackTop;
3270       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3271       /* update LoopMask */
3272       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3273          mach->LoopMask &= ~0x1;
3274       }
3275       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3276          mach->LoopMask &= ~0x2;
3277       }
3278       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3279          mach->LoopMask &= ~0x4;
3280       }
3281       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3282          mach->LoopMask &= ~0x8;
3283       }
3284       /* TODO: if mach->LoopMask == 0, jump to end of loop */
3285       UPDATE_EXEC_MASK(mach);
3286       /* fall-through (for now) */
3287    case TGSI_OPCODE_BGNLOOP:
3288       /* push LoopMask and ContMasks */
3289       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3290       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3291       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3292       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3293
3294       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3295       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3296       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3297       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3298       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3299       break;
3300
3301    case TGSI_OPCODE_ENDFOR:
3302       assert(mach->LoopCounterStackTop > 0);
3303       micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3304                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3305                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3306       /* update LoopMask */
3307       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3308          mach->LoopMask &= ~0x1;
3309       }
3310       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3311          mach->LoopMask &= ~0x2;
3312       }
3313       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3314          mach->LoopMask &= ~0x4;
3315       }
3316       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3317          mach->LoopMask &= ~0x8;
3318       }
3319       micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3320                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3321                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3322       assert(mach->LoopLabelStackTop > 0);
3323       inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3324       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3325       /* Restore ContMask, but don't pop */
3326       assert(mach->ContStackTop > 0);
3327       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3328       UPDATE_EXEC_MASK(mach);
3329       if (mach->ExecMask) {
3330          /* repeat loop: jump to instruction just past BGNLOOP */
3331          assert(mach->LoopLabelStackTop > 0);
3332          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3333       }
3334       else {
3335          /* exit loop: pop LoopMask */
3336          assert(mach->LoopStackTop > 0);
3337          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3338          /* pop ContMask */
3339          assert(mach->ContStackTop > 0);
3340          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3341          assert(mach->LoopLabelStackTop > 0);
3342          --mach->LoopLabelStackTop;
3343          assert(mach->LoopCounterStackTop > 0);
3344          --mach->LoopCounterStackTop;
3345
3346          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3347       }
3348       UPDATE_EXEC_MASK(mach);
3349       break;
3350
3351    case TGSI_OPCODE_ENDLOOP:
3352       /* Restore ContMask, but don't pop */
3353       assert(mach->ContStackTop > 0);
3354       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3355       UPDATE_EXEC_MASK(mach);
3356       if (mach->ExecMask) {
3357          /* repeat loop: jump to instruction just past BGNLOOP */
3358          assert(mach->LoopLabelStackTop > 0);
3359          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3360       }
3361       else {
3362          /* exit loop: pop LoopMask */
3363          assert(mach->LoopStackTop > 0);
3364          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3365          /* pop ContMask */
3366          assert(mach->ContStackTop > 0);
3367          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3368          assert(mach->LoopLabelStackTop > 0);
3369          --mach->LoopLabelStackTop;
3370
3371          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3372       }
3373       UPDATE_EXEC_MASK(mach);
3374       break;
3375
3376    case TGSI_OPCODE_BRK:
3377       exec_break(mach);
3378       break;
3379
3380    case TGSI_OPCODE_CONT:
3381       /* turn off cont channels for each enabled exec channel */
3382       mach->ContMask &= ~mach->ExecMask;
3383       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3384       UPDATE_EXEC_MASK(mach);
3385       break;
3386
3387    case TGSI_OPCODE_BGNSUB:
3388       /* no-op */
3389       break;
3390
3391    case TGSI_OPCODE_ENDSUB:
3392       /*
3393        * XXX: This really should be a no-op. We should never reach this opcode.
3394        */
3395
3396       assert(mach->CallStackTop > 0);
3397       mach->CallStackTop--;
3398
3399       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3400       mach->CondMask = mach->CondStack[mach->CondStackTop];
3401
3402       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3403       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3404
3405       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3406       mach->ContMask = mach->ContStack[mach->ContStackTop];
3407
3408       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3409       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3410
3411       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3412       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3413
3414       assert(mach->FuncStackTop > 0);
3415       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3416
3417       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3418
3419       UPDATE_EXEC_MASK(mach);
3420       break;
3421
3422    case TGSI_OPCODE_NOP:
3423       break;
3424
3425    case TGSI_OPCODE_BREAKC:
3426       FETCH(&r[0], 0, CHAN_X);
3427       /* update CondMask */
3428       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3429          mach->LoopMask &= ~0x1;
3430       }
3431       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3432          mach->LoopMask &= ~0x2;
3433       }
3434       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3435          mach->LoopMask &= ~0x4;
3436       }
3437       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3438          mach->LoopMask &= ~0x8;
3439       }
3440       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3441       UPDATE_EXEC_MASK(mach);
3442       break;
3443
3444    case TGSI_OPCODE_F2I:
3445       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3446       break;
3447
3448    case TGSI_OPCODE_IDIV:
3449       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3450       break;
3451
3452    case TGSI_OPCODE_IMAX:
3453       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3454       break;
3455
3456    case TGSI_OPCODE_IMIN:
3457       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3458       break;
3459
3460    case TGSI_OPCODE_INEG:
3461       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3462       break;
3463
3464    case TGSI_OPCODE_ISGE:
3465       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3466       break;
3467
3468    case TGSI_OPCODE_ISHR:
3469       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3470       break;
3471
3472    case TGSI_OPCODE_ISLT:
3473       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3474       break;
3475
3476    case TGSI_OPCODE_F2U:
3477       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3478       break;
3479
3480    case TGSI_OPCODE_U2F:
3481       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3482       break;
3483
3484    case TGSI_OPCODE_UADD:
3485       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3486       break;
3487
3488    case TGSI_OPCODE_UDIV:
3489       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3490       break;
3491
3492    case TGSI_OPCODE_UMAD:
3493       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3494       break;
3495
3496    case TGSI_OPCODE_UMAX:
3497       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3498       break;
3499
3500    case TGSI_OPCODE_UMIN:
3501       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3502       break;
3503
3504    case TGSI_OPCODE_UMOD:
3505       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3506       break;
3507
3508    case TGSI_OPCODE_UMUL:
3509       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3510       break;
3511
3512    case TGSI_OPCODE_USEQ:
3513       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3514       break;
3515
3516    case TGSI_OPCODE_USGE:
3517       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3518       break;
3519
3520    case TGSI_OPCODE_USHR:
3521       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3522       break;
3523
3524    case TGSI_OPCODE_USLT:
3525       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3526       break;
3527
3528    case TGSI_OPCODE_USNE:
3529       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3530       break;
3531
3532    case TGSI_OPCODE_SWITCH:
3533       exec_switch(mach, inst);
3534       break;
3535
3536    case TGSI_OPCODE_CASE:
3537       exec_case(mach, inst);
3538       break;
3539
3540    case TGSI_OPCODE_DEFAULT:
3541       exec_default(mach);
3542       break;
3543
3544    case TGSI_OPCODE_ENDSWITCH:
3545       exec_endswitch(mach);
3546       break;
3547
3548    default:
3549       assert( 0 );
3550    }
3551 }
3552
3553
3554 #define DEBUG_EXECUTION 0
3555
3556
3557 /**
3558  * Run TGSI interpreter.
3559  * \return bitmask of "alive" quad components
3560  */
3561 uint
3562 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3563 {
3564    uint i;
3565    int pc = 0;
3566
3567    mach->CondMask = 0xf;
3568    mach->LoopMask = 0xf;
3569    mach->ContMask = 0xf;
3570    mach->FuncMask = 0xf;
3571    mach->ExecMask = 0xf;
3572
3573    mach->Switch.mask = 0xf;
3574
3575    assert(mach->CondStackTop == 0);
3576    assert(mach->LoopStackTop == 0);
3577    assert(mach->ContStackTop == 0);
3578    assert(mach->SwitchStackTop == 0);
3579    assert(mach->BreakStackTop == 0);
3580    assert(mach->CallStackTop == 0);
3581
3582    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3583    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3584
3585    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3586       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3587       mach->Primitives[0] = 0;
3588    }
3589
3590    for (i = 0; i < QUAD_SIZE; i++) {
3591       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3592          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3593          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3594          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3595          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3596    }
3597
3598    /* execute declarations (interpolants) */
3599    for (i = 0; i < mach->NumDeclarations; i++) {
3600       exec_declaration( mach, mach->Declarations+i );
3601    }
3602
3603    {
3604 #if DEBUG_EXECUTION
3605       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3606       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3607       uint inst = 1;
3608
3609       memcpy(temps, mach->Temps, sizeof(temps));
3610       memcpy(outputs, mach->Outputs, sizeof(outputs));
3611 #endif
3612
3613       /* execute instructions, until pc is set to -1 */
3614       while (pc != -1) {
3615
3616 #if DEBUG_EXECUTION
3617          uint i;
3618
3619          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3620 #endif
3621
3622          assert(pc < (int) mach->NumInstructions);
3623          exec_instruction(mach, mach->Instructions + pc, &pc);
3624
3625 #if DEBUG_EXECUTION
3626          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3627             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3628                uint j;
3629
3630                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3631                debug_printf("TEMP[%2u] = ", i);
3632                for (j = 0; j < 4; j++) {
3633                   if (j > 0) {
3634                      debug_printf("           ");
3635                   }
3636                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3637                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3638                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3639                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3640                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3641                }
3642             }
3643          }
3644          for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3645             if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3646                uint j;
3647
3648                memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3649                debug_printf("OUT[%2u] =  ", i);
3650                for (j = 0; j < 4; j++) {
3651                   if (j > 0) {
3652                      debug_printf("           ");
3653                   }
3654                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3655                                outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3656                                outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3657                                outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3658                                outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3659                }
3660             }
3661          }
3662 #endif
3663       }
3664    }
3665
3666 #if 0
3667    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3668    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3669       /*
3670        * Scale back depth component.
3671        */
3672       for (i = 0; i < 4; i++)
3673          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3674    }
3675 #endif
3676
3677    assert(mach->CondStackTop == 0);
3678    assert(mach->LoopStackTop == 0);
3679    assert(mach->ContStackTop == 0);
3680    assert(mach->SwitchStackTop == 0);
3681    assert(mach->BreakStackTop == 0);
3682    assert(mach->CallStackTop == 0);
3683
3684    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3685 }