src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_memory.h"
  62 #include "util/u_math.h"
  63
  64
  65 #define FAST_MATH 1
  66
  67 #define TILE_TOP_LEFT     0
  68 #define TILE_TOP_RIGHT    1
  69 #define TILE_BOTTOM_LEFT  2
  70 #define TILE_BOTTOM_RIGHT 3
  71
  72 static void
  73 micro_abs(union tgsi_exec_channel *dst,
  74           const union tgsi_exec_channel *src)
  75 {
  76    dst->f[0] = fabsf(src->f[0]);
  77    dst->f[1] = fabsf(src->f[1]);
  78    dst->f[2] = fabsf(src->f[2]);
  79    dst->f[3] = fabsf(src->f[3]);
  80 }
  81
  82 static void
  83 micro_arl(union tgsi_exec_channel *dst,
  84           const union tgsi_exec_channel *src)
  85 {
  86    dst->i[0] = (int)floorf(src->f[0]);
  87    dst->i[1] = (int)floorf(src->f[1]);
  88    dst->i[2] = (int)floorf(src->f[2]);
  89    dst->i[3] = (int)floorf(src->f[3]);
  90 }
  91
  92 static void
  93 micro_arr(union tgsi_exec_channel *dst,
  94           const union tgsi_exec_channel *src)
  95 {
  96    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
  97    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
  98    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
  99    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 100 }
 101
 102 static void
 103 micro_ceil(union tgsi_exec_channel *dst,
 104            const union tgsi_exec_channel *src)
 105 {
 106    dst->f[0] = ceilf(src->f[0]);
 107    dst->f[1] = ceilf(src->f[1]);
 108    dst->f[2] = ceilf(src->f[2]);
 109    dst->f[3] = ceilf(src->f[3]);
 110 }
 111
 112 static void
 113 micro_cos(union tgsi_exec_channel *dst,
 114           const union tgsi_exec_channel *src)
 115 {
 116    dst->f[0] = cosf(src->f[0]);
 117    dst->f[1] = cosf(src->f[1]);
 118    dst->f[2] = cosf(src->f[2]);
 119    dst->f[3] = cosf(src->f[3]);
 120 }
 121
 122 static void
 123 micro_ddx(union tgsi_exec_channel *dst,
 124           const union tgsi_exec_channel *src)
 125 {
 126    dst->f[0] =
 127    dst->f[1] =
 128    dst->f[2] =
 129    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 130 }
 131
 132 static void
 133 micro_ddy(union tgsi_exec_channel *dst,
 134           const union tgsi_exec_channel *src)
 135 {
 136    dst->f[0] =
 137    dst->f[1] =
 138    dst->f[2] =
 139    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 140 }
 141
 142 static void
 143 micro_exp2(union tgsi_exec_channel *dst,
 144            const union tgsi_exec_channel *src)
 145 {
 146 #if FAST_MATH
 147    dst->f[0] = util_fast_exp2(src->f[0]);
 148    dst->f[1] = util_fast_exp2(src->f[1]);
 149    dst->f[2] = util_fast_exp2(src->f[2]);
 150    dst->f[3] = util_fast_exp2(src->f[3]);
 151 #else
 152 #if DEBUG
 153    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 154    uint i;
 155    union tgsi_exec_channel clamped;
 156
 157    for (i = 0; i < 4; i++) {
 158       if (src->f[i] > 127.99999f) {
 159          clamped.f[i] = 127.99999f;
 160       } else if (src->f[i] < -126.99999f) {
 161          clamped.f[i] = -126.99999f;
 162       } else {
 163          clamped.f[i] = src->f[i];
 164       }
 165    }
 166    src = &clamped;
 167 #endif /* DEBUG */
 168
 169    dst->f[0] = powf(2.0f, src->f[0]);
 170    dst->f[1] = powf(2.0f, src->f[1]);
 171    dst->f[2] = powf(2.0f, src->f[2]);
 172    dst->f[3] = powf(2.0f, src->f[3]);
 173 #endif /* FAST_MATH */
 174 }
 175
 176 static void
 177 micro_flr(union tgsi_exec_channel *dst,
 178           const union tgsi_exec_channel *src)
 179 {
 180    dst->f[0] = floorf(src->f[0]);
 181    dst->f[1] = floorf(src->f[1]);
 182    dst->f[2] = floorf(src->f[2]);
 183    dst->f[3] = floorf(src->f[3]);
 184 }
 185
 186 static void
 187 micro_frc(union tgsi_exec_channel *dst,
 188           const union tgsi_exec_channel *src)
 189 {
 190    dst->f[0] = src->f[0] - floorf(src->f[0]);
 191    dst->f[1] = src->f[1] - floorf(src->f[1]);
 192    dst->f[2] = src->f[2] - floorf(src->f[2]);
 193    dst->f[3] = src->f[3] - floorf(src->f[3]);
 194 }
 195
 196 static void
 197 micro_iabs(union tgsi_exec_channel *dst,
 198            const union tgsi_exec_channel *src)
 199 {
 200    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 201    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 202    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 203    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 204 }
 205
 206 static void
 207 micro_ineg(union tgsi_exec_channel *dst,
 208            const union tgsi_exec_channel *src)
 209 {
 210    dst->i[0] = -src->i[0];
 211    dst->i[1] = -src->i[1];
 212    dst->i[2] = -src->i[2];
 213    dst->i[3] = -src->i[3];
 214 }
 215
 216 static void
 217 micro_lg2(union tgsi_exec_channel *dst,
 218           const union tgsi_exec_channel *src)
 219 {
 220 #if FAST_MATH
 221    dst->f[0] = util_fast_log2(src->f[0]);
 222    dst->f[1] = util_fast_log2(src->f[1]);
 223    dst->f[2] = util_fast_log2(src->f[2]);
 224    dst->f[3] = util_fast_log2(src->f[3]);
 225 #else
 226    dst->f[0] = logf(src->f[0]) * 1.442695f;
 227    dst->f[1] = logf(src->f[1]) * 1.442695f;
 228    dst->f[2] = logf(src->f[2]) * 1.442695f;
 229    dst->f[3] = logf(src->f[3]) * 1.442695f;
 230 #endif
 231 }
 232
 233 static void
 234 micro_lrp(union tgsi_exec_channel *dst,
 235           const union tgsi_exec_channel *src)
 236 {
 237    dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
 238    dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
 239    dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
 240    dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
 241 }
 242
 243 static void
 244 micro_mad(union tgsi_exec_channel *dst,
 245           const union tgsi_exec_channel *src)
 246 {
 247    dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
 248    dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
 249    dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
 250    dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
 251 }
 252
 253 static void
 254 micro_mov(union tgsi_exec_channel *dst,
 255           const union tgsi_exec_channel *src)
 256 {
 257    dst->u[0] = src->u[0];
 258    dst->u[1] = src->u[1];
 259    dst->u[2] = src->u[2];
 260    dst->u[3] = src->u[3];
 261 }
 262
 263 static void
 264 micro_rcp(union tgsi_exec_channel *dst,
 265           const union tgsi_exec_channel *src)
 266 {
 267 #if 0 /* for debugging */
 268    assert(src->f[0] != 0.0f);
 269    assert(src->f[1] != 0.0f);
 270    assert(src->f[2] != 0.0f);
 271    assert(src->f[3] != 0.0f);
 272 #endif
 273    dst->f[0] = 1.0f / src->f[0];
 274    dst->f[1] = 1.0f / src->f[1];
 275    dst->f[2] = 1.0f / src->f[2];
 276    dst->f[3] = 1.0f / src->f[3];
 277 }
 278
 279 static void
 280 micro_rnd(union tgsi_exec_channel *dst,
 281           const union tgsi_exec_channel *src)
 282 {
 283    dst->f[0] = floorf(src->f[0] + 0.5f);
 284    dst->f[1] = floorf(src->f[1] + 0.5f);
 285    dst->f[2] = floorf(src->f[2] + 0.5f);
 286    dst->f[3] = floorf(src->f[3] + 0.5f);
 287 }
 288
 289 static void
 290 micro_rsq(union tgsi_exec_channel *dst,
 291           const union tgsi_exec_channel *src)
 292 {
 293 #if 0 /* for debugging */
 294    assert(src->f[0] != 0.0f);
 295    assert(src->f[1] != 0.0f);
 296    assert(src->f[2] != 0.0f);
 297    assert(src->f[3] != 0.0f);
 298 #endif
 299    dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
 300    dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
 301    dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
 302    dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
 303 }
 304
 305 static void
 306 micro_seq(union tgsi_exec_channel *dst,
 307           const union tgsi_exec_channel *src)
 308 {
 309    dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
 310    dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
 311    dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
 312    dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
 313 }
 314
 315 static void
 316 micro_sge(union tgsi_exec_channel *dst,
 317           const union tgsi_exec_channel *src)
 318 {
 319    dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
 320    dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
 321    dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
 322    dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
 323 }
 324
 325 static void
 326 micro_sgn(union tgsi_exec_channel *dst,
 327           const union tgsi_exec_channel *src)
 328 {
 329    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 330    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 331    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 332    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 333 }
 334
 335 static void
 336 micro_sgt(union tgsi_exec_channel *dst,
 337           const union tgsi_exec_channel *src)
 338 {
 339    dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
 340    dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
 341    dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
 342    dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
 343 }
 344
 345 static void
 346 micro_sin(union tgsi_exec_channel *dst,
 347           const union tgsi_exec_channel *src)
 348 {
 349    dst->f[0] = sinf(src->f[0]);
 350    dst->f[1] = sinf(src->f[1]);
 351    dst->f[2] = sinf(src->f[2]);
 352    dst->f[3] = sinf(src->f[3]);
 353 }
 354
 355 static void
 356 micro_sle(union tgsi_exec_channel *dst,
 357           const union tgsi_exec_channel *src)
 358 {
 359    dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
 360    dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
 361    dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
 362    dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
 363 }
 364
 365 static void
 366 micro_slt(union tgsi_exec_channel *dst,
 367           const union tgsi_exec_channel *src)
 368 {
 369    dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
 370    dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
 371    dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
 372    dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
 373 }
 374
 375 static void
 376 micro_sne(union tgsi_exec_channel *dst,
 377           const union tgsi_exec_channel *src)
 378 {
 379    dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
 380    dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
 381    dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
 382    dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
 383 }
 384
 385 static void
 386 micro_trunc(union tgsi_exec_channel *dst,
 387             const union tgsi_exec_channel *src)
 388 {
 389    dst->f[0] = (float)(int)src->f[0];
 390    dst->f[1] = (float)(int)src->f[1];
 391    dst->f[2] = (float)(int)src->f[2];
 392    dst->f[3] = (float)(int)src->f[3];
 393 }
 394
 395
 396 #define CHAN_X  0
 397 #define CHAN_Y  1
 398 #define CHAN_Z  2
 399 #define CHAN_W  3
 400
 401 enum tgsi_exec_datatype {
 402    TGSI_EXEC_DATA_FLOAT,
 403    TGSI_EXEC_DATA_INT,
 404    TGSI_EXEC_DATA_UINT
 405 };
 406
 407 /*
 408  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 409  */
 410 #define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
 411 #define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
 412 #define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
 413 #define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
 414 #define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
 415 #define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
 416 #define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
 417 #define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
 418 #define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
 419 #define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
 420 #define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
 421 #define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
 422 #define TEMP_128_I         TGSI_EXEC_TEMP_128_I
 423 #define TEMP_128_C         TGSI_EXEC_TEMP_128_C
 424 #define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
 425 #define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
 426 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 427 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 428 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 429 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 430 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 431 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 432 #define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
 433 #define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
 434 #define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
 435 #define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
 436 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 437 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 438 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 439 #define TEMP_P0            TGSI_EXEC_TEMP_P0
 440
 441 #define IS_CHANNEL_ENABLED(INST, CHAN)\
 442    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
 443
 444 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
 445    ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
 446
 447 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
 448    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 449       if (IS_CHANNEL_ENABLED( INST, CHAN ))
 450
 451 #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
 452    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
 453       if (IS_CHANNEL_ENABLED2( INST, CHAN ))
 454
 455
 456 /** The execution mask depends on the conditional mask and the loop mask */
 457 #define UPDATE_EXEC_MASK(MACH) \
 458       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 459
 460
 461 static const union tgsi_exec_channel ZeroVec =
 462    { { 0.0, 0.0, 0.0, 0.0 } };
 463
 464 static const union tgsi_exec_channel OneVec = {
 465    {1.0f, 1.0f, 1.0f, 1.0f}
 466 };
 467
 468
 469 /**
 470  * Assert that none of the float values in 'chan' are infinite or NaN.
 471  * NaN and Inf may occur normally during program execution and should
 472  * not lead to crashes, etc.  But when debugging, it's helpful to catch
 473  * them.
 474  */
 475 static INLINE void
 476 check_inf_or_nan(const union tgsi_exec_channel *chan)
 477 {
 478    assert(!util_is_inf_or_nan((chan)->f[0]));
 479    assert(!util_is_inf_or_nan((chan)->f[1]));
 480    assert(!util_is_inf_or_nan((chan)->f[2]));
 481    assert(!util_is_inf_or_nan((chan)->f[3]));
 482 }
 483
 484
 485 #ifdef DEBUG
 486 static void
 487 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 488 {
 489    debug_printf("%s = {%f, %f, %f, %f}\n",
 490                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 491 }
 492 #endif
 493
 494
 495 #ifdef DEBUG
 496 static void
 497 print_temp(const struct tgsi_exec_machine *mach, uint index)
 498 {
 499    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 500    int i;
 501    debug_printf("Temp[%u] =\n", index);
 502    for (i = 0; i < 4; i++) {
 503       debug_printf("  %c: { %f, %f, %f, %f }\n",
 504                    "XYZW"[i],
 505                    tmp->xyzw[i].f[0],
 506                    tmp->xyzw[i].f[1],
 507                    tmp->xyzw[i].f[2],
 508                    tmp->xyzw[i].f[3]);
 509    }
 510 }
 511 #endif
 512
 513
 514 /**
 515  * Check if there's a potential src/dst register data dependency when
 516  * using SOA execution.
 517  * Example:
 518  *   MOV T, T.yxwz;
 519  * This would expand into:
 520  *   MOV t0, t1;
 521  *   MOV t1, t0;
 522  *   MOV t2, t3;
 523  *   MOV t3, t2;
 524  * The second instruction will have the wrong value for t0 if executed as-is.
 525  */
 526 boolean
 527 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
 528 {
 529    uint i, chan;
 530
 531    uint writemask = inst->Dst[0].Register.WriteMask;
 532    if (writemask == TGSI_WRITEMASK_X ||
 533        writemask == TGSI_WRITEMASK_Y ||
 534        writemask == TGSI_WRITEMASK_Z ||
 535        writemask == TGSI_WRITEMASK_W ||
 536        writemask == TGSI_WRITEMASK_NONE) {
 537       /* no chance of data dependency */
 538       return FALSE;
 539    }
 540
 541    /* loop over src regs */
 542    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 543       if ((inst->Src[i].Register.File ==
 544            inst->Dst[0].Register.File) &&
 545           (inst->Src[i].Register.Index ==
 546            inst->Dst[0].Register.Index)) {
 547          /* loop over dest channels */
 548          uint channelsWritten = 0x0;
 549          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
 550             /* check if we're reading a channel that's been written */
 551             uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
 552             if (channelsWritten & (1 << swizzle)) {
 553                return TRUE;
 554             }
 555
 556             channelsWritten |= (1 << chan);
 557          }
 558       }
 559    }
 560    return FALSE;
 561 }
 562
 563
 564 /**
 565  * Initialize machine state by expanding tokens to full instructions,
 566  * allocating temporary storage, setting up constants, etc.
 567  * After this, we can call tgsi_exec_machine_run() many times.
 568  */
 569 void
 570 tgsi_exec_machine_bind_shader(
 571    struct tgsi_exec_machine *mach,
 572    const struct tgsi_token *tokens,
 573    uint numSamplers,
 574    struct tgsi_sampler **samplers)
 575 {
 576    uint k;
 577    struct tgsi_parse_context parse;
 578    struct tgsi_exec_labels *labels = &mach->Labels;
 579    struct tgsi_full_instruction *instructions;
 580    struct tgsi_full_declaration *declarations;
 581    uint maxInstructions = 10, numInstructions = 0;
 582    uint maxDeclarations = 10, numDeclarations = 0;
 583    uint instno = 0;
 584
 585 #if 0
 586    tgsi_dump(tokens, 0);
 587 #endif
 588
 589    util_init_math();
 590
 591    mach->Tokens = tokens;
 592    mach->Samplers = samplers;
 593
 594    k = tgsi_parse_init (&parse, mach->Tokens);
 595    if (k != TGSI_PARSE_OK) {
 596       debug_printf( "Problem parsing!\n" );
 597       return;
 598    }
 599
 600    mach->Processor = parse.FullHeader.Processor.Processor;
 601    mach->ImmLimit = 0;
 602    labels->count = 0;
 603
 604    declarations = (struct tgsi_full_declaration *)
 605       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 606
 607    if (!declarations) {
 608       return;
 609    }
 610
 611    instructions = (struct tgsi_full_instruction *)
 612       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 613
 614    if (!instructions) {
 615       FREE( declarations );
 616       return;
 617    }
 618
 619    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 620       uint pointer = parse.Position;
 621       uint i;
 622
 623       tgsi_parse_token( &parse );
 624       switch( parse.FullToken.Token.Type ) {
 625       case TGSI_TOKEN_TYPE_DECLARATION:
 626          /* save expanded declaration */
 627          if (numDeclarations == maxDeclarations) {
 628             declarations = REALLOC(declarations,
 629                                    maxDeclarations
 630                                    * sizeof(struct tgsi_full_declaration),
 631                                    (maxDeclarations + 10)
 632                                    * sizeof(struct tgsi_full_declaration));
 633             maxDeclarations += 10;
 634          }
 635          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
 636             unsigned reg;
 637             for (reg = parse.FullToken.FullDeclaration.Range.First;
 638                  reg <= parse.FullToken.FullDeclaration.Range.Last;
 639                  ++reg) {
 640                ++mach->NumOutputs;
 641             }
 642          }
 643          memcpy(declarations + numDeclarations,
 644                 &parse.FullToken.FullDeclaration,
 645                 sizeof(declarations[0]));
 646          numDeclarations++;
 647          break;
 648
 649       case TGSI_TOKEN_TYPE_IMMEDIATE:
 650          {
 651             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 652             assert( size <= 4 );
 653             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
 654
 655             for( i = 0; i < size; i++ ) {
 656                mach->Imms[mach->ImmLimit][i] =
 657                   parse.FullToken.FullImmediate.u[i].Float;
 658             }
 659             mach->ImmLimit += 1;
 660          }
 661          break;
 662
 663       case TGSI_TOKEN_TYPE_INSTRUCTION:
 664          assert( labels->count < MAX_LABELS );
 665
 666          labels->labels[labels->count][0] = instno;
 667          labels->labels[labels->count][1] = pointer;
 668          labels->count++;
 669
 670          /* save expanded instruction */
 671          if (numInstructions == maxInstructions) {
 672             instructions = REALLOC(instructions,
 673                                    maxInstructions
 674                                    * sizeof(struct tgsi_full_instruction),
 675                                    (maxInstructions + 10)
 676                                    * sizeof(struct tgsi_full_instruction));
 677             maxInstructions += 10;
 678          }
 679
 680          memcpy(instructions + numInstructions,
 681                 &parse.FullToken.FullInstruction,
 682                 sizeof(instructions[0]));
 683
 684          numInstructions++;
 685          break;
 686
 687       case TGSI_TOKEN_TYPE_PROPERTY:
 688          break;
 689
 690       default:
 691          assert( 0 );
 692       }
 693    }
 694    tgsi_parse_free (&parse);
 695
 696    if (mach->Declarations) {
 697       FREE( mach->Declarations );
 698    }
 699    mach->Declarations = declarations;
 700    mach->NumDeclarations = numDeclarations;
 701
 702    if (mach->Instructions) {
 703       FREE( mach->Instructions );
 704    }
 705    mach->Instructions = instructions;
 706    mach->NumInstructions = numInstructions;
 707 }
 708
 709
 710 struct tgsi_exec_machine *
 711 tgsi_exec_machine_create( void )
 712 {
 713    struct tgsi_exec_machine *mach;
 714    uint i;
 715
 716    mach = align_malloc( sizeof *mach, 16 );
 717    if (!mach)
 718       goto fail;
 719
 720    memset(mach, 0, sizeof(*mach));
 721
 722    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 723    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
 724    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
 725
 726    /* Setup constants. */
 727    for( i = 0; i < 4; i++ ) {
 728       mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
 729       mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
 730       mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
 731       mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
 732       mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
 733       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
 734       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
 735       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
 736       mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
 737       mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
 738    }
 739
 740 #ifdef DEBUG
 741    /* silence warnings */
 742    (void) print_chan;
 743    (void) print_temp;
 744 #endif
 745
 746    return mach;
 747
 748 fail:
 749    align_free(mach);
 750    return NULL;
 751 }
 752
 753
 754 void
 755 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 756 {
 757    if (mach) {
 758       FREE(mach->Instructions);
 759       FREE(mach->Declarations);
 760    }
 761
 762    align_free(mach);
 763 }
 764
 765 static void
 766 micro_add(
 767    union tgsi_exec_channel *dst,
 768    const union tgsi_exec_channel *src0,
 769    const union tgsi_exec_channel *src1 )
 770 {
 771    dst->f[0] = src0->f[0] + src1->f[0];
 772    dst->f[1] = src0->f[1] + src1->f[1];
 773    dst->f[2] = src0->f[2] + src1->f[2];
 774    dst->f[3] = src0->f[3] + src1->f[3];
 775 }
 776
 777 static void
 778 micro_div(
 779    union tgsi_exec_channel *dst,
 780    const union tgsi_exec_channel *src0,
 781    const union tgsi_exec_channel *src1 )
 782 {
 783    if (src1->f[0] != 0) {
 784       dst->f[0] = src0->f[0] / src1->f[0];
 785    }
 786    if (src1->f[1] != 0) {
 787       dst->f[1] = src0->f[1] / src1->f[1];
 788    }
 789    if (src1->f[2] != 0) {
 790       dst->f[2] = src0->f[2] / src1->f[2];
 791    }
 792    if (src1->f[3] != 0) {
 793       dst->f[3] = src0->f[3] / src1->f[3];
 794    }
 795 }
 796
 797 static void
 798 micro_float_clamp(union tgsi_exec_channel *dst,
 799                   const union tgsi_exec_channel *src)
 800 {
 801    uint i;
 802
 803    for (i = 0; i < 4; i++) {
 804       if (src->f[i] > 0.0f) {
 805          if (src->f[i] > 1.884467e+019f)
 806             dst->f[i] = 1.884467e+019f;
 807          else if (src->f[i] < 5.42101e-020f)
 808             dst->f[i] = 5.42101e-020f;
 809          else
 810             dst->f[i] = src->f[i];
 811       }
 812       else {
 813          if (src->f[i] < -1.884467e+019f)
 814             dst->f[i] = -1.884467e+019f;
 815          else if (src->f[i] > -5.42101e-020f)
 816             dst->f[i] = -5.42101e-020f;
 817          else
 818             dst->f[i] = src->f[i];
 819       }
 820    }
 821 }
 822
 823 static void
 824 micro_lt(
 825    union tgsi_exec_channel *dst,
 826    const union tgsi_exec_channel *src0,
 827    const union tgsi_exec_channel *src1,
 828    const union tgsi_exec_channel *src2,
 829    const union tgsi_exec_channel *src3 )
 830 {
 831    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 832    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 833    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 834    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 835 }
 836
 837 static void
 838 micro_max(
 839    union tgsi_exec_channel *dst,
 840    const union tgsi_exec_channel *src0,
 841    const union tgsi_exec_channel *src1 )
 842 {
 843    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 844    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 845    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 846    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 847 }
 848
 849 static void
 850 micro_min(
 851    union tgsi_exec_channel *dst,
 852    const union tgsi_exec_channel *src0,
 853    const union tgsi_exec_channel *src1 )
 854 {
 855    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 856    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 857    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 858    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 859 }
 860
 861 static void
 862 micro_mul(
 863    union tgsi_exec_channel *dst,
 864    const union tgsi_exec_channel *src0,
 865    const union tgsi_exec_channel *src1 )
 866 {
 867    dst->f[0] = src0->f[0] * src1->f[0];
 868    dst->f[1] = src0->f[1] * src1->f[1];
 869    dst->f[2] = src0->f[2] * src1->f[2];
 870    dst->f[3] = src0->f[3] * src1->f[3];
 871 }
 872
 873 #if 0
 874 static void
 875 micro_imul64(
 876    union tgsi_exec_channel *dst0,
 877    union tgsi_exec_channel *dst1,
 878    const union tgsi_exec_channel *src0,
 879    const union tgsi_exec_channel *src1 )
 880 {
 881    dst1->i[0] = src0->i[0] * src1->i[0];
 882    dst1->i[1] = src0->i[1] * src1->i[1];
 883    dst1->i[2] = src0->i[2] * src1->i[2];
 884    dst1->i[3] = src0->i[3] * src1->i[3];
 885    dst0->i[0] = 0;
 886    dst0->i[1] = 0;
 887    dst0->i[2] = 0;
 888    dst0->i[3] = 0;
 889 }
 890 #endif
 891
 892 #if 0
 893 static void
 894 micro_umul64(
 895    union tgsi_exec_channel *dst0,
 896    union tgsi_exec_channel *dst1,
 897    const union tgsi_exec_channel *src0,
 898    const union tgsi_exec_channel *src1 )
 899 {
 900    dst1->u[0] = src0->u[0] * src1->u[0];
 901    dst1->u[1] = src0->u[1] * src1->u[1];
 902    dst1->u[2] = src0->u[2] * src1->u[2];
 903    dst1->u[3] = src0->u[3] * src1->u[3];
 904    dst0->u[0] = 0;
 905    dst0->u[1] = 0;
 906    dst0->u[2] = 0;
 907    dst0->u[3] = 0;
 908 }
 909 #endif
 910
 911
 912 #if 0
 913 static void
 914 micro_movc(
 915    union tgsi_exec_channel *dst,
 916    const union tgsi_exec_channel *src0,
 917    const union tgsi_exec_channel *src1,
 918    const union tgsi_exec_channel *src2 )
 919 {
 920    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
 921    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
 922    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
 923    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
 924 }
 925 #endif
 926
 927 static void
 928 micro_neg(
 929    union tgsi_exec_channel *dst,
 930    const union tgsi_exec_channel *src )
 931 {
 932    dst->f[0] = -src->f[0];
 933    dst->f[1] = -src->f[1];
 934    dst->f[2] = -src->f[2];
 935    dst->f[3] = -src->f[3];
 936 }
 937
 938 static void
 939 micro_pow(
 940    union tgsi_exec_channel *dst,
 941    const union tgsi_exec_channel *src0,
 942    const union tgsi_exec_channel *src1 )
 943 {
 944 #if FAST_MATH
 945    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 946    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 947    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 948    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 949 #else
 950    dst->f[0] = powf( src0->f[0], src1->f[0] );
 951    dst->f[1] = powf( src0->f[1], src1->f[1] );
 952    dst->f[2] = powf( src0->f[2], src1->f[2] );
 953    dst->f[3] = powf( src0->f[3], src1->f[3] );
 954 #endif
 955 }
 956
 957 static void
 958 micro_sqrt( union tgsi_exec_channel *dst,
 959             const union tgsi_exec_channel *src )
 960 {
 961    dst->f[0] = sqrtf( src->f[0] );
 962    dst->f[1] = sqrtf( src->f[1] );
 963    dst->f[2] = sqrtf( src->f[2] );
 964    dst->f[3] = sqrtf( src->f[3] );
 965 }
 966
 967 static void
 968 micro_sub(
 969    union tgsi_exec_channel *dst,
 970    const union tgsi_exec_channel *src0,
 971    const union tgsi_exec_channel *src1 )
 972 {
 973    dst->f[0] = src0->f[0] - src1->f[0];
 974    dst->f[1] = src0->f[1] - src1->f[1];
 975    dst->f[2] = src0->f[2] - src1->f[2];
 976    dst->f[3] = src0->f[3] - src1->f[3];
 977 }
 978
 979 static void
 980 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
 981                        const uint file,
 982                        const uint swizzle,
 983                        const union tgsi_exec_channel *index,
 984                        const union tgsi_exec_channel *index2D,
 985                        union tgsi_exec_channel *chan)
 986 {
 987    uint i;
 988
 989    switch (file) {
 990    case TGSI_FILE_CONSTANT:
 991       for (i = 0; i < QUAD_SIZE; i++) {
 992          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
 993          assert(mach->Consts[index2D->i[i]]);
 994
 995          if (index->i[i] < 0) {
 996             chan->u[i] = 0;
 997          } else {
 998             const uint *p = (const uint *)mach->Consts[index2D->i[i]];
 999
1000             chan->u[i] = p[index->i[i] * 4 + swizzle];
1001          }
1002       }
1003       break;
1004
1005    case TGSI_FILE_INPUT:
1006    case TGSI_FILE_SYSTEM_VALUE:
1007       for (i = 0; i < QUAD_SIZE; i++) {
1008          /* XXX: 2D indexing */
1009          chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
1010       }
1011       break;
1012
1013    case TGSI_FILE_TEMPORARY:
1014       for (i = 0; i < QUAD_SIZE; i++) {
1015          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1016          assert(index2D->i[i] == 0);
1017
1018          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1019       }
1020       break;
1021
1022    case TGSI_FILE_IMMEDIATE:
1023       for (i = 0; i < QUAD_SIZE; i++) {
1024          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1025          assert(index2D->i[i] == 0);
1026
1027          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1028       }
1029       break;
1030
1031    case TGSI_FILE_ADDRESS:
1032       for (i = 0; i < QUAD_SIZE; i++) {
1033          assert(index->i[i] >= 0);
1034          assert(index2D->i[i] == 0);
1035
1036          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1037       }
1038       break;
1039
1040    case TGSI_FILE_PREDICATE:
1041       for (i = 0; i < QUAD_SIZE; i++) {
1042          assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1043          assert(index2D->i[i] == 0);
1044
1045          chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1046       }
1047       break;
1048
1049    case TGSI_FILE_OUTPUT:
1050       /* vertex/fragment output vars can be read too */
1051       for (i = 0; i < QUAD_SIZE; i++) {
1052          assert(index->i[i] >= 0);
1053          assert(index2D->i[i] == 0);
1054
1055          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1056       }
1057       break;
1058
1059    default:
1060       assert(0);
1061       for (i = 0; i < QUAD_SIZE; i++) {
1062          chan->u[i] = 0;
1063       }
1064    }
1065 }
1066
1067 static void
1068 fetch_source(const struct tgsi_exec_machine *mach,
1069              union tgsi_exec_channel *chan,
1070              const struct tgsi_full_src_register *reg,
1071              const uint chan_index,
1072              enum tgsi_exec_datatype src_datatype)
1073 {
1074    union tgsi_exec_channel index;
1075    union tgsi_exec_channel index2D;
1076    uint swizzle;
1077
1078    /* We start with a direct index into a register file.
1079     *
1080     *    file[1],
1081     *    where:
1082     *       file = Register.File
1083     *       [1] = Register.Index
1084     */
1085    index.i[0] =
1086    index.i[1] =
1087    index.i[2] =
1088    index.i[3] = reg->Register.Index;
1089
1090    /* There is an extra source register that indirectly subscripts
1091     * a register file. The direct index now becomes an offset
1092     * that is being added to the indirect register.
1093     *
1094     *    file[ind[2].x+1],
1095     *    where:
1096     *       ind = Indirect.File
1097     *       [2] = Indirect.Index
1098     *       .x = Indirect.SwizzleX
1099     */
1100    if (reg->Register.Indirect) {
1101       union tgsi_exec_channel index2;
1102       union tgsi_exec_channel indir_index;
1103       const uint execmask = mach->ExecMask;
1104       uint i;
1105
1106       /* which address register (always zero now) */
1107       index2.i[0] =
1108       index2.i[1] =
1109       index2.i[2] =
1110       index2.i[3] = reg->Indirect.Index;
1111
1112       /* get current value of address register[swizzle] */
1113       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1114       fetch_src_file_channel(mach,
1115                              reg->Indirect.File,
1116                              swizzle,
1117                              &index2,
1118                              &ZeroVec,
1119                              &indir_index);
1120
1121       /* add value of address register to the offset */
1122       index.i[0] += indir_index.i[0];
1123       index.i[1] += indir_index.i[1];
1124       index.i[2] += indir_index.i[2];
1125       index.i[3] += indir_index.i[3];
1126
1127       /* for disabled execution channels, zero-out the index to
1128        * avoid using a potential garbage value.
1129        */
1130       for (i = 0; i < QUAD_SIZE; i++) {
1131          if ((execmask & (1 << i)) == 0)
1132             index.i[i] = 0;
1133       }
1134    }
1135
1136    /* There is an extra source register that is a second
1137     * subscript to a register file. Effectively it means that
1138     * the register file is actually a 2D array of registers.
1139     *
1140     *    file[3][1],
1141     *    where:
1142     *       [3] = Dimension.Index
1143     */
1144    if (reg->Register.Dimension) {
1145       index2D.i[0] =
1146       index2D.i[1] =
1147       index2D.i[2] =
1148       index2D.i[3] = reg->Dimension.Index;
1149
1150       /* Again, the second subscript index can be addressed indirectly
1151        * identically to the first one.
1152        * Nothing stops us from indirectly addressing the indirect register,
1153        * but there is no need for that, so we won't exercise it.
1154        *
1155        *    file[ind[4].y+3][1],
1156        *    where:
1157        *       ind = DimIndirect.File
1158        *       [4] = DimIndirect.Index
1159        *       .y = DimIndirect.SwizzleX
1160        */
1161       if (reg->Dimension.Indirect) {
1162          union tgsi_exec_channel index2;
1163          union tgsi_exec_channel indir_index;
1164          const uint execmask = mach->ExecMask;
1165          uint i;
1166
1167          index2.i[0] =
1168          index2.i[1] =
1169          index2.i[2] =
1170          index2.i[3] = reg->DimIndirect.Index;
1171
1172          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1173          fetch_src_file_channel(mach,
1174                                 reg->DimIndirect.File,
1175                                 swizzle,
1176                                 &index2,
1177                                 &ZeroVec,
1178                                 &indir_index);
1179
1180          index2D.i[0] += indir_index.i[0];
1181          index2D.i[1] += indir_index.i[1];
1182          index2D.i[2] += indir_index.i[2];
1183          index2D.i[3] += indir_index.i[3];
1184
1185          /* for disabled execution channels, zero-out the index to
1186           * avoid using a potential garbage value.
1187           */
1188          for (i = 0; i < QUAD_SIZE; i++) {
1189             if ((execmask & (1 << i)) == 0) {
1190                index2D.i[i] = 0;
1191             }
1192          }
1193       }
1194
1195       /* If by any chance there was a need for a 3D array of register
1196        * files, we would have to check whether Dimension is followed
1197        * by a dimension register and continue the saga.
1198        */
1199    } else {
1200       index2D.i[0] =
1201       index2D.i[1] =
1202       index2D.i[2] =
1203       index2D.i[3] = 0;
1204    }
1205
1206    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1207    fetch_src_file_channel(mach,
1208                           reg->Register.File,
1209                           swizzle,
1210                           &index,
1211                           &index2D,
1212                           chan);
1213
1214    if (reg->Register.Absolute) {
1215       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1216          micro_abs(chan, chan);
1217       } else {
1218          micro_iabs(chan, chan);
1219       }
1220    }
1221
1222    if (reg->Register.Negate) {
1223       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1224          micro_neg(chan, chan);
1225       } else {
1226          micro_ineg(chan, chan);
1227       }
1228    }
1229 }
1230
1231 static void
1232 store_dest(struct tgsi_exec_machine *mach,
1233            const union tgsi_exec_channel *chan,
1234            const struct tgsi_full_dst_register *reg,
1235            const struct tgsi_full_instruction *inst,
1236            uint chan_index,
1237            enum tgsi_exec_datatype dst_datatype)
1238 {
1239    uint i;
1240    union tgsi_exec_channel null;
1241    union tgsi_exec_channel *dst;
1242    uint execmask = mach->ExecMask;
1243    int offset = 0;  /* indirection offset */
1244    int index;
1245
1246    /* for debugging */
1247    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1248       check_inf_or_nan(chan);
1249    }
1250
1251    /* There is an extra source register that indirectly subscripts
1252     * a register file. The direct index now becomes an offset
1253     * that is being added to the indirect register.
1254     *
1255     *    file[ind[2].x+1],
1256     *    where:
1257     *       ind = Indirect.File
1258     *       [2] = Indirect.Index
1259     *       .x = Indirect.SwizzleX
1260     */
1261    if (reg->Register.Indirect) {
1262       union tgsi_exec_channel index;
1263       union tgsi_exec_channel indir_index;
1264       uint swizzle;
1265
1266       /* which address register (always zero for now) */
1267       index.i[0] =
1268       index.i[1] =
1269       index.i[2] =
1270       index.i[3] = reg->Indirect.Index;
1271
1272       /* get current value of address register[swizzle] */
1273       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1274
1275       /* fetch values from the address/indirection register */
1276       fetch_src_file_channel(mach,
1277                              reg->Indirect.File,
1278                              swizzle,
1279                              &index,
1280                              &ZeroVec,
1281                              &indir_index);
1282
1283       /* save indirection offset */
1284       offset = indir_index.i[0];
1285    }
1286
1287    switch (reg->Register.File) {
1288    case TGSI_FILE_NULL:
1289       dst = &null;
1290       break;
1291
1292    case TGSI_FILE_OUTPUT:
1293       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1294          + reg->Register.Index;
1295       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1296 #if 0
1297       if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1298          fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1299          for (i = 0; i < QUAD_SIZE; i++)
1300             if (execmask & (1 << i))
1301                fprintf(stderr, "%f, ", chan->f[i]);
1302          fprintf(stderr, ")\n");
1303       }
1304 #endif
1305       break;
1306
1307    case TGSI_FILE_TEMPORARY:
1308       index = reg->Register.Index;
1309       assert( index < TGSI_EXEC_NUM_TEMPS );
1310       dst = &mach->Temps[offset + index].xyzw[chan_index];
1311       break;
1312
1313    case TGSI_FILE_ADDRESS:
1314       index = reg->Register.Index;
1315       dst = &mach->Addrs[index].xyzw[chan_index];
1316       break;
1317
1318    case TGSI_FILE_LOOP:
1319       assert(reg->Register.Index == 0);
1320       assert(mach->LoopCounterStackTop > 0);
1321       assert(chan_index == CHAN_X);
1322       dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1323       break;
1324
1325    case TGSI_FILE_PREDICATE:
1326       index = reg->Register.Index;
1327       assert(index < TGSI_EXEC_NUM_PREDS);
1328       dst = &mach->Predicates[index].xyzw[chan_index];
1329       break;
1330
1331    default:
1332       assert( 0 );
1333       return;
1334    }
1335
1336    if (inst->Instruction.Predicate) {
1337       uint swizzle;
1338       union tgsi_exec_channel *pred;
1339
1340       switch (chan_index) {
1341       case CHAN_X:
1342          swizzle = inst->Predicate.SwizzleX;
1343          break;
1344       case CHAN_Y:
1345          swizzle = inst->Predicate.SwizzleY;
1346          break;
1347       case CHAN_Z:
1348          swizzle = inst->Predicate.SwizzleZ;
1349          break;
1350       case CHAN_W:
1351          swizzle = inst->Predicate.SwizzleW;
1352          break;
1353       default:
1354          assert(0);
1355          return;
1356       }
1357
1358       assert(inst->Predicate.Index == 0);
1359
1360       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1361
1362       if (inst->Predicate.Negate) {
1363          for (i = 0; i < QUAD_SIZE; i++) {
1364             if (pred->u[i]) {
1365                execmask &= ~(1 << i);
1366             }
1367          }
1368       } else {
1369          for (i = 0; i < QUAD_SIZE; i++) {
1370             if (!pred->u[i]) {
1371                execmask &= ~(1 << i);
1372             }
1373          }
1374       }
1375    }
1376
1377    switch (inst->Instruction.Saturate) {
1378    case TGSI_SAT_NONE:
1379       for (i = 0; i < QUAD_SIZE; i++)
1380          if (execmask & (1 << i))
1381             dst->i[i] = chan->i[i];
1382       break;
1383
1384    case TGSI_SAT_ZERO_ONE:
1385       for (i = 0; i < QUAD_SIZE; i++)
1386          if (execmask & (1 << i)) {
1387             if (chan->f[i] < 0.0f)
1388                dst->f[i] = 0.0f;
1389             else if (chan->f[i] > 1.0f)
1390                dst->f[i] = 1.0f;
1391             else
1392                dst->i[i] = chan->i[i];
1393          }
1394       break;
1395
1396    case TGSI_SAT_MINUS_PLUS_ONE:
1397       for (i = 0; i < QUAD_SIZE; i++)
1398          if (execmask & (1 << i)) {
1399             if (chan->f[i] < -1.0f)
1400                dst->f[i] = -1.0f;
1401             else if (chan->f[i] > 1.0f)
1402                dst->f[i] = 1.0f;
1403             else
1404                dst->i[i] = chan->i[i];
1405          }
1406       break;
1407
1408    default:
1409       assert( 0 );
1410    }
1411 }
1412
1413 #define FETCH(VAL,INDEX,CHAN)\
1414     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1415
1416 #define STORE(VAL,INDEX,CHAN)\
1417    store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1418
1419
1420 /**
1421  * Execute ARB-style KIL which is predicated by a src register.
1422  * Kill fragment if any of the four values is less than zero.
1423  */
1424 static void
1425 exec_kil(struct tgsi_exec_machine *mach,
1426          const struct tgsi_full_instruction *inst)
1427 {
1428    uint uniquemask;
1429    uint chan_index;
1430    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1431    union tgsi_exec_channel r[1];
1432
1433    /* This mask stores component bits that were already tested. */
1434    uniquemask = 0;
1435
1436    for (chan_index = 0; chan_index < 4; chan_index++)
1437    {
1438       uint swizzle;
1439       uint i;
1440
1441       /* unswizzle channel */
1442       swizzle = tgsi_util_get_full_src_register_swizzle (
1443                         &inst->Src[0],
1444                         chan_index);
1445
1446       /* check if the component has not been already tested */
1447       if (uniquemask & (1 << swizzle))
1448          continue;
1449       uniquemask |= 1 << swizzle;
1450
1451       FETCH(&r[0], 0, chan_index);
1452       for (i = 0; i < 4; i++)
1453          if (r[0].f[i] < 0.0f)
1454             kilmask |= 1 << i;
1455    }
1456
1457    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1458 }
1459
1460 /**
1461  * Execute NVIDIA-style KIL which is predicated by a condition code.
1462  * Kill fragment if the condition code is TRUE.
1463  */
1464 static void
1465 exec_kilp(struct tgsi_exec_machine *mach,
1466           const struct tgsi_full_instruction *inst)
1467 {
1468    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1469
1470    /* "unconditional" kil */
1471    kilmask = mach->ExecMask;
1472    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1473 }
1474
1475 static void
1476 emit_vertex(struct tgsi_exec_machine *mach)
1477 {
1478    /* FIXME: check for exec mask correctly
1479    unsigned i;
1480    for (i = 0; i < QUAD_SIZE; ++i) {
1481          if ((mach->ExecMask & (1 << i)))
1482    */
1483    if (mach->ExecMask) {
1484       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1485       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1486    }
1487 }
1488
1489 static void
1490 emit_primitive(struct tgsi_exec_machine *mach)
1491 {
1492    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1493    /* FIXME: check for exec mask correctly
1494    unsigned i;
1495    for (i = 0; i < QUAD_SIZE; ++i) {
1496          if ((mach->ExecMask & (1 << i)))
1497    */
1498    if (mach->ExecMask) {
1499       ++(*prim_count);
1500       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1501       mach->Primitives[*prim_count] = 0;
1502    }
1503 }
1504
1505 /*
1506  * Fetch four texture samples using STR texture coordinates.
1507  */
1508 static void
1509 fetch_texel( struct tgsi_sampler *sampler,
1510              const union tgsi_exec_channel *s,
1511              const union tgsi_exec_channel *t,
1512              const union tgsi_exec_channel *p,
1513              const union tgsi_exec_channel *c0,
1514              enum tgsi_sampler_control control,
1515              union tgsi_exec_channel *r,
1516              union tgsi_exec_channel *g,
1517              union tgsi_exec_channel *b,
1518              union tgsi_exec_channel *a )
1519 {
1520    uint j;
1521    float rgba[NUM_CHANNELS][QUAD_SIZE];
1522
1523    sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1524
1525    for (j = 0; j < 4; j++) {
1526       r->f[j] = rgba[0][j];
1527       g->f[j] = rgba[1][j];
1528       b->f[j] = rgba[2][j];
1529       a->f[j] = rgba[3][j];
1530    }
1531 }
1532
1533
1534 #define TEX_MODIFIER_NONE           0
1535 #define TEX_MODIFIER_PROJECTED      1
1536 #define TEX_MODIFIER_LOD_BIAS       2
1537 #define TEX_MODIFIER_EXPLICIT_LOD   3
1538
1539
1540 static void
1541 exec_tex(struct tgsi_exec_machine *mach,
1542          const struct tgsi_full_instruction *inst,
1543          uint modifier)
1544 {
1545    const uint unit = inst->Src[1].Register.Index;
1546    union tgsi_exec_channel r[4];
1547    const union tgsi_exec_channel *lod = &ZeroVec;
1548    enum tgsi_sampler_control control;
1549    uint chan_index;
1550
1551    if (modifier != TEX_MODIFIER_NONE) {
1552       FETCH(&r[3], 0, CHAN_W);
1553       if (modifier != TEX_MODIFIER_PROJECTED) {
1554          lod = &r[3];
1555       }
1556    }
1557
1558    if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1559       control = tgsi_sampler_lod_explicit;
1560    } else {
1561       control = tgsi_sampler_lod_bias;
1562    }
1563
1564    switch (inst->Texture.Texture) {
1565    case TGSI_TEXTURE_1D:
1566    case TGSI_TEXTURE_SHADOW1D:
1567       FETCH(&r[0], 0, CHAN_X);
1568
1569       if (modifier == TEX_MODIFIER_PROJECTED) {
1570          micro_div(&r[0], &r[0], &r[3]);
1571       }
1572
1573       fetch_texel(mach->Samplers[unit],
1574                   &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1575                   control,
1576                   &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1577       break;
1578
1579    case TGSI_TEXTURE_2D:
1580    case TGSI_TEXTURE_RECT:
1581    case TGSI_TEXTURE_SHADOW2D:
1582    case TGSI_TEXTURE_SHADOWRECT:
1583       FETCH(&r[0], 0, CHAN_X);
1584       FETCH(&r[1], 0, CHAN_Y);
1585       FETCH(&r[2], 0, CHAN_Z);
1586
1587       if (modifier == TEX_MODIFIER_PROJECTED) {
1588          micro_div(&r[0], &r[0], &r[3]);
1589          micro_div(&r[1], &r[1], &r[3]);
1590          micro_div(&r[2], &r[2], &r[3]);
1591       }
1592
1593       fetch_texel(mach->Samplers[unit],
1594                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1595                   control,
1596                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1597       break;
1598
1599    case TGSI_TEXTURE_3D:
1600    case TGSI_TEXTURE_CUBE:
1601       FETCH(&r[0], 0, CHAN_X);
1602       FETCH(&r[1], 0, CHAN_Y);
1603       FETCH(&r[2], 0, CHAN_Z);
1604
1605       if (modifier == TEX_MODIFIER_PROJECTED) {
1606          micro_div(&r[0], &r[0], &r[3]);
1607          micro_div(&r[1], &r[1], &r[3]);
1608          micro_div(&r[2], &r[2], &r[3]);
1609       }
1610
1611       fetch_texel(mach->Samplers[unit],
1612                   &r[0], &r[1], &r[2], lod,
1613                   control,
1614                   &r[0], &r[1], &r[2], &r[3]);
1615       break;
1616
1617    default:
1618       assert(0);
1619    }
1620
1621    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1622       STORE(&r[chan_index], 0, chan_index);
1623    }
1624 }
1625
1626 static void
1627 exec_txd(struct tgsi_exec_machine *mach,
1628          const struct tgsi_full_instruction *inst)
1629 {
1630    const uint unit = inst->Src[3].Register.Index;
1631    union tgsi_exec_channel r[4];
1632    uint chan_index;
1633
1634    /*
1635     * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1636     */
1637
1638    switch (inst->Texture.Texture) {
1639    case TGSI_TEXTURE_1D:
1640    case TGSI_TEXTURE_SHADOW1D:
1641
1642       FETCH(&r[0], 0, CHAN_X);
1643
1644       fetch_texel(mach->Samplers[unit],
1645                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1646                   tgsi_sampler_lod_bias,
1647                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1648       break;
1649
1650    case TGSI_TEXTURE_2D:
1651    case TGSI_TEXTURE_RECT:
1652    case TGSI_TEXTURE_SHADOW2D:
1653    case TGSI_TEXTURE_SHADOWRECT:
1654
1655       FETCH(&r[0], 0, CHAN_X);
1656       FETCH(&r[1], 0, CHAN_Y);
1657       FETCH(&r[2], 0, CHAN_Z);
1658
1659       fetch_texel(mach->Samplers[unit],
1660                   &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1661                   tgsi_sampler_lod_bias,
1662                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1663       break;
1664
1665    case TGSI_TEXTURE_3D:
1666    case TGSI_TEXTURE_CUBE:
1667
1668       FETCH(&r[0], 0, CHAN_X);
1669       FETCH(&r[1], 0, CHAN_Y);
1670       FETCH(&r[2], 0, CHAN_Z);
1671
1672       fetch_texel(mach->Samplers[unit],
1673                   &r[0], &r[1], &r[2], &ZeroVec,
1674                   tgsi_sampler_lod_bias,
1675                   &r[0], &r[1], &r[2], &r[3]);
1676       break;
1677
1678    default:
1679       assert(0);
1680    }
1681
1682    FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1683       STORE(&r[chan_index], 0, chan_index);
1684    }
1685 }
1686
1687
1688 /**
1689  * Evaluate a constant-valued coefficient at the position of the
1690  * current quad.
1691  */
1692 static void
1693 eval_constant_coef(
1694    struct tgsi_exec_machine *mach,
1695    unsigned attrib,
1696    unsigned chan )
1697 {
1698    unsigned i;
1699
1700    for( i = 0; i < QUAD_SIZE; i++ ) {
1701       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1702    }
1703 }
1704
1705 /**
1706  * Evaluate a linear-valued coefficient at the position of the
1707  * current quad.
1708  */
1709 static void
1710 eval_linear_coef(
1711    struct tgsi_exec_machine *mach,
1712    unsigned attrib,
1713    unsigned chan )
1714 {
1715    const float x = mach->QuadPos.xyzw[0].f[0];
1716    const float y = mach->QuadPos.xyzw[1].f[0];
1717    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1718    const float dady = mach->InterpCoefs[attrib].dady[chan];
1719    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1720    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1721    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1722    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1723    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1724 }
1725
1726 /**
1727  * Evaluate a perspective-valued coefficient at the position of the
1728  * current quad.
1729  */
1730 static void
1731 eval_perspective_coef(
1732    struct tgsi_exec_machine *mach,
1733    unsigned attrib,
1734    unsigned chan )
1735 {
1736    const float x = mach->QuadPos.xyzw[0].f[0];
1737    const float y = mach->QuadPos.xyzw[1].f[0];
1738    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1739    const float dady = mach->InterpCoefs[attrib].dady[chan];
1740    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1741    const float *w = mach->QuadPos.xyzw[3].f;
1742    /* divide by W here */
1743    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1744    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1745    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1746    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1747 }
1748
1749
1750 typedef void (* eval_coef_func)(
1751    struct tgsi_exec_machine *mach,
1752    unsigned attrib,
1753    unsigned chan );
1754
1755 static void
1756 exec_declaration(struct tgsi_exec_machine *mach,
1757                  const struct tgsi_full_declaration *decl)
1758 {
1759    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1760       if (decl->Declaration.File == TGSI_FILE_INPUT ||
1761           decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1762          uint first, last, mask;
1763
1764          first = decl->Range.First;
1765          last = decl->Range.Last;
1766          mask = decl->Declaration.UsageMask;
1767
1768          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1769             uint i;
1770
1771             assert(decl->Semantic.Index == 0);
1772             assert(first == last);
1773
1774             for (i = 0; i < QUAD_SIZE; i++) {
1775                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1776             }
1777          } else {
1778             eval_coef_func eval;
1779             uint i, j;
1780
1781             switch (decl->Declaration.Interpolate) {
1782             case TGSI_INTERPOLATE_CONSTANT:
1783                eval = eval_constant_coef;
1784                break;
1785
1786             case TGSI_INTERPOLATE_LINEAR:
1787                eval = eval_linear_coef;
1788                break;
1789
1790             case TGSI_INTERPOLATE_PERSPECTIVE:
1791                eval = eval_perspective_coef;
1792                break;
1793
1794             default:
1795                assert(0);
1796                return;
1797             }
1798
1799             for (j = 0; j < NUM_CHANNELS; j++) {
1800                if (mask & (1 << j)) {
1801                   for (i = first; i <= last; i++) {
1802                      eval(mach, i, j);
1803                   }
1804                }
1805             }
1806          }
1807       }
1808    }
1809 }
1810
1811 typedef void (* micro_op)(union tgsi_exec_channel *dst,
1812                           const union tgsi_exec_channel *src);
1813
1814 static void
1815 exec_scalar_unary(struct tgsi_exec_machine *mach,
1816                   const struct tgsi_full_instruction *inst,
1817                   micro_op op,
1818                   enum tgsi_exec_datatype dst_datatype,
1819                   enum tgsi_exec_datatype src_datatype)
1820 {
1821    unsigned int chan;
1822    union tgsi_exec_channel src;
1823    union tgsi_exec_channel dst;
1824
1825    fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1826    op(&dst, &src);
1827    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1828       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1829          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1830       }
1831    }
1832 }
1833
1834 static void
1835 exec_vector_unary(struct tgsi_exec_machine *mach,
1836                   const struct tgsi_full_instruction *inst,
1837                   micro_op op,
1838                   enum tgsi_exec_datatype dst_datatype,
1839                   enum tgsi_exec_datatype src_datatype)
1840 {
1841    unsigned int chan;
1842    struct tgsi_exec_vector dst;
1843
1844    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1845       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1846          union tgsi_exec_channel src;
1847
1848          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1849          op(&dst.xyzw[chan], &src);
1850       }
1851    }
1852    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1853       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1854          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1855       }
1856    }
1857 }
1858
1859 static void
1860 exec_vector_binary(struct tgsi_exec_machine *mach,
1861                    const struct tgsi_full_instruction *inst,
1862                    micro_op op,
1863                    enum tgsi_exec_datatype dst_datatype,
1864                    enum tgsi_exec_datatype src_datatype)
1865 {
1866    unsigned int chan;
1867    struct tgsi_exec_vector dst;
1868
1869    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1870       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1871          union tgsi_exec_channel src[2];
1872
1873          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1874          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1875          op(&dst.xyzw[chan], src);
1876       }
1877    }
1878    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1879       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1880          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1881       }
1882    }
1883 }
1884
1885 static void
1886 exec_vector_trinary(struct tgsi_exec_machine *mach,
1887                     const struct tgsi_full_instruction *inst,
1888                     micro_op op,
1889                     enum tgsi_exec_datatype dst_datatype,
1890                     enum tgsi_exec_datatype src_datatype)
1891 {
1892    unsigned int chan;
1893    struct tgsi_exec_vector dst;
1894
1895    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1896       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1897          union tgsi_exec_channel src[3];
1898
1899          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1900          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1901          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1902          op(&dst.xyzw[chan], src);
1903       }
1904    }
1905    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1906       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1907          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1908       }
1909    }
1910 }
1911
1912 static void
1913 exec_dp3(struct tgsi_exec_machine *mach,
1914          const struct tgsi_full_instruction *inst)
1915 {
1916    unsigned int chan;
1917    union tgsi_exec_channel arg[3];
1918
1919    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1920    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1921    micro_mul(&arg[2], &arg[0], &arg[1]);
1922
1923    for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1924       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1925       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1926       micro_mad(&arg[2], arg);
1927    }
1928
1929    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1930       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1931          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1932       }
1933    }
1934 }
1935
1936 static void
1937 exec_dp4(struct tgsi_exec_machine *mach,
1938          const struct tgsi_full_instruction *inst)
1939 {
1940    unsigned int chan;
1941    union tgsi_exec_channel arg[3];
1942
1943    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1944    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1945    micro_mul(&arg[2], &arg[0], &arg[1]);
1946
1947    for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1948       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1949       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1950       micro_mad(&arg[2], arg);
1951    }
1952
1953    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1954       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1955          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1956       }
1957    }
1958 }
1959
1960 static void
1961 exec_dp2a(struct tgsi_exec_machine *mach,
1962           const struct tgsi_full_instruction *inst)
1963 {
1964    unsigned int chan;
1965    union tgsi_exec_channel arg[3];
1966
1967    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1968    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1969    micro_mul(&arg[2], &arg[0], &arg[1]);
1970
1971    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1972    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1973    micro_mad(&arg[0], arg);
1974
1975    fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1976    micro_add(&arg[0], &arg[0], &arg[1]);
1977
1978    for (chan = 0; chan < NUM_CHANNELS; chan++) {
1979       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1980          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1981       }
1982    }
1983 }
1984
1985 static void
1986 exec_dph(struct tgsi_exec_machine *mach,
1987          const struct tgsi_full_instruction *inst)
1988 {
1989    unsigned int chan;
1990    union tgsi_exec_channel arg[3];
1991
1992    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1993    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1994    micro_mul(&arg[2], &arg[0], &arg[1]);
1995
1996    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1997    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1998    micro_mad(&arg[2], arg);
1999
2000    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2001    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2002    micro_mad(&arg[0], arg);
2003
2004    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2005    micro_add(&arg[0], &arg[0], &arg[1]);
2006
2007    for (chan = 0; chan < NUM_CHANNELS; chan++) {
2008       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2009          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2010       }
2011    }
2012 }
2013
2014 static void
2015 exec_dp2(struct tgsi_exec_machine *mach,
2016          const struct tgsi_full_instruction *inst)
2017 {
2018    unsigned int chan;
2019    union tgsi_exec_channel arg[3];
2020
2021    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2022    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2023    micro_mul(&arg[2], &arg[0], &arg[1]);
2024
2025    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2026    fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2027    micro_mad(&arg[2], arg);
2028
2029    for (chan = 0; chan < NUM_CHANNELS; chan++) {
2030       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2031          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2032       }
2033    }
2034 }
2035
2036 static void
2037 exec_nrm4(struct tgsi_exec_machine *mach,
2038           const struct tgsi_full_instruction *inst)
2039 {
2040    unsigned int chan;
2041    union tgsi_exec_channel arg[4];
2042    union tgsi_exec_channel scale;
2043
2044    fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2045    micro_mul(&scale, &arg[0], &arg[0]);
2046
2047    for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2048       union tgsi_exec_channel product;
2049
2050       fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2051       micro_mul(&product, &arg[chan], &arg[chan]);
2052       micro_add(&scale, &scale, &product);
2053    }
2054
2055    micro_rsq(&scale, &scale);
2056
2057    for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2058       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2059          micro_mul(&arg[chan], &arg[chan], &scale);
2060          store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2061       }
2062    }
2063 }
2064
2065 static void
2066 exec_nrm3(struct tgsi_exec_machine *mach,
2067           const struct tgsi_full_instruction *inst)
2068 {
2069    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2070       unsigned int chan;
2071       union tgsi_exec_channel arg[3];
2072       union tgsi_exec_channel scale;
2073
2074       fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2075       micro_mul(&scale, &arg[0], &arg[0]);
2076
2077       for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2078          union tgsi_exec_channel product;
2079
2080          fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2081          micro_mul(&product, &arg[chan], &arg[chan]);
2082          micro_add(&scale, &scale, &product);
2083       }
2084
2085       micro_rsq(&scale, &scale);
2086
2087       for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2088          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2089             micro_mul(&arg[chan], &arg[chan], &scale);
2090             store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2091          }
2092       }
2093    }
2094
2095    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2096       store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2097    }
2098 }
2099
2100 static void
2101 exec_break(struct tgsi_exec_machine *mach)
2102 {
2103    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2104       /* turn off loop channels for each enabled exec channel */
2105       mach->LoopMask &= ~mach->ExecMask;
2106       /* Todo: if mach->LoopMask == 0, jump to end of loop */
2107       UPDATE_EXEC_MASK(mach);
2108    } else {
2109       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2110
2111       mach->Switch.mask = 0x0;
2112
2113       UPDATE_EXEC_MASK(mach);
2114    }
2115 }
2116
2117 static void
2118 exec_switch(struct tgsi_exec_machine *mach,
2119             const struct tgsi_full_instruction *inst)
2120 {
2121    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2122    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2123
2124    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2125    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2126    mach->Switch.mask = 0x0;
2127    mach->Switch.defaultMask = 0x0;
2128
2129    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2130    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2131
2132    UPDATE_EXEC_MASK(mach);
2133 }
2134
2135 static void
2136 exec_case(struct tgsi_exec_machine *mach,
2137           const struct tgsi_full_instruction *inst)
2138 {
2139    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2140    union tgsi_exec_channel src;
2141    uint mask = 0;
2142
2143    fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2144
2145    if (mach->Switch.selector.u[0] == src.u[0]) {
2146       mask |= 0x1;
2147    }
2148    if (mach->Switch.selector.u[1] == src.u[1]) {
2149       mask |= 0x2;
2150    }
2151    if (mach->Switch.selector.u[2] == src.u[2]) {
2152       mask |= 0x4;
2153    }
2154    if (mach->Switch.selector.u[3] == src.u[3]) {
2155       mask |= 0x8;
2156    }
2157
2158    mach->Switch.defaultMask |= mask;
2159
2160    mach->Switch.mask |= mask & prevMask;
2161
2162    UPDATE_EXEC_MASK(mach);
2163 }
2164
2165 static void
2166 exec_default(struct tgsi_exec_machine *mach)
2167 {
2168    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2169
2170    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2171
2172    UPDATE_EXEC_MASK(mach);
2173 }
2174
2175 static void
2176 exec_endswitch(struct tgsi_exec_machine *mach)
2177 {
2178    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2179    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2180
2181    UPDATE_EXEC_MASK(mach);
2182 }
2183
2184 static void
2185 micro_i2f(union tgsi_exec_channel *dst,
2186           const union tgsi_exec_channel *src)
2187 {
2188    dst->f[0] = (float)src->i[0];
2189    dst->f[1] = (float)src->i[1];
2190    dst->f[2] = (float)src->i[2];
2191    dst->f[3] = (float)src->i[3];
2192 }
2193
2194 static void
2195 micro_not(union tgsi_exec_channel *dst,
2196           const union tgsi_exec_channel *src)
2197 {
2198    dst->u[0] = ~src->u[0];
2199    dst->u[1] = ~src->u[1];
2200    dst->u[2] = ~src->u[2];
2201    dst->u[3] = ~src->u[3];
2202 }
2203
2204 static void
2205 micro_shl(union tgsi_exec_channel *dst,
2206           const union tgsi_exec_channel *src)
2207 {
2208    dst->u[0] = src[0].u[0] << src[1].u[0];
2209    dst->u[1] = src[0].u[1] << src[1].u[1];
2210    dst->u[2] = src[0].u[2] << src[1].u[2];
2211    dst->u[3] = src[0].u[3] << src[1].u[3];
2212 }
2213
2214 static void
2215 micro_and(union tgsi_exec_channel *dst,
2216           const union tgsi_exec_channel *src)
2217 {
2218    dst->u[0] = src[0].u[0] & src[1].u[0];
2219    dst->u[1] = src[0].u[1] & src[1].u[1];
2220    dst->u[2] = src[0].u[2] & src[1].u[2];
2221    dst->u[3] = src[0].u[3] & src[1].u[3];
2222 }
2223
2224 static void
2225 micro_or(union tgsi_exec_channel *dst,
2226          const union tgsi_exec_channel *src)
2227 {
2228    dst->u[0] = src[0].u[0] | src[1].u[0];
2229    dst->u[1] = src[0].u[1] | src[1].u[1];
2230    dst->u[2] = src[0].u[2] | src[1].u[2];
2231    dst->u[3] = src[0].u[3] | src[1].u[3];
2232 }
2233
2234 static void
2235 micro_xor(union tgsi_exec_channel *dst,
2236           const union tgsi_exec_channel *src)
2237 {
2238    dst->u[0] = src[0].u[0] ^ src[1].u[0];
2239    dst->u[1] = src[0].u[1] ^ src[1].u[1];
2240    dst->u[2] = src[0].u[2] ^ src[1].u[2];
2241    dst->u[3] = src[0].u[3] ^ src[1].u[3];
2242 }
2243
2244 static void
2245 micro_f2i(union tgsi_exec_channel *dst,
2246           const union tgsi_exec_channel *src)
2247 {
2248    dst->i[0] = (int)src->f[0];
2249    dst->i[1] = (int)src->f[1];
2250    dst->i[2] = (int)src->f[2];
2251    dst->i[3] = (int)src->f[3];
2252 }
2253
2254 static void
2255 micro_idiv(union tgsi_exec_channel *dst,
2256            const union tgsi_exec_channel *src)
2257 {
2258    dst->i[0] = src[0].i[0] / src[1].i[0];
2259    dst->i[1] = src[0].i[1] / src[1].i[1];
2260    dst->i[2] = src[0].i[2] / src[1].i[2];
2261    dst->i[3] = src[0].i[3] / src[1].i[3];
2262 }
2263
2264 static void
2265 micro_imax(union tgsi_exec_channel *dst,
2266            const union tgsi_exec_channel *src)
2267 {
2268    dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2269    dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2270    dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2271    dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2272 }
2273
2274 static void
2275 micro_imin(union tgsi_exec_channel *dst,
2276            const union tgsi_exec_channel *src)
2277 {
2278    dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2279    dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2280    dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2281    dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2282 }
2283
2284 static void
2285 micro_isge(union tgsi_exec_channel *dst,
2286            const union tgsi_exec_channel *src)
2287 {
2288    dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2289    dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2290    dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2291    dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2292 }
2293
2294 static void
2295 micro_ishr(union tgsi_exec_channel *dst,
2296            const union tgsi_exec_channel *src)
2297 {
2298    dst->i[0] = src[0].i[0] >> src[1].i[0];
2299    dst->i[1] = src[0].i[1] >> src[1].i[1];
2300    dst->i[2] = src[0].i[2] >> src[1].i[2];
2301    dst->i[3] = src[0].i[3] >> src[1].i[3];
2302 }
2303
2304 static void
2305 micro_islt(union tgsi_exec_channel *dst,
2306            const union tgsi_exec_channel *src)
2307 {
2308    dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2309    dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2310    dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2311    dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2312 }
2313
2314 static void
2315 micro_f2u(union tgsi_exec_channel *dst,
2316           const union tgsi_exec_channel *src)
2317 {
2318    dst->u[0] = (uint)src->f[0];
2319    dst->u[1] = (uint)src->f[1];
2320    dst->u[2] = (uint)src->f[2];
2321    dst->u[3] = (uint)src->f[3];
2322 }
2323
2324 static void
2325 micro_u2f(union tgsi_exec_channel *dst,
2326           const union tgsi_exec_channel *src)
2327 {
2328    dst->f[0] = (float)src->u[0];
2329    dst->f[1] = (float)src->u[1];
2330    dst->f[2] = (float)src->u[2];
2331    dst->f[3] = (float)src->u[3];
2332 }
2333
2334 static void
2335 micro_uadd(union tgsi_exec_channel *dst,
2336            const union tgsi_exec_channel *src)
2337 {
2338    dst->u[0] = src[0].u[0] + src[1].u[0];
2339    dst->u[1] = src[0].u[1] + src[1].u[1];
2340    dst->u[2] = src[0].u[2] + src[1].u[2];
2341    dst->u[3] = src[0].u[3] + src[1].u[3];
2342 }
2343
2344 static void
2345 micro_udiv(union tgsi_exec_channel *dst,
2346            const union tgsi_exec_channel *src)
2347 {
2348    dst->u[0] = src[0].u[0] / src[1].u[0];
2349    dst->u[1] = src[0].u[1] / src[1].u[1];
2350    dst->u[2] = src[0].u[2] / src[1].u[2];
2351    dst->u[3] = src[0].u[3] / src[1].u[3];
2352 }
2353
2354 static void
2355 micro_umad(union tgsi_exec_channel *dst,
2356            const union tgsi_exec_channel *src)
2357 {
2358    dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2359    dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2360    dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2361    dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2362 }
2363
2364 static void
2365 micro_umax(union tgsi_exec_channel *dst,
2366            const union tgsi_exec_channel *src)
2367 {
2368    dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2369    dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2370    dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2371    dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2372 }
2373
2374 static void
2375 micro_umin(union tgsi_exec_channel *dst,
2376            const union tgsi_exec_channel *src)
2377 {
2378    dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2379    dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2380    dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2381    dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2382 }
2383
2384 static void
2385 micro_umod(union tgsi_exec_channel *dst,
2386            const union tgsi_exec_channel *src)
2387 {
2388    dst->u[0] = src[0].u[0] % src[1].u[0];
2389    dst->u[1] = src[0].u[1] % src[1].u[1];
2390    dst->u[2] = src[0].u[2] % src[1].u[2];
2391    dst->u[3] = src[0].u[3] % src[1].u[3];
2392 }
2393
2394 static void
2395 micro_umul(union tgsi_exec_channel *dst,
2396            const union tgsi_exec_channel *src)
2397 {
2398    dst->u[0] = src[0].u[0] * src[1].u[0];
2399    dst->u[1] = src[0].u[1] * src[1].u[1];
2400    dst->u[2] = src[0].u[2] * src[1].u[2];
2401    dst->u[3] = src[0].u[3] * src[1].u[3];
2402 }
2403
2404 static void
2405 micro_useq(union tgsi_exec_channel *dst,
2406            const union tgsi_exec_channel *src)
2407 {
2408    dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2409    dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2410    dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2411    dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2412 }
2413
2414 static void
2415 micro_usge(union tgsi_exec_channel *dst,
2416            const union tgsi_exec_channel *src)
2417 {
2418    dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2419    dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2420    dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2421    dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2422 }
2423
2424 static void
2425 micro_ushr(union tgsi_exec_channel *dst,
2426            const union tgsi_exec_channel *src)
2427 {
2428    dst->u[0] = src[0].u[0] >> src[1].u[0];
2429    dst->u[1] = src[0].u[1] >> src[1].u[1];
2430    dst->u[2] = src[0].u[2] >> src[1].u[2];
2431    dst->u[3] = src[0].u[3] >> src[1].u[3];
2432 }
2433
2434 static void
2435 micro_uslt(union tgsi_exec_channel *dst,
2436            const union tgsi_exec_channel *src)
2437 {
2438    dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2439    dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2440    dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2441    dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2442 }
2443
2444 static void
2445 micro_usne(union tgsi_exec_channel *dst,
2446            const union tgsi_exec_channel *src)
2447 {
2448    dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2449    dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2450    dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2451    dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2452 }
2453
2454 static void
2455 exec_instruction(
2456    struct tgsi_exec_machine *mach,
2457    const struct tgsi_full_instruction *inst,
2458    int *pc )
2459 {
2460    uint chan_index;
2461    union tgsi_exec_channel r[10];
2462    union tgsi_exec_channel d[8];
2463
2464    (*pc)++;
2465
2466    switch (inst->Instruction.Opcode) {
2467    case TGSI_OPCODE_ARL:
2468       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2469       break;
2470
2471    case TGSI_OPCODE_MOV:
2472       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2473       break;
2474
2475    case TGSI_OPCODE_LIT:
2476       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2477          FETCH( &r[0], 0, CHAN_X );
2478          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2479             micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2480          }
2481
2482          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2483             FETCH( &r[1], 0, CHAN_Y );
2484             micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2485
2486             FETCH( &r[2], 0, CHAN_W );
2487             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2488             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2489             micro_pow( &r[1], &r[1], &r[2] );
2490             micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2491          }
2492
2493          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2494             STORE(&d[CHAN_Y], 0, CHAN_Y);
2495          }
2496          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2497             STORE(&d[CHAN_Z], 0, CHAN_Z);
2498          }
2499       }
2500       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2501          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2502       }
2503       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2504          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2505       }
2506       break;
2507
2508    case TGSI_OPCODE_RCP:
2509       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2510       break;
2511
2512    case TGSI_OPCODE_RSQ:
2513       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2514       break;
2515
2516    case TGSI_OPCODE_EXP:
2517       FETCH( &r[0], 0, CHAN_X );
2518       micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2519       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2520          micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2521          STORE( &r[2], 0, CHAN_X );        /* store r2 */
2522       }
2523       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2524          micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2525          STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2526       }
2527       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2528          micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2529          STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2530       }
2531       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2532          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2533       }
2534       break;
2535
2536    case TGSI_OPCODE_LOG:
2537       FETCH( &r[0], 0, CHAN_X );
2538       micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2539       micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2540       micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2541       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2542          STORE( &r[0], 0, CHAN_X );
2543       }
2544       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2545          micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2546          micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2547          STORE( &r[0], 0, CHAN_Y );
2548       }
2549       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2550          STORE( &r[1], 0, CHAN_Z );
2551       }
2552       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2553          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2554       }
2555       break;
2556
2557    case TGSI_OPCODE_MUL:
2558       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2559          FETCH(&r[0], 0, chan_index);
2560          FETCH(&r[1], 1, chan_index);
2561          micro_mul(&d[chan_index], &r[0], &r[1]);
2562       }
2563       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2564          STORE(&d[chan_index], 0, chan_index);
2565       }
2566       break;
2567
2568    case TGSI_OPCODE_ADD:
2569       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2570          FETCH( &r[0], 0, chan_index );
2571          FETCH( &r[1], 1, chan_index );
2572          micro_add(&d[chan_index], &r[0], &r[1]);
2573       }
2574       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2575          STORE(&d[chan_index], 0, chan_index);
2576       }
2577       break;
2578
2579    case TGSI_OPCODE_DP3:
2580       exec_dp3(mach, inst);
2581       break;
2582
2583    case TGSI_OPCODE_DP4:
2584       exec_dp4(mach, inst);
2585       break;
2586
2587    case TGSI_OPCODE_DST:
2588       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2589          FETCH( &r[0], 0, CHAN_Y );
2590          FETCH( &r[1], 1, CHAN_Y);
2591          micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2592       }
2593       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2594          FETCH(&d[CHAN_Z], 0, CHAN_Z);
2595       }
2596       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2597          FETCH(&d[CHAN_W], 1, CHAN_W);
2598       }
2599
2600       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2601          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2602       }
2603       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2604          STORE(&d[CHAN_Y], 0, CHAN_Y);
2605       }
2606       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2607          STORE(&d[CHAN_Z], 0, CHAN_Z);
2608       }
2609       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2610          STORE(&d[CHAN_W], 0, CHAN_W);
2611       }
2612       break;
2613
2614    case TGSI_OPCODE_MIN:
2615       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2616          FETCH(&r[0], 0, chan_index);
2617          FETCH(&r[1], 1, chan_index);
2618
2619          /* XXX use micro_min()?? */
2620          micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2621       }
2622       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2623          STORE(&d[chan_index], 0, chan_index);
2624       }
2625       break;
2626
2627    case TGSI_OPCODE_MAX:
2628       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2629          FETCH(&r[0], 0, chan_index);
2630          FETCH(&r[1], 1, chan_index);
2631
2632          /* XXX use micro_max()?? */
2633          micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2634       }
2635       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2636          STORE(&d[chan_index], 0, chan_index);
2637       }
2638       break;
2639
2640    case TGSI_OPCODE_SLT:
2641       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2642       break;
2643
2644    case TGSI_OPCODE_SGE:
2645       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2646       break;
2647
2648    case TGSI_OPCODE_MAD:
2649       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2650       break;
2651
2652    case TGSI_OPCODE_SUB:
2653       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2654          FETCH(&r[0], 0, chan_index);
2655          FETCH(&r[1], 1, chan_index);
2656          micro_sub(&d[chan_index], &r[0], &r[1]);
2657       }
2658       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2659          STORE(&d[chan_index], 0, chan_index);
2660       }
2661       break;
2662
2663    case TGSI_OPCODE_LRP:
2664       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2665       break;
2666
2667    case TGSI_OPCODE_CND:
2668       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2669          FETCH(&r[0], 0, chan_index);
2670          FETCH(&r[1], 1, chan_index);
2671          FETCH(&r[2], 2, chan_index);
2672          micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2673       }
2674       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2675          STORE(&d[chan_index], 0, chan_index);
2676       }
2677       break;
2678
2679    case TGSI_OPCODE_DP2A:
2680       exec_dp2a(mach, inst);
2681       break;
2682
2683    case TGSI_OPCODE_FRC:
2684       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2685       break;
2686
2687    case TGSI_OPCODE_CLAMP:
2688       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2689          FETCH(&r[0], 0, chan_index);
2690          FETCH(&r[1], 1, chan_index);
2691          micro_max(&r[0], &r[0], &r[1]);
2692          FETCH(&r[1], 2, chan_index);
2693          micro_min(&d[chan_index], &r[0], &r[1]);
2694       }
2695       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2696          STORE(&d[chan_index], 0, chan_index);
2697       }
2698       break;
2699
2700    case TGSI_OPCODE_FLR:
2701       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2702       break;
2703
2704    case TGSI_OPCODE_ROUND:
2705       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2706       break;
2707
2708    case TGSI_OPCODE_EX2:
2709       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2710       break;
2711
2712    case TGSI_OPCODE_LG2:
2713       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2714       break;
2715
2716    case TGSI_OPCODE_POW:
2717       FETCH(&r[0], 0, CHAN_X);
2718       FETCH(&r[1], 1, CHAN_X);
2719
2720       micro_pow( &r[0], &r[0], &r[1] );
2721
2722       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2723          STORE( &r[0], 0, chan_index );
2724       }
2725       break;
2726
2727    case TGSI_OPCODE_XPD:
2728       FETCH(&r[0], 0, CHAN_Y);
2729       FETCH(&r[1], 1, CHAN_Z);
2730
2731       micro_mul( &r[2], &r[0], &r[1] );
2732
2733       FETCH(&r[3], 0, CHAN_Z);
2734       FETCH(&r[4], 1, CHAN_Y);
2735
2736       micro_mul( &r[5], &r[3], &r[4] );
2737       micro_sub(&d[CHAN_X], &r[2], &r[5]);
2738
2739       FETCH(&r[2], 1, CHAN_X);
2740
2741       micro_mul( &r[3], &r[3], &r[2] );
2742
2743       FETCH(&r[5], 0, CHAN_X);
2744
2745       micro_mul( &r[1], &r[1], &r[5] );
2746       micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2747
2748       micro_mul( &r[5], &r[5], &r[4] );
2749       micro_mul( &r[0], &r[0], &r[2] );
2750       micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2751
2752       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2753          STORE(&d[CHAN_X], 0, CHAN_X);
2754       }
2755       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2756          STORE(&d[CHAN_Y], 0, CHAN_Y);
2757       }
2758       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2759          STORE(&d[CHAN_Z], 0, CHAN_Z);
2760       }
2761       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2762          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2763       }
2764       break;
2765
2766    case TGSI_OPCODE_ABS:
2767       exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2768       break;
2769
2770    case TGSI_OPCODE_RCC:
2771       FETCH(&r[0], 0, CHAN_X);
2772       micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2773       micro_float_clamp(&r[0], &r[0]);
2774       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2775          STORE(&r[0], 0, chan_index);
2776       }
2777       break;
2778
2779    case TGSI_OPCODE_DPH:
2780       exec_dph(mach, inst);
2781       break;
2782
2783    case TGSI_OPCODE_COS:
2784       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2785       break;
2786
2787    case TGSI_OPCODE_DDX:
2788       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2789       break;
2790
2791    case TGSI_OPCODE_DDY:
2792       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2793       break;
2794
2795    case TGSI_OPCODE_KILP:
2796       exec_kilp (mach, inst);
2797       break;
2798
2799    case TGSI_OPCODE_KIL:
2800       exec_kil (mach, inst);
2801       break;
2802
2803    case TGSI_OPCODE_PK2H:
2804       assert (0);
2805       break;
2806
2807    case TGSI_OPCODE_PK2US:
2808       assert (0);
2809       break;
2810
2811    case TGSI_OPCODE_PK4B:
2812       assert (0);
2813       break;
2814
2815    case TGSI_OPCODE_PK4UB:
2816       assert (0);
2817       break;
2818
2819    case TGSI_OPCODE_RFL:
2820       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2821           IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2822           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2823          /* r0 = dp3(src0, src0) */
2824          FETCH(&r[2], 0, CHAN_X);
2825          micro_mul(&r[0], &r[2], &r[2]);
2826          FETCH(&r[4], 0, CHAN_Y);
2827          micro_mul(&r[8], &r[4], &r[4]);
2828          micro_add(&r[0], &r[0], &r[8]);
2829          FETCH(&r[6], 0, CHAN_Z);
2830          micro_mul(&r[8], &r[6], &r[6]);
2831          micro_add(&r[0], &r[0], &r[8]);
2832
2833          /* r1 = dp3(src0, src1) */
2834          FETCH(&r[3], 1, CHAN_X);
2835          micro_mul(&r[1], &r[2], &r[3]);
2836          FETCH(&r[5], 1, CHAN_Y);
2837          micro_mul(&r[8], &r[4], &r[5]);
2838          micro_add(&r[1], &r[1], &r[8]);
2839          FETCH(&r[7], 1, CHAN_Z);
2840          micro_mul(&r[8], &r[6], &r[7]);
2841          micro_add(&r[1], &r[1], &r[8]);
2842
2843          /* r1 = 2 * r1 / r0 */
2844          micro_add(&r[1], &r[1], &r[1]);
2845          micro_div(&r[1], &r[1], &r[0]);
2846
2847          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2848             micro_mul(&r[2], &r[2], &r[1]);
2849             micro_sub(&r[2], &r[2], &r[3]);
2850             STORE(&r[2], 0, CHAN_X);
2851          }
2852          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2853             micro_mul(&r[4], &r[4], &r[1]);
2854             micro_sub(&r[4], &r[4], &r[5]);
2855             STORE(&r[4], 0, CHAN_Y);
2856          }
2857          if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2858             micro_mul(&r[6], &r[6], &r[1]);
2859             micro_sub(&r[6], &r[6], &r[7]);
2860             STORE(&r[6], 0, CHAN_Z);
2861          }
2862       }
2863       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2864          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2865       }
2866       break;
2867
2868    case TGSI_OPCODE_SEQ:
2869       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2870       break;
2871
2872    case TGSI_OPCODE_SFL:
2873       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2874          STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2875       }
2876       break;
2877
2878    case TGSI_OPCODE_SGT:
2879       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2880       break;
2881
2882    case TGSI_OPCODE_SIN:
2883       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2884       break;
2885
2886    case TGSI_OPCODE_SLE:
2887       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2888       break;
2889
2890    case TGSI_OPCODE_SNE:
2891       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2892       break;
2893
2894    case TGSI_OPCODE_STR:
2895       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2896          STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2897       }
2898       break;
2899
2900    case TGSI_OPCODE_TEX:
2901       /* simple texture lookup */
2902       /* src[0] = texcoord */
2903       /* src[1] = sampler unit */
2904       exec_tex(mach, inst, TEX_MODIFIER_NONE);
2905       break;
2906
2907    case TGSI_OPCODE_TXB:
2908       /* Texture lookup with lod bias */
2909       /* src[0] = texcoord (src[0].w = LOD bias) */
2910       /* src[1] = sampler unit */
2911       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2912       break;
2913
2914    case TGSI_OPCODE_TXD:
2915       /* Texture lookup with explict partial derivatives */
2916       /* src[0] = texcoord */
2917       /* src[1] = d[strq]/dx */
2918       /* src[2] = d[strq]/dy */
2919       /* src[3] = sampler unit */
2920       exec_txd(mach, inst);
2921       break;
2922
2923    case TGSI_OPCODE_TXL:
2924       /* Texture lookup with explit LOD */
2925       /* src[0] = texcoord (src[0].w = LOD) */
2926       /* src[1] = sampler unit */
2927       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2928       break;
2929
2930    case TGSI_OPCODE_TXP:
2931       /* Texture lookup with projection */
2932       /* src[0] = texcoord (src[0].w = projection) */
2933       /* src[1] = sampler unit */
2934       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2935       break;
2936
2937    case TGSI_OPCODE_UP2H:
2938       assert (0);
2939       break;
2940
2941    case TGSI_OPCODE_UP2US:
2942       assert (0);
2943       break;
2944
2945    case TGSI_OPCODE_UP4B:
2946       assert (0);
2947       break;
2948
2949    case TGSI_OPCODE_UP4UB:
2950       assert (0);
2951       break;
2952
2953    case TGSI_OPCODE_X2D:
2954       FETCH(&r[0], 1, CHAN_X);
2955       FETCH(&r[1], 1, CHAN_Y);
2956       if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2957           IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2958          FETCH(&r[2], 2, CHAN_X);
2959          micro_mul(&r[2], &r[2], &r[0]);
2960          FETCH(&r[3], 2, CHAN_Y);
2961          micro_mul(&r[3], &r[3], &r[1]);
2962          micro_add(&r[2], &r[2], &r[3]);
2963          FETCH(&r[3], 0, CHAN_X);
2964          micro_add(&d[CHAN_X], &r[2], &r[3]);
2965
2966       }
2967       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2968           IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2969          FETCH(&r[2], 2, CHAN_Z);
2970          micro_mul(&r[2], &r[2], &r[0]);
2971          FETCH(&r[3], 2, CHAN_W);
2972          micro_mul(&r[3], &r[3], &r[1]);
2973          micro_add(&r[2], &r[2], &r[3]);
2974          FETCH(&r[3], 0, CHAN_Y);
2975          micro_add(&d[CHAN_Y], &r[2], &r[3]);
2976
2977       }
2978       if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2979          STORE(&d[CHAN_X], 0, CHAN_X);
2980       }
2981       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2982          STORE(&d[CHAN_Y], 0, CHAN_Y);
2983       }
2984       if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2985          STORE(&d[CHAN_X], 0, CHAN_Z);
2986       }
2987       if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2988          STORE(&d[CHAN_Y], 0, CHAN_W);
2989       }
2990       break;
2991
2992    case TGSI_OPCODE_ARA:
2993       assert (0);
2994       break;
2995
2996    case TGSI_OPCODE_ARR:
2997       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2998       break;
2999
3000    case TGSI_OPCODE_BRA:
3001       assert (0);
3002       break;
3003
3004    case TGSI_OPCODE_CAL:
3005       /* skip the call if no execution channels are enabled */
3006       if (mach->ExecMask) {
3007          /* do the call */
3008
3009          /* First, record the depths of the execution stacks.
3010           * This is important for deeply nested/looped return statements.
3011           * We have to unwind the stacks by the correct amount.  For a
3012           * real code generator, we could determine the number of entries
3013           * to pop off each stack with simple static analysis and avoid
3014           * implementing this data structure at run time.
3015           */
3016          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3017          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3018          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3019          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3020          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3021          /* note that PC was already incremented above */
3022          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3023
3024          mach->CallStackTop++;
3025
3026          /* Second, push the Cond, Loop, Cont, Func stacks */
3027          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3028          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3029          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3030          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3031          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3032          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3033
3034          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3035          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3036          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3037          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3038          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3039          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3040
3041          /* Finally, jump to the subroutine */
3042          *pc = inst->Label.Label;
3043       }
3044       break;
3045
3046    case TGSI_OPCODE_RET:
3047       mach->FuncMask &= ~mach->ExecMask;
3048       UPDATE_EXEC_MASK(mach);
3049
3050       if (mach->FuncMask == 0x0) {
3051          /* really return now (otherwise, keep executing */
3052
3053          if (mach->CallStackTop == 0) {
3054             /* returning from main() */
3055             *pc = -1;
3056             return;
3057          }
3058
3059          assert(mach->CallStackTop > 0);
3060          mach->CallStackTop--;
3061
3062          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3063          mach->CondMask = mach->CondStack[mach->CondStackTop];
3064
3065          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3066          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3067
3068          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3069          mach->ContMask = mach->ContStack[mach->ContStackTop];
3070
3071          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3072          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3073
3074          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3075          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3076
3077          assert(mach->FuncStackTop > 0);
3078          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3079
3080          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3081
3082          UPDATE_EXEC_MASK(mach);
3083       }
3084       break;
3085
3086    case TGSI_OPCODE_SSG:
3087       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3088       break;
3089
3090    case TGSI_OPCODE_CMP:
3091       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3092          FETCH(&r[0], 0, chan_index);
3093          FETCH(&r[1], 1, chan_index);
3094          FETCH(&r[2], 2, chan_index);
3095          micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
3096       }
3097       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3098          STORE(&d[chan_index], 0, chan_index);
3099       }
3100       break;
3101
3102    case TGSI_OPCODE_SCS:
3103       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3104          FETCH( &r[0], 0, CHAN_X );
3105          if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3106             micro_cos(&r[1], &r[0]);
3107             STORE(&r[1], 0, CHAN_X);
3108          }
3109          if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3110             micro_sin(&r[1], &r[0]);
3111             STORE(&r[1], 0, CHAN_Y);
3112          }
3113       }
3114       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3115          STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3116       }
3117       if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3118          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3119       }
3120       break;
3121
3122    case TGSI_OPCODE_NRM:
3123       exec_nrm3(mach, inst);
3124       break;
3125
3126    case TGSI_OPCODE_NRM4:
3127       exec_nrm4(mach, inst);
3128       break;
3129
3130    case TGSI_OPCODE_DIV:
3131       assert( 0 );
3132       break;
3133
3134    case TGSI_OPCODE_DP2:
3135       exec_dp2(mach, inst);
3136       break;
3137
3138    case TGSI_OPCODE_IF:
3139       /* push CondMask */
3140       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3141       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3142       FETCH( &r[0], 0, CHAN_X );
3143       /* update CondMask */
3144       if( ! r[0].u[0] ) {
3145          mach->CondMask &= ~0x1;
3146       }
3147       if( ! r[0].u[1] ) {
3148          mach->CondMask &= ~0x2;
3149       }
3150       if( ! r[0].u[2] ) {
3151          mach->CondMask &= ~0x4;
3152       }
3153       if( ! r[0].u[3] ) {
3154          mach->CondMask &= ~0x8;
3155       }
3156       UPDATE_EXEC_MASK(mach);
3157       /* Todo: If CondMask==0, jump to ELSE */
3158       break;
3159
3160    case TGSI_OPCODE_ELSE:
3161       /* invert CondMask wrt previous mask */
3162       {
3163          uint prevMask;
3164          assert(mach->CondStackTop > 0);
3165          prevMask = mach->CondStack[mach->CondStackTop - 1];
3166          mach->CondMask = ~mach->CondMask & prevMask;
3167          UPDATE_EXEC_MASK(mach);
3168          /* Todo: If CondMask==0, jump to ENDIF */
3169       }
3170       break;
3171
3172    case TGSI_OPCODE_ENDIF:
3173       /* pop CondMask */
3174       assert(mach->CondStackTop > 0);
3175       mach->CondMask = mach->CondStack[--mach->CondStackTop];
3176       UPDATE_EXEC_MASK(mach);
3177       break;
3178
3179    case TGSI_OPCODE_END:
3180       /* halt execution */
3181       *pc = -1;
3182       break;
3183
3184    case TGSI_OPCODE_REP:
3185       assert (0);
3186       break;
3187
3188    case TGSI_OPCODE_ENDREP:
3189        assert (0);
3190        break;
3191
3192    case TGSI_OPCODE_PUSHA:
3193       assert (0);
3194       break;
3195
3196    case TGSI_OPCODE_POPA:
3197       assert (0);
3198       break;
3199
3200    case TGSI_OPCODE_CEIL:
3201       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3202       break;
3203
3204    case TGSI_OPCODE_I2F:
3205       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3206       break;
3207
3208    case TGSI_OPCODE_NOT:
3209       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3210       break;
3211
3212    case TGSI_OPCODE_TRUNC:
3213       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3214       break;
3215
3216    case TGSI_OPCODE_SHL:
3217       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3218       break;
3219
3220    case TGSI_OPCODE_AND:
3221       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3222       break;
3223
3224    case TGSI_OPCODE_OR:
3225       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3226       break;
3227
3228    case TGSI_OPCODE_MOD:
3229       assert (0);
3230       break;
3231
3232    case TGSI_OPCODE_XOR:
3233       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3234       break;
3235
3236    case TGSI_OPCODE_SAD:
3237       assert (0);
3238       break;
3239
3240    case TGSI_OPCODE_TXF:
3241       assert (0);
3242       break;
3243
3244    case TGSI_OPCODE_TXQ:
3245       assert (0);
3246       break;
3247
3248    case TGSI_OPCODE_EMIT:
3249       emit_vertex(mach);
3250       break;
3251
3252    case TGSI_OPCODE_ENDPRIM:
3253       emit_primitive(mach);
3254       break;
3255
3256    case TGSI_OPCODE_BGNFOR:
3257       assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3258       for (chan_index = 0; chan_index < 3; chan_index++) {
3259          FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3260       }
3261       ++mach->LoopCounterStackTop;
3262       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3263       /* update LoopMask */
3264       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3265          mach->LoopMask &= ~0x1;
3266       }
3267       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3268          mach->LoopMask &= ~0x2;
3269       }
3270       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3271          mach->LoopMask &= ~0x4;
3272       }
3273       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3274          mach->LoopMask &= ~0x8;
3275       }
3276       /* TODO: if mach->LoopMask == 0, jump to end of loop */
3277       UPDATE_EXEC_MASK(mach);
3278       /* fall-through (for now) */
3279    case TGSI_OPCODE_BGNLOOP:
3280       /* push LoopMask and ContMasks */
3281       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3282       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3283       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3284       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3285
3286       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3287       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3288       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3289       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3290       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3291       break;
3292
3293    case TGSI_OPCODE_ENDFOR:
3294       assert(mach->LoopCounterStackTop > 0);
3295       micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3296                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3297                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3298       /* update LoopMask */
3299       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3300          mach->LoopMask &= ~0x1;
3301       }
3302       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3303          mach->LoopMask &= ~0x2;
3304       }
3305       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3306          mach->LoopMask &= ~0x4;
3307       }
3308       if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3309          mach->LoopMask &= ~0x8;
3310       }
3311       micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3312                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3313                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3314       assert(mach->LoopLabelStackTop > 0);
3315       inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3316       STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3317       /* Restore ContMask, but don't pop */
3318       assert(mach->ContStackTop > 0);
3319       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3320       UPDATE_EXEC_MASK(mach);
3321       if (mach->ExecMask) {
3322          /* repeat loop: jump to instruction just past BGNLOOP */
3323          assert(mach->LoopLabelStackTop > 0);
3324          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3325       }
3326       else {
3327          /* exit loop: pop LoopMask */
3328          assert(mach->LoopStackTop > 0);
3329          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3330          /* pop ContMask */
3331          assert(mach->ContStackTop > 0);
3332          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3333          assert(mach->LoopLabelStackTop > 0);
3334          --mach->LoopLabelStackTop;
3335          assert(mach->LoopCounterStackTop > 0);
3336          --mach->LoopCounterStackTop;
3337
3338          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3339       }
3340       UPDATE_EXEC_MASK(mach);
3341       break;
3342
3343    case TGSI_OPCODE_ENDLOOP:
3344       /* Restore ContMask, but don't pop */
3345       assert(mach->ContStackTop > 0);
3346       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3347       UPDATE_EXEC_MASK(mach);
3348       if (mach->ExecMask) {
3349          /* repeat loop: jump to instruction just past BGNLOOP */
3350          assert(mach->LoopLabelStackTop > 0);
3351          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3352       }
3353       else {
3354          /* exit loop: pop LoopMask */
3355          assert(mach->LoopStackTop > 0);
3356          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3357          /* pop ContMask */
3358          assert(mach->ContStackTop > 0);
3359          mach->ContMask = mach->ContStack[--mach->ContStackTop];
3360          assert(mach->LoopLabelStackTop > 0);
3361          --mach->LoopLabelStackTop;
3362
3363          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3364       }
3365       UPDATE_EXEC_MASK(mach);
3366       break;
3367
3368    case TGSI_OPCODE_BRK:
3369       exec_break(mach);
3370       break;
3371
3372    case TGSI_OPCODE_CONT:
3373       /* turn off cont channels for each enabled exec channel */
3374       mach->ContMask &= ~mach->ExecMask;
3375       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3376       UPDATE_EXEC_MASK(mach);
3377       break;
3378
3379    case TGSI_OPCODE_BGNSUB:
3380       /* no-op */
3381       break;
3382
3383    case TGSI_OPCODE_ENDSUB:
3384       /*
3385        * XXX: This really should be a no-op. We should never reach this opcode.
3386        */
3387
3388       assert(mach->CallStackTop > 0);
3389       mach->CallStackTop--;
3390
3391       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3392       mach->CondMask = mach->CondStack[mach->CondStackTop];
3393
3394       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3395       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3396
3397       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3398       mach->ContMask = mach->ContStack[mach->ContStackTop];
3399
3400       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3401       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3402
3403       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3404       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3405
3406       assert(mach->FuncStackTop > 0);
3407       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3408
3409       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3410
3411       UPDATE_EXEC_MASK(mach);
3412       break;
3413
3414    case TGSI_OPCODE_NOP:
3415       break;
3416
3417    case TGSI_OPCODE_BREAKC:
3418       FETCH(&r[0], 0, CHAN_X);
3419       /* update CondMask */
3420       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3421          mach->LoopMask &= ~0x1;
3422       }
3423       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3424          mach->LoopMask &= ~0x2;
3425       }
3426       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3427          mach->LoopMask &= ~0x4;
3428       }
3429       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3430          mach->LoopMask &= ~0x8;
3431       }
3432       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3433       UPDATE_EXEC_MASK(mach);
3434       break;
3435
3436    case TGSI_OPCODE_F2I:
3437       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3438       break;
3439
3440    case TGSI_OPCODE_IDIV:
3441       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3442       break;
3443
3444    case TGSI_OPCODE_IMAX:
3445       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3446       break;
3447
3448    case TGSI_OPCODE_IMIN:
3449       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3450       break;
3451
3452    case TGSI_OPCODE_INEG:
3453       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3454       break;
3455
3456    case TGSI_OPCODE_ISGE:
3457       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3458       break;
3459
3460    case TGSI_OPCODE_ISHR:
3461       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3462       break;
3463
3464    case TGSI_OPCODE_ISLT:
3465       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3466       break;
3467
3468    case TGSI_OPCODE_F2U:
3469       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3470       break;
3471
3472    case TGSI_OPCODE_U2F:
3473       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3474       break;
3475
3476    case TGSI_OPCODE_UADD:
3477       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3478       break;
3479
3480    case TGSI_OPCODE_UDIV:
3481       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3482       break;
3483
3484    case TGSI_OPCODE_UMAD:
3485       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3486       break;
3487
3488    case TGSI_OPCODE_UMAX:
3489       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3490       break;
3491
3492    case TGSI_OPCODE_UMIN:
3493       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3494       break;
3495
3496    case TGSI_OPCODE_UMOD:
3497       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3498       break;
3499
3500    case TGSI_OPCODE_UMUL:
3501       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3502       break;
3503
3504    case TGSI_OPCODE_USEQ:
3505       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3506       break;
3507
3508    case TGSI_OPCODE_USGE:
3509       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3510       break;
3511
3512    case TGSI_OPCODE_USHR:
3513       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3514       break;
3515
3516    case TGSI_OPCODE_USLT:
3517       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3518       break;
3519
3520    case TGSI_OPCODE_USNE:
3521       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3522       break;
3523
3524    case TGSI_OPCODE_SWITCH:
3525       exec_switch(mach, inst);
3526       break;
3527
3528    case TGSI_OPCODE_CASE:
3529       exec_case(mach, inst);
3530       break;
3531
3532    case TGSI_OPCODE_DEFAULT:
3533       exec_default(mach);
3534       break;
3535
3536    case TGSI_OPCODE_ENDSWITCH:
3537       exec_endswitch(mach);
3538       break;
3539
3540    default:
3541       assert( 0 );
3542    }
3543 }
3544
3545
3546 #define DEBUG_EXECUTION 0
3547
3548
3549 /**
3550  * Run TGSI interpreter.
3551  * \return bitmask of "alive" quad components
3552  */
3553 uint
3554 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3555 {
3556    uint i;
3557    int pc = 0;
3558
3559    mach->CondMask = 0xf;
3560    mach->LoopMask = 0xf;
3561    mach->ContMask = 0xf;
3562    mach->FuncMask = 0xf;
3563    mach->ExecMask = 0xf;
3564
3565    mach->Switch.mask = 0xf;
3566
3567    assert(mach->CondStackTop == 0);
3568    assert(mach->LoopStackTop == 0);
3569    assert(mach->ContStackTop == 0);
3570    assert(mach->SwitchStackTop == 0);
3571    assert(mach->BreakStackTop == 0);
3572    assert(mach->CallStackTop == 0);
3573
3574    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3575    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3576
3577    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3578       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3579       mach->Primitives[0] = 0;
3580    }
3581
3582    for (i = 0; i < QUAD_SIZE; i++) {
3583       mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3584          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3585          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3586          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3587          (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3588    }
3589
3590    /* execute declarations (interpolants) */
3591    for (i = 0; i < mach->NumDeclarations; i++) {
3592       exec_declaration( mach, mach->Declarations+i );
3593    }
3594
3595    {
3596 #if DEBUG_EXECUTION
3597       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3598       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3599       uint inst = 1;
3600
3601       memcpy(temps, mach->Temps, sizeof(temps));
3602       memcpy(outputs, mach->Outputs, sizeof(outputs));
3603 #endif
3604
3605       /* execute instructions, until pc is set to -1 */
3606       while (pc != -1) {
3607
3608 #if DEBUG_EXECUTION
3609          uint i;
3610
3611          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3612 #endif
3613
3614          assert(pc < (int) mach->NumInstructions);
3615          exec_instruction(mach, mach->Instructions + pc, &pc);
3616
3617 #if DEBUG_EXECUTION
3618          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3619             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3620                uint j;
3621
3622                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3623                debug_printf("TEMP[%2u] = ", i);
3624                for (j = 0; j < 4; j++) {
3625                   if (j > 0) {
3626                      debug_printf("           ");
3627                   }
3628                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3629                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3630                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3631                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3632                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3633                }
3634             }
3635          }
3636          for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3637             if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3638                uint j;
3639
3640                memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3641                debug_printf("OUT[%2u] =  ", i);
3642                for (j = 0; j < 4; j++) {
3643                   if (j > 0) {
3644                      debug_printf("           ");
3645                   }
3646                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3647                                outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3648                                outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3649                                outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3650                                outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3651                }
3652             }
3653          }
3654 #endif
3655       }
3656    }
3657
3658 #if 0
3659    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3660    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3661       /*
3662        * Scale back depth component.
3663        */
3664       for (i = 0; i < 4; i++)
3665          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3666    }
3667 #endif
3668
3669    assert(mach->CondStackTop == 0);
3670    assert(mach->LoopStackTop == 0);
3671    assert(mach->ContStackTop == 0);
3672    assert(mach->SwitchStackTop == 0);
3673    assert(mach->BreakStackTop == 0);
3674    assert(mach->CallStackTop == 0);
3675
3676    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3677 }