src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 VMware, Inc.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_half.h"
  62 #include "util/u_memory.h"
  63 #include "util/u_math.h"
  64
  65
  66 #define DEBUG_EXECUTION 0
  67
  68
  69 #define FAST_MATH 0
  70
  71 #define TILE_TOP_LEFT     0
  72 #define TILE_TOP_RIGHT    1
  73 #define TILE_BOTTOM_LEFT  2
  74 #define TILE_BOTTOM_RIGHT 3
  75
  76 union tgsi_double_channel {
  77    double d[TGSI_QUAD_SIZE];
  78    unsigned u[TGSI_QUAD_SIZE][2];
  79 };
  80
  81 struct tgsi_double_vector {
  82    union tgsi_double_channel xy;
  83    union tgsi_double_channel zw;
  84 };
  85
  86 static void
  87 micro_abs(union tgsi_exec_channel *dst,
  88           const union tgsi_exec_channel *src)
  89 {
  90    dst->f[0] = fabsf(src->f[0]);
  91    dst->f[1] = fabsf(src->f[1]);
  92    dst->f[2] = fabsf(src->f[2]);
  93    dst->f[3] = fabsf(src->f[3]);
  94 }
  95
  96 static void
  97 micro_arl(union tgsi_exec_channel *dst,
  98           const union tgsi_exec_channel *src)
  99 {
 100    dst->i[0] = (int)floorf(src->f[0]);
 101    dst->i[1] = (int)floorf(src->f[1]);
 102    dst->i[2] = (int)floorf(src->f[2]);
 103    dst->i[3] = (int)floorf(src->f[3]);
 104 }
 105
 106 static void
 107 micro_arr(union tgsi_exec_channel *dst,
 108           const union tgsi_exec_channel *src)
 109 {
 110    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
 111    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
 112    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
 113    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 114 }
 115
 116 static void
 117 micro_ceil(union tgsi_exec_channel *dst,
 118            const union tgsi_exec_channel *src)
 119 {
 120    dst->f[0] = ceilf(src->f[0]);
 121    dst->f[1] = ceilf(src->f[1]);
 122    dst->f[2] = ceilf(src->f[2]);
 123    dst->f[3] = ceilf(src->f[3]);
 124 }
 125
 126 static void
 127 micro_clamp(union tgsi_exec_channel *dst,
 128             const union tgsi_exec_channel *src0,
 129             const union tgsi_exec_channel *src1,
 130             const union tgsi_exec_channel *src2)
 131 {
 132    dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
 133    dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
 134    dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
 135    dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
 136 }
 137
 138 static void
 139 micro_cmp(union tgsi_exec_channel *dst,
 140           const union tgsi_exec_channel *src0,
 141           const union tgsi_exec_channel *src1,
 142           const union tgsi_exec_channel *src2)
 143 {
 144    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
 145    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
 146    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
 147    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
 148 }
 149
 150 static void
 151 micro_cos(union tgsi_exec_channel *dst,
 152           const union tgsi_exec_channel *src)
 153 {
 154    dst->f[0] = cosf(src->f[0]);
 155    dst->f[1] = cosf(src->f[1]);
 156    dst->f[2] = cosf(src->f[2]);
 157    dst->f[3] = cosf(src->f[3]);
 158 }
 159
 160 static void
 161 micro_d2f(union tgsi_exec_channel *dst,
 162           const union tgsi_double_channel *src)
 163 {
 164    dst->f[0] = (float)src->d[0];
 165    dst->f[1] = (float)src->d[1];
 166    dst->f[2] = (float)src->d[2];
 167    dst->f[3] = (float)src->d[3];
 168 }
 169
 170 static void
 171 micro_d2i(union tgsi_exec_channel *dst,
 172           const union tgsi_double_channel *src)
 173 {
 174    dst->i[0] = (int)src->d[0];
 175    dst->i[1] = (int)src->d[1];
 176    dst->i[2] = (int)src->d[2];
 177    dst->i[3] = (int)src->d[3];
 178 }
 179
 180 static void
 181 micro_d2u(union tgsi_exec_channel *dst,
 182           const union tgsi_double_channel *src)
 183 {
 184    dst->u[0] = (unsigned)src->d[0];
 185    dst->u[1] = (unsigned)src->d[1];
 186    dst->u[2] = (unsigned)src->d[2];
 187    dst->u[3] = (unsigned)src->d[3];
 188 }
 189 static void
 190 micro_dabs(union tgsi_double_channel *dst,
 191            const union tgsi_double_channel *src)
 192 {
 193    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
 194    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
 195    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
 196    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
 197 }
 198
 199 static void
 200 micro_dadd(union tgsi_double_channel *dst,
 201           const union tgsi_double_channel *src)
 202 {
 203    dst->d[0] = src[0].d[0] + src[1].d[0];
 204    dst->d[1] = src[0].d[1] + src[1].d[1];
 205    dst->d[2] = src[0].d[2] + src[1].d[2];
 206    dst->d[3] = src[0].d[3] + src[1].d[3];
 207 }
 208
 209 static void
 210 micro_ddx(union tgsi_exec_channel *dst,
 211           const union tgsi_exec_channel *src)
 212 {
 213    dst->f[0] =
 214    dst->f[1] =
 215    dst->f[2] =
 216    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 217 }
 218
 219 static void
 220 micro_ddy(union tgsi_exec_channel *dst,
 221           const union tgsi_exec_channel *src)
 222 {
 223    dst->f[0] =
 224    dst->f[1] =
 225    dst->f[2] =
 226    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 227 }
 228
 229 static void
 230 micro_dmul(union tgsi_double_channel *dst,
 231            const union tgsi_double_channel *src)
 232 {
 233    dst->d[0] = src[0].d[0] * src[1].d[0];
 234    dst->d[1] = src[0].d[1] * src[1].d[1];
 235    dst->d[2] = src[0].d[2] * src[1].d[2];
 236    dst->d[3] = src[0].d[3] * src[1].d[3];
 237 }
 238
 239 static void
 240 micro_dmax(union tgsi_double_channel *dst,
 241            const union tgsi_double_channel *src)
 242 {
 243    dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
 244    dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
 245    dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
 246    dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
 247 }
 248
 249 static void
 250 micro_dmin(union tgsi_double_channel *dst,
 251            const union tgsi_double_channel *src)
 252 {
 253    dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
 254    dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
 255    dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
 256    dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
 257 }
 258
 259 static void
 260 micro_dneg(union tgsi_double_channel *dst,
 261            const union tgsi_double_channel *src)
 262 {
 263    dst->d[0] = -src->d[0];
 264    dst->d[1] = -src->d[1];
 265    dst->d[2] = -src->d[2];
 266    dst->d[3] = -src->d[3];
 267 }
 268
 269 static void
 270 micro_dslt(union tgsi_double_channel *dst,
 271            const union tgsi_double_channel *src)
 272 {
 273    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
 274    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
 275    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
 276    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
 277 }
 278
 279 static void
 280 micro_dsne(union tgsi_double_channel *dst,
 281            const union tgsi_double_channel *src)
 282 {
 283    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
 284    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
 285    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
 286    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
 287 }
 288
 289 static void
 290 micro_dsge(union tgsi_double_channel *dst,
 291            const union tgsi_double_channel *src)
 292 {
 293    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
 294    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
 295    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
 296    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
 297 }
 298
 299 static void
 300 micro_dseq(union tgsi_double_channel *dst,
 301            const union tgsi_double_channel *src)
 302 {
 303    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
 304    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
 305    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
 306    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
 307 }
 308
 309 static void
 310 micro_drcp(union tgsi_double_channel *dst,
 311            const union tgsi_double_channel *src)
 312 {
 313    dst->d[0] = 1.0 / src->d[0];
 314    dst->d[1] = 1.0 / src->d[1];
 315    dst->d[2] = 1.0 / src->d[2];
 316    dst->d[3] = 1.0 / src->d[3];
 317 }
 318
 319 static void
 320 micro_dsqrt(union tgsi_double_channel *dst,
 321             const union tgsi_double_channel *src)
 322 {
 323    dst->d[0] = sqrt(src->d[0]);
 324    dst->d[1] = sqrt(src->d[1]);
 325    dst->d[2] = sqrt(src->d[2]);
 326    dst->d[3] = sqrt(src->d[3]);
 327 }
 328
 329 static void
 330 micro_drsq(union tgsi_double_channel *dst,
 331           const union tgsi_double_channel *src)
 332 {
 333    dst->d[0] = 1.0 / sqrt(src->d[0]);
 334    dst->d[1] = 1.0 / sqrt(src->d[1]);
 335    dst->d[2] = 1.0 / sqrt(src->d[2]);
 336    dst->d[3] = 1.0 / sqrt(src->d[3]);
 337 }
 338
 339 static void
 340 micro_dmad(union tgsi_double_channel *dst,
 341            const union tgsi_double_channel *src)
 342 {
 343    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
 344    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
 345    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
 346    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
 347 }
 348
 349 static void
 350 micro_dfrac(union tgsi_double_channel *dst,
 351             const union tgsi_double_channel *src)
 352 {
 353    dst->d[0] = src->d[0] - floor(src->d[0]);
 354    dst->d[1] = src->d[1] - floor(src->d[1]);
 355    dst->d[2] = src->d[2] - floor(src->d[2]);
 356    dst->d[3] = src->d[3] - floor(src->d[3]);
 357 }
 358
 359 static void
 360 micro_dldexp(union tgsi_double_channel *dst,
 361              const union tgsi_double_channel *src0,
 362              union tgsi_exec_channel *src1)
 363 {
 364    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
 365    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
 366    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
 367    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
 368 }
 369
 370 static void
 371 micro_dfracexp(union tgsi_double_channel *dst,
 372                union tgsi_exec_channel *dst_exp,
 373                const union tgsi_double_channel *src)
 374 {
 375    dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
 376    dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
 377    dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
 378    dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
 379 }
 380
 381 static void
 382 micro_exp2(union tgsi_exec_channel *dst,
 383            const union tgsi_exec_channel *src)
 384 {
 385 #if FAST_MATH
 386    dst->f[0] = util_fast_exp2(src->f[0]);
 387    dst->f[1] = util_fast_exp2(src->f[1]);
 388    dst->f[2] = util_fast_exp2(src->f[2]);
 389    dst->f[3] = util_fast_exp2(src->f[3]);
 390 #else
 391 #if DEBUG
 392    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 393    uint i;
 394    union tgsi_exec_channel clamped;
 395
 396    for (i = 0; i < 4; i++) {
 397       if (src->f[i] > 127.99999f) {
 398          clamped.f[i] = 127.99999f;
 399       } else if (src->f[i] < -126.99999f) {
 400          clamped.f[i] = -126.99999f;
 401       } else {
 402          clamped.f[i] = src->f[i];
 403       }
 404    }
 405    src = &clamped;
 406 #endif /* DEBUG */
 407
 408    dst->f[0] = powf(2.0f, src->f[0]);
 409    dst->f[1] = powf(2.0f, src->f[1]);
 410    dst->f[2] = powf(2.0f, src->f[2]);
 411    dst->f[3] = powf(2.0f, src->f[3]);
 412 #endif /* FAST_MATH */
 413 }
 414
 415 static void
 416 micro_f2d(union tgsi_double_channel *dst,
 417           const union tgsi_exec_channel *src)
 418 {
 419    dst->d[0] = (double)src->f[0];
 420    dst->d[1] = (double)src->f[1];
 421    dst->d[2] = (double)src->f[2];
 422    dst->d[3] = (double)src->f[3];
 423 }
 424
 425 static void
 426 micro_flr(union tgsi_exec_channel *dst,
 427           const union tgsi_exec_channel *src)
 428 {
 429    dst->f[0] = floorf(src->f[0]);
 430    dst->f[1] = floorf(src->f[1]);
 431    dst->f[2] = floorf(src->f[2]);
 432    dst->f[3] = floorf(src->f[3]);
 433 }
 434
 435 static void
 436 micro_frc(union tgsi_exec_channel *dst,
 437           const union tgsi_exec_channel *src)
 438 {
 439    dst->f[0] = src->f[0] - floorf(src->f[0]);
 440    dst->f[1] = src->f[1] - floorf(src->f[1]);
 441    dst->f[2] = src->f[2] - floorf(src->f[2]);
 442    dst->f[3] = src->f[3] - floorf(src->f[3]);
 443 }
 444
 445 static void
 446 micro_i2d(union tgsi_double_channel *dst,
 447           const union tgsi_exec_channel *src)
 448 {
 449    dst->d[0] = (double)src->i[0];
 450    dst->d[1] = (double)src->i[1];
 451    dst->d[2] = (double)src->i[2];
 452    dst->d[3] = (double)src->i[3];
 453 }
 454
 455 static void
 456 micro_iabs(union tgsi_exec_channel *dst,
 457            const union tgsi_exec_channel *src)
 458 {
 459    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 460    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 461    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 462    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 463 }
 464
 465 static void
 466 micro_ineg(union tgsi_exec_channel *dst,
 467            const union tgsi_exec_channel *src)
 468 {
 469    dst->i[0] = -src->i[0];
 470    dst->i[1] = -src->i[1];
 471    dst->i[2] = -src->i[2];
 472    dst->i[3] = -src->i[3];
 473 }
 474
 475 static void
 476 micro_lg2(union tgsi_exec_channel *dst,
 477           const union tgsi_exec_channel *src)
 478 {
 479 #if FAST_MATH
 480    dst->f[0] = util_fast_log2(src->f[0]);
 481    dst->f[1] = util_fast_log2(src->f[1]);
 482    dst->f[2] = util_fast_log2(src->f[2]);
 483    dst->f[3] = util_fast_log2(src->f[3]);
 484 #else
 485    dst->f[0] = logf(src->f[0]) * 1.442695f;
 486    dst->f[1] = logf(src->f[1]) * 1.442695f;
 487    dst->f[2] = logf(src->f[2]) * 1.442695f;
 488    dst->f[3] = logf(src->f[3]) * 1.442695f;
 489 #endif
 490 }
 491
 492 static void
 493 micro_lrp(union tgsi_exec_channel *dst,
 494           const union tgsi_exec_channel *src0,
 495           const union tgsi_exec_channel *src1,
 496           const union tgsi_exec_channel *src2)
 497 {
 498    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
 499    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
 500    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
 501    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
 502 }
 503
 504 static void
 505 micro_mad(union tgsi_exec_channel *dst,
 506           const union tgsi_exec_channel *src0,
 507           const union tgsi_exec_channel *src1,
 508           const union tgsi_exec_channel *src2)
 509 {
 510    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
 511    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
 512    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
 513    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
 514 }
 515
 516 static void
 517 micro_mov(union tgsi_exec_channel *dst,
 518           const union tgsi_exec_channel *src)
 519 {
 520    dst->u[0] = src->u[0];
 521    dst->u[1] = src->u[1];
 522    dst->u[2] = src->u[2];
 523    dst->u[3] = src->u[3];
 524 }
 525
 526 static void
 527 micro_rcp(union tgsi_exec_channel *dst,
 528           const union tgsi_exec_channel *src)
 529 {
 530 #if 0 /* for debugging */
 531    assert(src->f[0] != 0.0f);
 532    assert(src->f[1] != 0.0f);
 533    assert(src->f[2] != 0.0f);
 534    assert(src->f[3] != 0.0f);
 535 #endif
 536    dst->f[0] = 1.0f / src->f[0];
 537    dst->f[1] = 1.0f / src->f[1];
 538    dst->f[2] = 1.0f / src->f[2];
 539    dst->f[3] = 1.0f / src->f[3];
 540 }
 541
 542 static void
 543 micro_rnd(union tgsi_exec_channel *dst,
 544           const union tgsi_exec_channel *src)
 545 {
 546    dst->f[0] = floorf(src->f[0] + 0.5f);
 547    dst->f[1] = floorf(src->f[1] + 0.5f);
 548    dst->f[2] = floorf(src->f[2] + 0.5f);
 549    dst->f[3] = floorf(src->f[3] + 0.5f);
 550 }
 551
 552 static void
 553 micro_rsq(union tgsi_exec_channel *dst,
 554           const union tgsi_exec_channel *src)
 555 {
 556 #if 0 /* for debugging */
 557    assert(src->f[0] != 0.0f);
 558    assert(src->f[1] != 0.0f);
 559    assert(src->f[2] != 0.0f);
 560    assert(src->f[3] != 0.0f);
 561 #endif
 562    dst->f[0] = 1.0f / sqrtf(src->f[0]);
 563    dst->f[1] = 1.0f / sqrtf(src->f[1]);
 564    dst->f[2] = 1.0f / sqrtf(src->f[2]);
 565    dst->f[3] = 1.0f / sqrtf(src->f[3]);
 566 }
 567
 568 static void
 569 micro_sqrt(union tgsi_exec_channel *dst,
 570            const union tgsi_exec_channel *src)
 571 {
 572    dst->f[0] = sqrtf(src->f[0]);
 573    dst->f[1] = sqrtf(src->f[1]);
 574    dst->f[2] = sqrtf(src->f[2]);
 575    dst->f[3] = sqrtf(src->f[3]);
 576 }
 577
 578 static void
 579 micro_seq(union tgsi_exec_channel *dst,
 580           const union tgsi_exec_channel *src0,
 581           const union tgsi_exec_channel *src1)
 582 {
 583    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
 584    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
 585    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
 586    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
 587 }
 588
 589 static void
 590 micro_sge(union tgsi_exec_channel *dst,
 591           const union tgsi_exec_channel *src0,
 592           const union tgsi_exec_channel *src1)
 593 {
 594    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
 595    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
 596    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
 597    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
 598 }
 599
 600 static void
 601 micro_sgn(union tgsi_exec_channel *dst,
 602           const union tgsi_exec_channel *src)
 603 {
 604    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 605    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 606    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 607    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 608 }
 609
 610 static void
 611 micro_isgn(union tgsi_exec_channel *dst,
 612           const union tgsi_exec_channel *src)
 613 {
 614    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
 615    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
 616    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
 617    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
 618 }
 619
 620 static void
 621 micro_sgt(union tgsi_exec_channel *dst,
 622           const union tgsi_exec_channel *src0,
 623           const union tgsi_exec_channel *src1)
 624 {
 625    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
 626    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
 627    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
 628    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
 629 }
 630
 631 static void
 632 micro_sin(union tgsi_exec_channel *dst,
 633           const union tgsi_exec_channel *src)
 634 {
 635    dst->f[0] = sinf(src->f[0]);
 636    dst->f[1] = sinf(src->f[1]);
 637    dst->f[2] = sinf(src->f[2]);
 638    dst->f[3] = sinf(src->f[3]);
 639 }
 640
 641 static void
 642 micro_sle(union tgsi_exec_channel *dst,
 643           const union tgsi_exec_channel *src0,
 644           const union tgsi_exec_channel *src1)
 645 {
 646    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
 647    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
 648    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
 649    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
 650 }
 651
 652 static void
 653 micro_slt(union tgsi_exec_channel *dst,
 654           const union tgsi_exec_channel *src0,
 655           const union tgsi_exec_channel *src1)
 656 {
 657    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
 658    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
 659    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
 660    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
 661 }
 662
 663 static void
 664 micro_sne(union tgsi_exec_channel *dst,
 665           const union tgsi_exec_channel *src0,
 666           const union tgsi_exec_channel *src1)
 667 {
 668    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
 669    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
 670    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
 671    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
 672 }
 673
 674 static void
 675 micro_trunc(union tgsi_exec_channel *dst,
 676             const union tgsi_exec_channel *src)
 677 {
 678    dst->f[0] = (float)(int)src->f[0];
 679    dst->f[1] = (float)(int)src->f[1];
 680    dst->f[2] = (float)(int)src->f[2];
 681    dst->f[3] = (float)(int)src->f[3];
 682 }
 683
 684 static void
 685 micro_u2d(union tgsi_double_channel *dst,
 686           const union tgsi_exec_channel *src)
 687 {
 688    dst->d[0] = (double)src->u[0];
 689    dst->d[1] = (double)src->u[1];
 690    dst->d[2] = (double)src->u[2];
 691    dst->d[3] = (double)src->u[3];
 692 }
 693
 694 enum tgsi_exec_datatype {
 695    TGSI_EXEC_DATA_FLOAT,
 696    TGSI_EXEC_DATA_INT,
 697    TGSI_EXEC_DATA_UINT,
 698    TGSI_EXEC_DATA_DOUBLE
 699 };
 700
 701 /*
 702  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 703  */
 704 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 705 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 706 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 707 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 708 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 709 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 710
 711
 712 /** The execution mask depends on the conditional mask and the loop mask */
 713 #define UPDATE_EXEC_MASK(MACH) \
 714       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 715
 716
 717 static const union tgsi_exec_channel ZeroVec =
 718    { { 0.0, 0.0, 0.0, 0.0 } };
 719
 720 static const union tgsi_exec_channel OneVec = {
 721    {1.0f, 1.0f, 1.0f, 1.0f}
 722 };
 723
 724 static const union tgsi_exec_channel P128Vec = {
 725    {128.0f, 128.0f, 128.0f, 128.0f}
 726 };
 727
 728 static const union tgsi_exec_channel M128Vec = {
 729    {-128.0f, -128.0f, -128.0f, -128.0f}
 730 };
 731
 732
 733 /**
 734  * Assert that none of the float values in 'chan' are infinite or NaN.
 735  * NaN and Inf may occur normally during program execution and should
 736  * not lead to crashes, etc.  But when debugging, it's helpful to catch
 737  * them.
 738  */
 739 static inline void
 740 check_inf_or_nan(const union tgsi_exec_channel *chan)
 741 {
 742    assert(!util_is_inf_or_nan((chan)->f[0]));
 743    assert(!util_is_inf_or_nan((chan)->f[1]));
 744    assert(!util_is_inf_or_nan((chan)->f[2]));
 745    assert(!util_is_inf_or_nan((chan)->f[3]));
 746 }
 747
 748
 749 #ifdef DEBUG
 750 static void
 751 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 752 {
 753    debug_printf("%s = {%f, %f, %f, %f}\n",
 754                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 755 }
 756 #endif
 757
 758
 759 #ifdef DEBUG
 760 static void
 761 print_temp(const struct tgsi_exec_machine *mach, uint index)
 762 {
 763    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 764    int i;
 765    debug_printf("Temp[%u] =\n", index);
 766    for (i = 0; i < 4; i++) {
 767       debug_printf("  %c: { %f, %f, %f, %f }\n",
 768                    "XYZW"[i],
 769                    tmp->xyzw[i].f[0],
 770                    tmp->xyzw[i].f[1],
 771                    tmp->xyzw[i].f[2],
 772                    tmp->xyzw[i].f[3]);
 773    }
 774 }
 775 #endif
 776
 777
 778 void
 779 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
 780                                unsigned num_bufs,
 781                                const void **bufs,
 782                                const unsigned *buf_sizes)
 783 {
 784    unsigned i;
 785
 786    for (i = 0; i < num_bufs; i++) {
 787       mach->Consts[i] = bufs[i];
 788       mach->ConstsSize[i] = buf_sizes[i];
 789    }
 790 }
 791
 792
 793 /**
 794  * Check if there's a potential src/dst register data dependency when
 795  * using SOA execution.
 796  * Example:
 797  *   MOV T, T.yxwz;
 798  * This would expand into:
 799  *   MOV t0, t1;
 800  *   MOV t1, t0;
 801  *   MOV t2, t3;
 802  *   MOV t3, t2;
 803  * The second instruction will have the wrong value for t0 if executed as-is.
 804  */
 805 boolean
 806 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
 807 {
 808    uint i, chan;
 809
 810    uint writemask = inst->Dst[0].Register.WriteMask;
 811    if (writemask == TGSI_WRITEMASK_X ||
 812        writemask == TGSI_WRITEMASK_Y ||
 813        writemask == TGSI_WRITEMASK_Z ||
 814        writemask == TGSI_WRITEMASK_W ||
 815        writemask == TGSI_WRITEMASK_NONE) {
 816       /* no chance of data dependency */
 817       return FALSE;
 818    }
 819
 820    /* loop over src regs */
 821    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 822       if ((inst->Src[i].Register.File ==
 823            inst->Dst[0].Register.File) &&
 824           ((inst->Src[i].Register.Index ==
 825             inst->Dst[0].Register.Index) ||
 826            inst->Src[i].Register.Indirect ||
 827            inst->Dst[0].Register.Indirect)) {
 828          /* loop over dest channels */
 829          uint channelsWritten = 0x0;
 830          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 831             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 832                /* check if we're reading a channel that's been written */
 833                uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
 834                if (channelsWritten & (1 << swizzle)) {
 835                   return TRUE;
 836                }
 837
 838                channelsWritten |= (1 << chan);
 839             }
 840          }
 841       }
 842    }
 843    return FALSE;
 844 }
 845
 846
 847 /**
 848  * Initialize machine state by expanding tokens to full instructions,
 849  * allocating temporary storage, setting up constants, etc.
 850  * After this, we can call tgsi_exec_machine_run() many times.
 851  */
 852 void
 853 tgsi_exec_machine_bind_shader(
 854    struct tgsi_exec_machine *mach,
 855    const struct tgsi_token *tokens,
 856    struct tgsi_sampler *sampler,
 857    struct tgsi_image *image,
 858    struct tgsi_buffer *buffer)
 859 {
 860    uint k;
 861    struct tgsi_parse_context parse;
 862    struct tgsi_full_instruction *instructions;
 863    struct tgsi_full_declaration *declarations;
 864    uint maxInstructions = 10, numInstructions = 0;
 865    uint maxDeclarations = 10, numDeclarations = 0;
 866
 867 #if 0
 868    tgsi_dump(tokens, 0);
 869 #endif
 870
 871    util_init_math();
 872
 873
 874    mach->Tokens = tokens;
 875    mach->Sampler = sampler;
 876    mach->Image = image;
 877    mach->Buffer = buffer;
 878
 879    if (!tokens) {
 880       /* unbind and free all */
 881       FREE(mach->Declarations);
 882       mach->Declarations = NULL;
 883       mach->NumDeclarations = 0;
 884
 885       FREE(mach->Instructions);
 886       mach->Instructions = NULL;
 887       mach->NumInstructions = 0;
 888
 889       return;
 890    }
 891
 892    k = tgsi_parse_init (&parse, mach->Tokens);
 893    if (k != TGSI_PARSE_OK) {
 894       debug_printf( "Problem parsing!\n" );
 895       return;
 896    }
 897
 898    mach->ImmLimit = 0;
 899    mach->NumOutputs = 0;
 900
 901    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
 902        !mach->UsedGeometryShader) {
 903       struct tgsi_exec_vector *inputs;
 904       struct tgsi_exec_vector *outputs;
 905
 906       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
 907                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
 908                             16);
 909
 910       if (!inputs)
 911          return;
 912
 913       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
 914                              TGSI_MAX_TOTAL_VERTICES, 16);
 915
 916       if (!outputs) {
 917          align_free(inputs);
 918          return;
 919       }
 920
 921       align_free(mach->Inputs);
 922       align_free(mach->Outputs);
 923
 924       mach->Inputs = inputs;
 925       mach->Outputs = outputs;
 926       mach->UsedGeometryShader = TRUE;
 927    }
 928
 929    declarations = (struct tgsi_full_declaration *)
 930       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 931
 932    if (!declarations) {
 933       return;
 934    }
 935
 936    instructions = (struct tgsi_full_instruction *)
 937       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 938
 939    if (!instructions) {
 940       FREE( declarations );
 941       return;
 942    }
 943
 944    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 945       uint i;
 946
 947       tgsi_parse_token( &parse );
 948       switch( parse.FullToken.Token.Type ) {
 949       case TGSI_TOKEN_TYPE_DECLARATION:
 950          /* save expanded declaration */
 951          if (numDeclarations == maxDeclarations) {
 952             declarations = REALLOC(declarations,
 953                                    maxDeclarations
 954                                    * sizeof(struct tgsi_full_declaration),
 955                                    (maxDeclarations + 10)
 956                                    * sizeof(struct tgsi_full_declaration));
 957             maxDeclarations += 10;
 958          }
 959          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
 960             unsigned reg;
 961             for (reg = parse.FullToken.FullDeclaration.Range.First;
 962                  reg <= parse.FullToken.FullDeclaration.Range.Last;
 963                  ++reg) {
 964                ++mach->NumOutputs;
 965             }
 966          }
 967          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
 968             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
 969             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
 970          }
 971
 972          memcpy(declarations + numDeclarations,
 973                 &parse.FullToken.FullDeclaration,
 974                 sizeof(declarations[0]));
 975          numDeclarations++;
 976          break;
 977
 978       case TGSI_TOKEN_TYPE_IMMEDIATE:
 979          {
 980             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 981             assert( size <= 4 );
 982             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
 983
 984             for( i = 0; i < size; i++ ) {
 985                mach->Imms[mach->ImmLimit][i] =
 986                   parse.FullToken.FullImmediate.u[i].Float;
 987             }
 988             mach->ImmLimit += 1;
 989          }
 990          break;
 991
 992       case TGSI_TOKEN_TYPE_INSTRUCTION:
 993
 994          /* save expanded instruction */
 995          if (numInstructions == maxInstructions) {
 996             instructions = REALLOC(instructions,
 997                                    maxInstructions
 998                                    * sizeof(struct tgsi_full_instruction),
 999                                    (maxInstructions + 10)
1000                                    * sizeof(struct tgsi_full_instruction));
1001             maxInstructions += 10;
1002          }
1003
1004          memcpy(instructions + numInstructions,
1005                 &parse.FullToken.FullInstruction,
1006                 sizeof(instructions[0]));
1007
1008          numInstructions++;
1009          break;
1010
1011       case TGSI_TOKEN_TYPE_PROPERTY:
1012          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1013             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1014                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1015             }
1016          }
1017          break;
1018
1019       default:
1020          assert( 0 );
1021       }
1022    }
1023    tgsi_parse_free (&parse);
1024
1025    FREE(mach->Declarations);
1026    mach->Declarations = declarations;
1027    mach->NumDeclarations = numDeclarations;
1028
1029    FREE(mach->Instructions);
1030    mach->Instructions = instructions;
1031    mach->NumInstructions = numInstructions;
1032 }
1033
1034
1035 struct tgsi_exec_machine *
1036 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1037 {
1038    struct tgsi_exec_machine *mach;
1039    uint i;
1040
1041    mach = align_malloc( sizeof *mach, 16 );
1042    if (!mach)
1043       goto fail;
1044
1045    memset(mach, 0, sizeof(*mach));
1046
1047    mach->ShaderType = shader_type;
1048    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1049    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1050    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
1051
1052    if (shader_type != PIPE_SHADER_COMPUTE) {
1053       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1054       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1055       if (!mach->Inputs || !mach->Outputs)
1056          goto fail;
1057    }
1058
1059    /* Setup constants needed by the SSE2 executor. */
1060    for( i = 0; i < 4; i++ ) {
1061       mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
1062       mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
1063       mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
1064       mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
1065       mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
1066       mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
1067       mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
1068       mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
1069       mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
1070       mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
1071    }
1072
1073 #ifdef DEBUG
1074    /* silence warnings */
1075    (void) print_chan;
1076    (void) print_temp;
1077 #endif
1078
1079    return mach;
1080
1081 fail:
1082    if (mach) {
1083       align_free(mach->Inputs);
1084       align_free(mach->Outputs);
1085       align_free(mach);
1086    }
1087    return NULL;
1088 }
1089
1090
1091 void
1092 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1093 {
1094    if (mach) {
1095       FREE(mach->Instructions);
1096       FREE(mach->Declarations);
1097
1098       align_free(mach->Inputs);
1099       align_free(mach->Outputs);
1100
1101       align_free(mach);
1102    }
1103 }
1104
1105 static void
1106 micro_add(union tgsi_exec_channel *dst,
1107           const union tgsi_exec_channel *src0,
1108           const union tgsi_exec_channel *src1)
1109 {
1110    dst->f[0] = src0->f[0] + src1->f[0];
1111    dst->f[1] = src0->f[1] + src1->f[1];
1112    dst->f[2] = src0->f[2] + src1->f[2];
1113    dst->f[3] = src0->f[3] + src1->f[3];
1114 }
1115
1116 static void
1117 micro_div(
1118    union tgsi_exec_channel *dst,
1119    const union tgsi_exec_channel *src0,
1120    const union tgsi_exec_channel *src1 )
1121 {
1122    if (src1->f[0] != 0) {
1123       dst->f[0] = src0->f[0] / src1->f[0];
1124    }
1125    if (src1->f[1] != 0) {
1126       dst->f[1] = src0->f[1] / src1->f[1];
1127    }
1128    if (src1->f[2] != 0) {
1129       dst->f[2] = src0->f[2] / src1->f[2];
1130    }
1131    if (src1->f[3] != 0) {
1132       dst->f[3] = src0->f[3] / src1->f[3];
1133    }
1134 }
1135
1136 static void
1137 micro_lt(
1138    union tgsi_exec_channel *dst,
1139    const union tgsi_exec_channel *src0,
1140    const union tgsi_exec_channel *src1,
1141    const union tgsi_exec_channel *src2,
1142    const union tgsi_exec_channel *src3 )
1143 {
1144    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1145    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1146    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1147    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1148 }
1149
1150 static void
1151 micro_max(union tgsi_exec_channel *dst,
1152           const union tgsi_exec_channel *src0,
1153           const union tgsi_exec_channel *src1)
1154 {
1155    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1156    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1157    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1158    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1159 }
1160
1161 static void
1162 micro_min(union tgsi_exec_channel *dst,
1163           const union tgsi_exec_channel *src0,
1164           const union tgsi_exec_channel *src1)
1165 {
1166    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1167    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1168    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1169    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1170 }
1171
1172 static void
1173 micro_mul(union tgsi_exec_channel *dst,
1174           const union tgsi_exec_channel *src0,
1175           const union tgsi_exec_channel *src1)
1176 {
1177    dst->f[0] = src0->f[0] * src1->f[0];
1178    dst->f[1] = src0->f[1] * src1->f[1];
1179    dst->f[2] = src0->f[2] * src1->f[2];
1180    dst->f[3] = src0->f[3] * src1->f[3];
1181 }
1182
1183 static void
1184 micro_neg(
1185    union tgsi_exec_channel *dst,
1186    const union tgsi_exec_channel *src )
1187 {
1188    dst->f[0] = -src->f[0];
1189    dst->f[1] = -src->f[1];
1190    dst->f[2] = -src->f[2];
1191    dst->f[3] = -src->f[3];
1192 }
1193
1194 static void
1195 micro_pow(
1196    union tgsi_exec_channel *dst,
1197    const union tgsi_exec_channel *src0,
1198    const union tgsi_exec_channel *src1 )
1199 {
1200 #if FAST_MATH
1201    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1202    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1203    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1204    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1205 #else
1206    dst->f[0] = powf( src0->f[0], src1->f[0] );
1207    dst->f[1] = powf( src0->f[1], src1->f[1] );
1208    dst->f[2] = powf( src0->f[2], src1->f[2] );
1209    dst->f[3] = powf( src0->f[3], src1->f[3] );
1210 #endif
1211 }
1212
1213 static void
1214 micro_sub(union tgsi_exec_channel *dst,
1215           const union tgsi_exec_channel *src0,
1216           const union tgsi_exec_channel *src1)
1217 {
1218    dst->f[0] = src0->f[0] - src1->f[0];
1219    dst->f[1] = src0->f[1] - src1->f[1];
1220    dst->f[2] = src0->f[2] - src1->f[2];
1221    dst->f[3] = src0->f[3] - src1->f[3];
1222 }
1223
1224 static void
1225 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1226                        const uint chan_index,
1227                        const uint file,
1228                        const uint swizzle,
1229                        const union tgsi_exec_channel *index,
1230                        const union tgsi_exec_channel *index2D,
1231                        union tgsi_exec_channel *chan)
1232 {
1233    uint i;
1234
1235    assert(swizzle < 4);
1236
1237    switch (file) {
1238    case TGSI_FILE_CONSTANT:
1239       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1240          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1241          assert(mach->Consts[index2D->i[i]]);
1242
1243          if (index->i[i] < 0) {
1244             chan->u[i] = 0;
1245          } else {
1246             /* NOTE: copying the const value as a uint instead of float */
1247             const uint constbuf = index2D->i[i];
1248             const uint *buf = (const uint *)mach->Consts[constbuf];
1249             const int pos = index->i[i] * 4 + swizzle;
1250             /* const buffer bounds check */
1251             if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1252                if (0) {
1253                   /* Debug: print warning */
1254                   static int count = 0;
1255                   if (count++ < 100)
1256                      debug_printf("TGSI Exec: const buffer index %d"
1257                                   " out of bounds\n", pos);
1258                }
1259                chan->u[i] = 0;
1260             }
1261             else
1262                chan->u[i] = buf[pos];
1263          }
1264       }
1265       break;
1266
1267    case TGSI_FILE_INPUT:
1268       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1269          /*
1270          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1271             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1272                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1273                          index2D->i[i], index->i[i]);
1274                          }*/
1275          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1276          assert(pos >= 0);
1277          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1278          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1279       }
1280       break;
1281
1282    case TGSI_FILE_SYSTEM_VALUE:
1283       /* XXX no swizzling at this point.  Will be needed if we put
1284        * gl_FragCoord, for example, in a sys value register.
1285        */
1286       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1287          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1288       }
1289       break;
1290
1291    case TGSI_FILE_TEMPORARY:
1292       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1293          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1294          assert(index2D->i[i] == 0);
1295
1296          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1297       }
1298       break;
1299
1300    case TGSI_FILE_IMMEDIATE:
1301       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1302          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1303          assert(index2D->i[i] == 0);
1304
1305          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1306       }
1307       break;
1308
1309    case TGSI_FILE_ADDRESS:
1310       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1311          assert(index->i[i] >= 0);
1312          assert(index2D->i[i] == 0);
1313
1314          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1315       }
1316       break;
1317
1318    case TGSI_FILE_PREDICATE:
1319       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1320          assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1321          assert(index2D->i[i] == 0);
1322
1323          chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1324       }
1325       break;
1326
1327    case TGSI_FILE_OUTPUT:
1328       /* vertex/fragment output vars can be read too */
1329       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1330          assert(index->i[i] >= 0);
1331          assert(index2D->i[i] == 0);
1332
1333          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1334       }
1335       break;
1336
1337    default:
1338       assert(0);
1339       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1340          chan->u[i] = 0;
1341       }
1342    }
1343 }
1344
1345 static void
1346 fetch_source_d(const struct tgsi_exec_machine *mach,
1347                union tgsi_exec_channel *chan,
1348                const struct tgsi_full_src_register *reg,
1349                const uint chan_index,
1350                enum tgsi_exec_datatype src_datatype)
1351 {
1352    union tgsi_exec_channel index;
1353    union tgsi_exec_channel index2D;
1354    uint swizzle;
1355
1356    /* We start with a direct index into a register file.
1357     *
1358     *    file[1],
1359     *    where:
1360     *       file = Register.File
1361     *       [1] = Register.Index
1362     */
1363    index.i[0] =
1364    index.i[1] =
1365    index.i[2] =
1366    index.i[3] = reg->Register.Index;
1367
1368    /* There is an extra source register that indirectly subscripts
1369     * a register file. The direct index now becomes an offset
1370     * that is being added to the indirect register.
1371     *
1372     *    file[ind[2].x+1],
1373     *    where:
1374     *       ind = Indirect.File
1375     *       [2] = Indirect.Index
1376     *       .x = Indirect.SwizzleX
1377     */
1378    if (reg->Register.Indirect) {
1379       union tgsi_exec_channel index2;
1380       union tgsi_exec_channel indir_index;
1381       const uint execmask = mach->ExecMask;
1382       uint i;
1383
1384       /* which address register (always zero now) */
1385       index2.i[0] =
1386       index2.i[1] =
1387       index2.i[2] =
1388       index2.i[3] = reg->Indirect.Index;
1389       /* get current value of address register[swizzle] */
1390       swizzle = reg->Indirect.Swizzle;
1391       fetch_src_file_channel(mach,
1392                              chan_index,
1393                              reg->Indirect.File,
1394                              swizzle,
1395                              &index2,
1396                              &ZeroVec,
1397                              &indir_index);
1398
1399       /* add value of address register to the offset */
1400       index.i[0] += indir_index.i[0];
1401       index.i[1] += indir_index.i[1];
1402       index.i[2] += indir_index.i[2];
1403       index.i[3] += indir_index.i[3];
1404
1405       /* for disabled execution channels, zero-out the index to
1406        * avoid using a potential garbage value.
1407        */
1408       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1409          if ((execmask & (1 << i)) == 0)
1410             index.i[i] = 0;
1411       }
1412    }
1413
1414    /* There is an extra source register that is a second
1415     * subscript to a register file. Effectively it means that
1416     * the register file is actually a 2D array of registers.
1417     *
1418     *    file[3][1],
1419     *    where:
1420     *       [3] = Dimension.Index
1421     */
1422    if (reg->Register.Dimension) {
1423       index2D.i[0] =
1424       index2D.i[1] =
1425       index2D.i[2] =
1426       index2D.i[3] = reg->Dimension.Index;
1427
1428       /* Again, the second subscript index can be addressed indirectly
1429        * identically to the first one.
1430        * Nothing stops us from indirectly addressing the indirect register,
1431        * but there is no need for that, so we won't exercise it.
1432        *
1433        *    file[ind[4].y+3][1],
1434        *    where:
1435        *       ind = DimIndirect.File
1436        *       [4] = DimIndirect.Index
1437        *       .y = DimIndirect.SwizzleX
1438        */
1439       if (reg->Dimension.Indirect) {
1440          union tgsi_exec_channel index2;
1441          union tgsi_exec_channel indir_index;
1442          const uint execmask = mach->ExecMask;
1443          uint i;
1444
1445          index2.i[0] =
1446          index2.i[1] =
1447          index2.i[2] =
1448          index2.i[3] = reg->DimIndirect.Index;
1449
1450          swizzle = reg->DimIndirect.Swizzle;
1451          fetch_src_file_channel(mach,
1452                                 chan_index,
1453                                 reg->DimIndirect.File,
1454                                 swizzle,
1455                                 &index2,
1456                                 &ZeroVec,
1457                                 &indir_index);
1458
1459          index2D.i[0] += indir_index.i[0];
1460          index2D.i[1] += indir_index.i[1];
1461          index2D.i[2] += indir_index.i[2];
1462          index2D.i[3] += indir_index.i[3];
1463
1464          /* for disabled execution channels, zero-out the index to
1465           * avoid using a potential garbage value.
1466           */
1467          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1468             if ((execmask & (1 << i)) == 0) {
1469                index2D.i[i] = 0;
1470             }
1471          }
1472       }
1473
1474       /* If by any chance there was a need for a 3D array of register
1475        * files, we would have to check whether Dimension is followed
1476        * by a dimension register and continue the saga.
1477        */
1478    } else {
1479       index2D.i[0] =
1480       index2D.i[1] =
1481       index2D.i[2] =
1482       index2D.i[3] = 0;
1483    }
1484
1485    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1486    fetch_src_file_channel(mach,
1487                           chan_index,
1488                           reg->Register.File,
1489                           swizzle,
1490                           &index,
1491                           &index2D,
1492                           chan);
1493 }
1494
1495 static void
1496 fetch_source(const struct tgsi_exec_machine *mach,
1497              union tgsi_exec_channel *chan,
1498              const struct tgsi_full_src_register *reg,
1499              const uint chan_index,
1500              enum tgsi_exec_datatype src_datatype)
1501 {
1502    fetch_source_d(mach, chan, reg, chan_index, src_datatype);
1503
1504    if (reg->Register.Absolute) {
1505       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1506          micro_abs(chan, chan);
1507       } else {
1508          micro_iabs(chan, chan);
1509       }
1510    }
1511
1512    if (reg->Register.Negate) {
1513       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1514          micro_neg(chan, chan);
1515       } else {
1516          micro_ineg(chan, chan);
1517       }
1518    }
1519 }
1520
1521 static union tgsi_exec_channel *
1522 store_dest_dstret(struct tgsi_exec_machine *mach,
1523                  const union tgsi_exec_channel *chan,
1524                  const struct tgsi_full_dst_register *reg,
1525                  const struct tgsi_full_instruction *inst,
1526                  uint chan_index,
1527                  enum tgsi_exec_datatype dst_datatype)
1528 {
1529    uint i;
1530    static union tgsi_exec_channel null;
1531    union tgsi_exec_channel *dst;
1532    union tgsi_exec_channel index2D;
1533    uint execmask = mach->ExecMask;
1534    int offset = 0;  /* indirection offset */
1535    int index;
1536
1537    /* for debugging */
1538    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1539       check_inf_or_nan(chan);
1540    }
1541
1542    /* There is an extra source register that indirectly subscripts
1543     * a register file. The direct index now becomes an offset
1544     * that is being added to the indirect register.
1545     *
1546     *    file[ind[2].x+1],
1547     *    where:
1548     *       ind = Indirect.File
1549     *       [2] = Indirect.Index
1550     *       .x = Indirect.SwizzleX
1551     */
1552    if (reg->Register.Indirect) {
1553       union tgsi_exec_channel index;
1554       union tgsi_exec_channel indir_index;
1555       uint swizzle;
1556
1557       /* which address register (always zero for now) */
1558       index.i[0] =
1559       index.i[1] =
1560       index.i[2] =
1561       index.i[3] = reg->Indirect.Index;
1562
1563       /* get current value of address register[swizzle] */
1564       swizzle = reg->Indirect.Swizzle;
1565
1566       /* fetch values from the address/indirection register */
1567       fetch_src_file_channel(mach,
1568                              chan_index,
1569                              reg->Indirect.File,
1570                              swizzle,
1571                              &index,
1572                              &ZeroVec,
1573                              &indir_index);
1574
1575       /* save indirection offset */
1576       offset = indir_index.i[0];
1577    }
1578
1579    /* There is an extra source register that is a second
1580     * subscript to a register file. Effectively it means that
1581     * the register file is actually a 2D array of registers.
1582     *
1583     *    file[3][1],
1584     *    where:
1585     *       [3] = Dimension.Index
1586     */
1587    if (reg->Register.Dimension) {
1588       index2D.i[0] =
1589       index2D.i[1] =
1590       index2D.i[2] =
1591       index2D.i[3] = reg->Dimension.Index;
1592
1593       /* Again, the second subscript index can be addressed indirectly
1594        * identically to the first one.
1595        * Nothing stops us from indirectly addressing the indirect register,
1596        * but there is no need for that, so we won't exercise it.
1597        *
1598        *    file[ind[4].y+3][1],
1599        *    where:
1600        *       ind = DimIndirect.File
1601        *       [4] = DimIndirect.Index
1602        *       .y = DimIndirect.SwizzleX
1603        */
1604       if (reg->Dimension.Indirect) {
1605          union tgsi_exec_channel index2;
1606          union tgsi_exec_channel indir_index;
1607          const uint execmask = mach->ExecMask;
1608          unsigned swizzle;
1609          uint i;
1610
1611          index2.i[0] =
1612          index2.i[1] =
1613          index2.i[2] =
1614          index2.i[3] = reg->DimIndirect.Index;
1615
1616          swizzle = reg->DimIndirect.Swizzle;
1617          fetch_src_file_channel(mach,
1618                                 chan_index,
1619                                 reg->DimIndirect.File,
1620                                 swizzle,
1621                                 &index2,
1622                                 &ZeroVec,
1623                                 &indir_index);
1624
1625          index2D.i[0] += indir_index.i[0];
1626          index2D.i[1] += indir_index.i[1];
1627          index2D.i[2] += indir_index.i[2];
1628          index2D.i[3] += indir_index.i[3];
1629
1630          /* for disabled execution channels, zero-out the index to
1631           * avoid using a potential garbage value.
1632           */
1633          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1634             if ((execmask & (1 << i)) == 0) {
1635                index2D.i[i] = 0;
1636             }
1637          }
1638       }
1639
1640       /* If by any chance there was a need for a 3D array of register
1641        * files, we would have to check whether Dimension is followed
1642        * by a dimension register and continue the saga.
1643        */
1644    } else {
1645       index2D.i[0] =
1646       index2D.i[1] =
1647       index2D.i[2] =
1648       index2D.i[3] = 0;
1649    }
1650
1651    switch (reg->Register.File) {
1652    case TGSI_FILE_NULL:
1653       dst = &null;
1654       break;
1655
1656    case TGSI_FILE_OUTPUT:
1657       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1658          + reg->Register.Index;
1659       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1660 #if 0
1661       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1662                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1663                    reg->Register.Index);
1664       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1665          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1666          for (i = 0; i < TGSI_QUAD_SIZE; i++)
1667             if (execmask & (1 << i))
1668                debug_printf("%f, ", chan->f[i]);
1669          debug_printf(")\n");
1670       }
1671 #endif
1672       break;
1673
1674    case TGSI_FILE_TEMPORARY:
1675       index = reg->Register.Index;
1676       assert( index < TGSI_EXEC_NUM_TEMPS );
1677       dst = &mach->Temps[offset + index].xyzw[chan_index];
1678       break;
1679
1680    case TGSI_FILE_ADDRESS:
1681       index = reg->Register.Index;
1682       dst = &mach->Addrs[index].xyzw[chan_index];
1683       break;
1684
1685    case TGSI_FILE_PREDICATE:
1686       index = reg->Register.Index;
1687       assert(index < TGSI_EXEC_NUM_PREDS);
1688       dst = &mach->Predicates[index].xyzw[chan_index];
1689       break;
1690
1691    default:
1692       assert( 0 );
1693       return NULL;
1694    }
1695
1696    if (inst->Instruction.Predicate) {
1697       uint swizzle;
1698       union tgsi_exec_channel *pred;
1699
1700       switch (chan_index) {
1701       case TGSI_CHAN_X:
1702          swizzle = inst->Predicate.SwizzleX;
1703          break;
1704       case TGSI_CHAN_Y:
1705          swizzle = inst->Predicate.SwizzleY;
1706          break;
1707       case TGSI_CHAN_Z:
1708          swizzle = inst->Predicate.SwizzleZ;
1709          break;
1710       case TGSI_CHAN_W:
1711          swizzle = inst->Predicate.SwizzleW;
1712          break;
1713       default:
1714          assert(0);
1715          return NULL;
1716       }
1717
1718       assert(inst->Predicate.Index == 0);
1719
1720       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1721
1722       if (inst->Predicate.Negate) {
1723          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1724             if (pred->u[i]) {
1725                execmask &= ~(1 << i);
1726             }
1727          }
1728       } else {
1729          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1730             if (!pred->u[i]) {
1731                execmask &= ~(1 << i);
1732             }
1733          }
1734       }
1735    }
1736
1737    return dst;
1738 }
1739
1740 static void
1741 store_dest_double(struct tgsi_exec_machine *mach,
1742                  const union tgsi_exec_channel *chan,
1743                  const struct tgsi_full_dst_register *reg,
1744                  const struct tgsi_full_instruction *inst,
1745                  uint chan_index,
1746                  enum tgsi_exec_datatype dst_datatype)
1747 {
1748    union tgsi_exec_channel *dst;
1749    const uint execmask = mach->ExecMask;
1750    int i;
1751
1752    dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
1753                            dst_datatype);
1754    if (!dst)
1755       return;
1756
1757    /* doubles path */
1758    for (i = 0; i < TGSI_QUAD_SIZE; i++)
1759       if (execmask & (1 << i))
1760          dst->i[i] = chan->i[i];
1761 }
1762
1763 static void
1764 store_dest(struct tgsi_exec_machine *mach,
1765            const union tgsi_exec_channel *chan,
1766            const struct tgsi_full_dst_register *reg,
1767            const struct tgsi_full_instruction *inst,
1768            uint chan_index,
1769            enum tgsi_exec_datatype dst_datatype)
1770 {
1771    union tgsi_exec_channel *dst;
1772    const uint execmask = mach->ExecMask;
1773    int i;
1774
1775    dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
1776                     dst_datatype);
1777    if (!dst)
1778       return;
1779
1780    if (!inst->Instruction.Saturate) {
1781       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1782          if (execmask & (1 << i))
1783             dst->i[i] = chan->i[i];
1784    }
1785    else {
1786       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1787          if (execmask & (1 << i)) {
1788             if (chan->f[i] < 0.0f)
1789                dst->f[i] = 0.0f;
1790             else if (chan->f[i] > 1.0f)
1791                dst->f[i] = 1.0f;
1792             else
1793                dst->i[i] = chan->i[i];
1794          }
1795    }
1796 }
1797
1798 #define FETCH(VAL,INDEX,CHAN)\
1799     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1800
1801 #define IFETCH(VAL,INDEX,CHAN)\
1802     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1803
1804
1805 /**
1806  * Execute ARB-style KIL which is predicated by a src register.
1807  * Kill fragment if any of the four values is less than zero.
1808  */
1809 static void
1810 exec_kill_if(struct tgsi_exec_machine *mach,
1811              const struct tgsi_full_instruction *inst)
1812 {
1813    uint uniquemask;
1814    uint chan_index;
1815    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1816    union tgsi_exec_channel r[1];
1817
1818    /* This mask stores component bits that were already tested. */
1819    uniquemask = 0;
1820
1821    for (chan_index = 0; chan_index < 4; chan_index++)
1822    {
1823       uint swizzle;
1824       uint i;
1825
1826       /* unswizzle channel */
1827       swizzle = tgsi_util_get_full_src_register_swizzle (
1828                         &inst->Src[0],
1829                         chan_index);
1830
1831       /* check if the component has not been already tested */
1832       if (uniquemask & (1 << swizzle))
1833          continue;
1834       uniquemask |= 1 << swizzle;
1835
1836       FETCH(&r[0], 0, chan_index);
1837       for (i = 0; i < 4; i++)
1838          if (r[0].f[i] < 0.0f)
1839             kilmask |= 1 << i;
1840    }
1841
1842    /* restrict to fragments currently executing */
1843    kilmask &= mach->ExecMask;
1844
1845    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1846 }
1847
1848 /**
1849  * Unconditional fragment kill/discard.
1850  */
1851 static void
1852 exec_kill(struct tgsi_exec_machine *mach,
1853           const struct tgsi_full_instruction *inst)
1854 {
1855    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1856
1857    /* kill fragment for all fragments currently executing */
1858    kilmask = mach->ExecMask;
1859    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1860 }
1861
1862 static void
1863 emit_vertex(struct tgsi_exec_machine *mach)
1864 {
1865    /* FIXME: check for exec mask correctly
1866    unsigned i;
1867    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1868          if ((mach->ExecMask & (1 << i)))
1869    */
1870    if (mach->ExecMask) {
1871       if (mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] >= mach->MaxOutputVertices)
1872          return;
1873
1874       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1875       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1876    }
1877 }
1878
1879 static void
1880 emit_primitive(struct tgsi_exec_machine *mach)
1881 {
1882    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1883    /* FIXME: check for exec mask correctly
1884    unsigned i;
1885    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1886          if ((mach->ExecMask & (1 << i)))
1887    */
1888    if (mach->ExecMask) {
1889       ++(*prim_count);
1890       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1891       mach->Primitives[*prim_count] = 0;
1892    }
1893 }
1894
1895 static void
1896 conditional_emit_primitive(struct tgsi_exec_machine *mach)
1897 {
1898    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1899       int emitted_verts =
1900          mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1901       if (emitted_verts) {
1902          emit_primitive(mach);
1903       }
1904    }
1905 }
1906
1907
1908 /*
1909  * Fetch four texture samples using STR texture coordinates.
1910  */
1911 static void
1912 fetch_texel( struct tgsi_sampler *sampler,
1913              const unsigned sview_idx,
1914              const unsigned sampler_idx,
1915              const union tgsi_exec_channel *s,
1916              const union tgsi_exec_channel *t,
1917              const union tgsi_exec_channel *p,
1918              const union tgsi_exec_channel *c0,
1919              const union tgsi_exec_channel *c1,
1920              float derivs[3][2][TGSI_QUAD_SIZE],
1921              const int8_t offset[3],
1922              enum tgsi_sampler_control control,
1923              union tgsi_exec_channel *r,
1924              union tgsi_exec_channel *g,
1925              union tgsi_exec_channel *b,
1926              union tgsi_exec_channel *a )
1927 {
1928    uint j;
1929    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
1930
1931    /* FIXME: handle explicit derivs, offsets */
1932    sampler->get_samples(sampler, sview_idx, sampler_idx,
1933                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
1934
1935    for (j = 0; j < 4; j++) {
1936       r->f[j] = rgba[0][j];
1937       g->f[j] = rgba[1][j];
1938       b->f[j] = rgba[2][j];
1939       a->f[j] = rgba[3][j];
1940    }
1941 }
1942
1943
1944 #define TEX_MODIFIER_NONE           0
1945 #define TEX_MODIFIER_PROJECTED      1
1946 #define TEX_MODIFIER_LOD_BIAS       2
1947 #define TEX_MODIFIER_EXPLICIT_LOD   3
1948 #define TEX_MODIFIER_LEVEL_ZERO     4
1949 #define TEX_MODIFIER_GATHER         5
1950
1951 /*
1952  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
1953  */
1954 static void
1955 fetch_texel_offsets(struct tgsi_exec_machine *mach,
1956                     const struct tgsi_full_instruction *inst,
1957                     int8_t offsets[3])
1958 {
1959    if (inst->Texture.NumOffsets == 1) {
1960       union tgsi_exec_channel index;
1961       union tgsi_exec_channel offset[3];
1962       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
1963       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
1964                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
1965       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
1966                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
1967       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
1968                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
1969      offsets[0] = offset[0].i[0];
1970      offsets[1] = offset[1].i[0];
1971      offsets[2] = offset[2].i[0];
1972    } else {
1973      assert(inst->Texture.NumOffsets == 0);
1974      offsets[0] = offsets[1] = offsets[2] = 0;
1975    }
1976 }
1977
1978
1979 /*
1980  * Fetch dx and dy values for one channel (s, t or r).
1981  * Put dx values into one float array, dy values into another.
1982  */
1983 static void
1984 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
1985                            const struct tgsi_full_instruction *inst,
1986                            unsigned regdsrcx,
1987                            unsigned chan,
1988                            float derivs[2][TGSI_QUAD_SIZE])
1989 {
1990    union tgsi_exec_channel d;
1991    FETCH(&d, regdsrcx, chan);
1992    derivs[0][0] = d.f[0];
1993    derivs[0][1] = d.f[1];
1994    derivs[0][2] = d.f[2];
1995    derivs[0][3] = d.f[3];
1996    FETCH(&d, regdsrcx + 1, chan);
1997    derivs[1][0] = d.f[0];
1998    derivs[1][1] = d.f[1];
1999    derivs[1][2] = d.f[2];
2000    derivs[1][3] = d.f[3];
2001 }
2002
2003 static uint
2004 fetch_sampler_unit(struct tgsi_exec_machine *mach,
2005                    const struct tgsi_full_instruction *inst,
2006                    uint sampler)
2007 {
2008    uint unit = 0;
2009    int i;
2010    if (inst->Src[sampler].Register.Indirect) {
2011       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2012       union tgsi_exec_channel indir_index, index2;
2013       const uint execmask = mach->ExecMask;
2014       index2.i[0] =
2015       index2.i[1] =
2016       index2.i[2] =
2017       index2.i[3] = reg->Indirect.Index;
2018
2019       fetch_src_file_channel(mach,
2020                              0,
2021                              reg->Indirect.File,
2022                              reg->Indirect.Swizzle,
2023                              &index2,
2024                              &ZeroVec,
2025                              &indir_index);
2026       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2027          if (execmask & (1 << i)) {
2028             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2029             break;
2030          }
2031       }
2032
2033    } else {
2034       unit = inst->Src[sampler].Register.Index;
2035    }
2036    return unit;
2037 }
2038
2039 /*
2040  * execute a texture instruction.
2041  *
2042  * modifier is used to control the channel routing for the
2043  * instruction variants like proj, lod, and texture with lod bias.
2044  * sampler indicates which src register the sampler is contained in.
2045  */
2046 static void
2047 exec_tex(struct tgsi_exec_machine *mach,
2048          const struct tgsi_full_instruction *inst,
2049          uint modifier, uint sampler)
2050 {
2051    const union tgsi_exec_channel *args[5], *proj = NULL;
2052    union tgsi_exec_channel r[5];
2053    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2054    uint chan;
2055    uint unit;
2056    int8_t offsets[3];
2057    int dim, shadow_ref, i;
2058
2059    unit = fetch_sampler_unit(mach, inst, sampler);
2060    /* always fetch all 3 offsets, overkill but keeps code simple */
2061    fetch_texel_offsets(mach, inst, offsets);
2062
2063    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2064    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2065
2066    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2067    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2068
2069    assert(dim <= 4);
2070    if (shadow_ref >= 0)
2071       assert(shadow_ref >= dim && shadow_ref < Elements(args));
2072
2073    /* fetch modifier to the last argument */
2074    if (modifier != TEX_MODIFIER_NONE) {
2075       const int last = Elements(args) - 1;
2076
2077       /* fetch modifier from src0.w or src1.x */
2078       if (sampler == 1) {
2079          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2080          FETCH(&r[last], 0, TGSI_CHAN_W);
2081       }
2082       else {
2083          assert(shadow_ref != 4);
2084          FETCH(&r[last], 1, TGSI_CHAN_X);
2085       }
2086
2087       if (modifier != TEX_MODIFIER_PROJECTED) {
2088          args[last] = &r[last];
2089       }
2090       else {
2091          proj = &r[last];
2092          args[last] = &ZeroVec;
2093       }
2094
2095       /* point unused arguments to zero vector */
2096       for (i = dim; i < last; i++)
2097          args[i] = &ZeroVec;
2098
2099       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2100          control = TGSI_SAMPLER_LOD_EXPLICIT;
2101       else if (modifier == TEX_MODIFIER_LOD_BIAS)
2102          control = TGSI_SAMPLER_LOD_BIAS;
2103       else if (modifier == TEX_MODIFIER_GATHER)
2104          control = TGSI_SAMPLER_GATHER;
2105    }
2106    else {
2107       for (i = dim; i < Elements(args); i++)
2108          args[i] = &ZeroVec;
2109    }
2110
2111    /* fetch coordinates */
2112    for (i = 0; i < dim; i++) {
2113       FETCH(&r[i], 0, TGSI_CHAN_X + i);
2114
2115       if (proj)
2116          micro_div(&r[i], &r[i], proj);
2117
2118       args[i] = &r[i];
2119    }
2120
2121    /* fetch reference value */
2122    if (shadow_ref >= 0) {
2123       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2124
2125       if (proj)
2126          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2127
2128       args[shadow_ref] = &r[shadow_ref];
2129    }
2130
2131    fetch_texel(mach->Sampler, unit, unit,
2132          args[0], args[1], args[2], args[3], args[4],
2133          NULL, offsets, control,
2134          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2135
2136 #if 0
2137    debug_printf("fetch r: %g %g %g %g\n",
2138          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2139    debug_printf("fetch g: %g %g %g %g\n",
2140          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2141    debug_printf("fetch b: %g %g %g %g\n",
2142          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2143    debug_printf("fetch a: %g %g %g %g\n",
2144          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2145 #endif
2146
2147    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2148       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2149          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2150       }
2151    }
2152 }
2153
2154 static void
2155 exec_lodq(struct tgsi_exec_machine *mach,
2156           const struct tgsi_full_instruction *inst)
2157 {
2158    uint unit;
2159    int dim;
2160    int i;
2161    union tgsi_exec_channel coords[4];
2162    const union tgsi_exec_channel *args[Elements(coords)];
2163    union tgsi_exec_channel r[2];
2164
2165    unit = fetch_sampler_unit(mach, inst, 1);
2166    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2167    assert(dim <= Elements(coords));
2168    /* fetch coordinates */
2169    for (i = 0; i < dim; i++) {
2170       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2171       args[i] = &coords[i];
2172    }
2173    for (i = dim; i < Elements(coords); i++) {
2174       args[i] = &ZeroVec;
2175    }
2176    mach->Sampler->query_lod(mach->Sampler, unit, unit,
2177                             args[0]->f,
2178                             args[1]->f,
2179                             args[2]->f,
2180                             args[3]->f,
2181                             TGSI_SAMPLER_LOD_NONE,
2182                             r[0].f,
2183                             r[1].f);
2184
2185    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2186       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2187                  TGSI_EXEC_DATA_FLOAT);
2188    }
2189    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2190       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2191                  TGSI_EXEC_DATA_FLOAT);
2192    }
2193 }
2194
2195 static void
2196 exec_txd(struct tgsi_exec_machine *mach,
2197          const struct tgsi_full_instruction *inst)
2198 {
2199    union tgsi_exec_channel r[4];
2200    float derivs[3][2][TGSI_QUAD_SIZE];
2201    uint chan;
2202    uint unit;
2203    int8_t offsets[3];
2204
2205    unit = fetch_sampler_unit(mach, inst, 3);
2206    /* always fetch all 3 offsets, overkill but keeps code simple */
2207    fetch_texel_offsets(mach, inst, offsets);
2208
2209    switch (inst->Texture.Texture) {
2210    case TGSI_TEXTURE_1D:
2211       FETCH(&r[0], 0, TGSI_CHAN_X);
2212
2213       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2214
2215       fetch_texel(mach->Sampler, unit, unit,
2216                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2217                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2218                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2219       break;
2220
2221    case TGSI_TEXTURE_SHADOW1D:
2222    case TGSI_TEXTURE_1D_ARRAY:
2223    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2224       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2225       FETCH(&r[0], 0, TGSI_CHAN_X);
2226       FETCH(&r[1], 0, TGSI_CHAN_Y);
2227       FETCH(&r[2], 0, TGSI_CHAN_Z);
2228
2229       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2230
2231       fetch_texel(mach->Sampler, unit, unit,
2232                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2233                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2234                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2235       break;
2236
2237    case TGSI_TEXTURE_2D:
2238    case TGSI_TEXTURE_RECT:
2239       FETCH(&r[0], 0, TGSI_CHAN_X);
2240       FETCH(&r[1], 0, TGSI_CHAN_Y);
2241
2242       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2243       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2244
2245       fetch_texel(mach->Sampler, unit, unit,
2246                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2247                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2248                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2249       break;
2250
2251
2252    case TGSI_TEXTURE_SHADOW2D:
2253    case TGSI_TEXTURE_SHADOWRECT:
2254    case TGSI_TEXTURE_2D_ARRAY:
2255    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2256       /* only SHADOW2D_ARRAY actually needs W */
2257       FETCH(&r[0], 0, TGSI_CHAN_X);
2258       FETCH(&r[1], 0, TGSI_CHAN_Y);
2259       FETCH(&r[2], 0, TGSI_CHAN_Z);
2260       FETCH(&r[3], 0, TGSI_CHAN_W);
2261
2262       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2263       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2264
2265       fetch_texel(mach->Sampler, unit, unit,
2266                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2267                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2268                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2269       break;
2270
2271    case TGSI_TEXTURE_3D:
2272    case TGSI_TEXTURE_CUBE:
2273    case TGSI_TEXTURE_CUBE_ARRAY:
2274    case TGSI_TEXTURE_SHADOWCUBE:
2275       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2276       FETCH(&r[0], 0, TGSI_CHAN_X);
2277       FETCH(&r[1], 0, TGSI_CHAN_Y);
2278       FETCH(&r[2], 0, TGSI_CHAN_Z);
2279       FETCH(&r[3], 0, TGSI_CHAN_W);
2280
2281       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2282       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2283       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2284
2285       fetch_texel(mach->Sampler, unit, unit,
2286                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2287                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2288                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2289       break;
2290
2291    default:
2292       assert(0);
2293    }
2294
2295    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2296       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2297          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2298       }
2299    }
2300 }
2301
2302
2303 static void
2304 exec_txf(struct tgsi_exec_machine *mach,
2305          const struct tgsi_full_instruction *inst)
2306 {
2307    union tgsi_exec_channel r[4];
2308    uint chan;
2309    uint unit;
2310    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2311    int j;
2312    int8_t offsets[3];
2313    unsigned target;
2314
2315    unit = fetch_sampler_unit(mach, inst, 1);
2316    /* always fetch all 3 offsets, overkill but keeps code simple */
2317    fetch_texel_offsets(mach, inst, offsets);
2318
2319    IFETCH(&r[3], 0, TGSI_CHAN_W);
2320
2321    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2322        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2323       target = mach->SamplerViews[unit].Resource;
2324    }
2325    else {
2326       target = inst->Texture.Texture;
2327    }
2328    switch(target) {
2329    case TGSI_TEXTURE_3D:
2330    case TGSI_TEXTURE_2D_ARRAY:
2331    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2332    case TGSI_TEXTURE_2D_ARRAY_MSAA:
2333       IFETCH(&r[2], 0, TGSI_CHAN_Z);
2334       /* fallthrough */
2335    case TGSI_TEXTURE_2D:
2336    case TGSI_TEXTURE_RECT:
2337    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2338    case TGSI_TEXTURE_SHADOW2D:
2339    case TGSI_TEXTURE_SHADOWRECT:
2340    case TGSI_TEXTURE_1D_ARRAY:
2341    case TGSI_TEXTURE_2D_MSAA:
2342       IFETCH(&r[1], 0, TGSI_CHAN_Y);
2343       /* fallthrough */
2344    case TGSI_TEXTURE_BUFFER:
2345    case TGSI_TEXTURE_1D:
2346    case TGSI_TEXTURE_SHADOW1D:
2347       IFETCH(&r[0], 0, TGSI_CHAN_X);
2348       break;
2349    default:
2350       assert(0);
2351       break;
2352    }
2353
2354    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2355                             offsets, rgba);
2356
2357    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2358       r[0].f[j] = rgba[0][j];
2359       r[1].f[j] = rgba[1][j];
2360       r[2].f[j] = rgba[2][j];
2361       r[3].f[j] = rgba[3][j];
2362    }
2363
2364    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2365        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2366       unsigned char swizzles[4];
2367       swizzles[0] = inst->Src[1].Register.SwizzleX;
2368       swizzles[1] = inst->Src[1].Register.SwizzleY;
2369       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2370       swizzles[3] = inst->Src[1].Register.SwizzleW;
2371
2372       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2373          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2374             store_dest(mach, &r[swizzles[chan]],
2375                        &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2376          }
2377       }
2378    }
2379    else {
2380       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2381          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2382             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2383          }
2384       }
2385    }
2386 }
2387
2388 static void
2389 exec_txq(struct tgsi_exec_machine *mach,
2390          const struct tgsi_full_instruction *inst)
2391 {
2392    int result[4];
2393    union tgsi_exec_channel r[4], src;
2394    uint chan;
2395    uint unit;
2396    int i,j;
2397
2398    unit = fetch_sampler_unit(mach, inst, 1);
2399
2400    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2401
2402    /* XXX: This interface can't return per-pixel values */
2403    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2404
2405    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2406       for (j = 0; j < 4; j++) {
2407          r[j].i[i] = result[j];
2408       }
2409    }
2410
2411    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2412       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2413          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2414                     TGSI_EXEC_DATA_INT);
2415       }
2416    }
2417 }
2418
2419 static void
2420 exec_sample(struct tgsi_exec_machine *mach,
2421             const struct tgsi_full_instruction *inst,
2422             uint modifier, boolean compare)
2423 {
2424    const uint resource_unit = inst->Src[1].Register.Index;
2425    const uint sampler_unit = inst->Src[2].Register.Index;
2426    union tgsi_exec_channel r[5], c1;
2427    const union tgsi_exec_channel *lod = &ZeroVec;
2428    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2429    uint chan;
2430    unsigned char swizzles[4];
2431    int8_t offsets[3];
2432
2433    /* always fetch all 3 offsets, overkill but keeps code simple */
2434    fetch_texel_offsets(mach, inst, offsets);
2435
2436    assert(modifier != TEX_MODIFIER_PROJECTED);
2437
2438    if (modifier != TEX_MODIFIER_NONE) {
2439       if (modifier == TEX_MODIFIER_LOD_BIAS) {
2440          FETCH(&c1, 3, TGSI_CHAN_X);
2441          lod = &c1;
2442          control = TGSI_SAMPLER_LOD_BIAS;
2443       }
2444       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2445          FETCH(&c1, 3, TGSI_CHAN_X);
2446          lod = &c1;
2447          control = TGSI_SAMPLER_LOD_EXPLICIT;
2448       }
2449       else {
2450          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2451          control = TGSI_SAMPLER_LOD_ZERO;
2452       }
2453    }
2454
2455    FETCH(&r[0], 0, TGSI_CHAN_X);
2456
2457    switch (mach->SamplerViews[resource_unit].Resource) {
2458    case TGSI_TEXTURE_1D:
2459       if (compare) {
2460          FETCH(&r[2], 3, TGSI_CHAN_X);
2461          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2462                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2463                      NULL, offsets, control,
2464                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2465       }
2466       else {
2467          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2468                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2469                      NULL, offsets, control,
2470                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2471       }
2472       break;
2473
2474    case TGSI_TEXTURE_1D_ARRAY:
2475    case TGSI_TEXTURE_2D:
2476    case TGSI_TEXTURE_RECT:
2477       FETCH(&r[1], 0, TGSI_CHAN_Y);
2478       if (compare) {
2479          FETCH(&r[2], 3, TGSI_CHAN_X);
2480          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2481                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2482                      NULL, offsets, control,
2483                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2484       }
2485       else {
2486          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2487                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2488                      NULL, offsets, control,
2489                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2490       }
2491       break;
2492
2493    case TGSI_TEXTURE_2D_ARRAY:
2494    case TGSI_TEXTURE_3D:
2495    case TGSI_TEXTURE_CUBE:
2496       FETCH(&r[1], 0, TGSI_CHAN_Y);
2497       FETCH(&r[2], 0, TGSI_CHAN_Z);
2498       if(compare) {
2499          FETCH(&r[3], 3, TGSI_CHAN_X);
2500          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2501                      &r[0], &r[1], &r[2], &r[3], lod,
2502                      NULL, offsets, control,
2503                      &r[0], &r[1], &r[2], &r[3]);
2504       }
2505       else {
2506          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2507                      &r[0], &r[1], &r[2], &ZeroVec, lod,
2508                      NULL, offsets, control,
2509                      &r[0], &r[1], &r[2], &r[3]);
2510       }
2511       break;
2512
2513    case TGSI_TEXTURE_CUBE_ARRAY:
2514       FETCH(&r[1], 0, TGSI_CHAN_Y);
2515       FETCH(&r[2], 0, TGSI_CHAN_Z);
2516       FETCH(&r[3], 0, TGSI_CHAN_W);
2517       if(compare) {
2518          FETCH(&r[4], 3, TGSI_CHAN_X);
2519          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2520                      &r[0], &r[1], &r[2], &r[3], &r[4],
2521                      NULL, offsets, control,
2522                      &r[0], &r[1], &r[2], &r[3]);
2523       }
2524       else {
2525          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2526                      &r[0], &r[1], &r[2], &r[3], lod,
2527                      NULL, offsets, control,
2528                      &r[0], &r[1], &r[2], &r[3]);
2529       }
2530       break;
2531
2532
2533    default:
2534       assert(0);
2535    }
2536
2537    swizzles[0] = inst->Src[1].Register.SwizzleX;
2538    swizzles[1] = inst->Src[1].Register.SwizzleY;
2539    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2540    swizzles[3] = inst->Src[1].Register.SwizzleW;
2541
2542    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2543       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2544          store_dest(mach, &r[swizzles[chan]],
2545                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2546       }
2547    }
2548 }
2549
2550 static void
2551 exec_sample_d(struct tgsi_exec_machine *mach,
2552               const struct tgsi_full_instruction *inst)
2553 {
2554    const uint resource_unit = inst->Src[1].Register.Index;
2555    const uint sampler_unit = inst->Src[2].Register.Index;
2556    union tgsi_exec_channel r[4];
2557    float derivs[3][2][TGSI_QUAD_SIZE];
2558    uint chan;
2559    unsigned char swizzles[4];
2560    int8_t offsets[3];
2561
2562    /* always fetch all 3 offsets, overkill but keeps code simple */
2563    fetch_texel_offsets(mach, inst, offsets);
2564
2565    FETCH(&r[0], 0, TGSI_CHAN_X);
2566
2567    switch (mach->SamplerViews[resource_unit].Resource) {
2568    case TGSI_TEXTURE_1D:
2569    case TGSI_TEXTURE_1D_ARRAY:
2570       /* only 1D array actually needs Y */
2571       FETCH(&r[1], 0, TGSI_CHAN_Y);
2572
2573       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2574
2575       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2576                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2577                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2578                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2579       break;
2580
2581    case TGSI_TEXTURE_2D:
2582    case TGSI_TEXTURE_RECT:
2583    case TGSI_TEXTURE_2D_ARRAY:
2584       /* only 2D array actually needs Z */
2585       FETCH(&r[1], 0, TGSI_CHAN_Y);
2586       FETCH(&r[2], 0, TGSI_CHAN_Z);
2587
2588       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2589       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2590
2591       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2592                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2593                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2594                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2595       break;
2596
2597    case TGSI_TEXTURE_3D:
2598    case TGSI_TEXTURE_CUBE:
2599    case TGSI_TEXTURE_CUBE_ARRAY:
2600       /* only cube array actually needs W */
2601       FETCH(&r[1], 0, TGSI_CHAN_Y);
2602       FETCH(&r[2], 0, TGSI_CHAN_Z);
2603       FETCH(&r[3], 0, TGSI_CHAN_W);
2604
2605       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2606       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2607       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2608
2609       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2610                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2611                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2612                   &r[0], &r[1], &r[2], &r[3]);
2613       break;
2614
2615    default:
2616       assert(0);
2617    }
2618
2619    swizzles[0] = inst->Src[1].Register.SwizzleX;
2620    swizzles[1] = inst->Src[1].Register.SwizzleY;
2621    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2622    swizzles[3] = inst->Src[1].Register.SwizzleW;
2623
2624    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2625       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2626          store_dest(mach, &r[swizzles[chan]],
2627                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2628       }
2629    }
2630 }
2631
2632
2633 /**
2634  * Evaluate a constant-valued coefficient at the position of the
2635  * current quad.
2636  */
2637 static void
2638 eval_constant_coef(
2639    struct tgsi_exec_machine *mach,
2640    unsigned attrib,
2641    unsigned chan )
2642 {
2643    unsigned i;
2644
2645    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2646       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2647    }
2648 }
2649
2650 /**
2651  * Evaluate a linear-valued coefficient at the position of the
2652  * current quad.
2653  */
2654 static void
2655 eval_linear_coef(
2656    struct tgsi_exec_machine *mach,
2657    unsigned attrib,
2658    unsigned chan )
2659 {
2660    const float x = mach->QuadPos.xyzw[0].f[0];
2661    const float y = mach->QuadPos.xyzw[1].f[0];
2662    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2663    const float dady = mach->InterpCoefs[attrib].dady[chan];
2664    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2665    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2666    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2667    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2668    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2669 }
2670
2671 /**
2672  * Evaluate a perspective-valued coefficient at the position of the
2673  * current quad.
2674  */
2675 static void
2676 eval_perspective_coef(
2677    struct tgsi_exec_machine *mach,
2678    unsigned attrib,
2679    unsigned chan )
2680 {
2681    const float x = mach->QuadPos.xyzw[0].f[0];
2682    const float y = mach->QuadPos.xyzw[1].f[0];
2683    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2684    const float dady = mach->InterpCoefs[attrib].dady[chan];
2685    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2686    const float *w = mach->QuadPos.xyzw[3].f;
2687    /* divide by W here */
2688    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2689    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2690    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2691    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2692 }
2693
2694
2695 typedef void (* eval_coef_func)(
2696    struct tgsi_exec_machine *mach,
2697    unsigned attrib,
2698    unsigned chan );
2699
2700 static void
2701 exec_declaration(struct tgsi_exec_machine *mach,
2702                  const struct tgsi_full_declaration *decl)
2703 {
2704    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2705       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2706       return;
2707    }
2708
2709    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2710       if (decl->Declaration.File == TGSI_FILE_INPUT) {
2711          uint first, last, mask;
2712
2713          first = decl->Range.First;
2714          last = decl->Range.Last;
2715          mask = decl->Declaration.UsageMask;
2716
2717          /* XXX we could remove this special-case code since
2718           * mach->InterpCoefs[first].a0 should already have the
2719           * front/back-face value.  But we should first update the
2720           * ureg code to emit the right UsageMask value (WRITEMASK_X).
2721           * Then, we could remove the tgsi_exec_machine::Face field.
2722           */
2723          /* XXX make FACE a system value */
2724          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2725             uint i;
2726
2727             assert(decl->Semantic.Index == 0);
2728             assert(first == last);
2729
2730             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2731                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2732             }
2733          } else {
2734             eval_coef_func eval;
2735             uint i, j;
2736
2737             switch (decl->Interp.Interpolate) {
2738             case TGSI_INTERPOLATE_CONSTANT:
2739                eval = eval_constant_coef;
2740                break;
2741
2742             case TGSI_INTERPOLATE_LINEAR:
2743                eval = eval_linear_coef;
2744                break;
2745
2746             case TGSI_INTERPOLATE_PERSPECTIVE:
2747                eval = eval_perspective_coef;
2748                break;
2749
2750             case TGSI_INTERPOLATE_COLOR:
2751                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2752                break;
2753
2754             default:
2755                assert(0);
2756                return;
2757             }
2758
2759             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2760                if (mask & (1 << j)) {
2761                   for (i = first; i <= last; i++) {
2762                      eval(mach, i, j);
2763                   }
2764                }
2765             }
2766          }
2767
2768          if (DEBUG_EXECUTION) {
2769             uint i, j;
2770             for (i = first; i <= last; ++i) {
2771                debug_printf("IN[%2u] = ", i);
2772                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2773                   if (j > 0) {
2774                      debug_printf("         ");
2775                   }
2776                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
2777                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
2778                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
2779                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
2780                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
2781                }
2782             }
2783          }
2784       }
2785    }
2786
2787 }
2788
2789 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2790                                 const union tgsi_exec_channel *src);
2791
2792 static void
2793 exec_scalar_unary(struct tgsi_exec_machine *mach,
2794                   const struct tgsi_full_instruction *inst,
2795                   micro_unary_op op,
2796                   enum tgsi_exec_datatype dst_datatype,
2797                   enum tgsi_exec_datatype src_datatype)
2798 {
2799    unsigned int chan;
2800    union tgsi_exec_channel src;
2801    union tgsi_exec_channel dst;
2802
2803    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
2804    op(&dst, &src);
2805    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2806       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2807          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2808       }
2809    }
2810 }
2811
2812 static void
2813 exec_vector_unary(struct tgsi_exec_machine *mach,
2814                   const struct tgsi_full_instruction *inst,
2815                   micro_unary_op op,
2816                   enum tgsi_exec_datatype dst_datatype,
2817                   enum tgsi_exec_datatype src_datatype)
2818 {
2819    unsigned int chan;
2820    struct tgsi_exec_vector dst;
2821
2822    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2823       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2824          union tgsi_exec_channel src;
2825
2826          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2827          op(&dst.xyzw[chan], &src);
2828       }
2829    }
2830    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2831       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2832          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2833       }
2834    }
2835 }
2836
2837 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2838                                  const union tgsi_exec_channel *src0,
2839                                  const union tgsi_exec_channel *src1);
2840
2841 static void
2842 exec_scalar_binary(struct tgsi_exec_machine *mach,
2843                    const struct tgsi_full_instruction *inst,
2844                    micro_binary_op op,
2845                    enum tgsi_exec_datatype dst_datatype,
2846                    enum tgsi_exec_datatype src_datatype)
2847 {
2848    unsigned int chan;
2849    union tgsi_exec_channel src[2];
2850    union tgsi_exec_channel dst;
2851
2852    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
2853    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
2854    op(&dst, &src[0], &src[1]);
2855    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2856       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2857          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2858       }
2859    }
2860 }
2861
2862 static void
2863 exec_vector_binary(struct tgsi_exec_machine *mach,
2864                    const struct tgsi_full_instruction *inst,
2865                    micro_binary_op op,
2866                    enum tgsi_exec_datatype dst_datatype,
2867                    enum tgsi_exec_datatype src_datatype)
2868 {
2869    unsigned int chan;
2870    struct tgsi_exec_vector dst;
2871
2872    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2873       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2874          union tgsi_exec_channel src[2];
2875
2876          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2877          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2878          op(&dst.xyzw[chan], &src[0], &src[1]);
2879       }
2880    }
2881    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2882       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2883          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2884       }
2885    }
2886 }
2887
2888 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2889                                   const union tgsi_exec_channel *src0,
2890                                   const union tgsi_exec_channel *src1,
2891                                   const union tgsi_exec_channel *src2);
2892
2893 static void
2894 exec_vector_trinary(struct tgsi_exec_machine *mach,
2895                     const struct tgsi_full_instruction *inst,
2896                     micro_trinary_op op,
2897                     enum tgsi_exec_datatype dst_datatype,
2898                     enum tgsi_exec_datatype src_datatype)
2899 {
2900    unsigned int chan;
2901    struct tgsi_exec_vector dst;
2902
2903    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2904       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2905          union tgsi_exec_channel src[3];
2906
2907          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2908          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2909          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2910          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2911       }
2912    }
2913    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2914       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2915          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2916       }
2917    }
2918 }
2919
2920 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
2921                                      const union tgsi_exec_channel *src0,
2922                                      const union tgsi_exec_channel *src1,
2923                                      const union tgsi_exec_channel *src2,
2924                                      const union tgsi_exec_channel *src3);
2925
2926 static void
2927 exec_vector_quaternary(struct tgsi_exec_machine *mach,
2928                        const struct tgsi_full_instruction *inst,
2929                        micro_quaternary_op op,
2930                        enum tgsi_exec_datatype dst_datatype,
2931                        enum tgsi_exec_datatype src_datatype)
2932 {
2933    unsigned int chan;
2934    struct tgsi_exec_vector dst;
2935
2936    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2937       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2938          union tgsi_exec_channel src[4];
2939
2940          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2941          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2942          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2943          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
2944          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
2945       }
2946    }
2947    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2948       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2949          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2950       }
2951    }
2952 }
2953
2954 static void
2955 exec_dp3(struct tgsi_exec_machine *mach,
2956          const struct tgsi_full_instruction *inst)
2957 {
2958    unsigned int chan;
2959    union tgsi_exec_channel arg[3];
2960
2961    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2962    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2963    micro_mul(&arg[2], &arg[0], &arg[1]);
2964
2965    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
2966       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2967       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2968       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2969    }
2970
2971    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2972       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2973          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2974       }
2975    }
2976 }
2977
2978 static void
2979 exec_dp4(struct tgsi_exec_machine *mach,
2980          const struct tgsi_full_instruction *inst)
2981 {
2982    unsigned int chan;
2983    union tgsi_exec_channel arg[3];
2984
2985    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2986    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2987    micro_mul(&arg[2], &arg[0], &arg[1]);
2988
2989    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
2990       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2991       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2992       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2993    }
2994
2995    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2996       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2997          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2998       }
2999    }
3000 }
3001
3002 static void
3003 exec_dp2a(struct tgsi_exec_machine *mach,
3004           const struct tgsi_full_instruction *inst)
3005 {
3006    unsigned int chan;
3007    union tgsi_exec_channel arg[3];
3008
3009    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3010    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3011    micro_mul(&arg[2], &arg[0], &arg[1]);
3012
3013    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3014    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3015    micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
3016
3017    fetch_source(mach, &arg[1], &inst->Src[2], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3018    micro_add(&arg[0], &arg[0], &arg[1]);
3019
3020    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3021       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3022          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3023       }
3024    }
3025 }
3026
3027 static void
3028 exec_dph(struct tgsi_exec_machine *mach,
3029          const struct tgsi_full_instruction *inst)
3030 {
3031    unsigned int chan;
3032    union tgsi_exec_channel arg[3];
3033
3034    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3035    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3036    micro_mul(&arg[2], &arg[0], &arg[1]);
3037
3038    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3039    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3040    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3041
3042    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3043    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3044    micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
3045
3046    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3047    micro_add(&arg[0], &arg[0], &arg[1]);
3048
3049    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3050       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3051          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3052       }
3053    }
3054 }
3055
3056 static void
3057 exec_dp2(struct tgsi_exec_machine *mach,
3058          const struct tgsi_full_instruction *inst)
3059 {
3060    unsigned int chan;
3061    union tgsi_exec_channel arg[3];
3062
3063    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3064    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3065    micro_mul(&arg[2], &arg[0], &arg[1]);
3066
3067    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3068    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3069    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3070
3071    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3072       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3073          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3074       }
3075    }
3076 }
3077
3078 static void
3079 exec_pk2h(struct tgsi_exec_machine *mach,
3080           const struct tgsi_full_instruction *inst)
3081 {
3082    unsigned chan;
3083    union tgsi_exec_channel arg[2], dst;
3084
3085    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3086    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3087    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3088       dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
3089          (util_float_to_half(arg[1].f[chan]) << 16);
3090    }
3091    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3092       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3093          store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
3094       }
3095    }
3096 }
3097
3098 static void
3099 exec_up2h(struct tgsi_exec_machine *mach,
3100           const struct tgsi_full_instruction *inst)
3101 {
3102    unsigned chan;
3103    union tgsi_exec_channel arg, dst[2];
3104
3105    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3106    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3107       dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
3108       dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
3109    }
3110    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3111       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3112          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3113       }
3114    }
3115 }
3116
3117 static void
3118 exec_scs(struct tgsi_exec_machine *mach,
3119          const struct tgsi_full_instruction *inst)
3120 {
3121    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
3122       union tgsi_exec_channel arg;
3123       union tgsi_exec_channel result;
3124
3125       fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3126
3127       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3128          micro_cos(&result, &arg);
3129          store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3130       }
3131       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3132          micro_sin(&result, &arg);
3133          store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3134       }
3135    }
3136    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3137       store_dest(mach, &ZeroVec, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3138    }
3139    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3140       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3141    }
3142 }
3143
3144 static void
3145 exec_xpd(struct tgsi_exec_machine *mach,
3146          const struct tgsi_full_instruction *inst)
3147 {
3148    union tgsi_exec_channel r[6];
3149    union tgsi_exec_channel d[3];
3150
3151    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3152    fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3153
3154    micro_mul(&r[2], &r[0], &r[1]);
3155
3156    fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3157    fetch_source(mach, &r[4], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3158
3159    micro_mul(&r[5], &r[3], &r[4] );
3160    micro_sub(&d[TGSI_CHAN_X], &r[2], &r[5]);
3161
3162    fetch_source(mach, &r[2], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3163
3164    micro_mul(&r[3], &r[3], &r[2]);
3165
3166    fetch_source(mach, &r[5], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3167
3168    micro_mul(&r[1], &r[1], &r[5]);
3169    micro_sub(&d[TGSI_CHAN_Y], &r[3], &r[1]);
3170
3171    micro_mul(&r[5], &r[5], &r[4]);
3172    micro_mul(&r[0], &r[0], &r[2]);
3173    micro_sub(&d[TGSI_CHAN_Z], &r[5], &r[0]);
3174
3175    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3176       store_dest(mach, &d[TGSI_CHAN_X], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3177    }
3178    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3179       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3180    }
3181    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3182       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3183    }
3184    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3185       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3186    }
3187 }
3188
3189 static void
3190 exec_dst(struct tgsi_exec_machine *mach,
3191          const struct tgsi_full_instruction *inst)
3192 {
3193    union tgsi_exec_channel r[2];
3194    union tgsi_exec_channel d[4];
3195
3196    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3197       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3198       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3199       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3200    }
3201    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3202       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3203    }
3204    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3205       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3206    }
3207
3208    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3209       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3210    }
3211    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3212       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3213    }
3214    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3215       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3216    }
3217    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3218       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3219    }
3220 }
3221
3222 static void
3223 exec_log(struct tgsi_exec_machine *mach,
3224          const struct tgsi_full_instruction *inst)
3225 {
3226    union tgsi_exec_channel r[3];
3227
3228    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3229    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3230    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3231    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3232    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3233       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3234    }
3235    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3236       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3237       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3238       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3239    }
3240    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3241       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3242    }
3243    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3244       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3245    }
3246 }
3247
3248 static void
3249 exec_exp(struct tgsi_exec_machine *mach,
3250          const struct tgsi_full_instruction *inst)
3251 {
3252    union tgsi_exec_channel r[3];
3253
3254    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3255    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3256    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3257       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3258       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3259    }
3260    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3261       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3262       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3263    }
3264    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3265       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3266       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3267    }
3268    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3269       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3270    }
3271 }
3272
3273 static void
3274 exec_lit(struct tgsi_exec_machine *mach,
3275          const struct tgsi_full_instruction *inst)
3276 {
3277    union tgsi_exec_channel r[3];
3278    union tgsi_exec_channel d[3];
3279
3280    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3281       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3282       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3283          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3284          micro_max(&r[1], &r[1], &ZeroVec);
3285
3286          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3287          micro_min(&r[2], &r[2], &P128Vec);
3288          micro_max(&r[2], &r[2], &M128Vec);
3289          micro_pow(&r[1], &r[1], &r[2]);
3290          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3291          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3292       }
3293       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3294          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3295          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3296       }
3297    }
3298    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3299       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3300    }
3301
3302    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3303       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3304    }
3305 }
3306
3307 static void
3308 exec_break(struct tgsi_exec_machine *mach)
3309 {
3310    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3311       /* turn off loop channels for each enabled exec channel */
3312       mach->LoopMask &= ~mach->ExecMask;
3313       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3314       UPDATE_EXEC_MASK(mach);
3315    } else {
3316       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3317
3318       mach->Switch.mask = 0x0;
3319
3320       UPDATE_EXEC_MASK(mach);
3321    }
3322 }
3323
3324 static void
3325 exec_switch(struct tgsi_exec_machine *mach,
3326             const struct tgsi_full_instruction *inst)
3327 {
3328    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3329    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3330
3331    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3332    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3333    mach->Switch.mask = 0x0;
3334    mach->Switch.defaultMask = 0x0;
3335
3336    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3337    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3338
3339    UPDATE_EXEC_MASK(mach);
3340 }
3341
3342 static void
3343 exec_case(struct tgsi_exec_machine *mach,
3344           const struct tgsi_full_instruction *inst)
3345 {
3346    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3347    union tgsi_exec_channel src;
3348    uint mask = 0;
3349
3350    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3351
3352    if (mach->Switch.selector.u[0] == src.u[0]) {
3353       mask |= 0x1;
3354    }
3355    if (mach->Switch.selector.u[1] == src.u[1]) {
3356       mask |= 0x2;
3357    }
3358    if (mach->Switch.selector.u[2] == src.u[2]) {
3359       mask |= 0x4;
3360    }
3361    if (mach->Switch.selector.u[3] == src.u[3]) {
3362       mask |= 0x8;
3363    }
3364
3365    mach->Switch.defaultMask |= mask;
3366
3367    mach->Switch.mask |= mask & prevMask;
3368
3369    UPDATE_EXEC_MASK(mach);
3370 }
3371
3372 /* FIXME: this will only work if default is last */
3373 static void
3374 exec_default(struct tgsi_exec_machine *mach)
3375 {
3376    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3377
3378    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3379
3380    UPDATE_EXEC_MASK(mach);
3381 }
3382
3383 static void
3384 exec_endswitch(struct tgsi_exec_machine *mach)
3385 {
3386    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3387    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3388
3389    UPDATE_EXEC_MASK(mach);
3390 }
3391
3392 typedef void (* micro_dop)(union tgsi_double_channel *dst,
3393                            const union tgsi_double_channel *src);
3394
3395 static void
3396 fetch_double_channel(struct tgsi_exec_machine *mach,
3397                      union tgsi_double_channel *chan,
3398                      const struct tgsi_full_src_register *reg,
3399                      uint chan_0,
3400                      uint chan_1)
3401 {
3402    union tgsi_exec_channel src[2];
3403    uint i;
3404
3405    fetch_source_d(mach, &src[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
3406    fetch_source_d(mach, &src[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
3407
3408    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3409       chan->u[i][0] = src[0].u[i];
3410       chan->u[i][1] = src[1].u[i];
3411    }
3412    if (reg->Register.Absolute) {
3413       micro_dabs(chan, chan);
3414    }
3415    if (reg->Register.Negate) {
3416       micro_dneg(chan, chan);
3417    }
3418 }
3419
3420 static void
3421 store_double_channel(struct tgsi_exec_machine *mach,
3422                      const union tgsi_double_channel *chan,
3423                      const struct tgsi_full_dst_register *reg,
3424                      const struct tgsi_full_instruction *inst,
3425                      uint chan_0,
3426                      uint chan_1)
3427 {
3428    union tgsi_exec_channel dst[2];
3429    uint i;
3430    union tgsi_double_channel temp;
3431    const uint execmask = mach->ExecMask;
3432
3433    if (!inst->Instruction.Saturate) {
3434       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3435          if (execmask & (1 << i)) {
3436             dst[0].u[i] = chan->u[i][0];
3437             dst[1].u[i] = chan->u[i][1];
3438          }
3439    }
3440    else {
3441       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3442          if (execmask & (1 << i)) {
3443             if (chan->d[i] < 0.0)
3444                temp.d[i] = 0.0;
3445             else if (chan->d[i] > 1.0)
3446                temp.d[i] = 1.0;
3447             else
3448                temp.d[i] = chan->d[i];
3449
3450             dst[0].u[i] = temp.u[i][0];
3451             dst[1].u[i] = temp.u[i][1];
3452          }
3453    }
3454
3455    store_dest_double(mach, &dst[0], reg, inst, chan_0, TGSI_EXEC_DATA_UINT);
3456    if (chan_1 != -1)
3457       store_dest_double(mach, &dst[1], reg, inst, chan_1, TGSI_EXEC_DATA_UINT);
3458 }
3459
3460 static void
3461 exec_double_unary(struct tgsi_exec_machine *mach,
3462                   const struct tgsi_full_instruction *inst,
3463                   micro_dop op)
3464 {
3465    union tgsi_double_channel src;
3466    union tgsi_double_channel dst;
3467
3468    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3469       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3470       op(&dst, &src);
3471       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3472    }
3473    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3474       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3475       op(&dst, &src);
3476       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3477    }
3478 }
3479
3480 static void
3481 exec_double_binary(struct tgsi_exec_machine *mach,
3482                    const struct tgsi_full_instruction *inst,
3483                    micro_dop op,
3484                    enum tgsi_exec_datatype dst_datatype)
3485 {
3486    union tgsi_double_channel src[2];
3487    union tgsi_double_channel dst;
3488    int first_dest_chan, second_dest_chan;
3489    int wmask;
3490
3491    wmask = inst->Dst[0].Register.WriteMask;
3492    /* these are & because of the way DSLT etc store their destinations */
3493    if (wmask & TGSI_WRITEMASK_XY) {
3494       first_dest_chan = TGSI_CHAN_X;
3495       second_dest_chan = TGSI_CHAN_Y;
3496       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3497          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3498          second_dest_chan = -1;
3499       }
3500
3501       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3502       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3503       op(&dst, src);
3504       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3505    }
3506
3507    if (wmask & TGSI_WRITEMASK_ZW) {
3508       first_dest_chan = TGSI_CHAN_Z;
3509       second_dest_chan = TGSI_CHAN_W;
3510       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3511          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3512          second_dest_chan = -1;
3513       }
3514
3515       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3516       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3517       op(&dst, src);
3518       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3519    }
3520 }
3521
3522 static void
3523 exec_double_trinary(struct tgsi_exec_machine *mach,
3524                     const struct tgsi_full_instruction *inst,
3525                     micro_dop op)
3526 {
3527    union tgsi_double_channel src[3];
3528    union tgsi_double_channel dst;
3529
3530    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3531       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3532       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3533       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3534       op(&dst, src);
3535       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3536    }
3537    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3538       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3539       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3540       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3541       op(&dst, src);
3542       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3543    }
3544 }
3545
3546 static void
3547 exec_f2d(struct tgsi_exec_machine *mach,
3548          const struct tgsi_full_instruction *inst)
3549 {
3550    union tgsi_exec_channel src;
3551    union tgsi_double_channel dst;
3552
3553    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3554       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3555       micro_f2d(&dst, &src);
3556       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3557    }
3558    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3559       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3560       micro_f2d(&dst, &src);
3561       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3562    }
3563 }
3564
3565 static void
3566 exec_d2f(struct tgsi_exec_machine *mach,
3567          const struct tgsi_full_instruction *inst)
3568 {
3569    union tgsi_double_channel src;
3570    union tgsi_exec_channel dst;
3571    int wm = inst->Dst[0].Register.WriteMask;
3572    int i;
3573    int bit;
3574    for (i = 0; i < 2; i++) {
3575       bit = ffs(wm);
3576       if (bit) {
3577          wm &= ~(1 << (bit - 1));
3578          if (i == 0)
3579             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3580          else
3581             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3582          micro_d2f(&dst, &src);
3583          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, TGSI_EXEC_DATA_FLOAT);
3584       }
3585    }
3586 }
3587
3588 static void
3589 exec_i2d(struct tgsi_exec_machine *mach,
3590          const struct tgsi_full_instruction *inst)
3591 {
3592    union tgsi_exec_channel src;
3593    union tgsi_double_channel dst;
3594
3595    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3596       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3597       micro_i2d(&dst, &src);
3598       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3599    }
3600    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3601       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_INT);
3602       micro_i2d(&dst, &src);
3603       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3604    }
3605 }
3606
3607 static void
3608 exec_d2i(struct tgsi_exec_machine *mach,
3609          const struct tgsi_full_instruction *inst)
3610 {
3611    union tgsi_double_channel src;
3612    union tgsi_exec_channel dst;
3613    int wm = inst->Dst[0].Register.WriteMask;
3614    int i;
3615    int bit;
3616    for (i = 0; i < 2; i++) {
3617       bit = ffs(wm);
3618       if (bit) {
3619          wm &= ~(1 << (bit - 1));
3620          if (i == 0)
3621             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3622          else
3623             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3624          micro_d2i(&dst, &src);
3625          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, TGSI_EXEC_DATA_INT);
3626       }
3627    }
3628 }
3629 static void
3630 exec_u2d(struct tgsi_exec_machine *mach,
3631          const struct tgsi_full_instruction *inst)
3632 {
3633    union tgsi_exec_channel src;
3634    union tgsi_double_channel dst;
3635
3636    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3637       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3638       micro_u2d(&dst, &src);
3639       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3640    }
3641    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3642       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_UINT);
3643       micro_u2d(&dst, &src);
3644       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3645    }
3646 }
3647
3648 static void
3649 exec_d2u(struct tgsi_exec_machine *mach,
3650          const struct tgsi_full_instruction *inst)
3651 {
3652    union tgsi_double_channel src;
3653    union tgsi_exec_channel dst;
3654    int wm = inst->Dst[0].Register.WriteMask;
3655    int i;
3656    int bit;
3657    for (i = 0; i < 2; i++) {
3658       bit = ffs(wm);
3659       if (bit) {
3660          wm &= ~(1 << (bit - 1));
3661          if (i == 0)
3662             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3663          else
3664             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3665          micro_d2u(&dst, &src);
3666          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, TGSI_EXEC_DATA_UINT);
3667       }
3668    }
3669 }
3670
3671 static void
3672 exec_dldexp(struct tgsi_exec_machine *mach,
3673             const struct tgsi_full_instruction *inst)
3674 {
3675    union tgsi_double_channel src0;
3676    union tgsi_exec_channel src1;
3677    union tgsi_double_channel dst;
3678    int wmask;
3679
3680    wmask = inst->Dst[0].Register.WriteMask;
3681    if (wmask & TGSI_WRITEMASK_XY) {
3682       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3683       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3684       micro_dldexp(&dst, &src0, &src1);
3685       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3686    }
3687
3688    if (wmask & TGSI_WRITEMASK_ZW) {
3689       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3690       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3691       micro_dldexp(&dst, &src0, &src1);
3692       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3693    }
3694 }
3695
3696 static void
3697 exec_dfracexp(struct tgsi_exec_machine *mach,
3698               const struct tgsi_full_instruction *inst)
3699 {
3700    union tgsi_double_channel src;
3701    union tgsi_double_channel dst;
3702    union tgsi_exec_channel dst_exp;
3703
3704    if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)) {
3705       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3706       micro_dfracexp(&dst, &dst_exp, &src);
3707       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3708       store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
3709    }
3710    if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)) {
3711       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3712       micro_dfracexp(&dst, &dst_exp, &src);
3713       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3714       store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
3715    }
3716 }
3717
3718 static int
3719 get_image_coord_dim(unsigned tgsi_tex)
3720 {
3721    int dim;
3722    switch (tgsi_tex) {
3723    case TGSI_TEXTURE_BUFFER:
3724    case TGSI_TEXTURE_1D:
3725       dim = 1;
3726       break;
3727    case TGSI_TEXTURE_2D:
3728    case TGSI_TEXTURE_RECT:
3729    case TGSI_TEXTURE_1D_ARRAY:
3730    case TGSI_TEXTURE_2D_MSAA:
3731       dim = 2;
3732       break;
3733    case TGSI_TEXTURE_3D:
3734    case TGSI_TEXTURE_CUBE:
3735    case TGSI_TEXTURE_2D_ARRAY:
3736    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3737    case TGSI_TEXTURE_CUBE_ARRAY:
3738       dim = 3;
3739       break;
3740    default:
3741       assert(!"unknown texture target");
3742       dim = 0;
3743       break;
3744    }
3745
3746    return dim;
3747 }
3748
3749 static int
3750 get_image_coord_sample(unsigned tgsi_tex)
3751 {
3752    int sample = 0;
3753    switch (tgsi_tex) {
3754    case TGSI_TEXTURE_2D_MSAA:
3755       sample = 3;
3756       break;
3757    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3758       sample = 4;
3759       break;
3760    default:
3761       break;
3762    }
3763    return sample;
3764 }
3765
3766 static void
3767 exec_load_img(struct tgsi_exec_machine *mach,
3768               const struct tgsi_full_instruction *inst)
3769 {
3770    union tgsi_exec_channel r[4], sample_r;
3771    uint unit;
3772    int sample;
3773    int i, j;
3774    int dim;
3775    uint chan;
3776    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3777    struct tgsi_image_params params;
3778    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3779
3780    unit = fetch_sampler_unit(mach, inst, 0);
3781    dim = get_image_coord_dim(inst->Memory.Texture);
3782    sample = get_image_coord_sample(inst->Memory.Texture);
3783    assert(dim <= 3);
3784
3785    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3786    params.unit = unit;
3787    params.tgsi_tex_instr = inst->Memory.Texture;
3788    params.format = inst->Memory.Format;
3789
3790    for (i = 0; i < dim; i++) {
3791       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3792    }
3793
3794    if (sample)
3795       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3796
3797    mach->Image->load(mach->Image, &params,
3798                      r[0].i, r[1].i, r[2].i, sample_r.i,
3799                      rgba);
3800    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3801       r[0].f[j] = rgba[0][j];
3802       r[1].f[j] = rgba[1][j];
3803       r[2].f[j] = rgba[2][j];
3804       r[3].f[j] = rgba[3][j];
3805    }
3806    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3807       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3808          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3809       }
3810    }
3811 }
3812
3813 static void
3814 exec_load_buf(struct tgsi_exec_machine *mach,
3815               const struct tgsi_full_instruction *inst)
3816 {
3817    union tgsi_exec_channel r[4];
3818    uint unit;
3819    int j;
3820    uint chan;
3821    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3822    struct tgsi_buffer_params params;
3823    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3824
3825    unit = fetch_sampler_unit(mach, inst, 0);
3826
3827    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3828    params.unit = unit;
3829    IFETCH(&r[0], 1, TGSI_CHAN_X);
3830
3831    mach->Buffer->load(mach->Buffer, &params,
3832                       r[0].i, rgba);
3833    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3834       r[0].f[j] = rgba[0][j];
3835       r[1].f[j] = rgba[1][j];
3836       r[2].f[j] = rgba[2][j];
3837       r[3].f[j] = rgba[3][j];
3838    }
3839    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3840       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3841          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3842       }
3843    }
3844 }
3845
3846 static void
3847 exec_load_mem(struct tgsi_exec_machine *mach,
3848               const struct tgsi_full_instruction *inst)
3849 {
3850    union tgsi_exec_channel r[3];
3851    uint chan;
3852    char *ptr = mach->LocalMem;
3853    uint32_t offset;
3854    int j;
3855
3856    IFETCH(&r[0], 1, TGSI_CHAN_X);
3857    if (r[0].u[0] >= mach->LocalMemSize)
3858       return;
3859
3860    offset = r[0].u[0];
3861    ptr += offset;
3862
3863    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3864       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3865          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3866             memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
3867          }
3868       }
3869    }
3870
3871    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3872       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3873          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3874       }
3875    }
3876 }
3877
3878 static void
3879 exec_load(struct tgsi_exec_machine *mach,
3880           const struct tgsi_full_instruction *inst)
3881 {
3882    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
3883       exec_load_img(mach, inst);
3884    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
3885       exec_load_buf(mach, inst);
3886    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
3887       exec_load_mem(mach, inst);
3888 }
3889
3890 static void
3891 exec_store_img(struct tgsi_exec_machine *mach,
3892                const struct tgsi_full_instruction *inst)
3893 {
3894    union tgsi_exec_channel r[3], sample_r;
3895    union tgsi_exec_channel value[4];
3896    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3897    struct tgsi_image_params params;
3898    int dim;
3899    int sample;
3900    int i, j;
3901    uint unit;
3902    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3903    unit = inst->Dst[0].Register.Index;
3904    dim = get_image_coord_dim(inst->Memory.Texture);
3905    sample = get_image_coord_sample(inst->Memory.Texture);
3906    assert(dim <= 3);
3907
3908    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3909    params.unit = unit;
3910    params.tgsi_tex_instr = inst->Memory.Texture;
3911    params.format = inst->Memory.Format;
3912
3913    for (i = 0; i < dim; i++) {
3914       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
3915    }
3916
3917    for (i = 0; i < 4; i++) {
3918       FETCH(&value[i], 1, TGSI_CHAN_X + i);
3919    }
3920    if (sample)
3921       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
3922
3923    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3924       rgba[0][j] = value[0].f[j];
3925       rgba[1][j] = value[1].f[j];
3926       rgba[2][j] = value[2].f[j];
3927       rgba[3][j] = value[3].f[j];
3928    }
3929
3930    mach->Image->store(mach->Image, &params,
3931                       r[0].i, r[1].i, r[2].i, sample_r.i,
3932                       rgba);
3933 }
3934
3935 static void
3936 exec_store_buf(struct tgsi_exec_machine *mach,
3937                const struct tgsi_full_instruction *inst)
3938 {
3939    union tgsi_exec_channel r[3];
3940    union tgsi_exec_channel value[4];
3941    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3942    struct tgsi_buffer_params params;
3943    int i, j;
3944    uint unit;
3945    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3946
3947    unit = inst->Dst[0].Register.Index;
3948
3949    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3950    params.unit = unit;
3951    params.writemask = inst->Dst[0].Register.WriteMask;
3952
3953    IFETCH(&r[0], 0, TGSI_CHAN_X);
3954    for (i = 0; i < 4; i++) {
3955       FETCH(&value[i], 1, TGSI_CHAN_X + i);
3956    }
3957
3958    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3959       rgba[0][j] = value[0].f[j];
3960       rgba[1][j] = value[1].f[j];
3961       rgba[2][j] = value[2].f[j];
3962       rgba[3][j] = value[3].f[j];
3963    }
3964
3965    mach->Buffer->store(mach->Buffer, &params,
3966                       r[0].i,
3967                       rgba);
3968 }
3969
3970 static void
3971 exec_store_mem(struct tgsi_exec_machine *mach,
3972                const struct tgsi_full_instruction *inst)
3973 {
3974    union tgsi_exec_channel r[3];
3975    union tgsi_exec_channel value[4];
3976    uint i, chan;
3977    char *ptr = mach->LocalMem;
3978    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3979    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3980
3981    IFETCH(&r[0], 0, TGSI_CHAN_X);
3982
3983    for (i = 0; i < 4; i++) {
3984       FETCH(&value[i], 1, TGSI_CHAN_X + i);
3985    }
3986
3987    if (r[0].u[0] >= mach->LocalMemSize)
3988       return;
3989    ptr += r[0].u[0];
3990
3991    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3992       if (execmask & (1 << i)) {
3993          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3994             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3995                memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
3996             }
3997          }
3998       }
3999    }
4000 }
4001
4002 static void
4003 exec_store(struct tgsi_exec_machine *mach,
4004            const struct tgsi_full_instruction *inst)
4005 {
4006    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
4007       exec_store_img(mach, inst);
4008    else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
4009       exec_store_buf(mach, inst);
4010    else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
4011       exec_store_mem(mach, inst);
4012 }
4013
4014 static void
4015 exec_atomop_img(struct tgsi_exec_machine *mach,
4016                 const struct tgsi_full_instruction *inst)
4017 {
4018    union tgsi_exec_channel r[4], sample_r;
4019    union tgsi_exec_channel value[4], value2[4];
4020    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4021    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4022    struct tgsi_image_params params;
4023    int dim;
4024    int sample;
4025    int i, j;
4026    uint unit, chan;
4027    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4028    unit = fetch_sampler_unit(mach, inst, 0);
4029    dim = get_image_coord_dim(inst->Memory.Texture);
4030    sample = get_image_coord_sample(inst->Memory.Texture);
4031    assert(dim <= 3);
4032
4033    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4034    params.unit = unit;
4035    params.tgsi_tex_instr = inst->Memory.Texture;
4036    params.format = inst->Memory.Format;
4037
4038    for (i = 0; i < dim; i++) {
4039       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4040    }
4041
4042    for (i = 0; i < 4; i++) {
4043       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4044       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4045          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4046    }
4047    if (sample)
4048       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4049
4050    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4051       rgba[0][j] = value[0].f[j];
4052       rgba[1][j] = value[1].f[j];
4053       rgba[2][j] = value[2].f[j];
4054       rgba[3][j] = value[3].f[j];
4055    }
4056    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4057       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4058          rgba2[0][j] = value2[0].f[j];
4059          rgba2[1][j] = value2[1].f[j];
4060          rgba2[2][j] = value2[2].f[j];
4061          rgba2[3][j] = value2[3].f[j];
4062       }
4063    }
4064
4065    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4066                    r[0].i, r[1].i, r[2].i, sample_r.i,
4067                    rgba, rgba2);
4068
4069    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4070       r[0].f[j] = rgba[0][j];
4071       r[1].f[j] = rgba[1][j];
4072       r[2].f[j] = rgba[2][j];
4073       r[3].f[j] = rgba[3][j];
4074    }
4075    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4076       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4077          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4078       }
4079    }
4080 }
4081
4082 static void
4083 exec_atomop_buf(struct tgsi_exec_machine *mach,
4084                 const struct tgsi_full_instruction *inst)
4085 {
4086    union tgsi_exec_channel r[4];
4087    union tgsi_exec_channel value[4], value2[4];
4088    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4089    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4090    struct tgsi_buffer_params params;
4091    int i, j;
4092    uint unit, chan;
4093    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4094
4095    unit = fetch_sampler_unit(mach, inst, 0);
4096
4097    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4098    params.unit = unit;
4099    params.writemask = inst->Dst[0].Register.WriteMask;
4100
4101    IFETCH(&r[0], 1, TGSI_CHAN_X);
4102
4103    for (i = 0; i < 4; i++) {
4104       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4105       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4106          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4107    }
4108
4109    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4110       rgba[0][j] = value[0].f[j];
4111       rgba[1][j] = value[1].f[j];
4112       rgba[2][j] = value[2].f[j];
4113       rgba[3][j] = value[3].f[j];
4114    }
4115    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4116       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4117          rgba2[0][j] = value2[0].f[j];
4118          rgba2[1][j] = value2[1].f[j];
4119          rgba2[2][j] = value2[2].f[j];
4120          rgba2[3][j] = value2[3].f[j];
4121       }
4122    }
4123
4124    mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
4125                    r[0].i,
4126                    rgba, rgba2);
4127
4128    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4129       r[0].f[j] = rgba[0][j];
4130       r[1].f[j] = rgba[1][j];
4131       r[2].f[j] = rgba[2][j];
4132       r[3].f[j] = rgba[3][j];
4133    }
4134    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4135       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4136          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4137       }
4138    }
4139 }
4140
4141 static void
4142 exec_atomop_mem(struct tgsi_exec_machine *mach,
4143                 const struct tgsi_full_instruction *inst)
4144 {
4145    union tgsi_exec_channel r[4];
4146    union tgsi_exec_channel value[4], value2[4];
4147    char *ptr = mach->LocalMem;
4148    uint32_t val;
4149    uint chan, i;
4150    uint32_t offset;
4151    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4152    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4153    IFETCH(&r[0], 1, TGSI_CHAN_X);
4154
4155    if (r[0].u[0] >= mach->LocalMemSize)
4156       return;
4157
4158    offset = r[0].u[0];
4159    ptr += offset;
4160    for (i = 0; i < 4; i++) {
4161       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4162       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4163          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4164    }
4165
4166    memcpy(&r[0].u[0], ptr, 4);
4167    val = r[0].u[0];
4168    switch (inst->Instruction.Opcode) {
4169    case TGSI_OPCODE_ATOMUADD:
4170       val += value[0].u[0];
4171       break;
4172    case TGSI_OPCODE_ATOMXOR:
4173       val ^= value[0].u[0];
4174       break;
4175    case TGSI_OPCODE_ATOMOR:
4176       val |= value[0].u[0];
4177       break;
4178    case TGSI_OPCODE_ATOMAND:
4179       val &= value[0].u[0];
4180       break;
4181    case TGSI_OPCODE_ATOMUMIN:
4182       val = MIN2(val, value[0].u[0]);
4183       break;
4184    case TGSI_OPCODE_ATOMUMAX:
4185       val = MAX2(val, value[0].u[0]);
4186       break;
4187    case TGSI_OPCODE_ATOMIMIN:
4188       val = MIN2(r[0].i[0], value[0].i[0]);
4189       break;
4190    case TGSI_OPCODE_ATOMIMAX:
4191       val = MAX2(r[0].i[0], value[0].i[0]);
4192       break;
4193    case TGSI_OPCODE_ATOMXCHG:
4194       val = value[0].i[0];
4195       break;
4196    case TGSI_OPCODE_ATOMCAS:
4197       if (val == value[0].u[0])
4198          val = value2[0].u[0];
4199       break;
4200    default:
4201       break;
4202    }
4203    for (i = 0; i < TGSI_QUAD_SIZE; i++)
4204       if (execmask & (1 << i))
4205          memcpy(ptr, &val, 4);
4206
4207    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4208       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4209          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4210       }
4211    }
4212 }
4213
4214 static void
4215 exec_atomop(struct tgsi_exec_machine *mach,
4216             const struct tgsi_full_instruction *inst)
4217 {
4218    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4219       exec_atomop_img(mach, inst);
4220    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4221       exec_atomop_buf(mach, inst);
4222    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4223       exec_atomop_mem(mach, inst);
4224 }
4225
4226 static void
4227 exec_resq_img(struct tgsi_exec_machine *mach,
4228               const struct tgsi_full_instruction *inst)
4229 {
4230    int result[4];
4231    union tgsi_exec_channel r[4];
4232    uint unit;
4233    int i, chan, j;
4234    struct tgsi_image_params params;
4235    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4236
4237    unit = fetch_sampler_unit(mach, inst, 0);
4238
4239    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4240    params.unit = unit;
4241    params.tgsi_tex_instr = inst->Memory.Texture;
4242    params.format = inst->Memory.Format;
4243
4244    mach->Image->get_dims(mach->Image, &params, result);
4245
4246    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4247       for (j = 0; j < 4; j++) {
4248          r[j].i[i] = result[j];
4249       }
4250    }
4251
4252    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4253       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4254          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4255                     TGSI_EXEC_DATA_INT);
4256       }
4257    }
4258 }
4259
4260 static void
4261 exec_resq_buf(struct tgsi_exec_machine *mach,
4262               const struct tgsi_full_instruction *inst)
4263 {
4264    int result;
4265    union tgsi_exec_channel r[4];
4266    uint unit;
4267    int i, chan;
4268    struct tgsi_buffer_params params;
4269    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4270
4271    unit = fetch_sampler_unit(mach, inst, 0);
4272
4273    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4274    params.unit = unit;
4275
4276    mach->Buffer->get_dims(mach->Buffer, &params, &result);
4277
4278    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4279       r[0].i[i] = result;
4280    }
4281
4282    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4283       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4284          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4285                     TGSI_EXEC_DATA_INT);
4286       }
4287    }
4288 }
4289
4290 static void
4291 exec_resq(struct tgsi_exec_machine *mach,
4292           const struct tgsi_full_instruction *inst)
4293 {
4294    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4295       exec_resq_img(mach, inst);
4296    else
4297       exec_resq_buf(mach, inst);
4298 }
4299
4300 static void
4301 micro_i2f(union tgsi_exec_channel *dst,
4302           const union tgsi_exec_channel *src)
4303 {
4304    dst->f[0] = (float)src->i[0];
4305    dst->f[1] = (float)src->i[1];
4306    dst->f[2] = (float)src->i[2];
4307    dst->f[3] = (float)src->i[3];
4308 }
4309
4310 static void
4311 micro_not(union tgsi_exec_channel *dst,
4312           const union tgsi_exec_channel *src)
4313 {
4314    dst->u[0] = ~src->u[0];
4315    dst->u[1] = ~src->u[1];
4316    dst->u[2] = ~src->u[2];
4317    dst->u[3] = ~src->u[3];
4318 }
4319
4320 static void
4321 micro_shl(union tgsi_exec_channel *dst,
4322           const union tgsi_exec_channel *src0,
4323           const union tgsi_exec_channel *src1)
4324 {
4325    unsigned masked_count;
4326    masked_count = src1->u[0] & 0x1f;
4327    dst->u[0] = src0->u[0] << masked_count;
4328    masked_count = src1->u[1] & 0x1f;
4329    dst->u[1] = src0->u[1] << masked_count;
4330    masked_count = src1->u[2] & 0x1f;
4331    dst->u[2] = src0->u[2] << masked_count;
4332    masked_count = src1->u[3] & 0x1f;
4333    dst->u[3] = src0->u[3] << masked_count;
4334 }
4335
4336 static void
4337 micro_and(union tgsi_exec_channel *dst,
4338           const union tgsi_exec_channel *src0,
4339           const union tgsi_exec_channel *src1)
4340 {
4341    dst->u[0] = src0->u[0] & src1->u[0];
4342    dst->u[1] = src0->u[1] & src1->u[1];
4343    dst->u[2] = src0->u[2] & src1->u[2];
4344    dst->u[3] = src0->u[3] & src1->u[3];
4345 }
4346
4347 static void
4348 micro_or(union tgsi_exec_channel *dst,
4349          const union tgsi_exec_channel *src0,
4350          const union tgsi_exec_channel *src1)
4351 {
4352    dst->u[0] = src0->u[0] | src1->u[0];
4353    dst->u[1] = src0->u[1] | src1->u[1];
4354    dst->u[2] = src0->u[2] | src1->u[2];
4355    dst->u[3] = src0->u[3] | src1->u[3];
4356 }
4357
4358 static void
4359 micro_xor(union tgsi_exec_channel *dst,
4360           const union tgsi_exec_channel *src0,
4361           const union tgsi_exec_channel *src1)
4362 {
4363    dst->u[0] = src0->u[0] ^ src1->u[0];
4364    dst->u[1] = src0->u[1] ^ src1->u[1];
4365    dst->u[2] = src0->u[2] ^ src1->u[2];
4366    dst->u[3] = src0->u[3] ^ src1->u[3];
4367 }
4368
4369 static void
4370 micro_mod(union tgsi_exec_channel *dst,
4371           const union tgsi_exec_channel *src0,
4372           const union tgsi_exec_channel *src1)
4373 {
4374    dst->i[0] = src0->i[0] % src1->i[0];
4375    dst->i[1] = src0->i[1] % src1->i[1];
4376    dst->i[2] = src0->i[2] % src1->i[2];
4377    dst->i[3] = src0->i[3] % src1->i[3];
4378 }
4379
4380 static void
4381 micro_f2i(union tgsi_exec_channel *dst,
4382           const union tgsi_exec_channel *src)
4383 {
4384    dst->i[0] = (int)src->f[0];
4385    dst->i[1] = (int)src->f[1];
4386    dst->i[2] = (int)src->f[2];
4387    dst->i[3] = (int)src->f[3];
4388 }
4389
4390 static void
4391 micro_fseq(union tgsi_exec_channel *dst,
4392            const union tgsi_exec_channel *src0,
4393            const union tgsi_exec_channel *src1)
4394 {
4395    dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4396    dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4397    dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4398    dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4399 }
4400
4401 static void
4402 micro_fsge(union tgsi_exec_channel *dst,
4403            const union tgsi_exec_channel *src0,
4404            const union tgsi_exec_channel *src1)
4405 {
4406    dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4407    dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4408    dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4409    dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4410 }
4411
4412 static void
4413 micro_fslt(union tgsi_exec_channel *dst,
4414            const union tgsi_exec_channel *src0,
4415            const union tgsi_exec_channel *src1)
4416 {
4417    dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4418    dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4419    dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4420    dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4421 }
4422
4423 static void
4424 micro_fsne(union tgsi_exec_channel *dst,
4425            const union tgsi_exec_channel *src0,
4426            const union tgsi_exec_channel *src1)
4427 {
4428    dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4429    dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4430    dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4431    dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4432 }
4433
4434 static void
4435 micro_idiv(union tgsi_exec_channel *dst,
4436            const union tgsi_exec_channel *src0,
4437            const union tgsi_exec_channel *src1)
4438 {
4439    dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4440    dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4441    dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4442    dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4443 }
4444
4445 static void
4446 micro_imax(union tgsi_exec_channel *dst,
4447            const union tgsi_exec_channel *src0,
4448            const union tgsi_exec_channel *src1)
4449 {
4450    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4451    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4452    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4453    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4454 }
4455
4456 static void
4457 micro_imin(union tgsi_exec_channel *dst,
4458            const union tgsi_exec_channel *src0,
4459            const union tgsi_exec_channel *src1)
4460 {
4461    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4462    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4463    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4464    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4465 }
4466
4467 static void
4468 micro_isge(union tgsi_exec_channel *dst,
4469            const union tgsi_exec_channel *src0,
4470            const union tgsi_exec_channel *src1)
4471 {
4472    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4473    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4474    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4475    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4476 }
4477
4478 static void
4479 micro_ishr(union tgsi_exec_channel *dst,
4480            const union tgsi_exec_channel *src0,
4481            const union tgsi_exec_channel *src1)
4482 {
4483    unsigned masked_count;
4484    masked_count = src1->i[0] & 0x1f;
4485    dst->i[0] = src0->i[0] >> masked_count;
4486    masked_count = src1->i[1] & 0x1f;
4487    dst->i[1] = src0->i[1] >> masked_count;
4488    masked_count = src1->i[2] & 0x1f;
4489    dst->i[2] = src0->i[2] >> masked_count;
4490    masked_count = src1->i[3] & 0x1f;
4491    dst->i[3] = src0->i[3] >> masked_count;
4492 }
4493
4494 static void
4495 micro_islt(union tgsi_exec_channel *dst,
4496            const union tgsi_exec_channel *src0,
4497            const union tgsi_exec_channel *src1)
4498 {
4499    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4500    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4501    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4502    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4503 }
4504
4505 static void
4506 micro_f2u(union tgsi_exec_channel *dst,
4507           const union tgsi_exec_channel *src)
4508 {
4509    dst->u[0] = (uint)src->f[0];
4510    dst->u[1] = (uint)src->f[1];
4511    dst->u[2] = (uint)src->f[2];
4512    dst->u[3] = (uint)src->f[3];
4513 }
4514
4515 static void
4516 micro_u2f(union tgsi_exec_channel *dst,
4517           const union tgsi_exec_channel *src)
4518 {
4519    dst->f[0] = (float)src->u[0];
4520    dst->f[1] = (float)src->u[1];
4521    dst->f[2] = (float)src->u[2];
4522    dst->f[3] = (float)src->u[3];
4523 }
4524
4525 static void
4526 micro_uadd(union tgsi_exec_channel *dst,
4527            const union tgsi_exec_channel *src0,
4528            const union tgsi_exec_channel *src1)
4529 {
4530    dst->u[0] = src0->u[0] + src1->u[0];
4531    dst->u[1] = src0->u[1] + src1->u[1];
4532    dst->u[2] = src0->u[2] + src1->u[2];
4533    dst->u[3] = src0->u[3] + src1->u[3];
4534 }
4535
4536 static void
4537 micro_udiv(union tgsi_exec_channel *dst,
4538            const union tgsi_exec_channel *src0,
4539            const union tgsi_exec_channel *src1)
4540 {
4541    dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4542    dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4543    dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4544    dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4545 }
4546
4547 static void
4548 micro_umad(union tgsi_exec_channel *dst,
4549            const union tgsi_exec_channel *src0,
4550            const union tgsi_exec_channel *src1,
4551            const union tgsi_exec_channel *src2)
4552 {
4553    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4554    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4555    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4556    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4557 }
4558
4559 static void
4560 micro_umax(union tgsi_exec_channel *dst,
4561            const union tgsi_exec_channel *src0,
4562            const union tgsi_exec_channel *src1)
4563 {
4564    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4565    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4566    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4567    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4568 }
4569
4570 static void
4571 micro_umin(union tgsi_exec_channel *dst,
4572            const union tgsi_exec_channel *src0,
4573            const union tgsi_exec_channel *src1)
4574 {
4575    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4576    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4577    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4578    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4579 }
4580
4581 static void
4582 micro_umod(union tgsi_exec_channel *dst,
4583            const union tgsi_exec_channel *src0,
4584            const union tgsi_exec_channel *src1)
4585 {
4586    dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4587    dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4588    dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4589    dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4590 }
4591
4592 static void
4593 micro_umul(union tgsi_exec_channel *dst,
4594            const union tgsi_exec_channel *src0,
4595            const union tgsi_exec_channel *src1)
4596 {
4597    dst->u[0] = src0->u[0] * src1->u[0];
4598    dst->u[1] = src0->u[1] * src1->u[1];
4599    dst->u[2] = src0->u[2] * src1->u[2];
4600    dst->u[3] = src0->u[3] * src1->u[3];
4601 }
4602
4603 static void
4604 micro_imul_hi(union tgsi_exec_channel *dst,
4605               const union tgsi_exec_channel *src0,
4606               const union tgsi_exec_channel *src1)
4607 {
4608 #define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4609    dst->i[0] = I64M(src0->i[0], src1->i[0]);
4610    dst->i[1] = I64M(src0->i[1], src1->i[1]);
4611    dst->i[2] = I64M(src0->i[2], src1->i[2]);
4612    dst->i[3] = I64M(src0->i[3], src1->i[3]);
4613 #undef I64M
4614 }
4615
4616 static void
4617 micro_umul_hi(union tgsi_exec_channel *dst,
4618               const union tgsi_exec_channel *src0,
4619               const union tgsi_exec_channel *src1)
4620 {
4621 #define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4622    dst->u[0] = U64M(src0->u[0], src1->u[0]);
4623    dst->u[1] = U64M(src0->u[1], src1->u[1]);
4624    dst->u[2] = U64M(src0->u[2], src1->u[2]);
4625    dst->u[3] = U64M(src0->u[3], src1->u[3]);
4626 #undef U64M
4627 }
4628
4629 static void
4630 micro_useq(union tgsi_exec_channel *dst,
4631            const union tgsi_exec_channel *src0,
4632            const union tgsi_exec_channel *src1)
4633 {
4634    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4635    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4636    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4637    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4638 }
4639
4640 static void
4641 micro_usge(union tgsi_exec_channel *dst,
4642            const union tgsi_exec_channel *src0,
4643            const union tgsi_exec_channel *src1)
4644 {
4645    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4646    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4647    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4648    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4649 }
4650
4651 static void
4652 micro_ushr(union tgsi_exec_channel *dst,
4653            const union tgsi_exec_channel *src0,
4654            const union tgsi_exec_channel *src1)
4655 {
4656    unsigned masked_count;
4657    masked_count = src1->u[0] & 0x1f;
4658    dst->u[0] = src0->u[0] >> masked_count;
4659    masked_count = src1->u[1] & 0x1f;
4660    dst->u[1] = src0->u[1] >> masked_count;
4661    masked_count = src1->u[2] & 0x1f;
4662    dst->u[2] = src0->u[2] >> masked_count;
4663    masked_count = src1->u[3] & 0x1f;
4664    dst->u[3] = src0->u[3] >> masked_count;
4665 }
4666
4667 static void
4668 micro_uslt(union tgsi_exec_channel *dst,
4669            const union tgsi_exec_channel *src0,
4670            const union tgsi_exec_channel *src1)
4671 {
4672    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4673    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4674    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4675    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4676 }
4677
4678 static void
4679 micro_usne(union tgsi_exec_channel *dst,
4680            const union tgsi_exec_channel *src0,
4681            const union tgsi_exec_channel *src1)
4682 {
4683    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4684    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4685    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4686    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
4687 }
4688
4689 static void
4690 micro_uarl(union tgsi_exec_channel *dst,
4691            const union tgsi_exec_channel *src)
4692 {
4693    dst->i[0] = src->u[0];
4694    dst->i[1] = src->u[1];
4695    dst->i[2] = src->u[2];
4696    dst->i[3] = src->u[3];
4697 }
4698
4699 static void
4700 micro_ucmp(union tgsi_exec_channel *dst,
4701            const union tgsi_exec_channel *src0,
4702            const union tgsi_exec_channel *src1,
4703            const union tgsi_exec_channel *src2)
4704 {
4705    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
4706    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
4707    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
4708    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
4709 }
4710
4711 /**
4712  * Signed bitfield extract (i.e. sign-extend the extracted bits)
4713  */
4714 static void
4715 micro_ibfe(union tgsi_exec_channel *dst,
4716            const union tgsi_exec_channel *src0,
4717            const union tgsi_exec_channel *src1,
4718            const union tgsi_exec_channel *src2)
4719 {
4720    int i;
4721    for (i = 0; i < 4; i++) {
4722       int width = src2->i[i] & 0x1f;
4723       int offset = src1->i[i] & 0x1f;
4724       if (width == 0)
4725          dst->i[i] = 0;
4726       else if (width + offset < 32)
4727          dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
4728       else
4729          dst->i[i] = src0->i[i] >> offset;
4730    }
4731 }
4732
4733 /**
4734  * Unsigned bitfield extract
4735  */
4736 static void
4737 micro_ubfe(union tgsi_exec_channel *dst,
4738            const union tgsi_exec_channel *src0,
4739            const union tgsi_exec_channel *src1,
4740            const union tgsi_exec_channel *src2)
4741 {
4742    int i;
4743    for (i = 0; i < 4; i++) {
4744       int width = src2->u[i] & 0x1f;
4745       int offset = src1->u[i] & 0x1f;
4746       if (width == 0)
4747          dst->u[i] = 0;
4748       else if (width + offset < 32)
4749          dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
4750       else
4751          dst->u[i] = src0->u[i] >> offset;
4752    }
4753 }
4754
4755 /**
4756  * Bitfield insert: copy low bits from src1 into a region of src0.
4757  */
4758 static void
4759 micro_bfi(union tgsi_exec_channel *dst,
4760           const union tgsi_exec_channel *src0,
4761           const union tgsi_exec_channel *src1,
4762           const union tgsi_exec_channel *src2,
4763           const union tgsi_exec_channel *src3)
4764 {
4765    int i;
4766    for (i = 0; i < 4; i++) {
4767       int width = src3->u[i] & 0x1f;
4768       int offset = src2->u[i] & 0x1f;
4769       int bitmask = ((1 << width) - 1) << offset;
4770       dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
4771    }
4772 }
4773
4774 static void
4775 micro_brev(union tgsi_exec_channel *dst,
4776            const union tgsi_exec_channel *src)
4777 {
4778    dst->u[0] = util_bitreverse(src->u[0]);
4779    dst->u[1] = util_bitreverse(src->u[1]);
4780    dst->u[2] = util_bitreverse(src->u[2]);
4781    dst->u[3] = util_bitreverse(src->u[3]);
4782 }
4783
4784 static void
4785 micro_popc(union tgsi_exec_channel *dst,
4786            const union tgsi_exec_channel *src)
4787 {
4788    dst->u[0] = util_bitcount(src->u[0]);
4789    dst->u[1] = util_bitcount(src->u[1]);
4790    dst->u[2] = util_bitcount(src->u[2]);
4791    dst->u[3] = util_bitcount(src->u[3]);
4792 }
4793
4794 static void
4795 micro_lsb(union tgsi_exec_channel *dst,
4796           const union tgsi_exec_channel *src)
4797 {
4798    dst->i[0] = ffs(src->u[0]) - 1;
4799    dst->i[1] = ffs(src->u[1]) - 1;
4800    dst->i[2] = ffs(src->u[2]) - 1;
4801    dst->i[3] = ffs(src->u[3]) - 1;
4802 }
4803
4804 static void
4805 micro_imsb(union tgsi_exec_channel *dst,
4806            const union tgsi_exec_channel *src)
4807 {
4808    dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
4809    dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
4810    dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
4811    dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
4812 }
4813
4814 static void
4815 micro_umsb(union tgsi_exec_channel *dst,
4816            const union tgsi_exec_channel *src)
4817 {
4818    dst->i[0] = util_last_bit(src->u[0]) - 1;
4819    dst->i[1] = util_last_bit(src->u[1]) - 1;
4820    dst->i[2] = util_last_bit(src->u[2]) - 1;
4821    dst->i[3] = util_last_bit(src->u[3]) - 1;
4822 }
4823
4824 static void
4825 exec_instruction(
4826    struct tgsi_exec_machine *mach,
4827    const struct tgsi_full_instruction *inst,
4828    int *pc )
4829 {
4830    union tgsi_exec_channel r[10];
4831
4832    (*pc)++;
4833
4834    switch (inst->Instruction.Opcode) {
4835    case TGSI_OPCODE_ARL:
4836       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
4837       break;
4838
4839    case TGSI_OPCODE_MOV:
4840       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
4841       break;
4842
4843    case TGSI_OPCODE_LIT:
4844       exec_lit(mach, inst);
4845       break;
4846
4847    case TGSI_OPCODE_RCP:
4848       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4849       break;
4850
4851    case TGSI_OPCODE_RSQ:
4852       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4853       break;
4854
4855    case TGSI_OPCODE_EXP:
4856       exec_exp(mach, inst);
4857       break;
4858
4859    case TGSI_OPCODE_LOG:
4860       exec_log(mach, inst);
4861       break;
4862
4863    case TGSI_OPCODE_MUL:
4864       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4865       break;
4866
4867    case TGSI_OPCODE_ADD:
4868       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4869       break;
4870
4871    case TGSI_OPCODE_DP3:
4872       exec_dp3(mach, inst);
4873       break;
4874
4875    case TGSI_OPCODE_DP4:
4876       exec_dp4(mach, inst);
4877       break;
4878
4879    case TGSI_OPCODE_DST:
4880       exec_dst(mach, inst);
4881       break;
4882
4883    case TGSI_OPCODE_MIN:
4884       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4885       break;
4886
4887    case TGSI_OPCODE_MAX:
4888       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4889       break;
4890
4891    case TGSI_OPCODE_SLT:
4892       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4893       break;
4894
4895    case TGSI_OPCODE_SGE:
4896       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4897       break;
4898
4899    case TGSI_OPCODE_MAD:
4900       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4901       break;
4902
4903    case TGSI_OPCODE_SUB:
4904       exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4905       break;
4906
4907    case TGSI_OPCODE_LRP:
4908       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4909       break;
4910
4911    case TGSI_OPCODE_SQRT:
4912       exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4913       break;
4914
4915    case TGSI_OPCODE_DP2A:
4916       exec_dp2a(mach, inst);
4917       break;
4918
4919    case TGSI_OPCODE_FRC:
4920       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4921       break;
4922
4923    case TGSI_OPCODE_CLAMP:
4924       exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4925       break;
4926
4927    case TGSI_OPCODE_FLR:
4928       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4929       break;
4930
4931    case TGSI_OPCODE_ROUND:
4932       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4933       break;
4934
4935    case TGSI_OPCODE_EX2:
4936       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4937       break;
4938
4939    case TGSI_OPCODE_LG2:
4940       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4941       break;
4942
4943    case TGSI_OPCODE_POW:
4944       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4945       break;
4946
4947    case TGSI_OPCODE_XPD:
4948       exec_xpd(mach, inst);
4949       break;
4950
4951    case TGSI_OPCODE_ABS:
4952       exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4953       break;
4954
4955    case TGSI_OPCODE_DPH:
4956       exec_dph(mach, inst);
4957       break;
4958
4959    case TGSI_OPCODE_COS:
4960       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4961       break;
4962
4963    case TGSI_OPCODE_DDX:
4964       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4965       break;
4966
4967    case TGSI_OPCODE_DDY:
4968       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4969       break;
4970
4971    case TGSI_OPCODE_KILL:
4972       exec_kill (mach, inst);
4973       break;
4974
4975    case TGSI_OPCODE_KILL_IF:
4976       exec_kill_if (mach, inst);
4977       break;
4978
4979    case TGSI_OPCODE_PK2H:
4980       exec_pk2h(mach, inst);
4981       break;
4982
4983    case TGSI_OPCODE_PK2US:
4984       assert (0);
4985       break;
4986
4987    case TGSI_OPCODE_PK4B:
4988       assert (0);
4989       break;
4990
4991    case TGSI_OPCODE_PK4UB:
4992       assert (0);
4993       break;
4994
4995    case TGSI_OPCODE_SEQ:
4996       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
4997       break;
4998
4999    case TGSI_OPCODE_SGT:
5000       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5001       break;
5002
5003    case TGSI_OPCODE_SIN:
5004       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5005       break;
5006
5007    case TGSI_OPCODE_SLE:
5008       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5009       break;
5010
5011    case TGSI_OPCODE_SNE:
5012       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5013       break;
5014
5015    case TGSI_OPCODE_TEX:
5016       /* simple texture lookup */
5017       /* src[0] = texcoord */
5018       /* src[1] = sampler unit */
5019       exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5020       break;
5021
5022    case TGSI_OPCODE_TXB:
5023       /* Texture lookup with lod bias */
5024       /* src[0] = texcoord (src[0].w = LOD bias) */
5025       /* src[1] = sampler unit */
5026       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5027       break;
5028
5029    case TGSI_OPCODE_TXD:
5030       /* Texture lookup with explict partial derivatives */
5031       /* src[0] = texcoord */
5032       /* src[1] = d[strq]/dx */
5033       /* src[2] = d[strq]/dy */
5034       /* src[3] = sampler unit */
5035       exec_txd(mach, inst);
5036       break;
5037
5038    case TGSI_OPCODE_TXL:
5039       /* Texture lookup with explit LOD */
5040       /* src[0] = texcoord (src[0].w = LOD) */
5041       /* src[1] = sampler unit */
5042       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5043       break;
5044
5045    case TGSI_OPCODE_TXP:
5046       /* Texture lookup with projection */
5047       /* src[0] = texcoord (src[0].w = projection) */
5048       /* src[1] = sampler unit */
5049       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5050       break;
5051
5052    case TGSI_OPCODE_TG4:
5053       /* src[0] = texcoord */
5054       /* src[1] = component */
5055       /* src[2] = sampler unit */
5056       exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5057       break;
5058
5059    case TGSI_OPCODE_LODQ:
5060       /* src[0] = texcoord */
5061       /* src[1] = sampler unit */
5062       exec_lodq(mach, inst);
5063       break;
5064
5065    case TGSI_OPCODE_UP2H:
5066       exec_up2h(mach, inst);
5067       break;
5068
5069    case TGSI_OPCODE_UP2US:
5070       assert (0);
5071       break;
5072
5073    case TGSI_OPCODE_UP4B:
5074       assert (0);
5075       break;
5076
5077    case TGSI_OPCODE_UP4UB:
5078       assert (0);
5079       break;
5080
5081    case TGSI_OPCODE_ARR:
5082       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5083       break;
5084
5085    case TGSI_OPCODE_CAL:
5086       /* skip the call if no execution channels are enabled */
5087       if (mach->ExecMask) {
5088          /* do the call */
5089
5090          /* First, record the depths of the execution stacks.
5091           * This is important for deeply nested/looped return statements.
5092           * We have to unwind the stacks by the correct amount.  For a
5093           * real code generator, we could determine the number of entries
5094           * to pop off each stack with simple static analysis and avoid
5095           * implementing this data structure at run time.
5096           */
5097          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5098          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5099          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5100          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5101          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5102          /* note that PC was already incremented above */
5103          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5104
5105          mach->CallStackTop++;
5106
5107          /* Second, push the Cond, Loop, Cont, Func stacks */
5108          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5109          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5110          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5111          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5112          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5113          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5114
5115          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5116          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5117          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5118          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5119          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5120          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5121
5122          /* Finally, jump to the subroutine.  The label is a pointer
5123           * (an instruction number) to the BGNSUB instruction.
5124           */
5125          *pc = inst->Label.Label;
5126          assert(mach->Instructions[*pc].Instruction.Opcode
5127                 == TGSI_OPCODE_BGNSUB);
5128       }
5129       break;
5130
5131    case TGSI_OPCODE_RET:
5132       mach->FuncMask &= ~mach->ExecMask;
5133       UPDATE_EXEC_MASK(mach);
5134
5135       if (mach->FuncMask == 0x0) {
5136          /* really return now (otherwise, keep executing */
5137
5138          if (mach->CallStackTop == 0) {
5139             /* returning from main() */
5140             mach->CondStackTop = 0;
5141             mach->LoopStackTop = 0;
5142             *pc = -1;
5143             return;
5144          }
5145
5146          assert(mach->CallStackTop > 0);
5147          mach->CallStackTop--;
5148
5149          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5150          mach->CondMask = mach->CondStack[mach->CondStackTop];
5151
5152          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5153          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5154
5155          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5156          mach->ContMask = mach->ContStack[mach->ContStackTop];
5157
5158          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5159          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5160
5161          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5162          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5163
5164          assert(mach->FuncStackTop > 0);
5165          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5166
5167          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5168
5169          UPDATE_EXEC_MASK(mach);
5170       }
5171       break;
5172
5173    case TGSI_OPCODE_SSG:
5174       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5175       break;
5176
5177    case TGSI_OPCODE_CMP:
5178       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5179       break;
5180
5181    case TGSI_OPCODE_SCS:
5182       exec_scs(mach, inst);
5183       break;
5184
5185    case TGSI_OPCODE_DIV:
5186       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5187       break;
5188
5189    case TGSI_OPCODE_DP2:
5190       exec_dp2(mach, inst);
5191       break;
5192
5193    case TGSI_OPCODE_IF:
5194       /* push CondMask */
5195       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5196       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5197       FETCH( &r[0], 0, TGSI_CHAN_X );
5198       /* update CondMask */
5199       if( ! r[0].f[0] ) {
5200          mach->CondMask &= ~0x1;
5201       }
5202       if( ! r[0].f[1] ) {
5203          mach->CondMask &= ~0x2;
5204       }
5205       if( ! r[0].f[2] ) {
5206          mach->CondMask &= ~0x4;
5207       }
5208       if( ! r[0].f[3] ) {
5209          mach->CondMask &= ~0x8;
5210       }
5211       UPDATE_EXEC_MASK(mach);
5212       /* Todo: If CondMask==0, jump to ELSE */
5213       break;
5214
5215    case TGSI_OPCODE_UIF:
5216       /* push CondMask */
5217       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5218       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5219       IFETCH( &r[0], 0, TGSI_CHAN_X );
5220       /* update CondMask */
5221       if( ! r[0].u[0] ) {
5222          mach->CondMask &= ~0x1;
5223       }
5224       if( ! r[0].u[1] ) {
5225          mach->CondMask &= ~0x2;
5226       }
5227       if( ! r[0].u[2] ) {
5228          mach->CondMask &= ~0x4;
5229       }
5230       if( ! r[0].u[3] ) {
5231          mach->CondMask &= ~0x8;
5232       }
5233       UPDATE_EXEC_MASK(mach);
5234       /* Todo: If CondMask==0, jump to ELSE */
5235       break;
5236
5237    case TGSI_OPCODE_ELSE:
5238       /* invert CondMask wrt previous mask */
5239       {
5240          uint prevMask;
5241          assert(mach->CondStackTop > 0);
5242          prevMask = mach->CondStack[mach->CondStackTop - 1];
5243          mach->CondMask = ~mach->CondMask & prevMask;
5244          UPDATE_EXEC_MASK(mach);
5245          /* Todo: If CondMask==0, jump to ENDIF */
5246       }
5247       break;
5248
5249    case TGSI_OPCODE_ENDIF:
5250       /* pop CondMask */
5251       assert(mach->CondStackTop > 0);
5252       mach->CondMask = mach->CondStack[--mach->CondStackTop];
5253       UPDATE_EXEC_MASK(mach);
5254       break;
5255
5256    case TGSI_OPCODE_END:
5257       /* make sure we end primitives which haven't
5258        * been explicitly emitted */
5259       conditional_emit_primitive(mach);
5260       /* halt execution */
5261       *pc = -1;
5262       break;
5263
5264    case TGSI_OPCODE_PUSHA:
5265       assert (0);
5266       break;
5267
5268    case TGSI_OPCODE_POPA:
5269       assert (0);
5270       break;
5271
5272    case TGSI_OPCODE_CEIL:
5273       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5274       break;
5275
5276    case TGSI_OPCODE_I2F:
5277       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
5278       break;
5279
5280    case TGSI_OPCODE_NOT:
5281       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5282       break;
5283
5284    case TGSI_OPCODE_TRUNC:
5285       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5286       break;
5287
5288    case TGSI_OPCODE_SHL:
5289       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5290       break;
5291
5292    case TGSI_OPCODE_AND:
5293       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5294       break;
5295
5296    case TGSI_OPCODE_OR:
5297       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5298       break;
5299
5300    case TGSI_OPCODE_MOD:
5301       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5302       break;
5303
5304    case TGSI_OPCODE_XOR:
5305       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5306       break;
5307
5308    case TGSI_OPCODE_SAD:
5309       assert (0);
5310       break;
5311
5312    case TGSI_OPCODE_TXF:
5313       exec_txf(mach, inst);
5314       break;
5315
5316    case TGSI_OPCODE_TXQ:
5317       exec_txq(mach, inst);
5318       break;
5319
5320    case TGSI_OPCODE_EMIT:
5321       emit_vertex(mach);
5322       break;
5323
5324    case TGSI_OPCODE_ENDPRIM:
5325       emit_primitive(mach);
5326       break;
5327
5328    case TGSI_OPCODE_BGNLOOP:
5329       /* push LoopMask and ContMasks */
5330       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5331       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5332       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5333       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5334
5335       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5336       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5337       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5338       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5339       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5340       break;
5341
5342    case TGSI_OPCODE_ENDLOOP:
5343       /* Restore ContMask, but don't pop */
5344       assert(mach->ContStackTop > 0);
5345       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5346       UPDATE_EXEC_MASK(mach);
5347       if (mach->ExecMask) {
5348          /* repeat loop: jump to instruction just past BGNLOOP */
5349          assert(mach->LoopLabelStackTop > 0);
5350          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5351       }
5352       else {
5353          /* exit loop: pop LoopMask */
5354          assert(mach->LoopStackTop > 0);
5355          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5356          /* pop ContMask */
5357          assert(mach->ContStackTop > 0);
5358          mach->ContMask = mach->ContStack[--mach->ContStackTop];
5359          assert(mach->LoopLabelStackTop > 0);
5360          --mach->LoopLabelStackTop;
5361
5362          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5363       }
5364       UPDATE_EXEC_MASK(mach);
5365       break;
5366
5367    case TGSI_OPCODE_BRK:
5368       exec_break(mach);
5369       break;
5370
5371    case TGSI_OPCODE_CONT:
5372       /* turn off cont channels for each enabled exec channel */
5373       mach->ContMask &= ~mach->ExecMask;
5374       /* Todo: if mach->LoopMask == 0, jump to end of loop */
5375       UPDATE_EXEC_MASK(mach);
5376       break;
5377
5378    case TGSI_OPCODE_BGNSUB:
5379       /* no-op */
5380       break;
5381
5382    case TGSI_OPCODE_ENDSUB:
5383       /*
5384        * XXX: This really should be a no-op. We should never reach this opcode.
5385        */
5386
5387       assert(mach->CallStackTop > 0);
5388       mach->CallStackTop--;
5389
5390       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5391       mach->CondMask = mach->CondStack[mach->CondStackTop];
5392
5393       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5394       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5395
5396       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5397       mach->ContMask = mach->ContStack[mach->ContStackTop];
5398
5399       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5400       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5401
5402       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5403       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5404
5405       assert(mach->FuncStackTop > 0);
5406       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5407
5408       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5409
5410       UPDATE_EXEC_MASK(mach);
5411       break;
5412
5413    case TGSI_OPCODE_NOP:
5414       break;
5415
5416    case TGSI_OPCODE_BREAKC:
5417       IFETCH(&r[0], 0, TGSI_CHAN_X);
5418       /* update CondMask */
5419       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
5420          mach->LoopMask &= ~0x1;
5421       }
5422       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
5423          mach->LoopMask &= ~0x2;
5424       }
5425       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
5426          mach->LoopMask &= ~0x4;
5427       }
5428       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
5429          mach->LoopMask &= ~0x8;
5430       }
5431       /* Todo: if mach->LoopMask == 0, jump to end of loop */
5432       UPDATE_EXEC_MASK(mach);
5433       break;
5434
5435    case TGSI_OPCODE_F2I:
5436       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5437       break;
5438
5439    case TGSI_OPCODE_FSEQ:
5440       exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5441       break;
5442
5443    case TGSI_OPCODE_FSGE:
5444       exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5445       break;
5446
5447    case TGSI_OPCODE_FSLT:
5448       exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5449       break;
5450
5451    case TGSI_OPCODE_FSNE:
5452       exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5453       break;
5454
5455    case TGSI_OPCODE_IDIV:
5456       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5457       break;
5458
5459    case TGSI_OPCODE_IMAX:
5460       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5461       break;
5462
5463    case TGSI_OPCODE_IMIN:
5464       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5465       break;
5466
5467    case TGSI_OPCODE_INEG:
5468       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5469       break;
5470
5471    case TGSI_OPCODE_ISGE:
5472       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5473       break;
5474
5475    case TGSI_OPCODE_ISHR:
5476       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5477       break;
5478
5479    case TGSI_OPCODE_ISLT:
5480       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5481       break;
5482
5483    case TGSI_OPCODE_F2U:
5484       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5485       break;
5486
5487    case TGSI_OPCODE_U2F:
5488       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
5489       break;
5490
5491    case TGSI_OPCODE_UADD:
5492       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5493       break;
5494
5495    case TGSI_OPCODE_UDIV:
5496       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5497       break;
5498
5499    case TGSI_OPCODE_UMAD:
5500       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5501       break;
5502
5503    case TGSI_OPCODE_UMAX:
5504       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5505       break;
5506
5507    case TGSI_OPCODE_UMIN:
5508       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5509       break;
5510
5511    case TGSI_OPCODE_UMOD:
5512       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5513       break;
5514
5515    case TGSI_OPCODE_UMUL:
5516       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5517       break;
5518
5519    case TGSI_OPCODE_IMUL_HI:
5520       exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5521       break;
5522
5523    case TGSI_OPCODE_UMUL_HI:
5524       exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5525       break;
5526
5527    case TGSI_OPCODE_USEQ:
5528       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5529       break;
5530
5531    case TGSI_OPCODE_USGE:
5532       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5533       break;
5534
5535    case TGSI_OPCODE_USHR:
5536       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5537       break;
5538
5539    case TGSI_OPCODE_USLT:
5540       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5541       break;
5542
5543    case TGSI_OPCODE_USNE:
5544       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5545       break;
5546
5547    case TGSI_OPCODE_SWITCH:
5548       exec_switch(mach, inst);
5549       break;
5550
5551    case TGSI_OPCODE_CASE:
5552       exec_case(mach, inst);
5553       break;
5554
5555    case TGSI_OPCODE_DEFAULT:
5556       exec_default(mach);
5557       break;
5558
5559    case TGSI_OPCODE_ENDSWITCH:
5560       exec_endswitch(mach);
5561       break;
5562
5563    case TGSI_OPCODE_SAMPLE_I:
5564       exec_txf(mach, inst);
5565       break;
5566
5567    case TGSI_OPCODE_SAMPLE_I_MS:
5568       exec_txf(mach, inst);
5569       break;
5570
5571    case TGSI_OPCODE_SAMPLE:
5572       exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5573       break;
5574
5575    case TGSI_OPCODE_SAMPLE_B:
5576       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5577       break;
5578
5579    case TGSI_OPCODE_SAMPLE_C:
5580       exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5581       break;
5582
5583    case TGSI_OPCODE_SAMPLE_C_LZ:
5584       exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5585       break;
5586
5587    case TGSI_OPCODE_SAMPLE_D:
5588       exec_sample_d(mach, inst);
5589       break;
5590
5591    case TGSI_OPCODE_SAMPLE_L:
5592       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5593       break;
5594
5595    case TGSI_OPCODE_GATHER4:
5596       assert(0);
5597       break;
5598
5599    case TGSI_OPCODE_SVIEWINFO:
5600       exec_txq(mach, inst);
5601       break;
5602
5603    case TGSI_OPCODE_SAMPLE_POS:
5604       assert(0);
5605       break;
5606
5607    case TGSI_OPCODE_SAMPLE_INFO:
5608       assert(0);
5609       break;
5610
5611    case TGSI_OPCODE_UARL:
5612       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5613       break;
5614
5615    case TGSI_OPCODE_UCMP:
5616       exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5617       break;
5618
5619    case TGSI_OPCODE_IABS:
5620       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5621       break;
5622
5623    case TGSI_OPCODE_ISSG:
5624       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5625       break;
5626
5627    case TGSI_OPCODE_TEX2:
5628       /* simple texture lookup */
5629       /* src[0] = texcoord */
5630       /* src[1] = compare */
5631       /* src[2] = sampler unit */
5632       exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
5633       break;
5634    case TGSI_OPCODE_TXB2:
5635       /* simple texture lookup */
5636       /* src[0] = texcoord */
5637       /* src[1] = bias */
5638       /* src[2] = sampler unit */
5639       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
5640       break;
5641    case TGSI_OPCODE_TXL2:
5642       /* simple texture lookup */
5643       /* src[0] = texcoord */
5644       /* src[1] = lod */
5645       /* src[2] = sampler unit */
5646       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
5647       break;
5648
5649    case TGSI_OPCODE_IBFE:
5650       exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5651       break;
5652    case TGSI_OPCODE_UBFE:
5653       exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5654       break;
5655    case TGSI_OPCODE_BFI:
5656       exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5657       break;
5658    case TGSI_OPCODE_BREV:
5659       exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5660       break;
5661    case TGSI_OPCODE_POPC:
5662       exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5663       break;
5664    case TGSI_OPCODE_LSB:
5665       exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5666       break;
5667    case TGSI_OPCODE_IMSB:
5668       exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5669       break;
5670    case TGSI_OPCODE_UMSB:
5671       exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5672       break;
5673
5674    case TGSI_OPCODE_F2D:
5675       exec_f2d(mach, inst);
5676       break;
5677
5678    case TGSI_OPCODE_D2F:
5679       exec_d2f(mach, inst);
5680       break;
5681
5682    case TGSI_OPCODE_DABS:
5683       exec_double_unary(mach, inst, micro_dabs);
5684       break;
5685
5686    case TGSI_OPCODE_DNEG:
5687       exec_double_unary(mach, inst, micro_dneg);
5688       break;
5689
5690    case TGSI_OPCODE_DADD:
5691       exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
5692       break;
5693
5694    case TGSI_OPCODE_DMUL:
5695       exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
5696       break;
5697
5698    case TGSI_OPCODE_DMAX:
5699       exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
5700       break;
5701
5702    case TGSI_OPCODE_DMIN:
5703       exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
5704       break;
5705
5706    case TGSI_OPCODE_DSLT:
5707       exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
5708       break;
5709
5710    case TGSI_OPCODE_DSGE:
5711       exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
5712       break;
5713
5714    case TGSI_OPCODE_DSEQ:
5715       exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
5716       break;
5717
5718    case TGSI_OPCODE_DSNE:
5719       exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
5720       break;
5721
5722    case TGSI_OPCODE_DRCP:
5723       exec_double_unary(mach, inst, micro_drcp);
5724       break;
5725
5726    case TGSI_OPCODE_DSQRT:
5727       exec_double_unary(mach, inst, micro_dsqrt);
5728       break;
5729
5730    case TGSI_OPCODE_DRSQ:
5731       exec_double_unary(mach, inst, micro_drsq);
5732       break;
5733
5734    case TGSI_OPCODE_DMAD:
5735       exec_double_trinary(mach, inst, micro_dmad);
5736       break;
5737
5738    case TGSI_OPCODE_DFRAC:
5739       exec_double_unary(mach, inst, micro_dfrac);
5740       break;
5741
5742    case TGSI_OPCODE_DLDEXP:
5743       exec_dldexp(mach, inst);
5744       break;
5745
5746    case TGSI_OPCODE_DFRACEXP:
5747       exec_dfracexp(mach, inst);
5748       break;
5749
5750    case TGSI_OPCODE_I2D:
5751       exec_i2d(mach, inst);
5752       break;
5753
5754    case TGSI_OPCODE_D2I:
5755       exec_d2i(mach, inst);
5756       break;
5757
5758    case TGSI_OPCODE_U2D:
5759       exec_u2d(mach, inst);
5760       break;
5761
5762    case TGSI_OPCODE_D2U:
5763       exec_d2u(mach, inst);
5764       break;
5765
5766    case TGSI_OPCODE_LOAD:
5767       exec_load(mach, inst);
5768       break;
5769
5770    case TGSI_OPCODE_STORE:
5771       exec_store(mach, inst);
5772       break;
5773
5774    case TGSI_OPCODE_ATOMUADD:
5775    case TGSI_OPCODE_ATOMXCHG:
5776    case TGSI_OPCODE_ATOMCAS:
5777    case TGSI_OPCODE_ATOMAND:
5778    case TGSI_OPCODE_ATOMOR:
5779    case TGSI_OPCODE_ATOMXOR:
5780    case TGSI_OPCODE_ATOMUMIN:
5781    case TGSI_OPCODE_ATOMUMAX:
5782    case TGSI_OPCODE_ATOMIMIN:
5783    case TGSI_OPCODE_ATOMIMAX:
5784       exec_atomop(mach, inst);
5785       break;
5786
5787    case TGSI_OPCODE_RESQ:
5788       exec_resq(mach, inst);
5789       break;
5790    case TGSI_OPCODE_BARRIER:
5791    case TGSI_OPCODE_MEMBAR:
5792       break;
5793    default:
5794       assert( 0 );
5795    }
5796 }
5797
5798 static void
5799 tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
5800 {
5801    uint default_mask = 0xf;
5802
5803    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
5804    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
5805
5806    if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
5807       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
5808       mach->Primitives[0] = 0;
5809       /* GS runs on a single primitive for now */
5810       default_mask = 0x1;
5811    }
5812
5813    if (mach->NonHelperMask == 0)
5814       mach->NonHelperMask = default_mask;
5815    mach->CondMask = default_mask;
5816    mach->LoopMask = default_mask;
5817    mach->ContMask = default_mask;
5818    mach->FuncMask = default_mask;
5819    mach->ExecMask = default_mask;
5820
5821    mach->Switch.mask = default_mask;
5822
5823    assert(mach->CondStackTop == 0);
5824    assert(mach->LoopStackTop == 0);
5825    assert(mach->ContStackTop == 0);
5826    assert(mach->SwitchStackTop == 0);
5827    assert(mach->BreakStackTop == 0);
5828    assert(mach->CallStackTop == 0);
5829 }
5830
5831 /**
5832  * Run TGSI interpreter.
5833  * \return bitmask of "alive" quad components
5834  */
5835 uint
5836 tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
5837 {
5838    uint i;
5839    int pc = 0;
5840
5841    tgsi_exec_machine_setup_masks(mach);
5842
5843    /* execute declarations (interpolants) */
5844    for (i = 0; i < mach->NumDeclarations; i++) {
5845       exec_declaration( mach, mach->Declarations+i );
5846    }
5847
5848    {
5849 #if DEBUG_EXECUTION
5850       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
5851       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
5852       uint inst = 1;
5853
5854       memset(mach->Temps, 0, sizeof(temps));
5855       if (mach->Outputs)
5856          memset(mach->Outputs, 0, sizeof(outputs));
5857       memset(temps, 0, sizeof(temps));
5858       memset(outputs, 0, sizeof(outputs));
5859 #endif
5860
5861       /* execute instructions, until pc is set to -1 */
5862       while (pc != -1) {
5863
5864 #if DEBUG_EXECUTION
5865          uint i;
5866
5867          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
5868 #endif
5869
5870          assert(pc < (int) mach->NumInstructions);
5871          exec_instruction(mach, mach->Instructions + pc, &pc);
5872
5873 #if DEBUG_EXECUTION
5874          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
5875             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
5876                uint j;
5877
5878                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
5879                debug_printf("TEMP[%2u] = ", i);
5880                for (j = 0; j < 4; j++) {
5881                   if (j > 0) {
5882                      debug_printf("           ");
5883                   }
5884                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
5885                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
5886                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
5887                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
5888                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
5889                }
5890             }
5891          }
5892          if (mach->Outputs) {
5893             for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
5894                if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
5895                   uint j;
5896
5897                   memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
5898                   debug_printf("OUT[%2u] =  ", i);
5899                   for (j = 0; j < 4; j++) {
5900                      if (j > 0) {
5901                         debug_printf("           ");
5902                      }
5903                      debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
5904                                   outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
5905                                   outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
5906                                   outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
5907                                   outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
5908                   }
5909                }
5910             }
5911          }
5912 #endif
5913       }
5914    }
5915
5916 #if 0
5917    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
5918    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
5919       /*
5920        * Scale back depth component.
5921        */
5922       for (i = 0; i < 4; i++)
5923          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
5924    }
5925 #endif
5926
5927    /* Strictly speaking, these assertions aren't really needed but they
5928     * can potentially catch some bugs in the control flow code.
5929     */
5930    assert(mach->CondStackTop == 0);
5931    assert(mach->LoopStackTop == 0);
5932    assert(mach->ContStackTop == 0);
5933    assert(mach->SwitchStackTop == 0);
5934    assert(mach->BreakStackTop == 0);
5935    assert(mach->CallStackTop == 0);
5936
5937    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
5938 }