src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 VMware, Inc.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_half.h"
  62 #include "util/u_memory.h"
  63 #include "util/u_math.h"
  64 #include "util/rounding.h"
  65
  66
  67 #define DEBUG_EXECUTION 0
  68
  69
  70 #define FAST_MATH 0
  71
  72 #define TILE_TOP_LEFT     0
  73 #define TILE_TOP_RIGHT    1
  74 #define TILE_BOTTOM_LEFT  2
  75 #define TILE_BOTTOM_RIGHT 3
  76
  77 union tgsi_double_channel {
  78    double d[TGSI_QUAD_SIZE];
  79    unsigned u[TGSI_QUAD_SIZE][2];
  80    uint64_t u64[TGSI_QUAD_SIZE];
  81    int64_t i64[TGSI_QUAD_SIZE];
  82 };
  83
  84 struct tgsi_double_vector {
  85    union tgsi_double_channel xy;
  86    union tgsi_double_channel zw;
  87 };
  88
  89 static void
  90 micro_abs(union tgsi_exec_channel *dst,
  91           const union tgsi_exec_channel *src)
  92 {
  93    dst->f[0] = fabsf(src->f[0]);
  94    dst->f[1] = fabsf(src->f[1]);
  95    dst->f[2] = fabsf(src->f[2]);
  96    dst->f[3] = fabsf(src->f[3]);
  97 }
  98
  99 static void
 100 micro_arl(union tgsi_exec_channel *dst,
 101           const union tgsi_exec_channel *src)
 102 {
 103    dst->i[0] = (int)floorf(src->f[0]);
 104    dst->i[1] = (int)floorf(src->f[1]);
 105    dst->i[2] = (int)floorf(src->f[2]);
 106    dst->i[3] = (int)floorf(src->f[3]);
 107 }
 108
 109 static void
 110 micro_arr(union tgsi_exec_channel *dst,
 111           const union tgsi_exec_channel *src)
 112 {
 113    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
 114    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
 115    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
 116    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 117 }
 118
 119 static void
 120 micro_ceil(union tgsi_exec_channel *dst,
 121            const union tgsi_exec_channel *src)
 122 {
 123    dst->f[0] = ceilf(src->f[0]);
 124    dst->f[1] = ceilf(src->f[1]);
 125    dst->f[2] = ceilf(src->f[2]);
 126    dst->f[3] = ceilf(src->f[3]);
 127 }
 128
 129 static void
 130 micro_cmp(union tgsi_exec_channel *dst,
 131           const union tgsi_exec_channel *src0,
 132           const union tgsi_exec_channel *src1,
 133           const union tgsi_exec_channel *src2)
 134 {
 135    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
 136    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
 137    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
 138    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
 139 }
 140
 141 static void
 142 micro_cos(union tgsi_exec_channel *dst,
 143           const union tgsi_exec_channel *src)
 144 {
 145    dst->f[0] = cosf(src->f[0]);
 146    dst->f[1] = cosf(src->f[1]);
 147    dst->f[2] = cosf(src->f[2]);
 148    dst->f[3] = cosf(src->f[3]);
 149 }
 150
 151 static void
 152 micro_d2f(union tgsi_exec_channel *dst,
 153           const union tgsi_double_channel *src)
 154 {
 155    dst->f[0] = (float)src->d[0];
 156    dst->f[1] = (float)src->d[1];
 157    dst->f[2] = (float)src->d[2];
 158    dst->f[3] = (float)src->d[3];
 159 }
 160
 161 static void
 162 micro_d2i(union tgsi_exec_channel *dst,
 163           const union tgsi_double_channel *src)
 164 {
 165    dst->i[0] = (int)src->d[0];
 166    dst->i[1] = (int)src->d[1];
 167    dst->i[2] = (int)src->d[2];
 168    dst->i[3] = (int)src->d[3];
 169 }
 170
 171 static void
 172 micro_d2u(union tgsi_exec_channel *dst,
 173           const union tgsi_double_channel *src)
 174 {
 175    dst->u[0] = (unsigned)src->d[0];
 176    dst->u[1] = (unsigned)src->d[1];
 177    dst->u[2] = (unsigned)src->d[2];
 178    dst->u[3] = (unsigned)src->d[3];
 179 }
 180 static void
 181 micro_dabs(union tgsi_double_channel *dst,
 182            const union tgsi_double_channel *src)
 183 {
 184    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
 185    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
 186    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
 187    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
 188 }
 189
 190 static void
 191 micro_dadd(union tgsi_double_channel *dst,
 192           const union tgsi_double_channel *src)
 193 {
 194    dst->d[0] = src[0].d[0] + src[1].d[0];
 195    dst->d[1] = src[0].d[1] + src[1].d[1];
 196    dst->d[2] = src[0].d[2] + src[1].d[2];
 197    dst->d[3] = src[0].d[3] + src[1].d[3];
 198 }
 199
 200 static void
 201 micro_ddiv(union tgsi_double_channel *dst,
 202           const union tgsi_double_channel *src)
 203 {
 204    dst->d[0] = src[0].d[0] / src[1].d[0];
 205    dst->d[1] = src[0].d[1] / src[1].d[1];
 206    dst->d[2] = src[0].d[2] / src[1].d[2];
 207    dst->d[3] = src[0].d[3] / src[1].d[3];
 208 }
 209
 210 static void
 211 micro_ddx(union tgsi_exec_channel *dst,
 212           const union tgsi_exec_channel *src)
 213 {
 214    dst->f[0] =
 215    dst->f[1] =
 216    dst->f[2] =
 217    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 218 }
 219
 220 static void
 221 micro_ddx_fine(union tgsi_exec_channel *dst,
 222           const union tgsi_exec_channel *src)
 223 {
 224    dst->f[0] =
 225    dst->f[1] = src->f[TILE_TOP_RIGHT] - src->f[TILE_TOP_LEFT];
 226    dst->f[2] =
 227    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 228 }
 229
 230
 231 static void
 232 micro_ddy(union tgsi_exec_channel *dst,
 233           const union tgsi_exec_channel *src)
 234 {
 235    dst->f[0] =
 236    dst->f[1] =
 237    dst->f[2] =
 238    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 239 }
 240
 241 static void
 242 micro_ddy_fine(union tgsi_exec_channel *dst,
 243           const union tgsi_exec_channel *src)
 244 {
 245    dst->f[0] =
 246    dst->f[2] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 247    dst->f[1] =
 248    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_TOP_RIGHT];
 249 }
 250
 251 static void
 252 micro_dmul(union tgsi_double_channel *dst,
 253            const union tgsi_double_channel *src)
 254 {
 255    dst->d[0] = src[0].d[0] * src[1].d[0];
 256    dst->d[1] = src[0].d[1] * src[1].d[1];
 257    dst->d[2] = src[0].d[2] * src[1].d[2];
 258    dst->d[3] = src[0].d[3] * src[1].d[3];
 259 }
 260
 261 static void
 262 micro_dmax(union tgsi_double_channel *dst,
 263            const union tgsi_double_channel *src)
 264 {
 265    dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
 266    dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
 267    dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
 268    dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
 269 }
 270
 271 static void
 272 micro_dmin(union tgsi_double_channel *dst,
 273            const union tgsi_double_channel *src)
 274 {
 275    dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
 276    dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
 277    dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
 278    dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
 279 }
 280
 281 static void
 282 micro_dneg(union tgsi_double_channel *dst,
 283            const union tgsi_double_channel *src)
 284 {
 285    dst->d[0] = -src->d[0];
 286    dst->d[1] = -src->d[1];
 287    dst->d[2] = -src->d[2];
 288    dst->d[3] = -src->d[3];
 289 }
 290
 291 static void
 292 micro_dslt(union tgsi_double_channel *dst,
 293            const union tgsi_double_channel *src)
 294 {
 295    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
 296    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
 297    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
 298    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
 299 }
 300
 301 static void
 302 micro_dsne(union tgsi_double_channel *dst,
 303            const union tgsi_double_channel *src)
 304 {
 305    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
 306    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
 307    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
 308    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
 309 }
 310
 311 static void
 312 micro_dsge(union tgsi_double_channel *dst,
 313            const union tgsi_double_channel *src)
 314 {
 315    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
 316    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
 317    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
 318    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
 319 }
 320
 321 static void
 322 micro_dseq(union tgsi_double_channel *dst,
 323            const union tgsi_double_channel *src)
 324 {
 325    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
 326    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
 327    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
 328    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
 329 }
 330
 331 static void
 332 micro_drcp(union tgsi_double_channel *dst,
 333            const union tgsi_double_channel *src)
 334 {
 335    dst->d[0] = 1.0 / src->d[0];
 336    dst->d[1] = 1.0 / src->d[1];
 337    dst->d[2] = 1.0 / src->d[2];
 338    dst->d[3] = 1.0 / src->d[3];
 339 }
 340
 341 static void
 342 micro_dsqrt(union tgsi_double_channel *dst,
 343             const union tgsi_double_channel *src)
 344 {
 345    dst->d[0] = sqrt(src->d[0]);
 346    dst->d[1] = sqrt(src->d[1]);
 347    dst->d[2] = sqrt(src->d[2]);
 348    dst->d[3] = sqrt(src->d[3]);
 349 }
 350
 351 static void
 352 micro_drsq(union tgsi_double_channel *dst,
 353           const union tgsi_double_channel *src)
 354 {
 355    dst->d[0] = 1.0 / sqrt(src->d[0]);
 356    dst->d[1] = 1.0 / sqrt(src->d[1]);
 357    dst->d[2] = 1.0 / sqrt(src->d[2]);
 358    dst->d[3] = 1.0 / sqrt(src->d[3]);
 359 }
 360
 361 static void
 362 micro_dmad(union tgsi_double_channel *dst,
 363            const union tgsi_double_channel *src)
 364 {
 365    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
 366    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
 367    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
 368    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
 369 }
 370
 371 static void
 372 micro_dfrac(union tgsi_double_channel *dst,
 373             const union tgsi_double_channel *src)
 374 {
 375    dst->d[0] = src->d[0] - floor(src->d[0]);
 376    dst->d[1] = src->d[1] - floor(src->d[1]);
 377    dst->d[2] = src->d[2] - floor(src->d[2]);
 378    dst->d[3] = src->d[3] - floor(src->d[3]);
 379 }
 380
 381 static void
 382 micro_dldexp(union tgsi_double_channel *dst,
 383              const union tgsi_double_channel *src0,
 384              union tgsi_exec_channel *src1)
 385 {
 386    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
 387    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
 388    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
 389    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
 390 }
 391
 392 static void
 393 micro_dfracexp(union tgsi_double_channel *dst,
 394                union tgsi_exec_channel *dst_exp,
 395                const union tgsi_double_channel *src)
 396 {
 397    dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
 398    dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
 399    dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
 400    dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
 401 }
 402
 403 static void
 404 micro_exp2(union tgsi_exec_channel *dst,
 405            const union tgsi_exec_channel *src)
 406 {
 407 #if FAST_MATH
 408    dst->f[0] = util_fast_exp2(src->f[0]);
 409    dst->f[1] = util_fast_exp2(src->f[1]);
 410    dst->f[2] = util_fast_exp2(src->f[2]);
 411    dst->f[3] = util_fast_exp2(src->f[3]);
 412 #else
 413 #if DEBUG
 414    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 415    uint i;
 416    union tgsi_exec_channel clamped;
 417
 418    for (i = 0; i < 4; i++) {
 419       if (src->f[i] > 127.99999f) {
 420          clamped.f[i] = 127.99999f;
 421       } else if (src->f[i] < -126.99999f) {
 422          clamped.f[i] = -126.99999f;
 423       } else {
 424          clamped.f[i] = src->f[i];
 425       }
 426    }
 427    src = &clamped;
 428 #endif /* DEBUG */
 429
 430    dst->f[0] = powf(2.0f, src->f[0]);
 431    dst->f[1] = powf(2.0f, src->f[1]);
 432    dst->f[2] = powf(2.0f, src->f[2]);
 433    dst->f[3] = powf(2.0f, src->f[3]);
 434 #endif /* FAST_MATH */
 435 }
 436
 437 static void
 438 micro_f2d(union tgsi_double_channel *dst,
 439           const union tgsi_exec_channel *src)
 440 {
 441    dst->d[0] = (double)src->f[0];
 442    dst->d[1] = (double)src->f[1];
 443    dst->d[2] = (double)src->f[2];
 444    dst->d[3] = (double)src->f[3];
 445 }
 446
 447 static void
 448 micro_flr(union tgsi_exec_channel *dst,
 449           const union tgsi_exec_channel *src)
 450 {
 451    dst->f[0] = floorf(src->f[0]);
 452    dst->f[1] = floorf(src->f[1]);
 453    dst->f[2] = floorf(src->f[2]);
 454    dst->f[3] = floorf(src->f[3]);
 455 }
 456
 457 static void
 458 micro_frc(union tgsi_exec_channel *dst,
 459           const union tgsi_exec_channel *src)
 460 {
 461    dst->f[0] = src->f[0] - floorf(src->f[0]);
 462    dst->f[1] = src->f[1] - floorf(src->f[1]);
 463    dst->f[2] = src->f[2] - floorf(src->f[2]);
 464    dst->f[3] = src->f[3] - floorf(src->f[3]);
 465 }
 466
 467 static void
 468 micro_i2d(union tgsi_double_channel *dst,
 469           const union tgsi_exec_channel *src)
 470 {
 471    dst->d[0] = (double)src->i[0];
 472    dst->d[1] = (double)src->i[1];
 473    dst->d[2] = (double)src->i[2];
 474    dst->d[3] = (double)src->i[3];
 475 }
 476
 477 static void
 478 micro_iabs(union tgsi_exec_channel *dst,
 479            const union tgsi_exec_channel *src)
 480 {
 481    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 482    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 483    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 484    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 485 }
 486
 487 static void
 488 micro_ineg(union tgsi_exec_channel *dst,
 489            const union tgsi_exec_channel *src)
 490 {
 491    dst->i[0] = -src->i[0];
 492    dst->i[1] = -src->i[1];
 493    dst->i[2] = -src->i[2];
 494    dst->i[3] = -src->i[3];
 495 }
 496
 497 static void
 498 micro_lg2(union tgsi_exec_channel *dst,
 499           const union tgsi_exec_channel *src)
 500 {
 501 #if FAST_MATH
 502    dst->f[0] = util_fast_log2(src->f[0]);
 503    dst->f[1] = util_fast_log2(src->f[1]);
 504    dst->f[2] = util_fast_log2(src->f[2]);
 505    dst->f[3] = util_fast_log2(src->f[3]);
 506 #else
 507    dst->f[0] = logf(src->f[0]) * 1.442695f;
 508    dst->f[1] = logf(src->f[1]) * 1.442695f;
 509    dst->f[2] = logf(src->f[2]) * 1.442695f;
 510    dst->f[3] = logf(src->f[3]) * 1.442695f;
 511 #endif
 512 }
 513
 514 static void
 515 micro_lrp(union tgsi_exec_channel *dst,
 516           const union tgsi_exec_channel *src0,
 517           const union tgsi_exec_channel *src1,
 518           const union tgsi_exec_channel *src2)
 519 {
 520    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
 521    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
 522    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
 523    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
 524 }
 525
 526 static void
 527 micro_mad(union tgsi_exec_channel *dst,
 528           const union tgsi_exec_channel *src0,
 529           const union tgsi_exec_channel *src1,
 530           const union tgsi_exec_channel *src2)
 531 {
 532    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
 533    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
 534    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
 535    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
 536 }
 537
 538 static void
 539 micro_mov(union tgsi_exec_channel *dst,
 540           const union tgsi_exec_channel *src)
 541 {
 542    dst->u[0] = src->u[0];
 543    dst->u[1] = src->u[1];
 544    dst->u[2] = src->u[2];
 545    dst->u[3] = src->u[3];
 546 }
 547
 548 static void
 549 micro_rcp(union tgsi_exec_channel *dst,
 550           const union tgsi_exec_channel *src)
 551 {
 552 #if 0 /* for debugging */
 553    assert(src->f[0] != 0.0f);
 554    assert(src->f[1] != 0.0f);
 555    assert(src->f[2] != 0.0f);
 556    assert(src->f[3] != 0.0f);
 557 #endif
 558    dst->f[0] = 1.0f / src->f[0];
 559    dst->f[1] = 1.0f / src->f[1];
 560    dst->f[2] = 1.0f / src->f[2];
 561    dst->f[3] = 1.0f / src->f[3];
 562 }
 563
 564 static void
 565 micro_rnd(union tgsi_exec_channel *dst,
 566           const union tgsi_exec_channel *src)
 567 {
 568    dst->f[0] = _mesa_roundevenf(src->f[0]);
 569    dst->f[1] = _mesa_roundevenf(src->f[1]);
 570    dst->f[2] = _mesa_roundevenf(src->f[2]);
 571    dst->f[3] = _mesa_roundevenf(src->f[3]);
 572 }
 573
 574 static void
 575 micro_rsq(union tgsi_exec_channel *dst,
 576           const union tgsi_exec_channel *src)
 577 {
 578 #if 0 /* for debugging */
 579    assert(src->f[0] != 0.0f);
 580    assert(src->f[1] != 0.0f);
 581    assert(src->f[2] != 0.0f);
 582    assert(src->f[3] != 0.0f);
 583 #endif
 584    dst->f[0] = 1.0f / sqrtf(src->f[0]);
 585    dst->f[1] = 1.0f / sqrtf(src->f[1]);
 586    dst->f[2] = 1.0f / sqrtf(src->f[2]);
 587    dst->f[3] = 1.0f / sqrtf(src->f[3]);
 588 }
 589
 590 static void
 591 micro_sqrt(union tgsi_exec_channel *dst,
 592            const union tgsi_exec_channel *src)
 593 {
 594    dst->f[0] = sqrtf(src->f[0]);
 595    dst->f[1] = sqrtf(src->f[1]);
 596    dst->f[2] = sqrtf(src->f[2]);
 597    dst->f[3] = sqrtf(src->f[3]);
 598 }
 599
 600 static void
 601 micro_seq(union tgsi_exec_channel *dst,
 602           const union tgsi_exec_channel *src0,
 603           const union tgsi_exec_channel *src1)
 604 {
 605    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
 606    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
 607    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
 608    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
 609 }
 610
 611 static void
 612 micro_sge(union tgsi_exec_channel *dst,
 613           const union tgsi_exec_channel *src0,
 614           const union tgsi_exec_channel *src1)
 615 {
 616    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
 617    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
 618    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
 619    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
 620 }
 621
 622 static void
 623 micro_sgn(union tgsi_exec_channel *dst,
 624           const union tgsi_exec_channel *src)
 625 {
 626    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 627    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 628    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 629    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 630 }
 631
 632 static void
 633 micro_isgn(union tgsi_exec_channel *dst,
 634           const union tgsi_exec_channel *src)
 635 {
 636    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
 637    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
 638    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
 639    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
 640 }
 641
 642 static void
 643 micro_sgt(union tgsi_exec_channel *dst,
 644           const union tgsi_exec_channel *src0,
 645           const union tgsi_exec_channel *src1)
 646 {
 647    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
 648    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
 649    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
 650    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
 651 }
 652
 653 static void
 654 micro_sin(union tgsi_exec_channel *dst,
 655           const union tgsi_exec_channel *src)
 656 {
 657    dst->f[0] = sinf(src->f[0]);
 658    dst->f[1] = sinf(src->f[1]);
 659    dst->f[2] = sinf(src->f[2]);
 660    dst->f[3] = sinf(src->f[3]);
 661 }
 662
 663 static void
 664 micro_sle(union tgsi_exec_channel *dst,
 665           const union tgsi_exec_channel *src0,
 666           const union tgsi_exec_channel *src1)
 667 {
 668    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
 669    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
 670    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
 671    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
 672 }
 673
 674 static void
 675 micro_slt(union tgsi_exec_channel *dst,
 676           const union tgsi_exec_channel *src0,
 677           const union tgsi_exec_channel *src1)
 678 {
 679    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
 680    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
 681    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
 682    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
 683 }
 684
 685 static void
 686 micro_sne(union tgsi_exec_channel *dst,
 687           const union tgsi_exec_channel *src0,
 688           const union tgsi_exec_channel *src1)
 689 {
 690    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
 691    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
 692    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
 693    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
 694 }
 695
 696 static void
 697 micro_trunc(union tgsi_exec_channel *dst,
 698             const union tgsi_exec_channel *src)
 699 {
 700    dst->f[0] = truncf(src->f[0]);
 701    dst->f[1] = truncf(src->f[1]);
 702    dst->f[2] = truncf(src->f[2]);
 703    dst->f[3] = truncf(src->f[3]);
 704 }
 705
 706 static void
 707 micro_u2d(union tgsi_double_channel *dst,
 708           const union tgsi_exec_channel *src)
 709 {
 710    dst->d[0] = (double)src->u[0];
 711    dst->d[1] = (double)src->u[1];
 712    dst->d[2] = (double)src->u[2];
 713    dst->d[3] = (double)src->u[3];
 714 }
 715
 716 static void
 717 micro_i64abs(union tgsi_double_channel *dst,
 718              const union tgsi_double_channel *src)
 719 {
 720    dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
 721    dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
 722    dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
 723    dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
 724 }
 725
 726 static void
 727 micro_i64sgn(union tgsi_double_channel *dst,
 728              const union tgsi_double_channel *src)
 729 {
 730    dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
 731    dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
 732    dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
 733    dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
 734 }
 735
 736 static void
 737 micro_i64neg(union tgsi_double_channel *dst,
 738              const union tgsi_double_channel *src)
 739 {
 740    dst->i64[0] = -src->i64[0];
 741    dst->i64[1] = -src->i64[1];
 742    dst->i64[2] = -src->i64[2];
 743    dst->i64[3] = -src->i64[3];
 744 }
 745
 746 static void
 747 micro_u64seq(union tgsi_double_channel *dst,
 748            const union tgsi_double_channel *src)
 749 {
 750    dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
 751    dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
 752    dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
 753    dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
 754 }
 755
 756 static void
 757 micro_u64sne(union tgsi_double_channel *dst,
 758              const union tgsi_double_channel *src)
 759 {
 760    dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
 761    dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
 762    dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
 763    dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
 764 }
 765
 766 static void
 767 micro_i64slt(union tgsi_double_channel *dst,
 768              const union tgsi_double_channel *src)
 769 {
 770    dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
 771    dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
 772    dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
 773    dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
 774 }
 775
 776 static void
 777 micro_u64slt(union tgsi_double_channel *dst,
 778              const union tgsi_double_channel *src)
 779 {
 780    dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
 781    dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
 782    dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
 783    dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
 784 }
 785
 786 static void
 787 micro_i64sge(union tgsi_double_channel *dst,
 788            const union tgsi_double_channel *src)
 789 {
 790    dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
 791    dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
 792    dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
 793    dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
 794 }
 795
 796 static void
 797 micro_u64sge(union tgsi_double_channel *dst,
 798              const union tgsi_double_channel *src)
 799 {
 800    dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
 801    dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
 802    dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
 803    dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
 804 }
 805
 806 static void
 807 micro_u64max(union tgsi_double_channel *dst,
 808              const union tgsi_double_channel *src)
 809 {
 810    dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
 811    dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
 812    dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
 813    dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
 814 }
 815
 816 static void
 817 micro_i64max(union tgsi_double_channel *dst,
 818              const union tgsi_double_channel *src)
 819 {
 820    dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
 821    dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
 822    dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
 823    dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
 824 }
 825
 826 static void
 827 micro_u64min(union tgsi_double_channel *dst,
 828              const union tgsi_double_channel *src)
 829 {
 830    dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
 831    dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
 832    dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
 833    dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
 834 }
 835
 836 static void
 837 micro_i64min(union tgsi_double_channel *dst,
 838              const union tgsi_double_channel *src)
 839 {
 840    dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
 841    dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
 842    dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
 843    dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
 844 }
 845
 846 static void
 847 micro_u64add(union tgsi_double_channel *dst,
 848              const union tgsi_double_channel *src)
 849 {
 850    dst->u64[0] = src[0].u64[0] + src[1].u64[0];
 851    dst->u64[1] = src[0].u64[1] + src[1].u64[1];
 852    dst->u64[2] = src[0].u64[2] + src[1].u64[2];
 853    dst->u64[3] = src[0].u64[3] + src[1].u64[3];
 854 }
 855
 856 static void
 857 micro_u64mul(union tgsi_double_channel *dst,
 858              const union tgsi_double_channel *src)
 859 {
 860    dst->u64[0] = src[0].u64[0] * src[1].u64[0];
 861    dst->u64[1] = src[0].u64[1] * src[1].u64[1];
 862    dst->u64[2] = src[0].u64[2] * src[1].u64[2];
 863    dst->u64[3] = src[0].u64[3] * src[1].u64[3];
 864 }
 865
 866 static void
 867 micro_u64div(union tgsi_double_channel *dst,
 868              const union tgsi_double_channel *src)
 869 {
 870    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
 871    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
 872    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
 873    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
 874 }
 875
 876 static void
 877 micro_i64div(union tgsi_double_channel *dst,
 878              const union tgsi_double_channel *src)
 879 {
 880    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
 881    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
 882    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
 883    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
 884 }
 885
 886 static void
 887 micro_u64mod(union tgsi_double_channel *dst,
 888              const union tgsi_double_channel *src)
 889 {
 890    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
 891    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
 892    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
 893    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
 894 }
 895
 896 static void
 897 micro_i64mod(union tgsi_double_channel *dst,
 898              const union tgsi_double_channel *src)
 899 {
 900    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
 901    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
 902    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
 903    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
 904 }
 905
 906 static void
 907 micro_u64shl(union tgsi_double_channel *dst,
 908              const union tgsi_double_channel *src0,
 909              union tgsi_exec_channel *src1)
 910 {
 911    unsigned masked_count;
 912    masked_count = src1->u[0] & 0x3f;
 913    dst->u64[0] = src0->u64[0] << masked_count;
 914    masked_count = src1->u[1] & 0x3f;
 915    dst->u64[1] = src0->u64[1] << masked_count;
 916    masked_count = src1->u[2] & 0x3f;
 917    dst->u64[2] = src0->u64[2] << masked_count;
 918    masked_count = src1->u[3] & 0x3f;
 919    dst->u64[3] = src0->u64[3] << masked_count;
 920 }
 921
 922 static void
 923 micro_i64shr(union tgsi_double_channel *dst,
 924              const union tgsi_double_channel *src0,
 925              union tgsi_exec_channel *src1)
 926 {
 927    unsigned masked_count;
 928    masked_count = src1->u[0] & 0x3f;
 929    dst->i64[0] = src0->i64[0] >> masked_count;
 930    masked_count = src1->u[1] & 0x3f;
 931    dst->i64[1] = src0->i64[1] >> masked_count;
 932    masked_count = src1->u[2] & 0x3f;
 933    dst->i64[2] = src0->i64[2] >> masked_count;
 934    masked_count = src1->u[3] & 0x3f;
 935    dst->i64[3] = src0->i64[3] >> masked_count;
 936 }
 937
 938 static void
 939 micro_u64shr(union tgsi_double_channel *dst,
 940              const union tgsi_double_channel *src0,
 941              union tgsi_exec_channel *src1)
 942 {
 943    unsigned masked_count;
 944    masked_count = src1->u[0] & 0x3f;
 945    dst->u64[0] = src0->u64[0] >> masked_count;
 946    masked_count = src1->u[1] & 0x3f;
 947    dst->u64[1] = src0->u64[1] >> masked_count;
 948    masked_count = src1->u[2] & 0x3f;
 949    dst->u64[2] = src0->u64[2] >> masked_count;
 950    masked_count = src1->u[3] & 0x3f;
 951    dst->u64[3] = src0->u64[3] >> masked_count;
 952 }
 953
 954 enum tgsi_exec_datatype {
 955    TGSI_EXEC_DATA_FLOAT,
 956    TGSI_EXEC_DATA_INT,
 957    TGSI_EXEC_DATA_UINT,
 958    TGSI_EXEC_DATA_DOUBLE,
 959    TGSI_EXEC_DATA_INT64,
 960    TGSI_EXEC_DATA_UINT64,
 961 };
 962
 963 /*
 964  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 965  */
 966 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 967 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 968 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 969 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 970 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 971 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 972 #define TEMP_PRIMITIVE_S1_I   TGSI_EXEC_TEMP_PRIMITIVE_S1_I
 973 #define TEMP_PRIMITIVE_S1_C   TGSI_EXEC_TEMP_PRIMITIVE_S1_C
 974 #define TEMP_PRIMITIVE_S2_I   TGSI_EXEC_TEMP_PRIMITIVE_S2_I
 975 #define TEMP_PRIMITIVE_S2_C   TGSI_EXEC_TEMP_PRIMITIVE_S2_C
 976 #define TEMP_PRIMITIVE_S3_I   TGSI_EXEC_TEMP_PRIMITIVE_S3_I
 977 #define TEMP_PRIMITIVE_S3_C   TGSI_EXEC_TEMP_PRIMITIVE_S3_C
 978
 979 static const struct {
 980    int idx;
 981    int chan;
 982 } temp_prim_idxs[] = {
 983    { TEMP_PRIMITIVE_I, TEMP_PRIMITIVE_C },
 984    { TEMP_PRIMITIVE_S1_I, TEMP_PRIMITIVE_S1_C },
 985    { TEMP_PRIMITIVE_S2_I, TEMP_PRIMITIVE_S2_C },
 986    { TEMP_PRIMITIVE_S3_I, TEMP_PRIMITIVE_S3_C },
 987 };
 988
 989 /** The execution mask depends on the conditional mask and the loop mask */
 990 #define UPDATE_EXEC_MASK(MACH) \
 991       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 992
 993
 994 static const union tgsi_exec_channel ZeroVec =
 995    { { 0.0, 0.0, 0.0, 0.0 } };
 996
 997 static const union tgsi_exec_channel OneVec = {
 998    {1.0f, 1.0f, 1.0f, 1.0f}
 999 };
1000
1001 static const union tgsi_exec_channel P128Vec = {
1002    {128.0f, 128.0f, 128.0f, 128.0f}
1003 };
1004
1005 static const union tgsi_exec_channel M128Vec = {
1006    {-128.0f, -128.0f, -128.0f, -128.0f}
1007 };
1008
1009
1010 /**
1011  * Assert that none of the float values in 'chan' are infinite or NaN.
1012  * NaN and Inf may occur normally during program execution and should
1013  * not lead to crashes, etc.  But when debugging, it's helpful to catch
1014  * them.
1015  */
1016 static inline void
1017 check_inf_or_nan(const union tgsi_exec_channel *chan)
1018 {
1019    assert(!util_is_inf_or_nan((chan)->f[0]));
1020    assert(!util_is_inf_or_nan((chan)->f[1]));
1021    assert(!util_is_inf_or_nan((chan)->f[2]));
1022    assert(!util_is_inf_or_nan((chan)->f[3]));
1023 }
1024
1025
1026 #ifdef DEBUG
1027 static void
1028 print_chan(const char *msg, const union tgsi_exec_channel *chan)
1029 {
1030    debug_printf("%s = {%f, %f, %f, %f}\n",
1031                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
1032 }
1033 #endif
1034
1035
1036 #ifdef DEBUG
1037 static void
1038 print_temp(const struct tgsi_exec_machine *mach, uint index)
1039 {
1040    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1041    int i;
1042    debug_printf("Temp[%u] =\n", index);
1043    for (i = 0; i < 4; i++) {
1044       debug_printf("  %c: { %f, %f, %f, %f }\n",
1045                    "XYZW"[i],
1046                    tmp->xyzw[i].f[0],
1047                    tmp->xyzw[i].f[1],
1048                    tmp->xyzw[i].f[2],
1049                    tmp->xyzw[i].f[3]);
1050    }
1051 }
1052 #endif
1053
1054
1055 void
1056 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1057                                unsigned num_bufs,
1058                                const void **bufs,
1059                                const unsigned *buf_sizes)
1060 {
1061    unsigned i;
1062
1063    for (i = 0; i < num_bufs; i++) {
1064       mach->Consts[i] = bufs[i];
1065       mach->ConstsSize[i] = buf_sizes[i];
1066    }
1067 }
1068
1069 /**
1070  * Initialize machine state by expanding tokens to full instructions,
1071  * allocating temporary storage, setting up constants, etc.
1072  * After this, we can call tgsi_exec_machine_run() many times.
1073  */
1074 void
1075 tgsi_exec_machine_bind_shader(
1076    struct tgsi_exec_machine *mach,
1077    const struct tgsi_token *tokens,
1078    struct tgsi_sampler *sampler,
1079    struct tgsi_image *image,
1080    struct tgsi_buffer *buffer)
1081 {
1082    uint k;
1083    struct tgsi_parse_context parse;
1084    struct tgsi_full_instruction *instructions;
1085    struct tgsi_full_declaration *declarations;
1086    uint maxInstructions = 10, numInstructions = 0;
1087    uint maxDeclarations = 10, numDeclarations = 0;
1088
1089 #if 0
1090    tgsi_dump(tokens, 0);
1091 #endif
1092
1093    util_init_math();
1094
1095
1096    mach->Tokens = tokens;
1097    mach->Sampler = sampler;
1098    mach->Image = image;
1099    mach->Buffer = buffer;
1100
1101    if (!tokens) {
1102       /* unbind and free all */
1103       FREE(mach->Declarations);
1104       mach->Declarations = NULL;
1105       mach->NumDeclarations = 0;
1106
1107       FREE(mach->Instructions);
1108       mach->Instructions = NULL;
1109       mach->NumInstructions = 0;
1110
1111       return;
1112    }
1113
1114    k = tgsi_parse_init (&parse, mach->Tokens);
1115    if (k != TGSI_PARSE_OK) {
1116       debug_printf( "Problem parsing!\n" );
1117       return;
1118    }
1119
1120    mach->ImmLimit = 0;
1121    mach->NumOutputs = 0;
1122
1123    for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1124       mach->SysSemanticToIndex[k] = -1;
1125
1126    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1127        !mach->UsedGeometryShader) {
1128       struct tgsi_exec_vector *inputs;
1129       struct tgsi_exec_vector *outputs;
1130
1131       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1132                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1133                             16);
1134
1135       if (!inputs)
1136          return;
1137
1138       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1139                              TGSI_MAX_TOTAL_VERTICES, 16);
1140
1141       if (!outputs) {
1142          align_free(inputs);
1143          return;
1144       }
1145
1146       align_free(mach->Inputs);
1147       align_free(mach->Outputs);
1148
1149       mach->Inputs = inputs;
1150       mach->Outputs = outputs;
1151       mach->UsedGeometryShader = TRUE;
1152    }
1153
1154    declarations = (struct tgsi_full_declaration *)
1155       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1156
1157    if (!declarations) {
1158       return;
1159    }
1160
1161    instructions = (struct tgsi_full_instruction *)
1162       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1163
1164    if (!instructions) {
1165       FREE( declarations );
1166       return;
1167    }
1168
1169    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1170       uint i;
1171
1172       tgsi_parse_token( &parse );
1173       switch( parse.FullToken.Token.Type ) {
1174       case TGSI_TOKEN_TYPE_DECLARATION:
1175          /* save expanded declaration */
1176          if (numDeclarations == maxDeclarations) {
1177             declarations = REALLOC(declarations,
1178                                    maxDeclarations
1179                                    * sizeof(struct tgsi_full_declaration),
1180                                    (maxDeclarations + 10)
1181                                    * sizeof(struct tgsi_full_declaration));
1182             maxDeclarations += 10;
1183          }
1184          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
1185             unsigned reg;
1186             for (reg = parse.FullToken.FullDeclaration.Range.First;
1187                  reg <= parse.FullToken.FullDeclaration.Range.Last;
1188                  ++reg) {
1189                ++mach->NumOutputs;
1190             }
1191          }
1192          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1193             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1194             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1195          }
1196
1197          memcpy(declarations + numDeclarations,
1198                 &parse.FullToken.FullDeclaration,
1199                 sizeof(declarations[0]));
1200          numDeclarations++;
1201          break;
1202
1203       case TGSI_TOKEN_TYPE_IMMEDIATE:
1204          {
1205             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1206             assert( size <= 4 );
1207             if (mach->ImmLimit >= mach->ImmsReserved) {
1208                unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1209                float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1210                if (imms) {
1211                   mach->ImmsReserved = newReserved;
1212                   mach->Imms = imms;
1213                } else {
1214                   debug_printf("Unable to (re)allocate space for immidiate constants\n");
1215                   break;
1216                }
1217             }
1218
1219             for( i = 0; i < size; i++ ) {
1220                mach->Imms[mach->ImmLimit][i] =
1221                   parse.FullToken.FullImmediate.u[i].Float;
1222             }
1223             mach->ImmLimit += 1;
1224          }
1225          break;
1226
1227       case TGSI_TOKEN_TYPE_INSTRUCTION:
1228
1229          /* save expanded instruction */
1230          if (numInstructions == maxInstructions) {
1231             instructions = REALLOC(instructions,
1232                                    maxInstructions
1233                                    * sizeof(struct tgsi_full_instruction),
1234                                    (maxInstructions + 10)
1235                                    * sizeof(struct tgsi_full_instruction));
1236             maxInstructions += 10;
1237          }
1238
1239          memcpy(instructions + numInstructions,
1240                 &parse.FullToken.FullInstruction,
1241                 sizeof(instructions[0]));
1242
1243          numInstructions++;
1244          break;
1245
1246       case TGSI_TOKEN_TYPE_PROPERTY:
1247          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1248             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1249                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1250             }
1251          }
1252          break;
1253
1254       default:
1255          assert( 0 );
1256       }
1257    }
1258    tgsi_parse_free (&parse);
1259
1260    FREE(mach->Declarations);
1261    mach->Declarations = declarations;
1262    mach->NumDeclarations = numDeclarations;
1263
1264    FREE(mach->Instructions);
1265    mach->Instructions = instructions;
1266    mach->NumInstructions = numInstructions;
1267 }
1268
1269
1270 struct tgsi_exec_machine *
1271 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1272 {
1273    struct tgsi_exec_machine *mach;
1274    uint i;
1275
1276    mach = align_malloc( sizeof *mach, 16 );
1277    if (!mach)
1278       goto fail;
1279
1280    memset(mach, 0, sizeof(*mach));
1281
1282    mach->ShaderType = shader_type;
1283    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1284    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1285
1286    if (shader_type != PIPE_SHADER_COMPUTE) {
1287       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1288       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1289       if (!mach->Inputs || !mach->Outputs)
1290          goto fail;
1291    }
1292
1293    if (shader_type == PIPE_SHADER_FRAGMENT) {
1294       mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
1295       if (!mach->InputSampleOffsetApply)
1296          goto fail;
1297    }
1298
1299 #ifdef DEBUG
1300    /* silence warnings */
1301    (void) print_chan;
1302    (void) print_temp;
1303 #endif
1304
1305    return mach;
1306
1307 fail:
1308    if (mach) {
1309       align_free(mach->InputSampleOffsetApply);
1310       align_free(mach->Inputs);
1311       align_free(mach->Outputs);
1312       align_free(mach);
1313    }
1314    return NULL;
1315 }
1316
1317
1318 void
1319 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1320 {
1321    if (mach) {
1322       FREE(mach->Instructions);
1323       FREE(mach->Declarations);
1324       FREE(mach->Imms);
1325
1326       align_free(mach->InputSampleOffsetApply);
1327       align_free(mach->Inputs);
1328       align_free(mach->Outputs);
1329
1330       align_free(mach);
1331    }
1332 }
1333
1334 static void
1335 micro_add(union tgsi_exec_channel *dst,
1336           const union tgsi_exec_channel *src0,
1337           const union tgsi_exec_channel *src1)
1338 {
1339    dst->f[0] = src0->f[0] + src1->f[0];
1340    dst->f[1] = src0->f[1] + src1->f[1];
1341    dst->f[2] = src0->f[2] + src1->f[2];
1342    dst->f[3] = src0->f[3] + src1->f[3];
1343 }
1344
1345 static void
1346 micro_div(
1347    union tgsi_exec_channel *dst,
1348    const union tgsi_exec_channel *src0,
1349    const union tgsi_exec_channel *src1 )
1350 {
1351    if (src1->f[0] != 0) {
1352       dst->f[0] = src0->f[0] / src1->f[0];
1353    }
1354    if (src1->f[1] != 0) {
1355       dst->f[1] = src0->f[1] / src1->f[1];
1356    }
1357    if (src1->f[2] != 0) {
1358       dst->f[2] = src0->f[2] / src1->f[2];
1359    }
1360    if (src1->f[3] != 0) {
1361       dst->f[3] = src0->f[3] / src1->f[3];
1362    }
1363 }
1364
1365 static void
1366 micro_lt(
1367    union tgsi_exec_channel *dst,
1368    const union tgsi_exec_channel *src0,
1369    const union tgsi_exec_channel *src1,
1370    const union tgsi_exec_channel *src2,
1371    const union tgsi_exec_channel *src3 )
1372 {
1373    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1374    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1375    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1376    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1377 }
1378
1379 static void
1380 micro_max(union tgsi_exec_channel *dst,
1381           const union tgsi_exec_channel *src0,
1382           const union tgsi_exec_channel *src1)
1383 {
1384    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1385    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1386    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1387    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1388 }
1389
1390 static void
1391 micro_min(union tgsi_exec_channel *dst,
1392           const union tgsi_exec_channel *src0,
1393           const union tgsi_exec_channel *src1)
1394 {
1395    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1396    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1397    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1398    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1399 }
1400
1401 static void
1402 micro_mul(union tgsi_exec_channel *dst,
1403           const union tgsi_exec_channel *src0,
1404           const union tgsi_exec_channel *src1)
1405 {
1406    dst->f[0] = src0->f[0] * src1->f[0];
1407    dst->f[1] = src0->f[1] * src1->f[1];
1408    dst->f[2] = src0->f[2] * src1->f[2];
1409    dst->f[3] = src0->f[3] * src1->f[3];
1410 }
1411
1412 static void
1413 micro_neg(
1414    union tgsi_exec_channel *dst,
1415    const union tgsi_exec_channel *src )
1416 {
1417    dst->f[0] = -src->f[0];
1418    dst->f[1] = -src->f[1];
1419    dst->f[2] = -src->f[2];
1420    dst->f[3] = -src->f[3];
1421 }
1422
1423 static void
1424 micro_pow(
1425    union tgsi_exec_channel *dst,
1426    const union tgsi_exec_channel *src0,
1427    const union tgsi_exec_channel *src1 )
1428 {
1429 #if FAST_MATH
1430    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1431    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1432    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1433    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1434 #else
1435    dst->f[0] = powf( src0->f[0], src1->f[0] );
1436    dst->f[1] = powf( src0->f[1], src1->f[1] );
1437    dst->f[2] = powf( src0->f[2], src1->f[2] );
1438    dst->f[3] = powf( src0->f[3], src1->f[3] );
1439 #endif
1440 }
1441
1442 static void
1443 micro_ldexp(union tgsi_exec_channel *dst,
1444             const union tgsi_exec_channel *src0,
1445             const union tgsi_exec_channel *src1)
1446 {
1447    dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1448    dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1449    dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1450    dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1451 }
1452
1453 static void
1454 micro_sub(union tgsi_exec_channel *dst,
1455           const union tgsi_exec_channel *src0,
1456           const union tgsi_exec_channel *src1)
1457 {
1458    dst->f[0] = src0->f[0] - src1->f[0];
1459    dst->f[1] = src0->f[1] - src1->f[1];
1460    dst->f[2] = src0->f[2] - src1->f[2];
1461    dst->f[3] = src0->f[3] - src1->f[3];
1462 }
1463
1464 static void
1465 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1466                        const uint file,
1467                        const uint swizzle,
1468                        const union tgsi_exec_channel *index,
1469                        const union tgsi_exec_channel *index2D,
1470                        union tgsi_exec_channel *chan)
1471 {
1472    uint i;
1473
1474    assert(swizzle < 4);
1475
1476    switch (file) {
1477    case TGSI_FILE_CONSTANT:
1478       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1479          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1480          assert(mach->Consts[index2D->i[i]]);
1481
1482          if (index->i[i] < 0) {
1483             chan->u[i] = 0;
1484          } else {
1485             /* NOTE: copying the const value as a uint instead of float */
1486             const uint constbuf = index2D->i[i];
1487             const uint *buf = (const uint *)mach->Consts[constbuf];
1488             const int pos = index->i[i] * 4 + swizzle;
1489             /* const buffer bounds check */
1490             if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1491                if (0) {
1492                   /* Debug: print warning */
1493                   static int count = 0;
1494                   if (count++ < 100)
1495                      debug_printf("TGSI Exec: const buffer index %d"
1496                                   " out of bounds\n", pos);
1497                }
1498                chan->u[i] = 0;
1499             }
1500             else
1501                chan->u[i] = buf[pos];
1502          }
1503       }
1504       break;
1505
1506    case TGSI_FILE_INPUT:
1507       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1508          /*
1509          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1510             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1511                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1512                          index2D->i[i], index->i[i]);
1513                          }*/
1514          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1515          assert(pos >= 0);
1516          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1517          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1518       }
1519       break;
1520
1521    case TGSI_FILE_SYSTEM_VALUE:
1522       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1523          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1524       }
1525       break;
1526
1527    case TGSI_FILE_TEMPORARY:
1528       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1529          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1530          assert(index2D->i[i] == 0);
1531
1532          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1533       }
1534       break;
1535
1536    case TGSI_FILE_IMMEDIATE:
1537       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1538          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1539          assert(index2D->i[i] == 0);
1540
1541          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1542       }
1543       break;
1544
1545    case TGSI_FILE_ADDRESS:
1546       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1547          assert(index->i[i] >= 0);
1548          assert(index2D->i[i] == 0);
1549
1550          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1551       }
1552       break;
1553
1554    case TGSI_FILE_OUTPUT:
1555       /* vertex/fragment output vars can be read too */
1556       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1557          assert(index->i[i] >= 0);
1558          assert(index2D->i[i] == 0);
1559
1560          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1561       }
1562       break;
1563
1564    default:
1565       assert(0);
1566       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1567          chan->u[i] = 0;
1568       }
1569    }
1570 }
1571
1572 static void
1573 get_index_registers(const struct tgsi_exec_machine *mach,
1574                     const struct tgsi_full_src_register *reg,
1575                     union tgsi_exec_channel *index,
1576                     union tgsi_exec_channel *index2D)
1577 {
1578    uint swizzle;
1579
1580    /* We start with a direct index into a register file.
1581     *
1582     *    file[1],
1583     *    where:
1584     *       file = Register.File
1585     *       [1] = Register.Index
1586     */
1587    index->i[0] =
1588    index->i[1] =
1589    index->i[2] =
1590    index->i[3] = reg->Register.Index;
1591
1592    /* There is an extra source register that indirectly subscripts
1593     * a register file. The direct index now becomes an offset
1594     * that is being added to the indirect register.
1595     *
1596     *    file[ind[2].x+1],
1597     *    where:
1598     *       ind = Indirect.File
1599     *       [2] = Indirect.Index
1600     *       .x = Indirect.SwizzleX
1601     */
1602    if (reg->Register.Indirect) {
1603       union tgsi_exec_channel index2;
1604       union tgsi_exec_channel indir_index;
1605       const uint execmask = mach->ExecMask;
1606       uint i;
1607
1608       /* which address register (always zero now) */
1609       index2.i[0] =
1610       index2.i[1] =
1611       index2.i[2] =
1612       index2.i[3] = reg->Indirect.Index;
1613       /* get current value of address register[swizzle] */
1614       swizzle = reg->Indirect.Swizzle;
1615       fetch_src_file_channel(mach,
1616                              reg->Indirect.File,
1617                              swizzle,
1618                              &index2,
1619                              &ZeroVec,
1620                              &indir_index);
1621
1622       /* add value of address register to the offset */
1623       index->i[0] += indir_index.i[0];
1624       index->i[1] += indir_index.i[1];
1625       index->i[2] += indir_index.i[2];
1626       index->i[3] += indir_index.i[3];
1627
1628       /* for disabled execution channels, zero-out the index to
1629        * avoid using a potential garbage value.
1630        */
1631       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1632          if ((execmask & (1 << i)) == 0)
1633             index->i[i] = 0;
1634       }
1635    }
1636
1637    /* There is an extra source register that is a second
1638     * subscript to a register file. Effectively it means that
1639     * the register file is actually a 2D array of registers.
1640     *
1641     *    file[3][1],
1642     *    where:
1643     *       [3] = Dimension.Index
1644     */
1645    if (reg->Register.Dimension) {
1646       index2D->i[0] =
1647       index2D->i[1] =
1648       index2D->i[2] =
1649       index2D->i[3] = reg->Dimension.Index;
1650
1651       /* Again, the second subscript index can be addressed indirectly
1652        * identically to the first one.
1653        * Nothing stops us from indirectly addressing the indirect register,
1654        * but there is no need for that, so we won't exercise it.
1655        *
1656        *    file[ind[4].y+3][1],
1657        *    where:
1658        *       ind = DimIndirect.File
1659        *       [4] = DimIndirect.Index
1660        *       .y = DimIndirect.SwizzleX
1661        */
1662       if (reg->Dimension.Indirect) {
1663          union tgsi_exec_channel index2;
1664          union tgsi_exec_channel indir_index;
1665          const uint execmask = mach->ExecMask;
1666          uint i;
1667
1668          index2.i[0] =
1669          index2.i[1] =
1670          index2.i[2] =
1671          index2.i[3] = reg->DimIndirect.Index;
1672
1673          swizzle = reg->DimIndirect.Swizzle;
1674          fetch_src_file_channel(mach,
1675                                 reg->DimIndirect.File,
1676                                 swizzle,
1677                                 &index2,
1678                                 &ZeroVec,
1679                                 &indir_index);
1680
1681          index2D->i[0] += indir_index.i[0];
1682          index2D->i[1] += indir_index.i[1];
1683          index2D->i[2] += indir_index.i[2];
1684          index2D->i[3] += indir_index.i[3];
1685
1686          /* for disabled execution channels, zero-out the index to
1687           * avoid using a potential garbage value.
1688           */
1689          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1690             if ((execmask & (1 << i)) == 0) {
1691                index2D->i[i] = 0;
1692             }
1693          }
1694       }
1695
1696       /* If by any chance there was a need for a 3D array of register
1697        * files, we would have to check whether Dimension is followed
1698        * by a dimension register and continue the saga.
1699        */
1700    } else {
1701       index2D->i[0] =
1702       index2D->i[1] =
1703       index2D->i[2] =
1704       index2D->i[3] = 0;
1705    }
1706 }
1707
1708
1709 static void
1710 fetch_source_d(const struct tgsi_exec_machine *mach,
1711                union tgsi_exec_channel *chan,
1712                const struct tgsi_full_src_register *reg,
1713                const uint chan_index)
1714 {
1715    union tgsi_exec_channel index;
1716    union tgsi_exec_channel index2D;
1717    uint swizzle;
1718
1719    get_index_registers(mach, reg, &index, &index2D);
1720
1721
1722    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1723    fetch_src_file_channel(mach,
1724                           reg->Register.File,
1725                           swizzle,
1726                           &index,
1727                           &index2D,
1728                           chan);
1729 }
1730
1731 static void
1732 fetch_source(const struct tgsi_exec_machine *mach,
1733              union tgsi_exec_channel *chan,
1734              const struct tgsi_full_src_register *reg,
1735              const uint chan_index,
1736              enum tgsi_exec_datatype src_datatype)
1737 {
1738    fetch_source_d(mach, chan, reg, chan_index);
1739
1740    if (reg->Register.Absolute) {
1741       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1742          micro_abs(chan, chan);
1743       } else {
1744          micro_iabs(chan, chan);
1745       }
1746    }
1747
1748    if (reg->Register.Negate) {
1749       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1750          micro_neg(chan, chan);
1751       } else {
1752          micro_ineg(chan, chan);
1753       }
1754    }
1755 }
1756
1757 static union tgsi_exec_channel *
1758 store_dest_dstret(struct tgsi_exec_machine *mach,
1759                  const union tgsi_exec_channel *chan,
1760                  const struct tgsi_full_dst_register *reg,
1761                  uint chan_index,
1762                  enum tgsi_exec_datatype dst_datatype)
1763 {
1764    static union tgsi_exec_channel null;
1765    union tgsi_exec_channel *dst;
1766    union tgsi_exec_channel index2D;
1767    int offset = 0;  /* indirection offset */
1768    int index;
1769
1770    /* for debugging */
1771    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1772       check_inf_or_nan(chan);
1773    }
1774
1775    /* There is an extra source register that indirectly subscripts
1776     * a register file. The direct index now becomes an offset
1777     * that is being added to the indirect register.
1778     *
1779     *    file[ind[2].x+1],
1780     *    where:
1781     *       ind = Indirect.File
1782     *       [2] = Indirect.Index
1783     *       .x = Indirect.SwizzleX
1784     */
1785    if (reg->Register.Indirect) {
1786       union tgsi_exec_channel index;
1787       union tgsi_exec_channel indir_index;
1788       uint swizzle;
1789
1790       /* which address register (always zero for now) */
1791       index.i[0] =
1792       index.i[1] =
1793       index.i[2] =
1794       index.i[3] = reg->Indirect.Index;
1795
1796       /* get current value of address register[swizzle] */
1797       swizzle = reg->Indirect.Swizzle;
1798
1799       /* fetch values from the address/indirection register */
1800       fetch_src_file_channel(mach,
1801                              reg->Indirect.File,
1802                              swizzle,
1803                              &index,
1804                              &ZeroVec,
1805                              &indir_index);
1806
1807       /* save indirection offset */
1808       offset = indir_index.i[0];
1809    }
1810
1811    /* There is an extra source register that is a second
1812     * subscript to a register file. Effectively it means that
1813     * the register file is actually a 2D array of registers.
1814     *
1815     *    file[3][1],
1816     *    where:
1817     *       [3] = Dimension.Index
1818     */
1819    if (reg->Register.Dimension) {
1820       index2D.i[0] =
1821       index2D.i[1] =
1822       index2D.i[2] =
1823       index2D.i[3] = reg->Dimension.Index;
1824
1825       /* Again, the second subscript index can be addressed indirectly
1826        * identically to the first one.
1827        * Nothing stops us from indirectly addressing the indirect register,
1828        * but there is no need for that, so we won't exercise it.
1829        *
1830        *    file[ind[4].y+3][1],
1831        *    where:
1832        *       ind = DimIndirect.File
1833        *       [4] = DimIndirect.Index
1834        *       .y = DimIndirect.SwizzleX
1835        */
1836       if (reg->Dimension.Indirect) {
1837          union tgsi_exec_channel index2;
1838          union tgsi_exec_channel indir_index;
1839          const uint execmask = mach->ExecMask;
1840          unsigned swizzle;
1841          uint i;
1842
1843          index2.i[0] =
1844          index2.i[1] =
1845          index2.i[2] =
1846          index2.i[3] = reg->DimIndirect.Index;
1847
1848          swizzle = reg->DimIndirect.Swizzle;
1849          fetch_src_file_channel(mach,
1850                                 reg->DimIndirect.File,
1851                                 swizzle,
1852                                 &index2,
1853                                 &ZeroVec,
1854                                 &indir_index);
1855
1856          index2D.i[0] += indir_index.i[0];
1857          index2D.i[1] += indir_index.i[1];
1858          index2D.i[2] += indir_index.i[2];
1859          index2D.i[3] += indir_index.i[3];
1860
1861          /* for disabled execution channels, zero-out the index to
1862           * avoid using a potential garbage value.
1863           */
1864          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1865             if ((execmask & (1 << i)) == 0) {
1866                index2D.i[i] = 0;
1867             }
1868          }
1869       }
1870
1871       /* If by any chance there was a need for a 3D array of register
1872        * files, we would have to check whether Dimension is followed
1873        * by a dimension register and continue the saga.
1874        */
1875    } else {
1876       index2D.i[0] =
1877       index2D.i[1] =
1878       index2D.i[2] =
1879       index2D.i[3] = 0;
1880    }
1881
1882    switch (reg->Register.File) {
1883    case TGSI_FILE_NULL:
1884       dst = &null;
1885       break;
1886
1887    case TGSI_FILE_OUTPUT:
1888       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1889          + reg->Register.Index;
1890       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1891 #if 0
1892       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1893                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1894                    reg->Register.Index);
1895       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1896          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1897          for (i = 0; i < TGSI_QUAD_SIZE; i++)
1898             if (execmask & (1 << i))
1899                debug_printf("%f, ", chan->f[i]);
1900          debug_printf(")\n");
1901       }
1902 #endif
1903       break;
1904
1905    case TGSI_FILE_TEMPORARY:
1906       index = reg->Register.Index;
1907       assert( index < TGSI_EXEC_NUM_TEMPS );
1908       dst = &mach->Temps[offset + index].xyzw[chan_index];
1909       break;
1910
1911    case TGSI_FILE_ADDRESS:
1912       index = reg->Register.Index;
1913       dst = &mach->Addrs[index].xyzw[chan_index];
1914       break;
1915
1916    default:
1917       assert( 0 );
1918       return NULL;
1919    }
1920
1921    return dst;
1922 }
1923
1924 static void
1925 store_dest_double(struct tgsi_exec_machine *mach,
1926                  const union tgsi_exec_channel *chan,
1927                  const struct tgsi_full_dst_register *reg,
1928                  uint chan_index,
1929                  enum tgsi_exec_datatype dst_datatype)
1930 {
1931    union tgsi_exec_channel *dst;
1932    const uint execmask = mach->ExecMask;
1933    int i;
1934
1935    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1936    if (!dst)
1937       return;
1938
1939    /* doubles path */
1940    for (i = 0; i < TGSI_QUAD_SIZE; i++)
1941       if (execmask & (1 << i))
1942          dst->i[i] = chan->i[i];
1943 }
1944
1945 static void
1946 store_dest(struct tgsi_exec_machine *mach,
1947            const union tgsi_exec_channel *chan,
1948            const struct tgsi_full_dst_register *reg,
1949            const struct tgsi_full_instruction *inst,
1950            uint chan_index,
1951            enum tgsi_exec_datatype dst_datatype)
1952 {
1953    union tgsi_exec_channel *dst;
1954    const uint execmask = mach->ExecMask;
1955    int i;
1956
1957    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1958    if (!dst)
1959       return;
1960
1961    if (!inst->Instruction.Saturate) {
1962       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1963          if (execmask & (1 << i))
1964             dst->i[i] = chan->i[i];
1965    }
1966    else {
1967       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1968          if (execmask & (1 << i)) {
1969             if (chan->f[i] < 0.0f)
1970                dst->f[i] = 0.0f;
1971             else if (chan->f[i] > 1.0f)
1972                dst->f[i] = 1.0f;
1973             else
1974                dst->i[i] = chan->i[i];
1975          }
1976    }
1977 }
1978
1979 #define FETCH(VAL,INDEX,CHAN)\
1980     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1981
1982 #define IFETCH(VAL,INDEX,CHAN)\
1983     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1984
1985
1986 /**
1987  * Execute ARB-style KIL which is predicated by a src register.
1988  * Kill fragment if any of the four values is less than zero.
1989  */
1990 static void
1991 exec_kill_if(struct tgsi_exec_machine *mach,
1992              const struct tgsi_full_instruction *inst)
1993 {
1994    uint uniquemask;
1995    uint chan_index;
1996    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1997    union tgsi_exec_channel r[1];
1998
1999    /* This mask stores component bits that were already tested. */
2000    uniquemask = 0;
2001
2002    for (chan_index = 0; chan_index < 4; chan_index++)
2003    {
2004       uint swizzle;
2005       uint i;
2006
2007       /* unswizzle channel */
2008       swizzle = tgsi_util_get_full_src_register_swizzle (
2009                         &inst->Src[0],
2010                         chan_index);
2011
2012       /* check if the component has not been already tested */
2013       if (uniquemask & (1 << swizzle))
2014          continue;
2015       uniquemask |= 1 << swizzle;
2016
2017       FETCH(&r[0], 0, chan_index);
2018       for (i = 0; i < 4; i++)
2019          if (r[0].f[i] < 0.0f)
2020             kilmask |= 1 << i;
2021    }
2022
2023    /* restrict to fragments currently executing */
2024    kilmask &= mach->ExecMask;
2025
2026    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2027 }
2028
2029 /**
2030  * Unconditional fragment kill/discard.
2031  */
2032 static void
2033 exec_kill(struct tgsi_exec_machine *mach)
2034 {
2035    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2036
2037    /* kill fragment for all fragments currently executing */
2038    kilmask = mach->ExecMask;
2039    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2040 }
2041
2042 static void
2043 emit_vertex(struct tgsi_exec_machine *mach,
2044             const struct tgsi_full_instruction *inst)
2045 {
2046    union tgsi_exec_channel r[1];
2047    unsigned stream_id;
2048    unsigned *prim_count;
2049    /* FIXME: check for exec mask correctly
2050    unsigned i;
2051    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2052          if ((mach->ExecMask & (1 << i)))
2053    */
2054    IFETCH(&r[0], 0, TGSI_CHAN_X);
2055    stream_id = r[0].u[0];
2056    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2057    if (mach->ExecMask) {
2058       if (mach->Primitives[stream_id][*prim_count] >= mach->MaxOutputVertices)
2059          return;
2060
2061       if (mach->Primitives[stream_id][*prim_count] == 0)
2062          mach->PrimitiveOffsets[stream_id][*prim_count] = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0];
2063       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
2064       mach->Primitives[stream_id][*prim_count]++;
2065    }
2066 }
2067
2068 static void
2069 emit_primitive(struct tgsi_exec_machine *mach,
2070                const struct tgsi_full_instruction *inst)
2071 {
2072    unsigned *prim_count;
2073    union tgsi_exec_channel r[1];
2074    unsigned stream_id = 0;
2075    /* FIXME: check for exec mask correctly
2076    unsigned i;
2077    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2078          if ((mach->ExecMask & (1 << i)))
2079    */
2080    if (inst) {
2081       IFETCH(&r[0], 0, TGSI_CHAN_X);
2082       stream_id = r[0].u[0];
2083    }
2084    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2085    if (mach->ExecMask) {
2086       ++(*prim_count);
2087       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
2088       mach->Primitives[stream_id][*prim_count] = 0;
2089    }
2090 }
2091
2092 static void
2093 conditional_emit_primitive(struct tgsi_exec_machine *mach)
2094 {
2095    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
2096       int emitted_verts =
2097          mach->Primitives[0][mach->Temps[temp_prim_idxs[0].idx].xyzw[temp_prim_idxs[0].chan].u[0]];
2098       if (emitted_verts) {
2099          emit_primitive(mach, NULL);
2100       }
2101    }
2102 }
2103
2104
2105 /*
2106  * Fetch four texture samples using STR texture coordinates.
2107  */
2108 static void
2109 fetch_texel( struct tgsi_sampler *sampler,
2110              const unsigned sview_idx,
2111              const unsigned sampler_idx,
2112              const union tgsi_exec_channel *s,
2113              const union tgsi_exec_channel *t,
2114              const union tgsi_exec_channel *p,
2115              const union tgsi_exec_channel *c0,
2116              const union tgsi_exec_channel *c1,
2117              float derivs[3][2][TGSI_QUAD_SIZE],
2118              const int8_t offset[3],
2119              enum tgsi_sampler_control control,
2120              union tgsi_exec_channel *r,
2121              union tgsi_exec_channel *g,
2122              union tgsi_exec_channel *b,
2123              union tgsi_exec_channel *a )
2124 {
2125    uint j;
2126    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2127
2128    /* FIXME: handle explicit derivs, offsets */
2129    sampler->get_samples(sampler, sview_idx, sampler_idx,
2130                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
2131
2132    for (j = 0; j < 4; j++) {
2133       r->f[j] = rgba[0][j];
2134       g->f[j] = rgba[1][j];
2135       b->f[j] = rgba[2][j];
2136       a->f[j] = rgba[3][j];
2137    }
2138 }
2139
2140
2141 #define TEX_MODIFIER_NONE           0
2142 #define TEX_MODIFIER_PROJECTED      1
2143 #define TEX_MODIFIER_LOD_BIAS       2
2144 #define TEX_MODIFIER_EXPLICIT_LOD   3
2145 #define TEX_MODIFIER_LEVEL_ZERO     4
2146 #define TEX_MODIFIER_GATHER         5
2147
2148 /*
2149  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2150  */
2151 static void
2152 fetch_texel_offsets(struct tgsi_exec_machine *mach,
2153                     const struct tgsi_full_instruction *inst,
2154                     int8_t offsets[3])
2155 {
2156    if (inst->Texture.NumOffsets == 1) {
2157       union tgsi_exec_channel index;
2158       union tgsi_exec_channel offset[3];
2159       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2160       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2161                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2162       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2163                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2164       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2165                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2166      offsets[0] = offset[0].i[0];
2167      offsets[1] = offset[1].i[0];
2168      offsets[2] = offset[2].i[0];
2169    } else {
2170      assert(inst->Texture.NumOffsets == 0);
2171      offsets[0] = offsets[1] = offsets[2] = 0;
2172    }
2173 }
2174
2175
2176 /*
2177  * Fetch dx and dy values for one channel (s, t or r).
2178  * Put dx values into one float array, dy values into another.
2179  */
2180 static void
2181 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2182                            const struct tgsi_full_instruction *inst,
2183                            unsigned regdsrcx,
2184                            unsigned chan,
2185                            float derivs[2][TGSI_QUAD_SIZE])
2186 {
2187    union tgsi_exec_channel d;
2188    FETCH(&d, regdsrcx, chan);
2189    derivs[0][0] = d.f[0];
2190    derivs[0][1] = d.f[1];
2191    derivs[0][2] = d.f[2];
2192    derivs[0][3] = d.f[3];
2193    FETCH(&d, regdsrcx + 1, chan);
2194    derivs[1][0] = d.f[0];
2195    derivs[1][1] = d.f[1];
2196    derivs[1][2] = d.f[2];
2197    derivs[1][3] = d.f[3];
2198 }
2199
2200 static uint
2201 fetch_sampler_unit(struct tgsi_exec_machine *mach,
2202                    const struct tgsi_full_instruction *inst,
2203                    uint sampler)
2204 {
2205    uint unit = 0;
2206    int i;
2207    if (inst->Src[sampler].Register.Indirect) {
2208       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2209       union tgsi_exec_channel indir_index, index2;
2210       const uint execmask = mach->ExecMask;
2211       index2.i[0] =
2212       index2.i[1] =
2213       index2.i[2] =
2214       index2.i[3] = reg->Indirect.Index;
2215
2216       fetch_src_file_channel(mach,
2217                              reg->Indirect.File,
2218                              reg->Indirect.Swizzle,
2219                              &index2,
2220                              &ZeroVec,
2221                              &indir_index);
2222       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2223          if (execmask & (1 << i)) {
2224             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2225             break;
2226          }
2227       }
2228
2229    } else {
2230       unit = inst->Src[sampler].Register.Index;
2231    }
2232    return unit;
2233 }
2234
2235 /*
2236  * execute a texture instruction.
2237  *
2238  * modifier is used to control the channel routing for the
2239  * instruction variants like proj, lod, and texture with lod bias.
2240  * sampler indicates which src register the sampler is contained in.
2241  */
2242 static void
2243 exec_tex(struct tgsi_exec_machine *mach,
2244          const struct tgsi_full_instruction *inst,
2245          uint modifier, uint sampler)
2246 {
2247    const union tgsi_exec_channel *args[5], *proj = NULL;
2248    union tgsi_exec_channel r[5];
2249    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2250    uint chan;
2251    uint unit;
2252    int8_t offsets[3];
2253    int dim, shadow_ref, i;
2254
2255    unit = fetch_sampler_unit(mach, inst, sampler);
2256    /* always fetch all 3 offsets, overkill but keeps code simple */
2257    fetch_texel_offsets(mach, inst, offsets);
2258
2259    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2260    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2261
2262    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2263    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2264
2265    assert(dim <= 4);
2266    if (shadow_ref >= 0)
2267       assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2268
2269    /* fetch modifier to the last argument */
2270    if (modifier != TEX_MODIFIER_NONE) {
2271       const int last = ARRAY_SIZE(args) - 1;
2272
2273       /* fetch modifier from src0.w or src1.x */
2274       if (sampler == 1) {
2275          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2276          FETCH(&r[last], 0, TGSI_CHAN_W);
2277       }
2278       else {
2279          FETCH(&r[last], 1, TGSI_CHAN_X);
2280       }
2281
2282       if (modifier != TEX_MODIFIER_PROJECTED) {
2283          args[last] = &r[last];
2284       }
2285       else {
2286          proj = &r[last];
2287          args[last] = &ZeroVec;
2288       }
2289
2290       /* point unused arguments to zero vector */
2291       for (i = dim; i < last; i++)
2292          args[i] = &ZeroVec;
2293
2294       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2295          control = TGSI_SAMPLER_LOD_EXPLICIT;
2296       else if (modifier == TEX_MODIFIER_LOD_BIAS)
2297          control = TGSI_SAMPLER_LOD_BIAS;
2298       else if (modifier == TEX_MODIFIER_GATHER)
2299          control = TGSI_SAMPLER_GATHER;
2300    }
2301    else {
2302       for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2303          args[i] = &ZeroVec;
2304    }
2305
2306    /* fetch coordinates */
2307    for (i = 0; i < dim; i++) {
2308       FETCH(&r[i], 0, TGSI_CHAN_X + i);
2309
2310       if (proj)
2311          micro_div(&r[i], &r[i], proj);
2312
2313       args[i] = &r[i];
2314    }
2315
2316    /* fetch reference value */
2317    if (shadow_ref >= 0) {
2318       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2319
2320       if (proj)
2321          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2322
2323       args[shadow_ref] = &r[shadow_ref];
2324    }
2325
2326    fetch_texel(mach->Sampler, unit, unit,
2327          args[0], args[1], args[2], args[3], args[4],
2328          NULL, offsets, control,
2329          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2330
2331 #if 0
2332    debug_printf("fetch r: %g %g %g %g\n",
2333          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2334    debug_printf("fetch g: %g %g %g %g\n",
2335          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2336    debug_printf("fetch b: %g %g %g %g\n",
2337          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2338    debug_printf("fetch a: %g %g %g %g\n",
2339          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2340 #endif
2341
2342    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2343       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2344          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2345       }
2346    }
2347 }
2348
2349 static void
2350 exec_lodq(struct tgsi_exec_machine *mach,
2351           const struct tgsi_full_instruction *inst)
2352 {
2353    uint resource_unit, sampler_unit;
2354    unsigned dim;
2355    unsigned i;
2356    union tgsi_exec_channel coords[4];
2357    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2358    union tgsi_exec_channel r[2];
2359
2360    resource_unit = fetch_sampler_unit(mach, inst, 1);
2361    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2362       uint target = mach->SamplerViews[resource_unit].Resource;
2363       dim = tgsi_util_get_texture_coord_dim(target);
2364       sampler_unit = fetch_sampler_unit(mach, inst, 2);
2365    } else {
2366       dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2367       sampler_unit = resource_unit;
2368    }
2369    assert(dim <= ARRAY_SIZE(coords));
2370    /* fetch coordinates */
2371    for (i = 0; i < dim; i++) {
2372       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2373       args[i] = &coords[i];
2374    }
2375    for (i = dim; i < ARRAY_SIZE(coords); i++) {
2376       args[i] = &ZeroVec;
2377    }
2378    mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2379                             args[0]->f,
2380                             args[1]->f,
2381                             args[2]->f,
2382                             args[3]->f,
2383                             TGSI_SAMPLER_LOD_NONE,
2384                             r[0].f,
2385                             r[1].f);
2386
2387    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2388       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2389                  TGSI_EXEC_DATA_FLOAT);
2390    }
2391    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2392       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2393                  TGSI_EXEC_DATA_FLOAT);
2394    }
2395    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2396       unsigned char swizzles[4];
2397       unsigned chan;
2398       swizzles[0] = inst->Src[1].Register.SwizzleX;
2399       swizzles[1] = inst->Src[1].Register.SwizzleY;
2400       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2401       swizzles[3] = inst->Src[1].Register.SwizzleW;
2402
2403       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2404          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2405             if (swizzles[chan] >= 2) {
2406                store_dest(mach, &ZeroVec,
2407                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2408             } else {
2409                store_dest(mach, &r[swizzles[chan]],
2410                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2411             }
2412          }
2413       }
2414    } else {
2415       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2416          store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2417                     TGSI_EXEC_DATA_FLOAT);
2418       }
2419       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2420          store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2421                     TGSI_EXEC_DATA_FLOAT);
2422       }
2423    }
2424 }
2425
2426 static void
2427 exec_txd(struct tgsi_exec_machine *mach,
2428          const struct tgsi_full_instruction *inst)
2429 {
2430    union tgsi_exec_channel r[4];
2431    float derivs[3][2][TGSI_QUAD_SIZE];
2432    uint chan;
2433    uint unit;
2434    int8_t offsets[3];
2435
2436    unit = fetch_sampler_unit(mach, inst, 3);
2437    /* always fetch all 3 offsets, overkill but keeps code simple */
2438    fetch_texel_offsets(mach, inst, offsets);
2439
2440    switch (inst->Texture.Texture) {
2441    case TGSI_TEXTURE_1D:
2442       FETCH(&r[0], 0, TGSI_CHAN_X);
2443
2444       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2445
2446       fetch_texel(mach->Sampler, unit, unit,
2447                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2448                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2449                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2450       break;
2451
2452    case TGSI_TEXTURE_SHADOW1D:
2453    case TGSI_TEXTURE_1D_ARRAY:
2454    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2455       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2456       FETCH(&r[0], 0, TGSI_CHAN_X);
2457       FETCH(&r[1], 0, TGSI_CHAN_Y);
2458       FETCH(&r[2], 0, TGSI_CHAN_Z);
2459
2460       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2461
2462       fetch_texel(mach->Sampler, unit, unit,
2463                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2464                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2465                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2466       break;
2467
2468    case TGSI_TEXTURE_2D:
2469    case TGSI_TEXTURE_RECT:
2470       FETCH(&r[0], 0, TGSI_CHAN_X);
2471       FETCH(&r[1], 0, TGSI_CHAN_Y);
2472
2473       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2474       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2475
2476       fetch_texel(mach->Sampler, unit, unit,
2477                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2478                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2479                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2480       break;
2481
2482
2483    case TGSI_TEXTURE_SHADOW2D:
2484    case TGSI_TEXTURE_SHADOWRECT:
2485    case TGSI_TEXTURE_2D_ARRAY:
2486    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2487       /* only SHADOW2D_ARRAY actually needs W */
2488       FETCH(&r[0], 0, TGSI_CHAN_X);
2489       FETCH(&r[1], 0, TGSI_CHAN_Y);
2490       FETCH(&r[2], 0, TGSI_CHAN_Z);
2491       FETCH(&r[3], 0, TGSI_CHAN_W);
2492
2493       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2494       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2495
2496       fetch_texel(mach->Sampler, unit, unit,
2497                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2498                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2499                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2500       break;
2501
2502    case TGSI_TEXTURE_3D:
2503    case TGSI_TEXTURE_CUBE:
2504    case TGSI_TEXTURE_CUBE_ARRAY:
2505    case TGSI_TEXTURE_SHADOWCUBE:
2506       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2507       FETCH(&r[0], 0, TGSI_CHAN_X);
2508       FETCH(&r[1], 0, TGSI_CHAN_Y);
2509       FETCH(&r[2], 0, TGSI_CHAN_Z);
2510       FETCH(&r[3], 0, TGSI_CHAN_W);
2511
2512       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2513       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2514       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2515
2516       fetch_texel(mach->Sampler, unit, unit,
2517                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2518                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2519                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2520       break;
2521
2522    default:
2523       assert(0);
2524    }
2525
2526    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2527       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2528          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2529       }
2530    }
2531 }
2532
2533
2534 static void
2535 exec_txf(struct tgsi_exec_machine *mach,
2536          const struct tgsi_full_instruction *inst)
2537 {
2538    union tgsi_exec_channel r[4];
2539    uint chan;
2540    uint unit;
2541    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2542    int j;
2543    int8_t offsets[3];
2544    unsigned target;
2545
2546    unit = fetch_sampler_unit(mach, inst, 1);
2547    /* always fetch all 3 offsets, overkill but keeps code simple */
2548    fetch_texel_offsets(mach, inst, offsets);
2549
2550    IFETCH(&r[3], 0, TGSI_CHAN_W);
2551
2552    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2553        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2554       target = mach->SamplerViews[unit].Resource;
2555    }
2556    else {
2557       target = inst->Texture.Texture;
2558    }
2559    switch(target) {
2560    case TGSI_TEXTURE_3D:
2561    case TGSI_TEXTURE_2D_ARRAY:
2562    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2563    case TGSI_TEXTURE_2D_ARRAY_MSAA:
2564       IFETCH(&r[2], 0, TGSI_CHAN_Z);
2565       /* fallthrough */
2566    case TGSI_TEXTURE_2D:
2567    case TGSI_TEXTURE_RECT:
2568    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2569    case TGSI_TEXTURE_SHADOW2D:
2570    case TGSI_TEXTURE_SHADOWRECT:
2571    case TGSI_TEXTURE_1D_ARRAY:
2572    case TGSI_TEXTURE_2D_MSAA:
2573       IFETCH(&r[1], 0, TGSI_CHAN_Y);
2574       /* fallthrough */
2575    case TGSI_TEXTURE_BUFFER:
2576    case TGSI_TEXTURE_1D:
2577    case TGSI_TEXTURE_SHADOW1D:
2578       IFETCH(&r[0], 0, TGSI_CHAN_X);
2579       break;
2580    default:
2581       assert(0);
2582       break;
2583    }
2584
2585    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2586                             offsets, rgba);
2587
2588    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2589       r[0].f[j] = rgba[0][j];
2590       r[1].f[j] = rgba[1][j];
2591       r[2].f[j] = rgba[2][j];
2592       r[3].f[j] = rgba[3][j];
2593    }
2594
2595    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2596        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2597       unsigned char swizzles[4];
2598       swizzles[0] = inst->Src[1].Register.SwizzleX;
2599       swizzles[1] = inst->Src[1].Register.SwizzleY;
2600       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2601       swizzles[3] = inst->Src[1].Register.SwizzleW;
2602
2603       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2604          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2605             store_dest(mach, &r[swizzles[chan]],
2606                        &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2607          }
2608       }
2609    }
2610    else {
2611       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2612          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2613             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2614          }
2615       }
2616    }
2617 }
2618
2619 static void
2620 exec_txq(struct tgsi_exec_machine *mach,
2621          const struct tgsi_full_instruction *inst)
2622 {
2623    int result[4];
2624    union tgsi_exec_channel r[4], src;
2625    uint chan;
2626    uint unit;
2627    int i,j;
2628
2629    unit = fetch_sampler_unit(mach, inst, 1);
2630
2631    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2632
2633    /* XXX: This interface can't return per-pixel values */
2634    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2635
2636    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2637       for (j = 0; j < 4; j++) {
2638          r[j].i[i] = result[j];
2639       }
2640    }
2641
2642    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2643       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2644          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2645                     TGSI_EXEC_DATA_INT);
2646       }
2647    }
2648 }
2649
2650 static void
2651 exec_sample(struct tgsi_exec_machine *mach,
2652             const struct tgsi_full_instruction *inst,
2653             uint modifier, boolean compare)
2654 {
2655    const uint resource_unit = inst->Src[1].Register.Index;
2656    const uint sampler_unit = inst->Src[2].Register.Index;
2657    union tgsi_exec_channel r[5], c1;
2658    const union tgsi_exec_channel *lod = &ZeroVec;
2659    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2660    uint chan;
2661    unsigned char swizzles[4];
2662    int8_t offsets[3];
2663
2664    /* always fetch all 3 offsets, overkill but keeps code simple */
2665    fetch_texel_offsets(mach, inst, offsets);
2666
2667    assert(modifier != TEX_MODIFIER_PROJECTED);
2668
2669    if (modifier != TEX_MODIFIER_NONE) {
2670       if (modifier == TEX_MODIFIER_LOD_BIAS) {
2671          FETCH(&c1, 3, TGSI_CHAN_X);
2672          lod = &c1;
2673          control = TGSI_SAMPLER_LOD_BIAS;
2674       }
2675       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2676          FETCH(&c1, 3, TGSI_CHAN_X);
2677          lod = &c1;
2678          control = TGSI_SAMPLER_LOD_EXPLICIT;
2679       }
2680       else if (modifier == TEX_MODIFIER_GATHER) {
2681          control = TGSI_SAMPLER_GATHER;
2682       }
2683       else {
2684          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2685          control = TGSI_SAMPLER_LOD_ZERO;
2686       }
2687    }
2688
2689    FETCH(&r[0], 0, TGSI_CHAN_X);
2690
2691    switch (mach->SamplerViews[resource_unit].Resource) {
2692    case TGSI_TEXTURE_1D:
2693       if (compare) {
2694          FETCH(&r[2], 3, TGSI_CHAN_X);
2695          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2696                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2697                      NULL, offsets, control,
2698                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2699       }
2700       else {
2701          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2702                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2703                      NULL, offsets, control,
2704                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2705       }
2706       break;
2707
2708    case TGSI_TEXTURE_1D_ARRAY:
2709    case TGSI_TEXTURE_2D:
2710    case TGSI_TEXTURE_RECT:
2711       FETCH(&r[1], 0, TGSI_CHAN_Y);
2712       if (compare) {
2713          FETCH(&r[2], 3, TGSI_CHAN_X);
2714          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2715                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2716                      NULL, offsets, control,
2717                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2718       }
2719       else {
2720          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2721                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2722                      NULL, offsets, control,
2723                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2724       }
2725       break;
2726
2727    case TGSI_TEXTURE_2D_ARRAY:
2728    case TGSI_TEXTURE_3D:
2729    case TGSI_TEXTURE_CUBE:
2730       FETCH(&r[1], 0, TGSI_CHAN_Y);
2731       FETCH(&r[2], 0, TGSI_CHAN_Z);
2732       if(compare) {
2733          FETCH(&r[3], 3, TGSI_CHAN_X);
2734          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2735                      &r[0], &r[1], &r[2], &r[3], lod,
2736                      NULL, offsets, control,
2737                      &r[0], &r[1], &r[2], &r[3]);
2738       }
2739       else {
2740          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2741                      &r[0], &r[1], &r[2], &ZeroVec, lod,
2742                      NULL, offsets, control,
2743                      &r[0], &r[1], &r[2], &r[3]);
2744       }
2745       break;
2746
2747    case TGSI_TEXTURE_CUBE_ARRAY:
2748       FETCH(&r[1], 0, TGSI_CHAN_Y);
2749       FETCH(&r[2], 0, TGSI_CHAN_Z);
2750       FETCH(&r[3], 0, TGSI_CHAN_W);
2751       if(compare) {
2752          FETCH(&r[4], 3, TGSI_CHAN_X);
2753          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2754                      &r[0], &r[1], &r[2], &r[3], &r[4],
2755                      NULL, offsets, control,
2756                      &r[0], &r[1], &r[2], &r[3]);
2757       }
2758       else {
2759          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2760                      &r[0], &r[1], &r[2], &r[3], lod,
2761                      NULL, offsets, control,
2762                      &r[0], &r[1], &r[2], &r[3]);
2763       }
2764       break;
2765
2766
2767    default:
2768       assert(0);
2769    }
2770
2771    swizzles[0] = inst->Src[1].Register.SwizzleX;
2772    swizzles[1] = inst->Src[1].Register.SwizzleY;
2773    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2774    swizzles[3] = inst->Src[1].Register.SwizzleW;
2775
2776    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2777       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2778          store_dest(mach, &r[swizzles[chan]],
2779                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2780       }
2781    }
2782 }
2783
2784 static void
2785 exec_sample_d(struct tgsi_exec_machine *mach,
2786               const struct tgsi_full_instruction *inst)
2787 {
2788    const uint resource_unit = inst->Src[1].Register.Index;
2789    const uint sampler_unit = inst->Src[2].Register.Index;
2790    union tgsi_exec_channel r[4];
2791    float derivs[3][2][TGSI_QUAD_SIZE];
2792    uint chan;
2793    unsigned char swizzles[4];
2794    int8_t offsets[3];
2795
2796    /* always fetch all 3 offsets, overkill but keeps code simple */
2797    fetch_texel_offsets(mach, inst, offsets);
2798
2799    FETCH(&r[0], 0, TGSI_CHAN_X);
2800
2801    switch (mach->SamplerViews[resource_unit].Resource) {
2802    case TGSI_TEXTURE_1D:
2803    case TGSI_TEXTURE_1D_ARRAY:
2804       /* only 1D array actually needs Y */
2805       FETCH(&r[1], 0, TGSI_CHAN_Y);
2806
2807       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2808
2809       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2810                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2811                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2812                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2813       break;
2814
2815    case TGSI_TEXTURE_2D:
2816    case TGSI_TEXTURE_RECT:
2817    case TGSI_TEXTURE_2D_ARRAY:
2818       /* only 2D array actually needs Z */
2819       FETCH(&r[1], 0, TGSI_CHAN_Y);
2820       FETCH(&r[2], 0, TGSI_CHAN_Z);
2821
2822       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2823       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2824
2825       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2826                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2827                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2828                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2829       break;
2830
2831    case TGSI_TEXTURE_3D:
2832    case TGSI_TEXTURE_CUBE:
2833    case TGSI_TEXTURE_CUBE_ARRAY:
2834       /* only cube array actually needs W */
2835       FETCH(&r[1], 0, TGSI_CHAN_Y);
2836       FETCH(&r[2], 0, TGSI_CHAN_Z);
2837       FETCH(&r[3], 0, TGSI_CHAN_W);
2838
2839       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2840       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2841       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2842
2843       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2844                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2845                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2846                   &r[0], &r[1], &r[2], &r[3]);
2847       break;
2848
2849    default:
2850       assert(0);
2851    }
2852
2853    swizzles[0] = inst->Src[1].Register.SwizzleX;
2854    swizzles[1] = inst->Src[1].Register.SwizzleY;
2855    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2856    swizzles[3] = inst->Src[1].Register.SwizzleW;
2857
2858    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2859       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2860          store_dest(mach, &r[swizzles[chan]],
2861                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2862       }
2863    }
2864 }
2865
2866
2867 /**
2868  * Evaluate a constant-valued coefficient at the position of the
2869  * current quad.
2870  */
2871 static void
2872 eval_constant_coef(
2873    struct tgsi_exec_machine *mach,
2874    unsigned attrib,
2875    unsigned chan )
2876 {
2877    unsigned i;
2878
2879    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2880       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2881    }
2882 }
2883
2884 static void
2885 interp_constant_offset(
2886       UNUSED const struct tgsi_exec_machine *mach,
2887       UNUSED unsigned attrib,
2888       UNUSED unsigned chan,
2889       UNUSED float ofs_x,
2890       UNUSED float ofs_y,
2891       UNUSED union tgsi_exec_channel *out_chan)
2892 {
2893 }
2894
2895 /**
2896  * Evaluate a linear-valued coefficient at the position of the
2897  * current quad.
2898  */
2899 static void
2900 interp_linear_offset(
2901       const struct tgsi_exec_machine *mach,
2902       unsigned attrib,
2903       unsigned chan,
2904       float ofs_x,
2905       float ofs_y,
2906       union tgsi_exec_channel *out_chan)
2907 {
2908    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2909    const float dady = mach->InterpCoefs[attrib].dady[chan];
2910    const float delta = ofs_x * dadx + ofs_y * dady;
2911    out_chan->f[0] += delta;
2912    out_chan->f[1] += delta;
2913    out_chan->f[2] += delta;
2914    out_chan->f[3] += delta;
2915 }
2916
2917 static void
2918 eval_linear_coef(struct tgsi_exec_machine *mach,
2919                  unsigned attrib,
2920                  unsigned chan)
2921 {
2922    const float x = mach->QuadPos.xyzw[0].f[0];
2923    const float y = mach->QuadPos.xyzw[1].f[0];
2924    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2925    const float dady = mach->InterpCoefs[attrib].dady[chan];
2926    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2927
2928    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2929    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2930    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2931    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2932 }
2933
2934 /**
2935  * Evaluate a perspective-valued coefficient at the position of the
2936  * current quad.
2937  */
2938
2939 static void
2940 interp_perspective_offset(
2941    const struct tgsi_exec_machine *mach,
2942    unsigned attrib,
2943    unsigned chan,
2944    float ofs_x,
2945    float ofs_y,
2946    union tgsi_exec_channel *out_chan)
2947 {
2948    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2949    const float dady = mach->InterpCoefs[attrib].dady[chan];
2950    const float *w = mach->QuadPos.xyzw[3].f;
2951    const float delta = ofs_x * dadx + ofs_y * dady;
2952    out_chan->f[0] += delta / w[0];
2953    out_chan->f[1] += delta / w[1];
2954    out_chan->f[2] += delta / w[2];
2955    out_chan->f[3] += delta / w[3];
2956 }
2957
2958 static void
2959 eval_perspective_coef(
2960    struct tgsi_exec_machine *mach,
2961    unsigned attrib,
2962    unsigned chan )
2963 {
2964    const float x = mach->QuadPos.xyzw[0].f[0];
2965    const float y = mach->QuadPos.xyzw[1].f[0];
2966    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2967    const float dady = mach->InterpCoefs[attrib].dady[chan];
2968    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2969    const float *w = mach->QuadPos.xyzw[3].f;
2970    /* divide by W here */
2971    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2972    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2973    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2974    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2975 }
2976
2977
2978 typedef void (* eval_coef_func)(
2979    struct tgsi_exec_machine *mach,
2980    unsigned attrib,
2981    unsigned chan );
2982
2983 static void
2984 exec_declaration(struct tgsi_exec_machine *mach,
2985                  const struct tgsi_full_declaration *decl)
2986 {
2987    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2988       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2989       return;
2990    }
2991
2992    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2993       if (decl->Declaration.File == TGSI_FILE_INPUT) {
2994          uint first, last, mask;
2995
2996          first = decl->Range.First;
2997          last = decl->Range.Last;
2998          mask = decl->Declaration.UsageMask;
2999
3000          /* XXX we could remove this special-case code since
3001           * mach->InterpCoefs[first].a0 should already have the
3002           * front/back-face value.  But we should first update the
3003           * ureg code to emit the right UsageMask value (WRITEMASK_X).
3004           * Then, we could remove the tgsi_exec_machine::Face field.
3005           */
3006          /* XXX make FACE a system value */
3007          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
3008             uint i;
3009
3010             assert(decl->Semantic.Index == 0);
3011             assert(first == last);
3012
3013             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3014                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
3015             }
3016          } else {
3017             eval_coef_func eval;
3018             apply_sample_offset_func interp;
3019             uint i, j;
3020
3021             switch (decl->Interp.Interpolate) {
3022             case TGSI_INTERPOLATE_CONSTANT:
3023                eval = eval_constant_coef;
3024                interp = interp_constant_offset;
3025                break;
3026
3027             case TGSI_INTERPOLATE_LINEAR:
3028                eval = eval_linear_coef;
3029                interp = interp_linear_offset;
3030                break;
3031
3032             case TGSI_INTERPOLATE_PERSPECTIVE:
3033                eval = eval_perspective_coef;
3034                interp = interp_perspective_offset;
3035                break;
3036
3037             case TGSI_INTERPOLATE_COLOR:
3038                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
3039                interp = mach->flatshade_color ? interp_constant_offset : interp_perspective_offset;
3040                break;
3041
3042             default:
3043                assert(0);
3044                return;
3045             }
3046
3047             for (i = first; i <= last; i++)
3048                mach->InputSampleOffsetApply[i] = interp;
3049
3050             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3051                if (mask & (1 << j)) {
3052                   for (i = first; i <= last; i++) {
3053                      eval(mach, i, j);
3054                   }
3055                }
3056             }
3057          }
3058
3059          if (DEBUG_EXECUTION) {
3060             uint i, j;
3061             for (i = first; i <= last; ++i) {
3062                debug_printf("IN[%2u] = ", i);
3063                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3064                   if (j > 0) {
3065                      debug_printf("         ");
3066                   }
3067                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3068                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
3069                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
3070                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
3071                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
3072                }
3073             }
3074          }
3075       }
3076    }
3077
3078 }
3079
3080 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
3081                                 const union tgsi_exec_channel *src);
3082
3083 static void
3084 exec_scalar_unary(struct tgsi_exec_machine *mach,
3085                   const struct tgsi_full_instruction *inst,
3086                   micro_unary_op op,
3087                   enum tgsi_exec_datatype dst_datatype,
3088                   enum tgsi_exec_datatype src_datatype)
3089 {
3090    unsigned int chan;
3091    union tgsi_exec_channel src;
3092    union tgsi_exec_channel dst;
3093
3094    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
3095    op(&dst, &src);
3096    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3097       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3098          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3099       }
3100    }
3101 }
3102
3103 static void
3104 exec_vector_unary(struct tgsi_exec_machine *mach,
3105                   const struct tgsi_full_instruction *inst,
3106                   micro_unary_op op,
3107                   enum tgsi_exec_datatype dst_datatype,
3108                   enum tgsi_exec_datatype src_datatype)
3109 {
3110    unsigned int chan;
3111    struct tgsi_exec_vector dst;
3112
3113    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3114       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3115          union tgsi_exec_channel src;
3116
3117          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
3118          op(&dst.xyzw[chan], &src);
3119       }
3120    }
3121    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3122       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3123          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3124       }
3125    }
3126 }
3127
3128 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
3129                                  const union tgsi_exec_channel *src0,
3130                                  const union tgsi_exec_channel *src1);
3131
3132 static void
3133 exec_scalar_binary(struct tgsi_exec_machine *mach,
3134                    const struct tgsi_full_instruction *inst,
3135                    micro_binary_op op,
3136                    enum tgsi_exec_datatype dst_datatype,
3137                    enum tgsi_exec_datatype src_datatype)
3138 {
3139    unsigned int chan;
3140    union tgsi_exec_channel src[2];
3141    union tgsi_exec_channel dst;
3142
3143    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
3144    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
3145    op(&dst, &src[0], &src[1]);
3146    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3147       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3148          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3149       }
3150    }
3151 }
3152
3153 static void
3154 exec_vector_binary(struct tgsi_exec_machine *mach,
3155                    const struct tgsi_full_instruction *inst,
3156                    micro_binary_op op,
3157                    enum tgsi_exec_datatype dst_datatype,
3158                    enum tgsi_exec_datatype src_datatype)
3159 {
3160    unsigned int chan;
3161    struct tgsi_exec_vector dst;
3162
3163    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3164       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3165          union tgsi_exec_channel src[2];
3166
3167          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3168          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3169          op(&dst.xyzw[chan], &src[0], &src[1]);
3170       }
3171    }
3172    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3173       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3174          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3175       }
3176    }
3177 }
3178
3179 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
3180                                   const union tgsi_exec_channel *src0,
3181                                   const union tgsi_exec_channel *src1,
3182                                   const union tgsi_exec_channel *src2);
3183
3184 static void
3185 exec_vector_trinary(struct tgsi_exec_machine *mach,
3186                     const struct tgsi_full_instruction *inst,
3187                     micro_trinary_op op,
3188                     enum tgsi_exec_datatype dst_datatype,
3189                     enum tgsi_exec_datatype src_datatype)
3190 {
3191    unsigned int chan;
3192    struct tgsi_exec_vector dst;
3193
3194    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3195       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3196          union tgsi_exec_channel src[3];
3197
3198          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3199          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3200          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3201          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3202       }
3203    }
3204    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3205       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3206          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3207       }
3208    }
3209 }
3210
3211 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
3212                                      const union tgsi_exec_channel *src0,
3213                                      const union tgsi_exec_channel *src1,
3214                                      const union tgsi_exec_channel *src2,
3215                                      const union tgsi_exec_channel *src3);
3216
3217 static void
3218 exec_vector_quaternary(struct tgsi_exec_machine *mach,
3219                        const struct tgsi_full_instruction *inst,
3220                        micro_quaternary_op op,
3221                        enum tgsi_exec_datatype dst_datatype,
3222                        enum tgsi_exec_datatype src_datatype)
3223 {
3224    unsigned int chan;
3225    struct tgsi_exec_vector dst;
3226
3227    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3228       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3229          union tgsi_exec_channel src[4];
3230
3231          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3232          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3233          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3234          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3235          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3236       }
3237    }
3238    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3239       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3240          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3241       }
3242    }
3243 }
3244
3245 static void
3246 exec_dp3(struct tgsi_exec_machine *mach,
3247          const struct tgsi_full_instruction *inst)
3248 {
3249    unsigned int chan;
3250    union tgsi_exec_channel arg[3];
3251
3252    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3253    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3254    micro_mul(&arg[2], &arg[0], &arg[1]);
3255
3256    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3257       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3258       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3259       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3260    }
3261
3262    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3263       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3264          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3265       }
3266    }
3267 }
3268
3269 static void
3270 exec_dp4(struct tgsi_exec_machine *mach,
3271          const struct tgsi_full_instruction *inst)
3272 {
3273    unsigned int chan;
3274    union tgsi_exec_channel arg[3];
3275
3276    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3277    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3278    micro_mul(&arg[2], &arg[0], &arg[1]);
3279
3280    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3281       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3282       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3283       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3284    }
3285
3286    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3287       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3288          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3289       }
3290    }
3291 }
3292
3293 static void
3294 exec_dp2(struct tgsi_exec_machine *mach,
3295          const struct tgsi_full_instruction *inst)
3296 {
3297    unsigned int chan;
3298    union tgsi_exec_channel arg[3];
3299
3300    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3301    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3302    micro_mul(&arg[2], &arg[0], &arg[1]);
3303
3304    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3305    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3306    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3307
3308    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3309       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3310          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3311       }
3312    }
3313 }
3314
3315 static void
3316 exec_pk2h(struct tgsi_exec_machine *mach,
3317           const struct tgsi_full_instruction *inst)
3318 {
3319    unsigned chan;
3320    union tgsi_exec_channel arg[2], dst;
3321
3322    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3323    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3324    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3325       dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
3326          (util_float_to_half(arg[1].f[chan]) << 16);
3327    }
3328    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3329       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3330          store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
3331       }
3332    }
3333 }
3334
3335 static void
3336 exec_up2h(struct tgsi_exec_machine *mach,
3337           const struct tgsi_full_instruction *inst)
3338 {
3339    unsigned chan;
3340    union tgsi_exec_channel arg, dst[2];
3341
3342    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3343    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3344       dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
3345       dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
3346    }
3347    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3348       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3349          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3350       }
3351    }
3352 }
3353
3354 static void
3355 micro_ucmp(union tgsi_exec_channel *dst,
3356            const union tgsi_exec_channel *src0,
3357            const union tgsi_exec_channel *src1,
3358            const union tgsi_exec_channel *src2)
3359 {
3360    dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
3361    dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
3362    dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
3363    dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
3364 }
3365
3366 static void
3367 exec_ucmp(struct tgsi_exec_machine *mach,
3368           const struct tgsi_full_instruction *inst)
3369 {
3370    unsigned int chan;
3371    struct tgsi_exec_vector dst;
3372
3373    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3374       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3375          union tgsi_exec_channel src[3];
3376
3377          fetch_source(mach, &src[0], &inst->Src[0], chan,
3378                       TGSI_EXEC_DATA_UINT);
3379          fetch_source(mach, &src[1], &inst->Src[1], chan,
3380                       TGSI_EXEC_DATA_FLOAT);
3381          fetch_source(mach, &src[2], &inst->Src[2], chan,
3382                       TGSI_EXEC_DATA_FLOAT);
3383          micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3384       }
3385    }
3386    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3387       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3388          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan,
3389                     TGSI_EXEC_DATA_FLOAT);
3390       }
3391    }
3392 }
3393
3394 static void
3395 exec_dst(struct tgsi_exec_machine *mach,
3396          const struct tgsi_full_instruction *inst)
3397 {
3398    union tgsi_exec_channel r[2];
3399    union tgsi_exec_channel d[4];
3400
3401    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3402       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3403       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3404       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3405    }
3406    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3407       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3408    }
3409    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3410       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3411    }
3412
3413    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3414       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3415    }
3416    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3417       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3418    }
3419    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3420       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3421    }
3422    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3423       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3424    }
3425 }
3426
3427 static void
3428 exec_log(struct tgsi_exec_machine *mach,
3429          const struct tgsi_full_instruction *inst)
3430 {
3431    union tgsi_exec_channel r[3];
3432
3433    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3434    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3435    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3436    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3437    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3438       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3439    }
3440    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3441       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3442       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3443       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3444    }
3445    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3446       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3447    }
3448    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3449       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3450    }
3451 }
3452
3453 static void
3454 exec_exp(struct tgsi_exec_machine *mach,
3455          const struct tgsi_full_instruction *inst)
3456 {
3457    union tgsi_exec_channel r[3];
3458
3459    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3460    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3461    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3462       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3463       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3464    }
3465    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3466       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3467       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3468    }
3469    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3470       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3471       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3472    }
3473    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3474       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3475    }
3476 }
3477
3478 static void
3479 exec_lit(struct tgsi_exec_machine *mach,
3480          const struct tgsi_full_instruction *inst)
3481 {
3482    union tgsi_exec_channel r[3];
3483    union tgsi_exec_channel d[3];
3484
3485    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3486       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3487       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3488          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3489          micro_max(&r[1], &r[1], &ZeroVec);
3490
3491          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3492          micro_min(&r[2], &r[2], &P128Vec);
3493          micro_max(&r[2], &r[2], &M128Vec);
3494          micro_pow(&r[1], &r[1], &r[2]);
3495          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3496          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3497       }
3498       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3499          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3500          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3501       }
3502    }
3503    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3504       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3505    }
3506
3507    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3508       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3509    }
3510 }
3511
3512 static void
3513 exec_break(struct tgsi_exec_machine *mach)
3514 {
3515    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3516       /* turn off loop channels for each enabled exec channel */
3517       mach->LoopMask &= ~mach->ExecMask;
3518       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3519       UPDATE_EXEC_MASK(mach);
3520    } else {
3521       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3522
3523       mach->Switch.mask = 0x0;
3524
3525       UPDATE_EXEC_MASK(mach);
3526    }
3527 }
3528
3529 static void
3530 exec_switch(struct tgsi_exec_machine *mach,
3531             const struct tgsi_full_instruction *inst)
3532 {
3533    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3534    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3535
3536    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3537    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3538    mach->Switch.mask = 0x0;
3539    mach->Switch.defaultMask = 0x0;
3540
3541    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3542    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3543
3544    UPDATE_EXEC_MASK(mach);
3545 }
3546
3547 static void
3548 exec_case(struct tgsi_exec_machine *mach,
3549           const struct tgsi_full_instruction *inst)
3550 {
3551    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3552    union tgsi_exec_channel src;
3553    uint mask = 0;
3554
3555    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3556
3557    if (mach->Switch.selector.u[0] == src.u[0]) {
3558       mask |= 0x1;
3559    }
3560    if (mach->Switch.selector.u[1] == src.u[1]) {
3561       mask |= 0x2;
3562    }
3563    if (mach->Switch.selector.u[2] == src.u[2]) {
3564       mask |= 0x4;
3565    }
3566    if (mach->Switch.selector.u[3] == src.u[3]) {
3567       mask |= 0x8;
3568    }
3569
3570    mach->Switch.defaultMask |= mask;
3571
3572    mach->Switch.mask |= mask & prevMask;
3573
3574    UPDATE_EXEC_MASK(mach);
3575 }
3576
3577 /* FIXME: this will only work if default is last */
3578 static void
3579 exec_default(struct tgsi_exec_machine *mach)
3580 {
3581    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3582
3583    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3584
3585    UPDATE_EXEC_MASK(mach);
3586 }
3587
3588 static void
3589 exec_endswitch(struct tgsi_exec_machine *mach)
3590 {
3591    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3592    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3593
3594    UPDATE_EXEC_MASK(mach);
3595 }
3596
3597 typedef void (* micro_dop)(union tgsi_double_channel *dst,
3598                            const union tgsi_double_channel *src);
3599
3600 typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3601                                const union tgsi_double_channel *src0,
3602                                union tgsi_exec_channel *src1);
3603
3604 typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3605                              const union tgsi_exec_channel *src);
3606
3607 typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3608                              const union tgsi_double_channel *src);
3609
3610 static void
3611 fetch_double_channel(struct tgsi_exec_machine *mach,
3612                      union tgsi_double_channel *chan,
3613                      const struct tgsi_full_src_register *reg,
3614                      uint chan_0,
3615                      uint chan_1)
3616 {
3617    union tgsi_exec_channel src[2];
3618    uint i;
3619
3620    fetch_source_d(mach, &src[0], reg, chan_0);
3621    fetch_source_d(mach, &src[1], reg, chan_1);
3622
3623    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3624       chan->u[i][0] = src[0].u[i];
3625       chan->u[i][1] = src[1].u[i];
3626    }
3627    if (reg->Register.Absolute) {
3628       micro_dabs(chan, chan);
3629    }
3630    if (reg->Register.Negate) {
3631       micro_dneg(chan, chan);
3632    }
3633 }
3634
3635 static void
3636 store_double_channel(struct tgsi_exec_machine *mach,
3637                      const union tgsi_double_channel *chan,
3638                      const struct tgsi_full_dst_register *reg,
3639                      const struct tgsi_full_instruction *inst,
3640                      uint chan_0,
3641                      uint chan_1)
3642 {
3643    union tgsi_exec_channel dst[2];
3644    uint i;
3645    union tgsi_double_channel temp;
3646    const uint execmask = mach->ExecMask;
3647
3648    if (!inst->Instruction.Saturate) {
3649       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3650          if (execmask & (1 << i)) {
3651             dst[0].u[i] = chan->u[i][0];
3652             dst[1].u[i] = chan->u[i][1];
3653          }
3654    }
3655    else {
3656       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3657          if (execmask & (1 << i)) {
3658             if (chan->d[i] < 0.0)
3659                temp.d[i] = 0.0;
3660             else if (chan->d[i] > 1.0)
3661                temp.d[i] = 1.0;
3662             else
3663                temp.d[i] = chan->d[i];
3664
3665             dst[0].u[i] = temp.u[i][0];
3666             dst[1].u[i] = temp.u[i][1];
3667          }
3668    }
3669
3670    store_dest_double(mach, &dst[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
3671    if (chan_1 != (unsigned)-1)
3672       store_dest_double(mach, &dst[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
3673 }
3674
3675 static void
3676 exec_double_unary(struct tgsi_exec_machine *mach,
3677                   const struct tgsi_full_instruction *inst,
3678                   micro_dop op)
3679 {
3680    union tgsi_double_channel src;
3681    union tgsi_double_channel dst;
3682
3683    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3684       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3685       op(&dst, &src);
3686       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3687    }
3688    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3689       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3690       op(&dst, &src);
3691       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3692    }
3693 }
3694
3695 static void
3696 exec_double_binary(struct tgsi_exec_machine *mach,
3697                    const struct tgsi_full_instruction *inst,
3698                    micro_dop op,
3699                    enum tgsi_exec_datatype dst_datatype)
3700 {
3701    union tgsi_double_channel src[2];
3702    union tgsi_double_channel dst;
3703    int first_dest_chan, second_dest_chan;
3704    int wmask;
3705
3706    wmask = inst->Dst[0].Register.WriteMask;
3707    /* these are & because of the way DSLT etc store their destinations */
3708    if (wmask & TGSI_WRITEMASK_XY) {
3709       first_dest_chan = TGSI_CHAN_X;
3710       second_dest_chan = TGSI_CHAN_Y;
3711       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3712          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3713          second_dest_chan = -1;
3714       }
3715
3716       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3717       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3718       op(&dst, src);
3719       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3720    }
3721
3722    if (wmask & TGSI_WRITEMASK_ZW) {
3723       first_dest_chan = TGSI_CHAN_Z;
3724       second_dest_chan = TGSI_CHAN_W;
3725       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3726          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3727          second_dest_chan = -1;
3728       }
3729
3730       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3731       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3732       op(&dst, src);
3733       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3734    }
3735 }
3736
3737 static void
3738 exec_double_trinary(struct tgsi_exec_machine *mach,
3739                     const struct tgsi_full_instruction *inst,
3740                     micro_dop op)
3741 {
3742    union tgsi_double_channel src[3];
3743    union tgsi_double_channel dst;
3744
3745    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3746       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3747       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3748       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3749       op(&dst, src);
3750       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3751    }
3752    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3753       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3754       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3755       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3756       op(&dst, src);
3757       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3758    }
3759 }
3760
3761 static void
3762 exec_dldexp(struct tgsi_exec_machine *mach,
3763             const struct tgsi_full_instruction *inst)
3764 {
3765    union tgsi_double_channel src0;
3766    union tgsi_exec_channel src1;
3767    union tgsi_double_channel dst;
3768    int wmask;
3769
3770    wmask = inst->Dst[0].Register.WriteMask;
3771    if (wmask & TGSI_WRITEMASK_XY) {
3772       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3773       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3774       micro_dldexp(&dst, &src0, &src1);
3775       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3776    }
3777
3778    if (wmask & TGSI_WRITEMASK_ZW) {
3779       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3780       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3781       micro_dldexp(&dst, &src0, &src1);
3782       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3783    }
3784 }
3785
3786 static void
3787 exec_dfracexp(struct tgsi_exec_machine *mach,
3788               const struct tgsi_full_instruction *inst)
3789 {
3790    union tgsi_double_channel src;
3791    union tgsi_double_channel dst;
3792    union tgsi_exec_channel dst_exp;
3793
3794    fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3795    micro_dfracexp(&dst, &dst_exp, &src);
3796    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
3797       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3798    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
3799       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3800    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3801       if (inst->Dst[1].Register.WriteMask & (1 << chan))
3802          store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
3803    }
3804 }
3805
3806 static void
3807 exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3808             const struct tgsi_full_instruction *inst,
3809             micro_dop_sop op)
3810 {
3811    union tgsi_double_channel src0;
3812    union tgsi_exec_channel src1;
3813    union tgsi_double_channel dst;
3814    int wmask;
3815
3816    wmask = inst->Dst[0].Register.WriteMask;
3817    if (wmask & TGSI_WRITEMASK_XY) {
3818       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3819       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3820       op(&dst, &src0, &src1);
3821       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3822    }
3823
3824    if (wmask & TGSI_WRITEMASK_ZW) {
3825       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3826       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3827       op(&dst, &src0, &src1);
3828       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3829    }
3830 }
3831
3832 static int
3833 get_image_coord_dim(unsigned tgsi_tex)
3834 {
3835    int dim;
3836    switch (tgsi_tex) {
3837    case TGSI_TEXTURE_BUFFER:
3838    case TGSI_TEXTURE_1D:
3839       dim = 1;
3840       break;
3841    case TGSI_TEXTURE_2D:
3842    case TGSI_TEXTURE_RECT:
3843    case TGSI_TEXTURE_1D_ARRAY:
3844    case TGSI_TEXTURE_2D_MSAA:
3845       dim = 2;
3846       break;
3847    case TGSI_TEXTURE_3D:
3848    case TGSI_TEXTURE_CUBE:
3849    case TGSI_TEXTURE_2D_ARRAY:
3850    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3851    case TGSI_TEXTURE_CUBE_ARRAY:
3852       dim = 3;
3853       break;
3854    default:
3855       assert(!"unknown texture target");
3856       dim = 0;
3857       break;
3858    }
3859
3860    return dim;
3861 }
3862
3863 static int
3864 get_image_coord_sample(unsigned tgsi_tex)
3865 {
3866    int sample = 0;
3867    switch (tgsi_tex) {
3868    case TGSI_TEXTURE_2D_MSAA:
3869       sample = 3;
3870       break;
3871    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3872       sample = 4;
3873       break;
3874    default:
3875       break;
3876    }
3877    return sample;
3878 }
3879
3880 static void
3881 exec_load_img(struct tgsi_exec_machine *mach,
3882               const struct tgsi_full_instruction *inst)
3883 {
3884    union tgsi_exec_channel r[4], sample_r;
3885    uint unit;
3886    int sample;
3887    int i, j;
3888    int dim;
3889    uint chan;
3890    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3891    struct tgsi_image_params params;
3892    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3893
3894    unit = fetch_sampler_unit(mach, inst, 0);
3895    dim = get_image_coord_dim(inst->Memory.Texture);
3896    sample = get_image_coord_sample(inst->Memory.Texture);
3897    assert(dim <= 3);
3898
3899    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3900    params.unit = unit;
3901    params.tgsi_tex_instr = inst->Memory.Texture;
3902    params.format = inst->Memory.Format;
3903
3904    for (i = 0; i < dim; i++) {
3905       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3906    }
3907
3908    if (sample)
3909       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3910
3911    mach->Image->load(mach->Image, &params,
3912                      r[0].i, r[1].i, r[2].i, sample_r.i,
3913                      rgba);
3914    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3915       r[0].f[j] = rgba[0][j];
3916       r[1].f[j] = rgba[1][j];
3917       r[2].f[j] = rgba[2][j];
3918       r[3].f[j] = rgba[3][j];
3919    }
3920    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3921       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3922          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3923       }
3924    }
3925 }
3926
3927 static void
3928 exec_load_buf(struct tgsi_exec_machine *mach,
3929               const struct tgsi_full_instruction *inst)
3930 {
3931    union tgsi_exec_channel r[4];
3932    uint unit;
3933    int j;
3934    uint chan;
3935    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3936    struct tgsi_buffer_params params;
3937    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3938
3939    unit = fetch_sampler_unit(mach, inst, 0);
3940
3941    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3942    params.unit = unit;
3943    IFETCH(&r[0], 1, TGSI_CHAN_X);
3944
3945    mach->Buffer->load(mach->Buffer, &params,
3946                       r[0].i, rgba);
3947    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3948       r[0].f[j] = rgba[0][j];
3949       r[1].f[j] = rgba[1][j];
3950       r[2].f[j] = rgba[2][j];
3951       r[3].f[j] = rgba[3][j];
3952    }
3953    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3954       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3955          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3956       }
3957    }
3958 }
3959
3960 static void
3961 exec_load_mem(struct tgsi_exec_machine *mach,
3962               const struct tgsi_full_instruction *inst)
3963 {
3964    union tgsi_exec_channel r[4];
3965    uint chan;
3966    char *ptr = mach->LocalMem;
3967    uint32_t offset;
3968    int j;
3969
3970    IFETCH(&r[0], 1, TGSI_CHAN_X);
3971    if (r[0].u[0] >= mach->LocalMemSize)
3972       return;
3973
3974    offset = r[0].u[0];
3975    ptr += offset;
3976
3977    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3978       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3979          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3980             memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
3981          }
3982       }
3983    }
3984
3985    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3986       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3987          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3988       }
3989    }
3990 }
3991
3992 static void
3993 exec_load(struct tgsi_exec_machine *mach,
3994           const struct tgsi_full_instruction *inst)
3995 {
3996    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
3997       exec_load_img(mach, inst);
3998    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
3999       exec_load_buf(mach, inst);
4000    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4001       exec_load_mem(mach, inst);
4002 }
4003
4004 static uint
4005 fetch_store_img_unit(struct tgsi_exec_machine *mach,
4006                      const struct tgsi_full_dst_register *dst)
4007 {
4008    uint unit = 0;
4009    int i;
4010    if (dst->Register.Indirect) {
4011       union tgsi_exec_channel indir_index, index2;
4012       const uint execmask = mach->ExecMask;
4013       index2.i[0] =
4014       index2.i[1] =
4015       index2.i[2] =
4016       index2.i[3] = dst->Indirect.Index;
4017
4018       fetch_src_file_channel(mach,
4019                              dst->Indirect.File,
4020                              dst->Indirect.Swizzle,
4021                              &index2,
4022                              &ZeroVec,
4023                              &indir_index);
4024       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4025          if (execmask & (1 << i)) {
4026             unit = dst->Register.Index + indir_index.i[i];
4027             break;
4028          }
4029       }
4030    } else {
4031       unit = dst->Register.Index;
4032    }
4033    return unit;
4034 }
4035
4036 static void
4037 exec_store_img(struct tgsi_exec_machine *mach,
4038                const struct tgsi_full_instruction *inst)
4039 {
4040    union tgsi_exec_channel r[3], sample_r;
4041    union tgsi_exec_channel value[4];
4042    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4043    struct tgsi_image_params params;
4044    int dim;
4045    int sample;
4046    int i, j;
4047    uint unit;
4048    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4049    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4050    dim = get_image_coord_dim(inst->Memory.Texture);
4051    sample = get_image_coord_sample(inst->Memory.Texture);
4052    assert(dim <= 3);
4053
4054    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4055    params.unit = unit;
4056    params.tgsi_tex_instr = inst->Memory.Texture;
4057    params.format = inst->Memory.Format;
4058
4059    for (i = 0; i < dim; i++) {
4060       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
4061    }
4062
4063    for (i = 0; i < 4; i++) {
4064       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4065    }
4066    if (sample)
4067       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
4068
4069    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4070       rgba[0][j] = value[0].f[j];
4071       rgba[1][j] = value[1].f[j];
4072       rgba[2][j] = value[2].f[j];
4073       rgba[3][j] = value[3].f[j];
4074    }
4075
4076    mach->Image->store(mach->Image, &params,
4077                       r[0].i, r[1].i, r[2].i, sample_r.i,
4078                       rgba);
4079 }
4080
4081 static void
4082 exec_store_buf(struct tgsi_exec_machine *mach,
4083                const struct tgsi_full_instruction *inst)
4084 {
4085    union tgsi_exec_channel r[3];
4086    union tgsi_exec_channel value[4];
4087    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4088    struct tgsi_buffer_params params;
4089    int i, j;
4090    uint unit;
4091    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4092
4093    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4094
4095    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4096    params.unit = unit;
4097    params.writemask = inst->Dst[0].Register.WriteMask;
4098
4099    IFETCH(&r[0], 0, TGSI_CHAN_X);
4100    for (i = 0; i < 4; i++) {
4101       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4102    }
4103
4104    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4105       rgba[0][j] = value[0].f[j];
4106       rgba[1][j] = value[1].f[j];
4107       rgba[2][j] = value[2].f[j];
4108       rgba[3][j] = value[3].f[j];
4109    }
4110
4111    mach->Buffer->store(mach->Buffer, &params,
4112                       r[0].i,
4113                       rgba);
4114 }
4115
4116 static void
4117 exec_store_mem(struct tgsi_exec_machine *mach,
4118                const struct tgsi_full_instruction *inst)
4119 {
4120    union tgsi_exec_channel r[3];
4121    union tgsi_exec_channel value[4];
4122    uint i, chan;
4123    char *ptr = mach->LocalMem;
4124    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4125    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4126
4127    IFETCH(&r[0], 0, TGSI_CHAN_X);
4128
4129    for (i = 0; i < 4; i++) {
4130       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4131    }
4132
4133    if (r[0].u[0] >= mach->LocalMemSize)
4134       return;
4135    ptr += r[0].u[0];
4136
4137    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4138       if (execmask & (1 << i)) {
4139          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4140             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4141                memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
4142             }
4143          }
4144       }
4145    }
4146 }
4147
4148 static void
4149 exec_store(struct tgsi_exec_machine *mach,
4150            const struct tgsi_full_instruction *inst)
4151 {
4152    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
4153       exec_store_img(mach, inst);
4154    else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
4155       exec_store_buf(mach, inst);
4156    else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
4157       exec_store_mem(mach, inst);
4158 }
4159
4160 static void
4161 exec_atomop_img(struct tgsi_exec_machine *mach,
4162                 const struct tgsi_full_instruction *inst)
4163 {
4164    union tgsi_exec_channel r[4], sample_r;
4165    union tgsi_exec_channel value[4], value2[4];
4166    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4167    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4168    struct tgsi_image_params params;
4169    int dim;
4170    int sample;
4171    int i, j;
4172    uint unit, chan;
4173    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4174    unit = fetch_sampler_unit(mach, inst, 0);
4175    dim = get_image_coord_dim(inst->Memory.Texture);
4176    sample = get_image_coord_sample(inst->Memory.Texture);
4177    assert(dim <= 3);
4178
4179    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4180    params.unit = unit;
4181    params.tgsi_tex_instr = inst->Memory.Texture;
4182    params.format = inst->Memory.Format;
4183
4184    for (i = 0; i < dim; i++) {
4185       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4186    }
4187
4188    for (i = 0; i < 4; i++) {
4189       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4190       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4191          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4192    }
4193    if (sample)
4194       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4195
4196    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4197       rgba[0][j] = value[0].f[j];
4198       rgba[1][j] = value[1].f[j];
4199       rgba[2][j] = value[2].f[j];
4200       rgba[3][j] = value[3].f[j];
4201    }
4202    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4203       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4204          rgba2[0][j] = value2[0].f[j];
4205          rgba2[1][j] = value2[1].f[j];
4206          rgba2[2][j] = value2[2].f[j];
4207          rgba2[3][j] = value2[3].f[j];
4208       }
4209    }
4210
4211    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4212                    r[0].i, r[1].i, r[2].i, sample_r.i,
4213                    rgba, rgba2);
4214
4215    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4216       r[0].f[j] = rgba[0][j];
4217       r[1].f[j] = rgba[1][j];
4218       r[2].f[j] = rgba[2][j];
4219       r[3].f[j] = rgba[3][j];
4220    }
4221    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4222       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4223          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4224       }
4225    }
4226 }
4227
4228 static void
4229 exec_atomop_buf(struct tgsi_exec_machine *mach,
4230                 const struct tgsi_full_instruction *inst)
4231 {
4232    union tgsi_exec_channel r[4];
4233    union tgsi_exec_channel value[4], value2[4];
4234    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4235    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4236    struct tgsi_buffer_params params;
4237    int i, j;
4238    uint unit, chan;
4239    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4240
4241    unit = fetch_sampler_unit(mach, inst, 0);
4242
4243    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4244    params.unit = unit;
4245    params.writemask = inst->Dst[0].Register.WriteMask;
4246
4247    IFETCH(&r[0], 1, TGSI_CHAN_X);
4248
4249    for (i = 0; i < 4; i++) {
4250       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4251       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4252          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4253    }
4254
4255    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4256       rgba[0][j] = value[0].f[j];
4257       rgba[1][j] = value[1].f[j];
4258       rgba[2][j] = value[2].f[j];
4259       rgba[3][j] = value[3].f[j];
4260    }
4261    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4262       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4263          rgba2[0][j] = value2[0].f[j];
4264          rgba2[1][j] = value2[1].f[j];
4265          rgba2[2][j] = value2[2].f[j];
4266          rgba2[3][j] = value2[3].f[j];
4267       }
4268    }
4269
4270    mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
4271                    r[0].i,
4272                    rgba, rgba2);
4273
4274    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4275       r[0].f[j] = rgba[0][j];
4276       r[1].f[j] = rgba[1][j];
4277       r[2].f[j] = rgba[2][j];
4278       r[3].f[j] = rgba[3][j];
4279    }
4280    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4281       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4282          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4283       }
4284    }
4285 }
4286
4287 static void
4288 exec_atomop_mem(struct tgsi_exec_machine *mach,
4289                 const struct tgsi_full_instruction *inst)
4290 {
4291    union tgsi_exec_channel r[4];
4292    union tgsi_exec_channel value[4], value2[4];
4293    char *ptr = mach->LocalMem;
4294    uint32_t val;
4295    uint chan, i;
4296    uint32_t offset;
4297    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4298    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4299    IFETCH(&r[0], 1, TGSI_CHAN_X);
4300
4301    if (r[0].u[0] >= mach->LocalMemSize)
4302       return;
4303
4304    offset = r[0].u[0];
4305    ptr += offset;
4306    for (i = 0; i < 4; i++) {
4307       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4308       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4309          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4310    }
4311
4312    memcpy(&r[0].u[0], ptr, 4);
4313    val = r[0].u[0];
4314    switch (inst->Instruction.Opcode) {
4315    case TGSI_OPCODE_ATOMUADD:
4316       val += value[0].u[0];
4317       break;
4318    case TGSI_OPCODE_ATOMXOR:
4319       val ^= value[0].u[0];
4320       break;
4321    case TGSI_OPCODE_ATOMOR:
4322       val |= value[0].u[0];
4323       break;
4324    case TGSI_OPCODE_ATOMAND:
4325       val &= value[0].u[0];
4326       break;
4327    case TGSI_OPCODE_ATOMUMIN:
4328       val = MIN2(val, value[0].u[0]);
4329       break;
4330    case TGSI_OPCODE_ATOMUMAX:
4331       val = MAX2(val, value[0].u[0]);
4332       break;
4333    case TGSI_OPCODE_ATOMIMIN:
4334       val = MIN2(r[0].i[0], value[0].i[0]);
4335       break;
4336    case TGSI_OPCODE_ATOMIMAX:
4337       val = MAX2(r[0].i[0], value[0].i[0]);
4338       break;
4339    case TGSI_OPCODE_ATOMXCHG:
4340       val = value[0].i[0];
4341       break;
4342    case TGSI_OPCODE_ATOMCAS:
4343       if (val == value[0].u[0])
4344          val = value2[0].u[0];
4345       break;
4346    case TGSI_OPCODE_ATOMFADD:
4347       val = fui(r[0].f[0] + value[0].f[0]);
4348       break;
4349    default:
4350       break;
4351    }
4352    for (i = 0; i < TGSI_QUAD_SIZE; i++)
4353       if (execmask & (1 << i))
4354          memcpy(ptr, &val, 4);
4355
4356    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4357       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4358          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4359       }
4360    }
4361 }
4362
4363 static void
4364 exec_atomop(struct tgsi_exec_machine *mach,
4365             const struct tgsi_full_instruction *inst)
4366 {
4367    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4368       exec_atomop_img(mach, inst);
4369    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4370       exec_atomop_buf(mach, inst);
4371    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4372       exec_atomop_mem(mach, inst);
4373 }
4374
4375 static void
4376 exec_resq_img(struct tgsi_exec_machine *mach,
4377               const struct tgsi_full_instruction *inst)
4378 {
4379    int result[4];
4380    union tgsi_exec_channel r[4];
4381    uint unit;
4382    int i, chan, j;
4383    struct tgsi_image_params params;
4384    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4385
4386    unit = fetch_sampler_unit(mach, inst, 0);
4387
4388    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4389    params.unit = unit;
4390    params.tgsi_tex_instr = inst->Memory.Texture;
4391    params.format = inst->Memory.Format;
4392
4393    mach->Image->get_dims(mach->Image, &params, result);
4394
4395    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4396       for (j = 0; j < 4; j++) {
4397          r[j].i[i] = result[j];
4398       }
4399    }
4400
4401    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4402       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4403          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4404                     TGSI_EXEC_DATA_INT);
4405       }
4406    }
4407 }
4408
4409 static void
4410 exec_resq_buf(struct tgsi_exec_machine *mach,
4411               const struct tgsi_full_instruction *inst)
4412 {
4413    int result;
4414    union tgsi_exec_channel r[4];
4415    uint unit;
4416    int i, chan;
4417    struct tgsi_buffer_params params;
4418    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4419
4420    unit = fetch_sampler_unit(mach, inst, 0);
4421
4422    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4423    params.unit = unit;
4424
4425    mach->Buffer->get_dims(mach->Buffer, &params, &result);
4426
4427    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4428       r[0].i[i] = result;
4429    }
4430
4431    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4432       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4433          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4434                     TGSI_EXEC_DATA_INT);
4435       }
4436    }
4437 }
4438
4439 static void
4440 exec_resq(struct tgsi_exec_machine *mach,
4441           const struct tgsi_full_instruction *inst)
4442 {
4443    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4444       exec_resq_img(mach, inst);
4445    else
4446       exec_resq_buf(mach, inst);
4447 }
4448
4449 static void
4450 micro_f2u64(union tgsi_double_channel *dst,
4451             const union tgsi_exec_channel *src)
4452 {
4453    dst->u64[0] = (uint64_t)src->f[0];
4454    dst->u64[1] = (uint64_t)src->f[1];
4455    dst->u64[2] = (uint64_t)src->f[2];
4456    dst->u64[3] = (uint64_t)src->f[3];
4457 }
4458
4459 static void
4460 micro_f2i64(union tgsi_double_channel *dst,
4461             const union tgsi_exec_channel *src)
4462 {
4463    dst->i64[0] = (int64_t)src->f[0];
4464    dst->i64[1] = (int64_t)src->f[1];
4465    dst->i64[2] = (int64_t)src->f[2];
4466    dst->i64[3] = (int64_t)src->f[3];
4467 }
4468
4469 static void
4470 micro_u2i64(union tgsi_double_channel *dst,
4471             const union tgsi_exec_channel *src)
4472 {
4473    dst->u64[0] = (uint64_t)src->u[0];
4474    dst->u64[1] = (uint64_t)src->u[1];
4475    dst->u64[2] = (uint64_t)src->u[2];
4476    dst->u64[3] = (uint64_t)src->u[3];
4477 }
4478
4479 static void
4480 micro_i2i64(union tgsi_double_channel *dst,
4481             const union tgsi_exec_channel *src)
4482 {
4483    dst->i64[0] = (int64_t)src->i[0];
4484    dst->i64[1] = (int64_t)src->i[1];
4485    dst->i64[2] = (int64_t)src->i[2];
4486    dst->i64[3] = (int64_t)src->i[3];
4487 }
4488
4489 static void
4490 micro_d2u64(union tgsi_double_channel *dst,
4491            const union tgsi_double_channel *src)
4492 {
4493    dst->u64[0] = (uint64_t)src->d[0];
4494    dst->u64[1] = (uint64_t)src->d[1];
4495    dst->u64[2] = (uint64_t)src->d[2];
4496    dst->u64[3] = (uint64_t)src->d[3];
4497 }
4498
4499 static void
4500 micro_d2i64(union tgsi_double_channel *dst,
4501            const union tgsi_double_channel *src)
4502 {
4503    dst->i64[0] = (int64_t)src->d[0];
4504    dst->i64[1] = (int64_t)src->d[1];
4505    dst->i64[2] = (int64_t)src->d[2];
4506    dst->i64[3] = (int64_t)src->d[3];
4507 }
4508
4509 static void
4510 micro_u642d(union tgsi_double_channel *dst,
4511            const union tgsi_double_channel *src)
4512 {
4513    dst->d[0] = (double)src->u64[0];
4514    dst->d[1] = (double)src->u64[1];
4515    dst->d[2] = (double)src->u64[2];
4516    dst->d[3] = (double)src->u64[3];
4517 }
4518
4519 static void
4520 micro_i642d(union tgsi_double_channel *dst,
4521            const union tgsi_double_channel *src)
4522 {
4523    dst->d[0] = (double)src->i64[0];
4524    dst->d[1] = (double)src->i64[1];
4525    dst->d[2] = (double)src->i64[2];
4526    dst->d[3] = (double)src->i64[3];
4527 }
4528
4529 static void
4530 micro_u642f(union tgsi_exec_channel *dst,
4531             const union tgsi_double_channel *src)
4532 {
4533    dst->f[0] = (float)src->u64[0];
4534    dst->f[1] = (float)src->u64[1];
4535    dst->f[2] = (float)src->u64[2];
4536    dst->f[3] = (float)src->u64[3];
4537 }
4538
4539 static void
4540 micro_i642f(union tgsi_exec_channel *dst,
4541             const union tgsi_double_channel *src)
4542 {
4543    dst->f[0] = (float)src->i64[0];
4544    dst->f[1] = (float)src->i64[1];
4545    dst->f[2] = (float)src->i64[2];
4546    dst->f[3] = (float)src->i64[3];
4547 }
4548
4549 static void
4550 exec_t_2_64(struct tgsi_exec_machine *mach,
4551           const struct tgsi_full_instruction *inst,
4552           micro_dop_s op,
4553           enum tgsi_exec_datatype src_datatype)
4554 {
4555    union tgsi_exec_channel src;
4556    union tgsi_double_channel dst;
4557
4558    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4559       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4560       op(&dst, &src);
4561       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4562    }
4563    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4564       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4565       op(&dst, &src);
4566       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4567    }
4568 }
4569
4570 static void
4571 exec_64_2_t(struct tgsi_exec_machine *mach,
4572             const struct tgsi_full_instruction *inst,
4573             micro_sop_d op,
4574             enum tgsi_exec_datatype dst_datatype)
4575 {
4576    union tgsi_double_channel src;
4577    union tgsi_exec_channel dst;
4578    int wm = inst->Dst[0].Register.WriteMask;
4579    int i;
4580    int bit;
4581    for (i = 0; i < 2; i++) {
4582       bit = ffs(wm);
4583       if (bit) {
4584          wm &= ~(1 << (bit - 1));
4585          if (i == 0)
4586             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4587          else
4588             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4589          op(&dst, &src);
4590          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
4591       }
4592    }
4593 }
4594
4595 static void
4596 micro_i2f(union tgsi_exec_channel *dst,
4597           const union tgsi_exec_channel *src)
4598 {
4599    dst->f[0] = (float)src->i[0];
4600    dst->f[1] = (float)src->i[1];
4601    dst->f[2] = (float)src->i[2];
4602    dst->f[3] = (float)src->i[3];
4603 }
4604
4605 static void
4606 micro_not(union tgsi_exec_channel *dst,
4607           const union tgsi_exec_channel *src)
4608 {
4609    dst->u[0] = ~src->u[0];
4610    dst->u[1] = ~src->u[1];
4611    dst->u[2] = ~src->u[2];
4612    dst->u[3] = ~src->u[3];
4613 }
4614
4615 static void
4616 micro_shl(union tgsi_exec_channel *dst,
4617           const union tgsi_exec_channel *src0,
4618           const union tgsi_exec_channel *src1)
4619 {
4620    unsigned masked_count;
4621    masked_count = src1->u[0] & 0x1f;
4622    dst->u[0] = src0->u[0] << masked_count;
4623    masked_count = src1->u[1] & 0x1f;
4624    dst->u[1] = src0->u[1] << masked_count;
4625    masked_count = src1->u[2] & 0x1f;
4626    dst->u[2] = src0->u[2] << masked_count;
4627    masked_count = src1->u[3] & 0x1f;
4628    dst->u[3] = src0->u[3] << masked_count;
4629 }
4630
4631 static void
4632 micro_and(union tgsi_exec_channel *dst,
4633           const union tgsi_exec_channel *src0,
4634           const union tgsi_exec_channel *src1)
4635 {
4636    dst->u[0] = src0->u[0] & src1->u[0];
4637    dst->u[1] = src0->u[1] & src1->u[1];
4638    dst->u[2] = src0->u[2] & src1->u[2];
4639    dst->u[3] = src0->u[3] & src1->u[3];
4640 }
4641
4642 static void
4643 micro_or(union tgsi_exec_channel *dst,
4644          const union tgsi_exec_channel *src0,
4645          const union tgsi_exec_channel *src1)
4646 {
4647    dst->u[0] = src0->u[0] | src1->u[0];
4648    dst->u[1] = src0->u[1] | src1->u[1];
4649    dst->u[2] = src0->u[2] | src1->u[2];
4650    dst->u[3] = src0->u[3] | src1->u[3];
4651 }
4652
4653 static void
4654 micro_xor(union tgsi_exec_channel *dst,
4655           const union tgsi_exec_channel *src0,
4656           const union tgsi_exec_channel *src1)
4657 {
4658    dst->u[0] = src0->u[0] ^ src1->u[0];
4659    dst->u[1] = src0->u[1] ^ src1->u[1];
4660    dst->u[2] = src0->u[2] ^ src1->u[2];
4661    dst->u[3] = src0->u[3] ^ src1->u[3];
4662 }
4663
4664 static void
4665 micro_mod(union tgsi_exec_channel *dst,
4666           const union tgsi_exec_channel *src0,
4667           const union tgsi_exec_channel *src1)
4668 {
4669    dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
4670    dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
4671    dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
4672    dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
4673 }
4674
4675 static void
4676 micro_f2i(union tgsi_exec_channel *dst,
4677           const union tgsi_exec_channel *src)
4678 {
4679    dst->i[0] = (int)src->f[0];
4680    dst->i[1] = (int)src->f[1];
4681    dst->i[2] = (int)src->f[2];
4682    dst->i[3] = (int)src->f[3];
4683 }
4684
4685 static void
4686 micro_fseq(union tgsi_exec_channel *dst,
4687            const union tgsi_exec_channel *src0,
4688            const union tgsi_exec_channel *src1)
4689 {
4690    dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4691    dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4692    dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4693    dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4694 }
4695
4696 static void
4697 micro_fsge(union tgsi_exec_channel *dst,
4698            const union tgsi_exec_channel *src0,
4699            const union tgsi_exec_channel *src1)
4700 {
4701    dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4702    dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4703    dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4704    dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4705 }
4706
4707 static void
4708 micro_fslt(union tgsi_exec_channel *dst,
4709            const union tgsi_exec_channel *src0,
4710            const union tgsi_exec_channel *src1)
4711 {
4712    dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4713    dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4714    dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4715    dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4716 }
4717
4718 static void
4719 micro_fsne(union tgsi_exec_channel *dst,
4720            const union tgsi_exec_channel *src0,
4721            const union tgsi_exec_channel *src1)
4722 {
4723    dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4724    dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4725    dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4726    dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4727 }
4728
4729 static void
4730 micro_idiv(union tgsi_exec_channel *dst,
4731            const union tgsi_exec_channel *src0,
4732            const union tgsi_exec_channel *src1)
4733 {
4734    dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4735    dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4736    dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4737    dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4738 }
4739
4740 static void
4741 micro_imax(union tgsi_exec_channel *dst,
4742            const union tgsi_exec_channel *src0,
4743            const union tgsi_exec_channel *src1)
4744 {
4745    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4746    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4747    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4748    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4749 }
4750
4751 static void
4752 micro_imin(union tgsi_exec_channel *dst,
4753            const union tgsi_exec_channel *src0,
4754            const union tgsi_exec_channel *src1)
4755 {
4756    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4757    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4758    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4759    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4760 }
4761
4762 static void
4763 micro_isge(union tgsi_exec_channel *dst,
4764            const union tgsi_exec_channel *src0,
4765            const union tgsi_exec_channel *src1)
4766 {
4767    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4768    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4769    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4770    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4771 }
4772
4773 static void
4774 micro_ishr(union tgsi_exec_channel *dst,
4775            const union tgsi_exec_channel *src0,
4776            const union tgsi_exec_channel *src1)
4777 {
4778    unsigned masked_count;
4779    masked_count = src1->i[0] & 0x1f;
4780    dst->i[0] = src0->i[0] >> masked_count;
4781    masked_count = src1->i[1] & 0x1f;
4782    dst->i[1] = src0->i[1] >> masked_count;
4783    masked_count = src1->i[2] & 0x1f;
4784    dst->i[2] = src0->i[2] >> masked_count;
4785    masked_count = src1->i[3] & 0x1f;
4786    dst->i[3] = src0->i[3] >> masked_count;
4787 }
4788
4789 static void
4790 micro_islt(union tgsi_exec_channel *dst,
4791            const union tgsi_exec_channel *src0,
4792            const union tgsi_exec_channel *src1)
4793 {
4794    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4795    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4796    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4797    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4798 }
4799
4800 static void
4801 micro_f2u(union tgsi_exec_channel *dst,
4802           const union tgsi_exec_channel *src)
4803 {
4804    dst->u[0] = (uint)src->f[0];
4805    dst->u[1] = (uint)src->f[1];
4806    dst->u[2] = (uint)src->f[2];
4807    dst->u[3] = (uint)src->f[3];
4808 }
4809
4810 static void
4811 micro_u2f(union tgsi_exec_channel *dst,
4812           const union tgsi_exec_channel *src)
4813 {
4814    dst->f[0] = (float)src->u[0];
4815    dst->f[1] = (float)src->u[1];
4816    dst->f[2] = (float)src->u[2];
4817    dst->f[3] = (float)src->u[3];
4818 }
4819
4820 static void
4821 micro_uadd(union tgsi_exec_channel *dst,
4822            const union tgsi_exec_channel *src0,
4823            const union tgsi_exec_channel *src1)
4824 {
4825    dst->u[0] = src0->u[0] + src1->u[0];
4826    dst->u[1] = src0->u[1] + src1->u[1];
4827    dst->u[2] = src0->u[2] + src1->u[2];
4828    dst->u[3] = src0->u[3] + src1->u[3];
4829 }
4830
4831 static void
4832 micro_udiv(union tgsi_exec_channel *dst,
4833            const union tgsi_exec_channel *src0,
4834            const union tgsi_exec_channel *src1)
4835 {
4836    dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4837    dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4838    dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4839    dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4840 }
4841
4842 static void
4843 micro_umad(union tgsi_exec_channel *dst,
4844            const union tgsi_exec_channel *src0,
4845            const union tgsi_exec_channel *src1,
4846            const union tgsi_exec_channel *src2)
4847 {
4848    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4849    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4850    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4851    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4852 }
4853
4854 static void
4855 micro_umax(union tgsi_exec_channel *dst,
4856            const union tgsi_exec_channel *src0,
4857            const union tgsi_exec_channel *src1)
4858 {
4859    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4860    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4861    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4862    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4863 }
4864
4865 static void
4866 micro_umin(union tgsi_exec_channel *dst,
4867            const union tgsi_exec_channel *src0,
4868            const union tgsi_exec_channel *src1)
4869 {
4870    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4871    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4872    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4873    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4874 }
4875
4876 static void
4877 micro_umod(union tgsi_exec_channel *dst,
4878            const union tgsi_exec_channel *src0,
4879            const union tgsi_exec_channel *src1)
4880 {
4881    dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4882    dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4883    dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4884    dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4885 }
4886
4887 static void
4888 micro_umul(union tgsi_exec_channel *dst,
4889            const union tgsi_exec_channel *src0,
4890            const union tgsi_exec_channel *src1)
4891 {
4892    dst->u[0] = src0->u[0] * src1->u[0];
4893    dst->u[1] = src0->u[1] * src1->u[1];
4894    dst->u[2] = src0->u[2] * src1->u[2];
4895    dst->u[3] = src0->u[3] * src1->u[3];
4896 }
4897
4898 static void
4899 micro_imul_hi(union tgsi_exec_channel *dst,
4900               const union tgsi_exec_channel *src0,
4901               const union tgsi_exec_channel *src1)
4902 {
4903 #define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4904    dst->i[0] = I64M(src0->i[0], src1->i[0]);
4905    dst->i[1] = I64M(src0->i[1], src1->i[1]);
4906    dst->i[2] = I64M(src0->i[2], src1->i[2]);
4907    dst->i[3] = I64M(src0->i[3], src1->i[3]);
4908 #undef I64M
4909 }
4910
4911 static void
4912 micro_umul_hi(union tgsi_exec_channel *dst,
4913               const union tgsi_exec_channel *src0,
4914               const union tgsi_exec_channel *src1)
4915 {
4916 #define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4917    dst->u[0] = U64M(src0->u[0], src1->u[0]);
4918    dst->u[1] = U64M(src0->u[1], src1->u[1]);
4919    dst->u[2] = U64M(src0->u[2], src1->u[2]);
4920    dst->u[3] = U64M(src0->u[3], src1->u[3]);
4921 #undef U64M
4922 }
4923
4924 static void
4925 micro_useq(union tgsi_exec_channel *dst,
4926            const union tgsi_exec_channel *src0,
4927            const union tgsi_exec_channel *src1)
4928 {
4929    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4930    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4931    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4932    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4933 }
4934
4935 static void
4936 micro_usge(union tgsi_exec_channel *dst,
4937            const union tgsi_exec_channel *src0,
4938            const union tgsi_exec_channel *src1)
4939 {
4940    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4941    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4942    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4943    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4944 }
4945
4946 static void
4947 micro_ushr(union tgsi_exec_channel *dst,
4948            const union tgsi_exec_channel *src0,
4949            const union tgsi_exec_channel *src1)
4950 {
4951    unsigned masked_count;
4952    masked_count = src1->u[0] & 0x1f;
4953    dst->u[0] = src0->u[0] >> masked_count;
4954    masked_count = src1->u[1] & 0x1f;
4955    dst->u[1] = src0->u[1] >> masked_count;
4956    masked_count = src1->u[2] & 0x1f;
4957    dst->u[2] = src0->u[2] >> masked_count;
4958    masked_count = src1->u[3] & 0x1f;
4959    dst->u[3] = src0->u[3] >> masked_count;
4960 }
4961
4962 static void
4963 micro_uslt(union tgsi_exec_channel *dst,
4964            const union tgsi_exec_channel *src0,
4965            const union tgsi_exec_channel *src1)
4966 {
4967    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4968    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4969    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4970    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4971 }
4972
4973 static void
4974 micro_usne(union tgsi_exec_channel *dst,
4975            const union tgsi_exec_channel *src0,
4976            const union tgsi_exec_channel *src1)
4977 {
4978    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4979    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4980    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4981    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
4982 }
4983
4984 static void
4985 micro_uarl(union tgsi_exec_channel *dst,
4986            const union tgsi_exec_channel *src)
4987 {
4988    dst->i[0] = src->u[0];
4989    dst->i[1] = src->u[1];
4990    dst->i[2] = src->u[2];
4991    dst->i[3] = src->u[3];
4992 }
4993
4994 /**
4995  * Signed bitfield extract (i.e. sign-extend the extracted bits)
4996  */
4997 static void
4998 micro_ibfe(union tgsi_exec_channel *dst,
4999            const union tgsi_exec_channel *src0,
5000            const union tgsi_exec_channel *src1,
5001            const union tgsi_exec_channel *src2)
5002 {
5003    int i;
5004    for (i = 0; i < 4; i++) {
5005       int width = src2->i[i];
5006       int offset = src1->i[i] & 0x1f;
5007       if (width == 32 && offset == 0) {
5008          dst->i[i] = src0->i[i];
5009          continue;
5010       }
5011       width &= 0x1f;
5012       if (width == 0)
5013          dst->i[i] = 0;
5014       else if (width + offset < 32)
5015          dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
5016       else
5017          dst->i[i] = src0->i[i] >> offset;
5018    }
5019 }
5020
5021 /**
5022  * Unsigned bitfield extract
5023  */
5024 static void
5025 micro_ubfe(union tgsi_exec_channel *dst,
5026            const union tgsi_exec_channel *src0,
5027            const union tgsi_exec_channel *src1,
5028            const union tgsi_exec_channel *src2)
5029 {
5030    int i;
5031    for (i = 0; i < 4; i++) {
5032       int width = src2->u[i];
5033       int offset = src1->u[i] & 0x1f;
5034       if (width == 32 && offset == 0) {
5035          dst->u[i] = src0->u[i];
5036          continue;
5037       }
5038       width &= 0x1f;
5039       if (width == 0)
5040          dst->u[i] = 0;
5041       else if (width + offset < 32)
5042          dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
5043       else
5044          dst->u[i] = src0->u[i] >> offset;
5045    }
5046 }
5047
5048 /**
5049  * Bitfield insert: copy low bits from src1 into a region of src0.
5050  */
5051 static void
5052 micro_bfi(union tgsi_exec_channel *dst,
5053           const union tgsi_exec_channel *src0,
5054           const union tgsi_exec_channel *src1,
5055           const union tgsi_exec_channel *src2,
5056           const union tgsi_exec_channel *src3)
5057 {
5058    int i;
5059    for (i = 0; i < 4; i++) {
5060       int width = src3->u[i];
5061       int offset = src2->u[i] & 0x1f;
5062       if (width == 32) {
5063          dst->u[i] = src1->u[i];
5064       } else {
5065          int bitmask = ((1 << width) - 1) << offset;
5066          dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
5067       }
5068    }
5069 }
5070
5071 static void
5072 micro_brev(union tgsi_exec_channel *dst,
5073            const union tgsi_exec_channel *src)
5074 {
5075    dst->u[0] = util_bitreverse(src->u[0]);
5076    dst->u[1] = util_bitreverse(src->u[1]);
5077    dst->u[2] = util_bitreverse(src->u[2]);
5078    dst->u[3] = util_bitreverse(src->u[3]);
5079 }
5080
5081 static void
5082 micro_popc(union tgsi_exec_channel *dst,
5083            const union tgsi_exec_channel *src)
5084 {
5085    dst->u[0] = util_bitcount(src->u[0]);
5086    dst->u[1] = util_bitcount(src->u[1]);
5087    dst->u[2] = util_bitcount(src->u[2]);
5088    dst->u[3] = util_bitcount(src->u[3]);
5089 }
5090
5091 static void
5092 micro_lsb(union tgsi_exec_channel *dst,
5093           const union tgsi_exec_channel *src)
5094 {
5095    dst->i[0] = ffs(src->u[0]) - 1;
5096    dst->i[1] = ffs(src->u[1]) - 1;
5097    dst->i[2] = ffs(src->u[2]) - 1;
5098    dst->i[3] = ffs(src->u[3]) - 1;
5099 }
5100
5101 static void
5102 micro_imsb(union tgsi_exec_channel *dst,
5103            const union tgsi_exec_channel *src)
5104 {
5105    dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
5106    dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
5107    dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
5108    dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
5109 }
5110
5111 static void
5112 micro_umsb(union tgsi_exec_channel *dst,
5113            const union tgsi_exec_channel *src)
5114 {
5115    dst->i[0] = util_last_bit(src->u[0]) - 1;
5116    dst->i[1] = util_last_bit(src->u[1]) - 1;
5117    dst->i[2] = util_last_bit(src->u[2]) - 1;
5118    dst->i[3] = util_last_bit(src->u[3]) - 1;
5119 }
5120
5121
5122 static void
5123 exec_interp_at_sample(struct tgsi_exec_machine *mach,
5124                       const struct tgsi_full_instruction *inst)
5125 {
5126    union tgsi_exec_channel index;
5127    union tgsi_exec_channel index2D;
5128    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5129    const struct tgsi_full_src_register *reg = &inst->Src[0];
5130
5131    assert(reg->Register.File == TGSI_FILE_INPUT);
5132    assert(inst->Src[1].Register.File == TGSI_FILE_IMMEDIATE);
5133
5134    get_index_registers(mach, reg, &index, &index2D);
5135    float sample = mach->Imms[inst->Src[1].Register.Index][inst->Src[1].Register.SwizzleX];
5136
5137    /* Short cut: sample 0 is like a normal fetch */
5138    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5139       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5140          continue;
5141
5142       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5143                              &result[chan]);
5144       if (sample != 0.0f) {
5145
5146       /* TODO: define the samples > 0, but so far we only do fake MSAA */
5147          float x = 0;
5148          float y = 0;
5149
5150          unsigned pos = index2D.i[chan] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[chan];
5151          assert(pos >= 0);
5152          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
5153          mach->InputSampleOffsetApply[pos](mach, pos, chan, x, y, &result[chan]);
5154       }
5155       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5156    }
5157 }
5158
5159
5160 static void
5161 exec_interp_at_offset(struct tgsi_exec_machine *mach,
5162                       const struct tgsi_full_instruction *inst)
5163 {
5164    union tgsi_exec_channel index;
5165    union tgsi_exec_channel index2D;
5166    union tgsi_exec_channel ofsx;
5167    union tgsi_exec_channel ofsy;
5168    const struct tgsi_full_src_register *reg = &inst->Src[0];
5169
5170    assert(reg->Register.File == TGSI_FILE_INPUT);
5171
5172    get_index_registers(mach, reg, &index, &index2D);
5173    unsigned pos = index2D.i[0] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[0];
5174
5175    fetch_source(mach, &ofsx, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
5176    fetch_source(mach, &ofsy, &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
5177
5178    for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5179       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5180          continue;
5181       union tgsi_exec_channel result;
5182       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D, &result);
5183       mach->InputSampleOffsetApply[pos](mach, pos, chan, ofsx.f[chan], ofsy.f[chan], &result);
5184       store_dest(mach, &result, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5185    }
5186 }
5187
5188
5189 static void
5190 exec_interp_at_centroid(struct tgsi_exec_machine *mach,
5191                         const struct tgsi_full_instruction *inst)
5192 {
5193    union tgsi_exec_channel index;
5194    union tgsi_exec_channel index2D;
5195    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5196    const struct tgsi_full_src_register *reg = &inst->Src[0];
5197
5198    assert(reg->Register.File == TGSI_FILE_INPUT);
5199    get_index_registers(mach, reg, &index, &index2D);
5200
5201    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5202       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5203          continue;
5204
5205       /* Here we should add the change to use a sample that lies within the
5206        * primitive (Section 15.2):
5207        *
5208        * "When interpolating variables declared using centroid in ,
5209        * the variable is sampled at a location within the pixel covered
5210        * by the primitive generating the fragment.
5211        * ...
5212        * The built-in functions interpolateAtCentroid ... will sample
5213        * variables as though they were declared with the centroid ...
5214        * qualifier[s]."
5215        *
5216        * Since we only support 1 sample currently, this is just a pass-through.
5217        */
5218       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5219                              &result[chan]);
5220       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5221    }
5222
5223 }
5224
5225
5226 /**
5227  * Execute a TGSI instruction.
5228  * Returns TRUE if a barrier instruction is hit,
5229  * otherwise FALSE.
5230  */
5231 static boolean
5232 exec_instruction(
5233    struct tgsi_exec_machine *mach,
5234    const struct tgsi_full_instruction *inst,
5235    int *pc )
5236 {
5237    union tgsi_exec_channel r[10];
5238
5239    (*pc)++;
5240
5241    switch (inst->Instruction.Opcode) {
5242    case TGSI_OPCODE_ARL:
5243       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5244       break;
5245
5246    case TGSI_OPCODE_MOV:
5247       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5248       break;
5249
5250    case TGSI_OPCODE_LIT:
5251       exec_lit(mach, inst);
5252       break;
5253
5254    case TGSI_OPCODE_RCP:
5255       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5256       break;
5257
5258    case TGSI_OPCODE_RSQ:
5259       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5260       break;
5261
5262    case TGSI_OPCODE_EXP:
5263       exec_exp(mach, inst);
5264       break;
5265
5266    case TGSI_OPCODE_LOG:
5267       exec_log(mach, inst);
5268       break;
5269
5270    case TGSI_OPCODE_MUL:
5271       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5272       break;
5273
5274    case TGSI_OPCODE_ADD:
5275       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5276       break;
5277
5278    case TGSI_OPCODE_DP3:
5279       exec_dp3(mach, inst);
5280       break;
5281
5282    case TGSI_OPCODE_DP4:
5283       exec_dp4(mach, inst);
5284       break;
5285
5286    case TGSI_OPCODE_DST:
5287       exec_dst(mach, inst);
5288       break;
5289
5290    case TGSI_OPCODE_MIN:
5291       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5292       break;
5293
5294    case TGSI_OPCODE_MAX:
5295       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5296       break;
5297
5298    case TGSI_OPCODE_SLT:
5299       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5300       break;
5301
5302    case TGSI_OPCODE_SGE:
5303       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5304       break;
5305
5306    case TGSI_OPCODE_MAD:
5307       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5308       break;
5309
5310    case TGSI_OPCODE_LRP:
5311       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5312       break;
5313
5314    case TGSI_OPCODE_SQRT:
5315       exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5316       break;
5317
5318    case TGSI_OPCODE_FRC:
5319       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5320       break;
5321
5322    case TGSI_OPCODE_FLR:
5323       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5324       break;
5325
5326    case TGSI_OPCODE_ROUND:
5327       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5328       break;
5329
5330    case TGSI_OPCODE_EX2:
5331       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5332       break;
5333
5334    case TGSI_OPCODE_LG2:
5335       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5336       break;
5337
5338    case TGSI_OPCODE_POW:
5339       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5340       break;
5341
5342    case TGSI_OPCODE_LDEXP:
5343       exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5344       break;
5345
5346    case TGSI_OPCODE_COS:
5347       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5348       break;
5349
5350    case TGSI_OPCODE_DDX_FINE:
5351       exec_vector_unary(mach, inst, micro_ddx_fine, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5352       break;
5353
5354    case TGSI_OPCODE_DDX:
5355       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5356       break;
5357
5358    case TGSI_OPCODE_DDY_FINE:
5359       exec_vector_unary(mach, inst, micro_ddy_fine, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5360       break;
5361
5362    case TGSI_OPCODE_DDY:
5363       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5364       break;
5365
5366    case TGSI_OPCODE_KILL:
5367       exec_kill (mach);
5368       break;
5369
5370    case TGSI_OPCODE_KILL_IF:
5371       exec_kill_if (mach, inst);
5372       break;
5373
5374    case TGSI_OPCODE_PK2H:
5375       exec_pk2h(mach, inst);
5376       break;
5377
5378    case TGSI_OPCODE_PK2US:
5379       assert (0);
5380       break;
5381
5382    case TGSI_OPCODE_PK4B:
5383       assert (0);
5384       break;
5385
5386    case TGSI_OPCODE_PK4UB:
5387       assert (0);
5388       break;
5389
5390    case TGSI_OPCODE_SEQ:
5391       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5392       break;
5393
5394    case TGSI_OPCODE_SGT:
5395       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5396       break;
5397
5398    case TGSI_OPCODE_SIN:
5399       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5400       break;
5401
5402    case TGSI_OPCODE_SLE:
5403       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5404       break;
5405
5406    case TGSI_OPCODE_SNE:
5407       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5408       break;
5409
5410    case TGSI_OPCODE_TEX:
5411       /* simple texture lookup */
5412       /* src[0] = texcoord */
5413       /* src[1] = sampler unit */
5414       exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5415       break;
5416
5417    case TGSI_OPCODE_TXB:
5418       /* Texture lookup with lod bias */
5419       /* src[0] = texcoord (src[0].w = LOD bias) */
5420       /* src[1] = sampler unit */
5421       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5422       break;
5423
5424    case TGSI_OPCODE_TXD:
5425       /* Texture lookup with explict partial derivatives */
5426       /* src[0] = texcoord */
5427       /* src[1] = d[strq]/dx */
5428       /* src[2] = d[strq]/dy */
5429       /* src[3] = sampler unit */
5430       exec_txd(mach, inst);
5431       break;
5432
5433    case TGSI_OPCODE_TXL:
5434       /* Texture lookup with explit LOD */
5435       /* src[0] = texcoord (src[0].w = LOD) */
5436       /* src[1] = sampler unit */
5437       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5438       break;
5439
5440    case TGSI_OPCODE_TXP:
5441       /* Texture lookup with projection */
5442       /* src[0] = texcoord (src[0].w = projection) */
5443       /* src[1] = sampler unit */
5444       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5445       break;
5446
5447    case TGSI_OPCODE_TG4:
5448       /* src[0] = texcoord */
5449       /* src[1] = component */
5450       /* src[2] = sampler unit */
5451       exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5452       break;
5453
5454    case TGSI_OPCODE_LODQ:
5455       /* src[0] = texcoord */
5456       /* src[1] = sampler unit */
5457       exec_lodq(mach, inst);
5458       break;
5459
5460    case TGSI_OPCODE_UP2H:
5461       exec_up2h(mach, inst);
5462       break;
5463
5464    case TGSI_OPCODE_UP2US:
5465       assert (0);
5466       break;
5467
5468    case TGSI_OPCODE_UP4B:
5469       assert (0);
5470       break;
5471
5472    case TGSI_OPCODE_UP4UB:
5473       assert (0);
5474       break;
5475
5476    case TGSI_OPCODE_ARR:
5477       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5478       break;
5479
5480    case TGSI_OPCODE_CAL:
5481       /* skip the call if no execution channels are enabled */
5482       if (mach->ExecMask) {
5483          /* do the call */
5484
5485          /* First, record the depths of the execution stacks.
5486           * This is important for deeply nested/looped return statements.
5487           * We have to unwind the stacks by the correct amount.  For a
5488           * real code generator, we could determine the number of entries
5489           * to pop off each stack with simple static analysis and avoid
5490           * implementing this data structure at run time.
5491           */
5492          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5493          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5494          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5495          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5496          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5497          /* note that PC was already incremented above */
5498          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5499
5500          mach->CallStackTop++;
5501
5502          /* Second, push the Cond, Loop, Cont, Func stacks */
5503          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5504          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5505          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5506          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5507          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5508          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5509
5510          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5511          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5512          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5513          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5514          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5515          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5516
5517          /* Finally, jump to the subroutine.  The label is a pointer
5518           * (an instruction number) to the BGNSUB instruction.
5519           */
5520          *pc = inst->Label.Label;
5521          assert(mach->Instructions[*pc].Instruction.Opcode
5522                 == TGSI_OPCODE_BGNSUB);
5523       }
5524       break;
5525
5526    case TGSI_OPCODE_RET:
5527       mach->FuncMask &= ~mach->ExecMask;
5528       UPDATE_EXEC_MASK(mach);
5529
5530       if (mach->FuncMask == 0x0) {
5531          /* really return now (otherwise, keep executing */
5532
5533          if (mach->CallStackTop == 0) {
5534             /* returning from main() */
5535             mach->CondStackTop = 0;
5536             mach->LoopStackTop = 0;
5537             mach->ContStackTop = 0;
5538             mach->LoopLabelStackTop = 0;
5539             mach->SwitchStackTop = 0;
5540             mach->BreakStackTop = 0;
5541             *pc = -1;
5542             return FALSE;
5543          }
5544
5545          assert(mach->CallStackTop > 0);
5546          mach->CallStackTop--;
5547
5548          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5549          mach->CondMask = mach->CondStack[mach->CondStackTop];
5550
5551          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5552          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5553
5554          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5555          mach->ContMask = mach->ContStack[mach->ContStackTop];
5556
5557          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5558          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5559
5560          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5561          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5562
5563          assert(mach->FuncStackTop > 0);
5564          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5565
5566          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5567
5568          UPDATE_EXEC_MASK(mach);
5569       }
5570       break;
5571
5572    case TGSI_OPCODE_SSG:
5573       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5574       break;
5575
5576    case TGSI_OPCODE_CMP:
5577       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5578       break;
5579
5580    case TGSI_OPCODE_DIV:
5581       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5582       break;
5583
5584    case TGSI_OPCODE_DP2:
5585       exec_dp2(mach, inst);
5586       break;
5587
5588    case TGSI_OPCODE_IF:
5589       /* push CondMask */
5590       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5591       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5592       FETCH( &r[0], 0, TGSI_CHAN_X );
5593       /* update CondMask */
5594       if( ! r[0].f[0] ) {
5595          mach->CondMask &= ~0x1;
5596       }
5597       if( ! r[0].f[1] ) {
5598          mach->CondMask &= ~0x2;
5599       }
5600       if( ! r[0].f[2] ) {
5601          mach->CondMask &= ~0x4;
5602       }
5603       if( ! r[0].f[3] ) {
5604          mach->CondMask &= ~0x8;
5605       }
5606       UPDATE_EXEC_MASK(mach);
5607       /* Todo: If CondMask==0, jump to ELSE */
5608       break;
5609
5610    case TGSI_OPCODE_UIF:
5611       /* push CondMask */
5612       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5613       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5614       IFETCH( &r[0], 0, TGSI_CHAN_X );
5615       /* update CondMask */
5616       if( ! r[0].u[0] ) {
5617          mach->CondMask &= ~0x1;
5618       }
5619       if( ! r[0].u[1] ) {
5620          mach->CondMask &= ~0x2;
5621       }
5622       if( ! r[0].u[2] ) {
5623          mach->CondMask &= ~0x4;
5624       }
5625       if( ! r[0].u[3] ) {
5626          mach->CondMask &= ~0x8;
5627       }
5628       UPDATE_EXEC_MASK(mach);
5629       /* Todo: If CondMask==0, jump to ELSE */
5630       break;
5631
5632    case TGSI_OPCODE_ELSE:
5633       /* invert CondMask wrt previous mask */
5634       {
5635          uint prevMask;
5636          assert(mach->CondStackTop > 0);
5637          prevMask = mach->CondStack[mach->CondStackTop - 1];
5638          mach->CondMask = ~mach->CondMask & prevMask;
5639          UPDATE_EXEC_MASK(mach);
5640          /* Todo: If CondMask==0, jump to ENDIF */
5641       }
5642       break;
5643
5644    case TGSI_OPCODE_ENDIF:
5645       /* pop CondMask */
5646       assert(mach->CondStackTop > 0);
5647       mach->CondMask = mach->CondStack[--mach->CondStackTop];
5648       UPDATE_EXEC_MASK(mach);
5649       break;
5650
5651    case TGSI_OPCODE_END:
5652       /* make sure we end primitives which haven't
5653        * been explicitly emitted */
5654       conditional_emit_primitive(mach);
5655       /* halt execution */
5656       *pc = -1;
5657       break;
5658
5659    case TGSI_OPCODE_CEIL:
5660       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5661       break;
5662
5663    case TGSI_OPCODE_I2F:
5664       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
5665       break;
5666
5667    case TGSI_OPCODE_NOT:
5668       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5669       break;
5670
5671    case TGSI_OPCODE_TRUNC:
5672       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5673       break;
5674
5675    case TGSI_OPCODE_SHL:
5676       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5677       break;
5678
5679    case TGSI_OPCODE_AND:
5680       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5681       break;
5682
5683    case TGSI_OPCODE_OR:
5684       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5685       break;
5686
5687    case TGSI_OPCODE_MOD:
5688       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5689       break;
5690
5691    case TGSI_OPCODE_XOR:
5692       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5693       break;
5694
5695    case TGSI_OPCODE_TXF:
5696       exec_txf(mach, inst);
5697       break;
5698
5699    case TGSI_OPCODE_TXQ:
5700       exec_txq(mach, inst);
5701       break;
5702
5703    case TGSI_OPCODE_EMIT:
5704       emit_vertex(mach, inst);
5705       break;
5706
5707    case TGSI_OPCODE_ENDPRIM:
5708       emit_primitive(mach, inst);
5709       break;
5710
5711    case TGSI_OPCODE_BGNLOOP:
5712       /* push LoopMask and ContMasks */
5713       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5714       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5715       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5716       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5717
5718       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5719       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5720       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5721       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5722       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5723       break;
5724
5725    case TGSI_OPCODE_ENDLOOP:
5726       /* Restore ContMask, but don't pop */
5727       assert(mach->ContStackTop > 0);
5728       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5729       UPDATE_EXEC_MASK(mach);
5730       if (mach->ExecMask) {
5731          /* repeat loop: jump to instruction just past BGNLOOP */
5732          assert(mach->LoopLabelStackTop > 0);
5733          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5734       }
5735       else {
5736          /* exit loop: pop LoopMask */
5737          assert(mach->LoopStackTop > 0);
5738          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5739          /* pop ContMask */
5740          assert(mach->ContStackTop > 0);
5741          mach->ContMask = mach->ContStack[--mach->ContStackTop];
5742          assert(mach->LoopLabelStackTop > 0);
5743          --mach->LoopLabelStackTop;
5744
5745          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5746       }
5747       UPDATE_EXEC_MASK(mach);
5748       break;
5749
5750    case TGSI_OPCODE_BRK:
5751       exec_break(mach);
5752       break;
5753
5754    case TGSI_OPCODE_CONT:
5755       /* turn off cont channels for each enabled exec channel */
5756       mach->ContMask &= ~mach->ExecMask;
5757       /* Todo: if mach->LoopMask == 0, jump to end of loop */
5758       UPDATE_EXEC_MASK(mach);
5759       break;
5760
5761    case TGSI_OPCODE_BGNSUB:
5762       /* no-op */
5763       break;
5764
5765    case TGSI_OPCODE_ENDSUB:
5766       /*
5767        * XXX: This really should be a no-op. We should never reach this opcode.
5768        */
5769
5770       assert(mach->CallStackTop > 0);
5771       mach->CallStackTop--;
5772
5773       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5774       mach->CondMask = mach->CondStack[mach->CondStackTop];
5775
5776       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5777       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5778
5779       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5780       mach->ContMask = mach->ContStack[mach->ContStackTop];
5781
5782       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5783       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5784
5785       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5786       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5787
5788       assert(mach->FuncStackTop > 0);
5789       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5790
5791       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5792
5793       UPDATE_EXEC_MASK(mach);
5794       break;
5795
5796    case TGSI_OPCODE_NOP:
5797       break;
5798
5799    case TGSI_OPCODE_F2I:
5800       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5801       break;
5802
5803    case TGSI_OPCODE_FSEQ:
5804       exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5805       break;
5806
5807    case TGSI_OPCODE_FSGE:
5808       exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5809       break;
5810
5811    case TGSI_OPCODE_FSLT:
5812       exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5813       break;
5814
5815    case TGSI_OPCODE_FSNE:
5816       exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5817       break;
5818
5819    case TGSI_OPCODE_IDIV:
5820       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5821       break;
5822
5823    case TGSI_OPCODE_IMAX:
5824       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5825       break;
5826
5827    case TGSI_OPCODE_IMIN:
5828       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5829       break;
5830
5831    case TGSI_OPCODE_INEG:
5832       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5833       break;
5834
5835    case TGSI_OPCODE_ISGE:
5836       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5837       break;
5838
5839    case TGSI_OPCODE_ISHR:
5840       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5841       break;
5842
5843    case TGSI_OPCODE_ISLT:
5844       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5845       break;
5846
5847    case TGSI_OPCODE_F2U:
5848       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5849       break;
5850
5851    case TGSI_OPCODE_U2F:
5852       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
5853       break;
5854
5855    case TGSI_OPCODE_UADD:
5856       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5857       break;
5858
5859    case TGSI_OPCODE_UDIV:
5860       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5861       break;
5862
5863    case TGSI_OPCODE_UMAD:
5864       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5865       break;
5866
5867    case TGSI_OPCODE_UMAX:
5868       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5869       break;
5870
5871    case TGSI_OPCODE_UMIN:
5872       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5873       break;
5874
5875    case TGSI_OPCODE_UMOD:
5876       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5877       break;
5878
5879    case TGSI_OPCODE_UMUL:
5880       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5881       break;
5882
5883    case TGSI_OPCODE_IMUL_HI:
5884       exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5885       break;
5886
5887    case TGSI_OPCODE_UMUL_HI:
5888       exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5889       break;
5890
5891    case TGSI_OPCODE_USEQ:
5892       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5893       break;
5894
5895    case TGSI_OPCODE_USGE:
5896       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5897       break;
5898
5899    case TGSI_OPCODE_USHR:
5900       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5901       break;
5902
5903    case TGSI_OPCODE_USLT:
5904       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5905       break;
5906
5907    case TGSI_OPCODE_USNE:
5908       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5909       break;
5910
5911    case TGSI_OPCODE_SWITCH:
5912       exec_switch(mach, inst);
5913       break;
5914
5915    case TGSI_OPCODE_CASE:
5916       exec_case(mach, inst);
5917       break;
5918
5919    case TGSI_OPCODE_DEFAULT:
5920       exec_default(mach);
5921       break;
5922
5923    case TGSI_OPCODE_ENDSWITCH:
5924       exec_endswitch(mach);
5925       break;
5926
5927    case TGSI_OPCODE_SAMPLE_I:
5928       exec_txf(mach, inst);
5929       break;
5930
5931    case TGSI_OPCODE_SAMPLE_I_MS:
5932       exec_txf(mach, inst);
5933       break;
5934
5935    case TGSI_OPCODE_SAMPLE:
5936       exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5937       break;
5938
5939    case TGSI_OPCODE_SAMPLE_B:
5940       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5941       break;
5942
5943    case TGSI_OPCODE_SAMPLE_C:
5944       exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5945       break;
5946
5947    case TGSI_OPCODE_SAMPLE_C_LZ:
5948       exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5949       break;
5950
5951    case TGSI_OPCODE_SAMPLE_D:
5952       exec_sample_d(mach, inst);
5953       break;
5954
5955    case TGSI_OPCODE_SAMPLE_L:
5956       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5957       break;
5958
5959    case TGSI_OPCODE_GATHER4:
5960       exec_sample(mach, inst, TEX_MODIFIER_GATHER, FALSE);
5961       break;
5962
5963    case TGSI_OPCODE_SVIEWINFO:
5964       exec_txq(mach, inst);
5965       break;
5966
5967    case TGSI_OPCODE_SAMPLE_POS:
5968       assert(0);
5969       break;
5970
5971    case TGSI_OPCODE_SAMPLE_INFO:
5972       assert(0);
5973       break;
5974
5975    case TGSI_OPCODE_LOD:
5976       exec_lodq(mach, inst);
5977       break;
5978
5979    case TGSI_OPCODE_UARL:
5980       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5981       break;
5982
5983    case TGSI_OPCODE_UCMP:
5984       exec_ucmp(mach, inst);
5985       break;
5986
5987    case TGSI_OPCODE_IABS:
5988       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5989       break;
5990
5991    case TGSI_OPCODE_ISSG:
5992       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5993       break;
5994
5995    case TGSI_OPCODE_TEX2:
5996       /* simple texture lookup */
5997       /* src[0] = texcoord */
5998       /* src[1] = compare */
5999       /* src[2] = sampler unit */
6000       exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
6001       break;
6002    case TGSI_OPCODE_TXB2:
6003       /* simple texture lookup */
6004       /* src[0] = texcoord */
6005       /* src[1] = bias */
6006       /* src[2] = sampler unit */
6007       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
6008       break;
6009    case TGSI_OPCODE_TXL2:
6010       /* simple texture lookup */
6011       /* src[0] = texcoord */
6012       /* src[1] = lod */
6013       /* src[2] = sampler unit */
6014       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
6015       break;
6016
6017    case TGSI_OPCODE_IBFE:
6018       exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6019       break;
6020    case TGSI_OPCODE_UBFE:
6021       exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6022       break;
6023    case TGSI_OPCODE_BFI:
6024       exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6025       break;
6026    case TGSI_OPCODE_BREV:
6027       exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6028       break;
6029    case TGSI_OPCODE_POPC:
6030       exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6031       break;
6032    case TGSI_OPCODE_LSB:
6033       exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6034       break;
6035    case TGSI_OPCODE_IMSB:
6036       exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6037       break;
6038    case TGSI_OPCODE_UMSB:
6039       exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6040       break;
6041
6042    case TGSI_OPCODE_F2D:
6043       exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
6044       break;
6045
6046    case TGSI_OPCODE_D2F:
6047       exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
6048       break;
6049
6050    case TGSI_OPCODE_DABS:
6051       exec_double_unary(mach, inst, micro_dabs);
6052       break;
6053
6054    case TGSI_OPCODE_DNEG:
6055       exec_double_unary(mach, inst, micro_dneg);
6056       break;
6057
6058    case TGSI_OPCODE_DADD:
6059       exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
6060       break;
6061
6062    case TGSI_OPCODE_DDIV:
6063       exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
6064       break;
6065
6066    case TGSI_OPCODE_DMUL:
6067       exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
6068       break;
6069
6070    case TGSI_OPCODE_DMAX:
6071       exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
6072       break;
6073
6074    case TGSI_OPCODE_DMIN:
6075       exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
6076       break;
6077
6078    case TGSI_OPCODE_DSLT:
6079       exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
6080       break;
6081
6082    case TGSI_OPCODE_DSGE:
6083       exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
6084       break;
6085
6086    case TGSI_OPCODE_DSEQ:
6087       exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
6088       break;
6089
6090    case TGSI_OPCODE_DSNE:
6091       exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
6092       break;
6093
6094    case TGSI_OPCODE_DRCP:
6095       exec_double_unary(mach, inst, micro_drcp);
6096       break;
6097
6098    case TGSI_OPCODE_DSQRT:
6099       exec_double_unary(mach, inst, micro_dsqrt);
6100       break;
6101
6102    case TGSI_OPCODE_DRSQ:
6103       exec_double_unary(mach, inst, micro_drsq);
6104       break;
6105
6106    case TGSI_OPCODE_DMAD:
6107       exec_double_trinary(mach, inst, micro_dmad);
6108       break;
6109
6110    case TGSI_OPCODE_DFRAC:
6111       exec_double_unary(mach, inst, micro_dfrac);
6112       break;
6113
6114    case TGSI_OPCODE_DLDEXP:
6115       exec_dldexp(mach, inst);
6116       break;
6117
6118    case TGSI_OPCODE_DFRACEXP:
6119       exec_dfracexp(mach, inst);
6120       break;
6121
6122    case TGSI_OPCODE_I2D:
6123       exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
6124       break;
6125
6126    case TGSI_OPCODE_D2I:
6127       exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
6128       break;
6129
6130    case TGSI_OPCODE_U2D:
6131       exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
6132       break;
6133
6134    case TGSI_OPCODE_D2U:
6135       exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
6136       break;
6137
6138    case TGSI_OPCODE_LOAD:
6139       exec_load(mach, inst);
6140       break;
6141
6142    case TGSI_OPCODE_STORE:
6143       exec_store(mach, inst);
6144       break;
6145
6146    case TGSI_OPCODE_ATOMUADD:
6147    case TGSI_OPCODE_ATOMXCHG:
6148    case TGSI_OPCODE_ATOMCAS:
6149    case TGSI_OPCODE_ATOMAND:
6150    case TGSI_OPCODE_ATOMOR:
6151    case TGSI_OPCODE_ATOMXOR:
6152    case TGSI_OPCODE_ATOMUMIN:
6153    case TGSI_OPCODE_ATOMUMAX:
6154    case TGSI_OPCODE_ATOMIMIN:
6155    case TGSI_OPCODE_ATOMIMAX:
6156    case TGSI_OPCODE_ATOMFADD:
6157       exec_atomop(mach, inst);
6158       break;
6159
6160    case TGSI_OPCODE_RESQ:
6161       exec_resq(mach, inst);
6162       break;
6163    case TGSI_OPCODE_BARRIER:
6164    case TGSI_OPCODE_MEMBAR:
6165       return TRUE;
6166       break;
6167
6168    case TGSI_OPCODE_I64ABS:
6169       exec_double_unary(mach, inst, micro_i64abs);
6170       break;
6171
6172    case TGSI_OPCODE_I64SSG:
6173       exec_double_unary(mach, inst, micro_i64sgn);
6174       break;
6175
6176    case TGSI_OPCODE_I64NEG:
6177       exec_double_unary(mach, inst, micro_i64neg);
6178       break;
6179
6180    case TGSI_OPCODE_U64SEQ:
6181       exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
6182       break;
6183
6184    case TGSI_OPCODE_U64SNE:
6185       exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
6186       break;
6187
6188    case TGSI_OPCODE_I64SLT:
6189       exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
6190       break;
6191    case TGSI_OPCODE_U64SLT:
6192       exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
6193       break;
6194
6195    case TGSI_OPCODE_I64SGE:
6196       exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
6197       break;
6198    case TGSI_OPCODE_U64SGE:
6199       exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
6200       break;
6201
6202    case TGSI_OPCODE_I64MIN:
6203       exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
6204       break;
6205    case TGSI_OPCODE_U64MIN:
6206       exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
6207       break;
6208    case TGSI_OPCODE_I64MAX:
6209       exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
6210       break;
6211    case TGSI_OPCODE_U64MAX:
6212       exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
6213       break;
6214    case TGSI_OPCODE_U64ADD:
6215       exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
6216       break;
6217    case TGSI_OPCODE_U64MUL:
6218       exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
6219       break;
6220    case TGSI_OPCODE_U64SHL:
6221       exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
6222       break;
6223    case TGSI_OPCODE_I64SHR:
6224       exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
6225       break;
6226    case TGSI_OPCODE_U64SHR:
6227       exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
6228       break;
6229    case TGSI_OPCODE_U64DIV:
6230       exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
6231       break;
6232    case TGSI_OPCODE_I64DIV:
6233       exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
6234       break;
6235    case TGSI_OPCODE_U64MOD:
6236       exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
6237       break;
6238    case TGSI_OPCODE_I64MOD:
6239       exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
6240       break;
6241
6242    case TGSI_OPCODE_F2U64:
6243       exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
6244       break;
6245
6246    case TGSI_OPCODE_F2I64:
6247       exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
6248       break;
6249
6250    case TGSI_OPCODE_U2I64:
6251       exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
6252       break;
6253    case TGSI_OPCODE_I2I64:
6254       exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
6255       break;
6256
6257    case TGSI_OPCODE_D2U64:
6258       exec_double_unary(mach, inst, micro_d2u64);
6259       break;
6260
6261    case TGSI_OPCODE_D2I64:
6262       exec_double_unary(mach, inst, micro_d2i64);
6263       break;
6264
6265    case TGSI_OPCODE_U642F:
6266       exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
6267       break;
6268    case TGSI_OPCODE_I642F:
6269       exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
6270       break;
6271
6272    case TGSI_OPCODE_U642D:
6273       exec_double_unary(mach, inst, micro_u642d);
6274       break;
6275    case TGSI_OPCODE_I642D:
6276       exec_double_unary(mach, inst, micro_i642d);
6277       break;
6278    case TGSI_OPCODE_INTERP_SAMPLE:
6279       exec_interp_at_sample(mach, inst);
6280       break;
6281    case TGSI_OPCODE_INTERP_OFFSET:
6282       exec_interp_at_offset(mach, inst);
6283       break;
6284    case TGSI_OPCODE_INTERP_CENTROID:
6285       exec_interp_at_centroid(mach, inst);
6286       break;
6287    default:
6288       assert( 0 );
6289    }
6290    return FALSE;
6291 }
6292
6293 static void
6294 tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
6295 {
6296    uint default_mask = 0xf;
6297
6298    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
6299    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
6300
6301    if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
6302       for (unsigned i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) {
6303          mach->Temps[temp_prim_idxs[i].idx].xyzw[temp_prim_idxs[i].chan].u[0] = 0;
6304          mach->Primitives[i][0] = 0;
6305       }
6306       /* GS runs on a single primitive for now */
6307       default_mask = 0x1;
6308    }
6309
6310    if (mach->NonHelperMask == 0)
6311       mach->NonHelperMask = default_mask;
6312    mach->CondMask = default_mask;
6313    mach->LoopMask = default_mask;
6314    mach->ContMask = default_mask;
6315    mach->FuncMask = default_mask;
6316    mach->ExecMask = default_mask;
6317
6318    mach->Switch.mask = default_mask;
6319
6320    assert(mach->CondStackTop == 0);
6321    assert(mach->LoopStackTop == 0);
6322    assert(mach->ContStackTop == 0);
6323    assert(mach->SwitchStackTop == 0);
6324    assert(mach->BreakStackTop == 0);
6325    assert(mach->CallStackTop == 0);
6326 }
6327
6328 /**
6329  * Run TGSI interpreter.
6330  * \return bitmask of "alive" quad components
6331  */
6332 uint
6333 tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
6334 {
6335    uint i;
6336
6337    mach->pc = start_pc;
6338
6339    if (!start_pc) {
6340       tgsi_exec_machine_setup_masks(mach);
6341
6342       /* execute declarations (interpolants) */
6343       for (i = 0; i < mach->NumDeclarations; i++) {
6344          exec_declaration( mach, mach->Declarations+i );
6345       }
6346    }
6347
6348    {
6349 #if DEBUG_EXECUTION
6350       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
6351       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6352       uint inst = 1;
6353
6354       if (!start_pc) {
6355          memset(mach->Temps, 0, sizeof(temps));
6356          if (mach->Outputs)
6357             memset(mach->Outputs, 0, sizeof(outputs));
6358          memset(temps, 0, sizeof(temps));
6359          memset(outputs, 0, sizeof(outputs));
6360       }
6361 #endif
6362
6363       /* execute instructions, until pc is set to -1 */
6364       while (mach->pc != -1) {
6365          boolean barrier_hit;
6366 #if DEBUG_EXECUTION
6367          uint i;
6368
6369          tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6370 #endif
6371
6372          assert(mach->pc < (int) mach->NumInstructions);
6373          barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6374
6375          /* for compute shaders if we hit a barrier return now for later rescheduling */
6376          if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6377             return 0;
6378
6379 #if DEBUG_EXECUTION
6380          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
6381             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6382                uint j;
6383
6384                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6385                debug_printf("TEMP[%2u] = ", i);
6386                for (j = 0; j < 4; j++) {
6387                   if (j > 0) {
6388                      debug_printf("           ");
6389                   }
6390                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6391                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6392                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6393                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6394                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6395                }
6396             }
6397          }
6398          if (mach->Outputs) {
6399             for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6400                if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6401                   uint j;
6402
6403                   memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6404                   debug_printf("OUT[%2u] =  ", i);
6405                   for (j = 0; j < 4; j++) {
6406                      if (j > 0) {
6407                         debug_printf("           ");
6408                      }
6409                      debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6410                                   outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6411                                   outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6412                                   outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6413                                   outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6414                   }
6415                }
6416             }
6417          }
6418 #endif
6419       }
6420    }
6421
6422 #if 0
6423    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6424    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6425       /*
6426        * Scale back depth component.
6427        */
6428       for (i = 0; i < 4; i++)
6429          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6430    }
6431 #endif
6432
6433    /* Strictly speaking, these assertions aren't really needed but they
6434     * can potentially catch some bugs in the control flow code.
6435     */
6436    assert(mach->CondStackTop == 0);
6437    assert(mach->LoopStackTop == 0);
6438    assert(mach->ContStackTop == 0);
6439    assert(mach->SwitchStackTop == 0);
6440    assert(mach->BreakStackTop == 0);
6441    assert(mach->CallStackTop == 0);
6442
6443    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
6444 }