src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 VMware, Inc.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_half.h"
  62 #include "util/u_memory.h"
  63 #include "util/u_math.h"
  64 #include "util/rounding.h"
  65
  66
  67 #define DEBUG_EXECUTION 0
  68
  69
  70 #define FAST_MATH 0
  71
  72 #define TILE_TOP_LEFT     0
  73 #define TILE_TOP_RIGHT    1
  74 #define TILE_BOTTOM_LEFT  2
  75 #define TILE_BOTTOM_RIGHT 3
  76
  77 union tgsi_double_channel {
  78    double d[TGSI_QUAD_SIZE];
  79    unsigned u[TGSI_QUAD_SIZE][2];
  80    uint64_t u64[TGSI_QUAD_SIZE];
  81    int64_t i64[TGSI_QUAD_SIZE];
  82 };
  83
  84 struct tgsi_double_vector {
  85    union tgsi_double_channel xy;
  86    union tgsi_double_channel zw;
  87 };
  88
  89 static void
  90 micro_abs(union tgsi_exec_channel *dst,
  91           const union tgsi_exec_channel *src)
  92 {
  93    dst->f[0] = fabsf(src->f[0]);
  94    dst->f[1] = fabsf(src->f[1]);
  95    dst->f[2] = fabsf(src->f[2]);
  96    dst->f[3] = fabsf(src->f[3]);
  97 }
  98
  99 static void
 100 micro_arl(union tgsi_exec_channel *dst,
 101           const union tgsi_exec_channel *src)
 102 {
 103    dst->i[0] = (int)floorf(src->f[0]);
 104    dst->i[1] = (int)floorf(src->f[1]);
 105    dst->i[2] = (int)floorf(src->f[2]);
 106    dst->i[3] = (int)floorf(src->f[3]);
 107 }
 108
 109 static void
 110 micro_arr(union tgsi_exec_channel *dst,
 111           const union tgsi_exec_channel *src)
 112 {
 113    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
 114    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
 115    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
 116    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 117 }
 118
 119 static void
 120 micro_ceil(union tgsi_exec_channel *dst,
 121            const union tgsi_exec_channel *src)
 122 {
 123    dst->f[0] = ceilf(src->f[0]);
 124    dst->f[1] = ceilf(src->f[1]);
 125    dst->f[2] = ceilf(src->f[2]);
 126    dst->f[3] = ceilf(src->f[3]);
 127 }
 128
 129 static void
 130 micro_cmp(union tgsi_exec_channel *dst,
 131           const union tgsi_exec_channel *src0,
 132           const union tgsi_exec_channel *src1,
 133           const union tgsi_exec_channel *src2)
 134 {
 135    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
 136    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
 137    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
 138    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
 139 }
 140
 141 static void
 142 micro_cos(union tgsi_exec_channel *dst,
 143           const union tgsi_exec_channel *src)
 144 {
 145    dst->f[0] = cosf(src->f[0]);
 146    dst->f[1] = cosf(src->f[1]);
 147    dst->f[2] = cosf(src->f[2]);
 148    dst->f[3] = cosf(src->f[3]);
 149 }
 150
 151 static void
 152 micro_d2f(union tgsi_exec_channel *dst,
 153           const union tgsi_double_channel *src)
 154 {
 155    dst->f[0] = (float)src->d[0];
 156    dst->f[1] = (float)src->d[1];
 157    dst->f[2] = (float)src->d[2];
 158    dst->f[3] = (float)src->d[3];
 159 }
 160
 161 static void
 162 micro_d2i(union tgsi_exec_channel *dst,
 163           const union tgsi_double_channel *src)
 164 {
 165    dst->i[0] = (int)src->d[0];
 166    dst->i[1] = (int)src->d[1];
 167    dst->i[2] = (int)src->d[2];
 168    dst->i[3] = (int)src->d[3];
 169 }
 170
 171 static void
 172 micro_d2u(union tgsi_exec_channel *dst,
 173           const union tgsi_double_channel *src)
 174 {
 175    dst->u[0] = (unsigned)src->d[0];
 176    dst->u[1] = (unsigned)src->d[1];
 177    dst->u[2] = (unsigned)src->d[2];
 178    dst->u[3] = (unsigned)src->d[3];
 179 }
 180 static void
 181 micro_dabs(union tgsi_double_channel *dst,
 182            const union tgsi_double_channel *src)
 183 {
 184    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
 185    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
 186    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
 187    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
 188 }
 189
 190 static void
 191 micro_dadd(union tgsi_double_channel *dst,
 192           const union tgsi_double_channel *src)
 193 {
 194    dst->d[0] = src[0].d[0] + src[1].d[0];
 195    dst->d[1] = src[0].d[1] + src[1].d[1];
 196    dst->d[2] = src[0].d[2] + src[1].d[2];
 197    dst->d[3] = src[0].d[3] + src[1].d[3];
 198 }
 199
 200 static void
 201 micro_ddiv(union tgsi_double_channel *dst,
 202           const union tgsi_double_channel *src)
 203 {
 204    dst->d[0] = src[0].d[0] / src[1].d[0];
 205    dst->d[1] = src[0].d[1] / src[1].d[1];
 206    dst->d[2] = src[0].d[2] / src[1].d[2];
 207    dst->d[3] = src[0].d[3] / src[1].d[3];
 208 }
 209
 210 static void
 211 micro_ddx(union tgsi_exec_channel *dst,
 212           const union tgsi_exec_channel *src)
 213 {
 214    dst->f[0] =
 215    dst->f[1] =
 216    dst->f[2] =
 217    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 218 }
 219
 220 static void
 221 micro_ddx_fine(union tgsi_exec_channel *dst,
 222           const union tgsi_exec_channel *src)
 223 {
 224    dst->f[0] =
 225    dst->f[1] = src->f[TILE_TOP_RIGHT] - src->f[TILE_TOP_LEFT];
 226    dst->f[2] =
 227    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 228 }
 229
 230
 231 static void
 232 micro_ddy(union tgsi_exec_channel *dst,
 233           const union tgsi_exec_channel *src)
 234 {
 235    dst->f[0] =
 236    dst->f[1] =
 237    dst->f[2] =
 238    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 239 }
 240
 241 static void
 242 micro_ddy_fine(union tgsi_exec_channel *dst,
 243           const union tgsi_exec_channel *src)
 244 {
 245    dst->f[0] =
 246    dst->f[2] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 247    dst->f[1] =
 248    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_TOP_RIGHT];
 249 }
 250
 251 static void
 252 micro_dmul(union tgsi_double_channel *dst,
 253            const union tgsi_double_channel *src)
 254 {
 255    dst->d[0] = src[0].d[0] * src[1].d[0];
 256    dst->d[1] = src[0].d[1] * src[1].d[1];
 257    dst->d[2] = src[0].d[2] * src[1].d[2];
 258    dst->d[3] = src[0].d[3] * src[1].d[3];
 259 }
 260
 261 static void
 262 micro_dmax(union tgsi_double_channel *dst,
 263            const union tgsi_double_channel *src)
 264 {
 265    dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
 266    dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
 267    dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
 268    dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
 269 }
 270
 271 static void
 272 micro_dmin(union tgsi_double_channel *dst,
 273            const union tgsi_double_channel *src)
 274 {
 275    dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
 276    dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
 277    dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
 278    dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
 279 }
 280
 281 static void
 282 micro_dneg(union tgsi_double_channel *dst,
 283            const union tgsi_double_channel *src)
 284 {
 285    dst->d[0] = -src->d[0];
 286    dst->d[1] = -src->d[1];
 287    dst->d[2] = -src->d[2];
 288    dst->d[3] = -src->d[3];
 289 }
 290
 291 static void
 292 micro_dslt(union tgsi_double_channel *dst,
 293            const union tgsi_double_channel *src)
 294 {
 295    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
 296    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
 297    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
 298    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
 299 }
 300
 301 static void
 302 micro_dsne(union tgsi_double_channel *dst,
 303            const union tgsi_double_channel *src)
 304 {
 305    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
 306    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
 307    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
 308    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
 309 }
 310
 311 static void
 312 micro_dsge(union tgsi_double_channel *dst,
 313            const union tgsi_double_channel *src)
 314 {
 315    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
 316    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
 317    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
 318    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
 319 }
 320
 321 static void
 322 micro_dseq(union tgsi_double_channel *dst,
 323            const union tgsi_double_channel *src)
 324 {
 325    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
 326    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
 327    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
 328    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
 329 }
 330
 331 static void
 332 micro_drcp(union tgsi_double_channel *dst,
 333            const union tgsi_double_channel *src)
 334 {
 335    dst->d[0] = 1.0 / src->d[0];
 336    dst->d[1] = 1.0 / src->d[1];
 337    dst->d[2] = 1.0 / src->d[2];
 338    dst->d[3] = 1.0 / src->d[3];
 339 }
 340
 341 static void
 342 micro_dsqrt(union tgsi_double_channel *dst,
 343             const union tgsi_double_channel *src)
 344 {
 345    dst->d[0] = sqrt(src->d[0]);
 346    dst->d[1] = sqrt(src->d[1]);
 347    dst->d[2] = sqrt(src->d[2]);
 348    dst->d[3] = sqrt(src->d[3]);
 349 }
 350
 351 static void
 352 micro_drsq(union tgsi_double_channel *dst,
 353           const union tgsi_double_channel *src)
 354 {
 355    dst->d[0] = 1.0 / sqrt(src->d[0]);
 356    dst->d[1] = 1.0 / sqrt(src->d[1]);
 357    dst->d[2] = 1.0 / sqrt(src->d[2]);
 358    dst->d[3] = 1.0 / sqrt(src->d[3]);
 359 }
 360
 361 static void
 362 micro_dmad(union tgsi_double_channel *dst,
 363            const union tgsi_double_channel *src)
 364 {
 365    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
 366    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
 367    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
 368    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
 369 }
 370
 371 static void
 372 micro_dfrac(union tgsi_double_channel *dst,
 373             const union tgsi_double_channel *src)
 374 {
 375    dst->d[0] = src->d[0] - floor(src->d[0]);
 376    dst->d[1] = src->d[1] - floor(src->d[1]);
 377    dst->d[2] = src->d[2] - floor(src->d[2]);
 378    dst->d[3] = src->d[3] - floor(src->d[3]);
 379 }
 380
 381 static void
 382 micro_dldexp(union tgsi_double_channel *dst,
 383              const union tgsi_double_channel *src0,
 384              union tgsi_exec_channel *src1)
 385 {
 386    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
 387    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
 388    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
 389    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
 390 }
 391
 392 static void
 393 micro_dfracexp(union tgsi_double_channel *dst,
 394                union tgsi_exec_channel *dst_exp,
 395                const union tgsi_double_channel *src)
 396 {
 397    dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
 398    dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
 399    dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
 400    dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
 401 }
 402
 403 static void
 404 micro_exp2(union tgsi_exec_channel *dst,
 405            const union tgsi_exec_channel *src)
 406 {
 407 #if FAST_MATH
 408    dst->f[0] = util_fast_exp2(src->f[0]);
 409    dst->f[1] = util_fast_exp2(src->f[1]);
 410    dst->f[2] = util_fast_exp2(src->f[2]);
 411    dst->f[3] = util_fast_exp2(src->f[3]);
 412 #else
 413 #if DEBUG
 414    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 415    uint i;
 416    union tgsi_exec_channel clamped;
 417
 418    for (i = 0; i < 4; i++) {
 419       if (src->f[i] > 127.99999f) {
 420          clamped.f[i] = 127.99999f;
 421       } else if (src->f[i] < -126.99999f) {
 422          clamped.f[i] = -126.99999f;
 423       } else {
 424          clamped.f[i] = src->f[i];
 425       }
 426    }
 427    src = &clamped;
 428 #endif /* DEBUG */
 429
 430    dst->f[0] = powf(2.0f, src->f[0]);
 431    dst->f[1] = powf(2.0f, src->f[1]);
 432    dst->f[2] = powf(2.0f, src->f[2]);
 433    dst->f[3] = powf(2.0f, src->f[3]);
 434 #endif /* FAST_MATH */
 435 }
 436
 437 static void
 438 micro_f2d(union tgsi_double_channel *dst,
 439           const union tgsi_exec_channel *src)
 440 {
 441    dst->d[0] = (double)src->f[0];
 442    dst->d[1] = (double)src->f[1];
 443    dst->d[2] = (double)src->f[2];
 444    dst->d[3] = (double)src->f[3];
 445 }
 446
 447 static void
 448 micro_flr(union tgsi_exec_channel *dst,
 449           const union tgsi_exec_channel *src)
 450 {
 451    dst->f[0] = floorf(src->f[0]);
 452    dst->f[1] = floorf(src->f[1]);
 453    dst->f[2] = floorf(src->f[2]);
 454    dst->f[3] = floorf(src->f[3]);
 455 }
 456
 457 static void
 458 micro_frc(union tgsi_exec_channel *dst,
 459           const union tgsi_exec_channel *src)
 460 {
 461    dst->f[0] = src->f[0] - floorf(src->f[0]);
 462    dst->f[1] = src->f[1] - floorf(src->f[1]);
 463    dst->f[2] = src->f[2] - floorf(src->f[2]);
 464    dst->f[3] = src->f[3] - floorf(src->f[3]);
 465 }
 466
 467 static void
 468 micro_i2d(union tgsi_double_channel *dst,
 469           const union tgsi_exec_channel *src)
 470 {
 471    dst->d[0] = (double)src->i[0];
 472    dst->d[1] = (double)src->i[1];
 473    dst->d[2] = (double)src->i[2];
 474    dst->d[3] = (double)src->i[3];
 475 }
 476
 477 static void
 478 micro_iabs(union tgsi_exec_channel *dst,
 479            const union tgsi_exec_channel *src)
 480 {
 481    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 482    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 483    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 484    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 485 }
 486
 487 static void
 488 micro_ineg(union tgsi_exec_channel *dst,
 489            const union tgsi_exec_channel *src)
 490 {
 491    dst->i[0] = -src->i[0];
 492    dst->i[1] = -src->i[1];
 493    dst->i[2] = -src->i[2];
 494    dst->i[3] = -src->i[3];
 495 }
 496
 497 static void
 498 micro_lg2(union tgsi_exec_channel *dst,
 499           const union tgsi_exec_channel *src)
 500 {
 501 #if FAST_MATH
 502    dst->f[0] = util_fast_log2(src->f[0]);
 503    dst->f[1] = util_fast_log2(src->f[1]);
 504    dst->f[2] = util_fast_log2(src->f[2]);
 505    dst->f[3] = util_fast_log2(src->f[3]);
 506 #else
 507    dst->f[0] = logf(src->f[0]) * 1.442695f;
 508    dst->f[1] = logf(src->f[1]) * 1.442695f;
 509    dst->f[2] = logf(src->f[2]) * 1.442695f;
 510    dst->f[3] = logf(src->f[3]) * 1.442695f;
 511 #endif
 512 }
 513
 514 static void
 515 micro_lrp(union tgsi_exec_channel *dst,
 516           const union tgsi_exec_channel *src0,
 517           const union tgsi_exec_channel *src1,
 518           const union tgsi_exec_channel *src2)
 519 {
 520    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
 521    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
 522    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
 523    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
 524 }
 525
 526 static void
 527 micro_mad(union tgsi_exec_channel *dst,
 528           const union tgsi_exec_channel *src0,
 529           const union tgsi_exec_channel *src1,
 530           const union tgsi_exec_channel *src2)
 531 {
 532    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
 533    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
 534    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
 535    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
 536 }
 537
 538 static void
 539 micro_mov(union tgsi_exec_channel *dst,
 540           const union tgsi_exec_channel *src)
 541 {
 542    dst->u[0] = src->u[0];
 543    dst->u[1] = src->u[1];
 544    dst->u[2] = src->u[2];
 545    dst->u[3] = src->u[3];
 546 }
 547
 548 static void
 549 micro_rcp(union tgsi_exec_channel *dst,
 550           const union tgsi_exec_channel *src)
 551 {
 552 #if 0 /* for debugging */
 553    assert(src->f[0] != 0.0f);
 554    assert(src->f[1] != 0.0f);
 555    assert(src->f[2] != 0.0f);
 556    assert(src->f[3] != 0.0f);
 557 #endif
 558    dst->f[0] = 1.0f / src->f[0];
 559    dst->f[1] = 1.0f / src->f[1];
 560    dst->f[2] = 1.0f / src->f[2];
 561    dst->f[3] = 1.0f / src->f[3];
 562 }
 563
 564 static void
 565 micro_rnd(union tgsi_exec_channel *dst,
 566           const union tgsi_exec_channel *src)
 567 {
 568    dst->f[0] = _mesa_roundevenf(src->f[0]);
 569    dst->f[1] = _mesa_roundevenf(src->f[1]);
 570    dst->f[2] = _mesa_roundevenf(src->f[2]);
 571    dst->f[3] = _mesa_roundevenf(src->f[3]);
 572 }
 573
 574 static void
 575 micro_rsq(union tgsi_exec_channel *dst,
 576           const union tgsi_exec_channel *src)
 577 {
 578 #if 0 /* for debugging */
 579    assert(src->f[0] != 0.0f);
 580    assert(src->f[1] != 0.0f);
 581    assert(src->f[2] != 0.0f);
 582    assert(src->f[3] != 0.0f);
 583 #endif
 584    dst->f[0] = 1.0f / sqrtf(src->f[0]);
 585    dst->f[1] = 1.0f / sqrtf(src->f[1]);
 586    dst->f[2] = 1.0f / sqrtf(src->f[2]);
 587    dst->f[3] = 1.0f / sqrtf(src->f[3]);
 588 }
 589
 590 static void
 591 micro_sqrt(union tgsi_exec_channel *dst,
 592            const union tgsi_exec_channel *src)
 593 {
 594    dst->f[0] = sqrtf(src->f[0]);
 595    dst->f[1] = sqrtf(src->f[1]);
 596    dst->f[2] = sqrtf(src->f[2]);
 597    dst->f[3] = sqrtf(src->f[3]);
 598 }
 599
 600 static void
 601 micro_seq(union tgsi_exec_channel *dst,
 602           const union tgsi_exec_channel *src0,
 603           const union tgsi_exec_channel *src1)
 604 {
 605    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
 606    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
 607    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
 608    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
 609 }
 610
 611 static void
 612 micro_sge(union tgsi_exec_channel *dst,
 613           const union tgsi_exec_channel *src0,
 614           const union tgsi_exec_channel *src1)
 615 {
 616    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
 617    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
 618    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
 619    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
 620 }
 621
 622 static void
 623 micro_sgn(union tgsi_exec_channel *dst,
 624           const union tgsi_exec_channel *src)
 625 {
 626    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 627    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 628    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 629    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 630 }
 631
 632 static void
 633 micro_isgn(union tgsi_exec_channel *dst,
 634           const union tgsi_exec_channel *src)
 635 {
 636    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
 637    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
 638    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
 639    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
 640 }
 641
 642 static void
 643 micro_sgt(union tgsi_exec_channel *dst,
 644           const union tgsi_exec_channel *src0,
 645           const union tgsi_exec_channel *src1)
 646 {
 647    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
 648    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
 649    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
 650    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
 651 }
 652
 653 static void
 654 micro_sin(union tgsi_exec_channel *dst,
 655           const union tgsi_exec_channel *src)
 656 {
 657    dst->f[0] = sinf(src->f[0]);
 658    dst->f[1] = sinf(src->f[1]);
 659    dst->f[2] = sinf(src->f[2]);
 660    dst->f[3] = sinf(src->f[3]);
 661 }
 662
 663 static void
 664 micro_sle(union tgsi_exec_channel *dst,
 665           const union tgsi_exec_channel *src0,
 666           const union tgsi_exec_channel *src1)
 667 {
 668    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
 669    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
 670    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
 671    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
 672 }
 673
 674 static void
 675 micro_slt(union tgsi_exec_channel *dst,
 676           const union tgsi_exec_channel *src0,
 677           const union tgsi_exec_channel *src1)
 678 {
 679    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
 680    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
 681    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
 682    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
 683 }
 684
 685 static void
 686 micro_sne(union tgsi_exec_channel *dst,
 687           const union tgsi_exec_channel *src0,
 688           const union tgsi_exec_channel *src1)
 689 {
 690    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
 691    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
 692    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
 693    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
 694 }
 695
 696 static void
 697 micro_trunc(union tgsi_exec_channel *dst,
 698             const union tgsi_exec_channel *src)
 699 {
 700    dst->f[0] = truncf(src->f[0]);
 701    dst->f[1] = truncf(src->f[1]);
 702    dst->f[2] = truncf(src->f[2]);
 703    dst->f[3] = truncf(src->f[3]);
 704 }
 705
 706 static void
 707 micro_u2d(union tgsi_double_channel *dst,
 708           const union tgsi_exec_channel *src)
 709 {
 710    dst->d[0] = (double)src->u[0];
 711    dst->d[1] = (double)src->u[1];
 712    dst->d[2] = (double)src->u[2];
 713    dst->d[3] = (double)src->u[3];
 714 }
 715
 716 static void
 717 micro_i64abs(union tgsi_double_channel *dst,
 718              const union tgsi_double_channel *src)
 719 {
 720    dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
 721    dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
 722    dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
 723    dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
 724 }
 725
 726 static void
 727 micro_i64sgn(union tgsi_double_channel *dst,
 728              const union tgsi_double_channel *src)
 729 {
 730    dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
 731    dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
 732    dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
 733    dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
 734 }
 735
 736 static void
 737 micro_i64neg(union tgsi_double_channel *dst,
 738              const union tgsi_double_channel *src)
 739 {
 740    dst->i64[0] = -src->i64[0];
 741    dst->i64[1] = -src->i64[1];
 742    dst->i64[2] = -src->i64[2];
 743    dst->i64[3] = -src->i64[3];
 744 }
 745
 746 static void
 747 micro_u64seq(union tgsi_double_channel *dst,
 748            const union tgsi_double_channel *src)
 749 {
 750    dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
 751    dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
 752    dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
 753    dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
 754 }
 755
 756 static void
 757 micro_u64sne(union tgsi_double_channel *dst,
 758              const union tgsi_double_channel *src)
 759 {
 760    dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
 761    dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
 762    dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
 763    dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
 764 }
 765
 766 static void
 767 micro_i64slt(union tgsi_double_channel *dst,
 768              const union tgsi_double_channel *src)
 769 {
 770    dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
 771    dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
 772    dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
 773    dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
 774 }
 775
 776 static void
 777 micro_u64slt(union tgsi_double_channel *dst,
 778              const union tgsi_double_channel *src)
 779 {
 780    dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
 781    dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
 782    dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
 783    dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
 784 }
 785
 786 static void
 787 micro_i64sge(union tgsi_double_channel *dst,
 788            const union tgsi_double_channel *src)
 789 {
 790    dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
 791    dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
 792    dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
 793    dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
 794 }
 795
 796 static void
 797 micro_u64sge(union tgsi_double_channel *dst,
 798              const union tgsi_double_channel *src)
 799 {
 800    dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
 801    dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
 802    dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
 803    dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
 804 }
 805
 806 static void
 807 micro_u64max(union tgsi_double_channel *dst,
 808              const union tgsi_double_channel *src)
 809 {
 810    dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
 811    dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
 812    dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
 813    dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
 814 }
 815
 816 static void
 817 micro_i64max(union tgsi_double_channel *dst,
 818              const union tgsi_double_channel *src)
 819 {
 820    dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
 821    dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
 822    dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
 823    dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
 824 }
 825
 826 static void
 827 micro_u64min(union tgsi_double_channel *dst,
 828              const union tgsi_double_channel *src)
 829 {
 830    dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
 831    dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
 832    dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
 833    dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
 834 }
 835
 836 static void
 837 micro_i64min(union tgsi_double_channel *dst,
 838              const union tgsi_double_channel *src)
 839 {
 840    dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
 841    dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
 842    dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
 843    dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
 844 }
 845
 846 static void
 847 micro_u64add(union tgsi_double_channel *dst,
 848              const union tgsi_double_channel *src)
 849 {
 850    dst->u64[0] = src[0].u64[0] + src[1].u64[0];
 851    dst->u64[1] = src[0].u64[1] + src[1].u64[1];
 852    dst->u64[2] = src[0].u64[2] + src[1].u64[2];
 853    dst->u64[3] = src[0].u64[3] + src[1].u64[3];
 854 }
 855
 856 static void
 857 micro_u64mul(union tgsi_double_channel *dst,
 858              const union tgsi_double_channel *src)
 859 {
 860    dst->u64[0] = src[0].u64[0] * src[1].u64[0];
 861    dst->u64[1] = src[0].u64[1] * src[1].u64[1];
 862    dst->u64[2] = src[0].u64[2] * src[1].u64[2];
 863    dst->u64[3] = src[0].u64[3] * src[1].u64[3];
 864 }
 865
 866 static void
 867 micro_u64div(union tgsi_double_channel *dst,
 868              const union tgsi_double_channel *src)
 869 {
 870    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
 871    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
 872    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
 873    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
 874 }
 875
 876 static void
 877 micro_i64div(union tgsi_double_channel *dst,
 878              const union tgsi_double_channel *src)
 879 {
 880    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
 881    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
 882    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
 883    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
 884 }
 885
 886 static void
 887 micro_u64mod(union tgsi_double_channel *dst,
 888              const union tgsi_double_channel *src)
 889 {
 890    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
 891    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
 892    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
 893    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
 894 }
 895
 896 static void
 897 micro_i64mod(union tgsi_double_channel *dst,
 898              const union tgsi_double_channel *src)
 899 {
 900    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
 901    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
 902    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
 903    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
 904 }
 905
 906 static void
 907 micro_u64shl(union tgsi_double_channel *dst,
 908              const union tgsi_double_channel *src0,
 909              union tgsi_exec_channel *src1)
 910 {
 911    unsigned masked_count;
 912    masked_count = src1->u[0] & 0x3f;
 913    dst->u64[0] = src0->u64[0] << masked_count;
 914    masked_count = src1->u[1] & 0x3f;
 915    dst->u64[1] = src0->u64[1] << masked_count;
 916    masked_count = src1->u[2] & 0x3f;
 917    dst->u64[2] = src0->u64[2] << masked_count;
 918    masked_count = src1->u[3] & 0x3f;
 919    dst->u64[3] = src0->u64[3] << masked_count;
 920 }
 921
 922 static void
 923 micro_i64shr(union tgsi_double_channel *dst,
 924              const union tgsi_double_channel *src0,
 925              union tgsi_exec_channel *src1)
 926 {
 927    unsigned masked_count;
 928    masked_count = src1->u[0] & 0x3f;
 929    dst->i64[0] = src0->i64[0] >> masked_count;
 930    masked_count = src1->u[1] & 0x3f;
 931    dst->i64[1] = src0->i64[1] >> masked_count;
 932    masked_count = src1->u[2] & 0x3f;
 933    dst->i64[2] = src0->i64[2] >> masked_count;
 934    masked_count = src1->u[3] & 0x3f;
 935    dst->i64[3] = src0->i64[3] >> masked_count;
 936 }
 937
 938 static void
 939 micro_u64shr(union tgsi_double_channel *dst,
 940              const union tgsi_double_channel *src0,
 941              union tgsi_exec_channel *src1)
 942 {
 943    unsigned masked_count;
 944    masked_count = src1->u[0] & 0x3f;
 945    dst->u64[0] = src0->u64[0] >> masked_count;
 946    masked_count = src1->u[1] & 0x3f;
 947    dst->u64[1] = src0->u64[1] >> masked_count;
 948    masked_count = src1->u[2] & 0x3f;
 949    dst->u64[2] = src0->u64[2] >> masked_count;
 950    masked_count = src1->u[3] & 0x3f;
 951    dst->u64[3] = src0->u64[3] >> masked_count;
 952 }
 953
 954 enum tgsi_exec_datatype {
 955    TGSI_EXEC_DATA_FLOAT,
 956    TGSI_EXEC_DATA_INT,
 957    TGSI_EXEC_DATA_UINT,
 958    TGSI_EXEC_DATA_DOUBLE,
 959    TGSI_EXEC_DATA_INT64,
 960    TGSI_EXEC_DATA_UINT64,
 961 };
 962
 963 /*
 964  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 965  */
 966 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 967 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 968 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 969 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 970 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 971 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 972 #define TEMP_PRIMITIVE_S1_I   TGSI_EXEC_TEMP_PRIMITIVE_S1_I
 973 #define TEMP_PRIMITIVE_S1_C   TGSI_EXEC_TEMP_PRIMITIVE_S1_C
 974 #define TEMP_PRIMITIVE_S2_I   TGSI_EXEC_TEMP_PRIMITIVE_S2_I
 975 #define TEMP_PRIMITIVE_S2_C   TGSI_EXEC_TEMP_PRIMITIVE_S2_C
 976 #define TEMP_PRIMITIVE_S3_I   TGSI_EXEC_TEMP_PRIMITIVE_S3_I
 977 #define TEMP_PRIMITIVE_S3_C   TGSI_EXEC_TEMP_PRIMITIVE_S3_C
 978
 979 static const struct {
 980    int idx;
 981    int chan;
 982 } temp_prim_idxs[] = {
 983    { TEMP_PRIMITIVE_I, TEMP_PRIMITIVE_C },
 984    { TEMP_PRIMITIVE_S1_I, TEMP_PRIMITIVE_S1_C },
 985    { TEMP_PRIMITIVE_S2_I, TEMP_PRIMITIVE_S2_C },
 986    { TEMP_PRIMITIVE_S3_I, TEMP_PRIMITIVE_S3_C },
 987 };
 988
 989 /** The execution mask depends on the conditional mask and the loop mask */
 990 #define UPDATE_EXEC_MASK(MACH) \
 991       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 992
 993
 994 static const union tgsi_exec_channel ZeroVec =
 995    { { 0.0, 0.0, 0.0, 0.0 } };
 996
 997 static const union tgsi_exec_channel OneVec = {
 998    {1.0f, 1.0f, 1.0f, 1.0f}
 999 };
1000
1001 static const union tgsi_exec_channel P128Vec = {
1002    {128.0f, 128.0f, 128.0f, 128.0f}
1003 };
1004
1005 static const union tgsi_exec_channel M128Vec = {
1006    {-128.0f, -128.0f, -128.0f, -128.0f}
1007 };
1008
1009
1010 /**
1011  * Assert that none of the float values in 'chan' are infinite or NaN.
1012  * NaN and Inf may occur normally during program execution and should
1013  * not lead to crashes, etc.  But when debugging, it's helpful to catch
1014  * them.
1015  */
1016 static inline void
1017 check_inf_or_nan(const union tgsi_exec_channel *chan)
1018 {
1019    assert(!util_is_inf_or_nan((chan)->f[0]));
1020    assert(!util_is_inf_or_nan((chan)->f[1]));
1021    assert(!util_is_inf_or_nan((chan)->f[2]));
1022    assert(!util_is_inf_or_nan((chan)->f[3]));
1023 }
1024
1025
1026 #ifdef DEBUG
1027 static void
1028 print_chan(const char *msg, const union tgsi_exec_channel *chan)
1029 {
1030    debug_printf("%s = {%f, %f, %f, %f}\n",
1031                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
1032 }
1033 #endif
1034
1035
1036 #ifdef DEBUG
1037 static void
1038 print_temp(const struct tgsi_exec_machine *mach, uint index)
1039 {
1040    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1041    int i;
1042    debug_printf("Temp[%u] =\n", index);
1043    for (i = 0; i < 4; i++) {
1044       debug_printf("  %c: { %f, %f, %f, %f }\n",
1045                    "XYZW"[i],
1046                    tmp->xyzw[i].f[0],
1047                    tmp->xyzw[i].f[1],
1048                    tmp->xyzw[i].f[2],
1049                    tmp->xyzw[i].f[3]);
1050    }
1051 }
1052 #endif
1053
1054
1055 void
1056 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1057                                unsigned num_bufs,
1058                                const void **bufs,
1059                                const unsigned *buf_sizes)
1060 {
1061    unsigned i;
1062
1063    for (i = 0; i < num_bufs; i++) {
1064       mach->Consts[i] = bufs[i];
1065       mach->ConstsSize[i] = buf_sizes[i];
1066    }
1067 }
1068
1069 /**
1070  * Initialize machine state by expanding tokens to full instructions,
1071  * allocating temporary storage, setting up constants, etc.
1072  * After this, we can call tgsi_exec_machine_run() many times.
1073  */
1074 void
1075 tgsi_exec_machine_bind_shader(
1076    struct tgsi_exec_machine *mach,
1077    const struct tgsi_token *tokens,
1078    struct tgsi_sampler *sampler,
1079    struct tgsi_image *image,
1080    struct tgsi_buffer *buffer)
1081 {
1082    uint k;
1083    struct tgsi_parse_context parse;
1084    struct tgsi_full_instruction *instructions;
1085    struct tgsi_full_declaration *declarations;
1086    uint maxInstructions = 10, numInstructions = 0;
1087    uint maxDeclarations = 10, numDeclarations = 0;
1088
1089 #if 0
1090    tgsi_dump(tokens, 0);
1091 #endif
1092
1093    util_init_math();
1094
1095
1096    mach->Tokens = tokens;
1097    mach->Sampler = sampler;
1098    mach->Image = image;
1099    mach->Buffer = buffer;
1100
1101    if (!tokens) {
1102       /* unbind and free all */
1103       FREE(mach->Declarations);
1104       mach->Declarations = NULL;
1105       mach->NumDeclarations = 0;
1106
1107       FREE(mach->Instructions);
1108       mach->Instructions = NULL;
1109       mach->NumInstructions = 0;
1110
1111       return;
1112    }
1113
1114    k = tgsi_parse_init (&parse, mach->Tokens);
1115    if (k != TGSI_PARSE_OK) {
1116       debug_printf( "Problem parsing!\n" );
1117       return;
1118    }
1119
1120    mach->ImmLimit = 0;
1121    mach->NumOutputs = 0;
1122
1123    for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1124       mach->SysSemanticToIndex[k] = -1;
1125
1126    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1127        !mach->UsedGeometryShader) {
1128       struct tgsi_exec_vector *inputs;
1129       struct tgsi_exec_vector *outputs;
1130
1131       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1132                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1133                             16);
1134
1135       if (!inputs)
1136          return;
1137
1138       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1139                              TGSI_MAX_TOTAL_VERTICES, 16);
1140
1141       if (!outputs) {
1142          align_free(inputs);
1143          return;
1144       }
1145
1146       align_free(mach->Inputs);
1147       align_free(mach->Outputs);
1148
1149       mach->Inputs = inputs;
1150       mach->Outputs = outputs;
1151       mach->UsedGeometryShader = TRUE;
1152    }
1153
1154    declarations = (struct tgsi_full_declaration *)
1155       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1156
1157    if (!declarations) {
1158       return;
1159    }
1160
1161    instructions = (struct tgsi_full_instruction *)
1162       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1163
1164    if (!instructions) {
1165       FREE( declarations );
1166       return;
1167    }
1168
1169    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1170       uint i;
1171
1172       tgsi_parse_token( &parse );
1173       switch( parse.FullToken.Token.Type ) {
1174       case TGSI_TOKEN_TYPE_DECLARATION:
1175          /* save expanded declaration */
1176          if (numDeclarations == maxDeclarations) {
1177             declarations = REALLOC(declarations,
1178                                    maxDeclarations
1179                                    * sizeof(struct tgsi_full_declaration),
1180                                    (maxDeclarations + 10)
1181                                    * sizeof(struct tgsi_full_declaration));
1182             maxDeclarations += 10;
1183          }
1184          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
1185             unsigned reg;
1186             for (reg = parse.FullToken.FullDeclaration.Range.First;
1187                  reg <= parse.FullToken.FullDeclaration.Range.Last;
1188                  ++reg) {
1189                ++mach->NumOutputs;
1190             }
1191          }
1192          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1193             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1194             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1195          }
1196
1197          memcpy(declarations + numDeclarations,
1198                 &parse.FullToken.FullDeclaration,
1199                 sizeof(declarations[0]));
1200          numDeclarations++;
1201          break;
1202
1203       case TGSI_TOKEN_TYPE_IMMEDIATE:
1204          {
1205             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1206             assert( size <= 4 );
1207             if (mach->ImmLimit >= mach->ImmsReserved) {
1208                unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1209                float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1210                if (imms) {
1211                   mach->ImmsReserved = newReserved;
1212                   mach->Imms = imms;
1213                } else {
1214                   debug_printf("Unable to (re)allocate space for immidiate constants\n");
1215                   break;
1216                }
1217             }
1218
1219             for( i = 0; i < size; i++ ) {
1220                mach->Imms[mach->ImmLimit][i] =
1221                   parse.FullToken.FullImmediate.u[i].Float;
1222             }
1223             mach->ImmLimit += 1;
1224          }
1225          break;
1226
1227       case TGSI_TOKEN_TYPE_INSTRUCTION:
1228
1229          /* save expanded instruction */
1230          if (numInstructions == maxInstructions) {
1231             instructions = REALLOC(instructions,
1232                                    maxInstructions
1233                                    * sizeof(struct tgsi_full_instruction),
1234                                    (maxInstructions + 10)
1235                                    * sizeof(struct tgsi_full_instruction));
1236             maxInstructions += 10;
1237          }
1238
1239          memcpy(instructions + numInstructions,
1240                 &parse.FullToken.FullInstruction,
1241                 sizeof(instructions[0]));
1242
1243          numInstructions++;
1244          break;
1245
1246       case TGSI_TOKEN_TYPE_PROPERTY:
1247          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1248             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1249                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1250             }
1251          }
1252          break;
1253
1254       default:
1255          assert( 0 );
1256       }
1257    }
1258    tgsi_parse_free (&parse);
1259
1260    FREE(mach->Declarations);
1261    mach->Declarations = declarations;
1262    mach->NumDeclarations = numDeclarations;
1263
1264    FREE(mach->Instructions);
1265    mach->Instructions = instructions;
1266    mach->NumInstructions = numInstructions;
1267 }
1268
1269
1270 struct tgsi_exec_machine *
1271 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1272 {
1273    struct tgsi_exec_machine *mach;
1274
1275    mach = align_malloc( sizeof *mach, 16 );
1276    if (!mach)
1277       goto fail;
1278
1279    memset(mach, 0, sizeof(*mach));
1280
1281    mach->ShaderType = shader_type;
1282    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1283    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1284
1285    if (shader_type != PIPE_SHADER_COMPUTE) {
1286       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1287       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1288       if (!mach->Inputs || !mach->Outputs)
1289          goto fail;
1290    }
1291
1292    if (shader_type == PIPE_SHADER_FRAGMENT) {
1293       mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
1294       if (!mach->InputSampleOffsetApply)
1295          goto fail;
1296    }
1297
1298 #ifdef DEBUG
1299    /* silence warnings */
1300    (void) print_chan;
1301    (void) print_temp;
1302 #endif
1303
1304    return mach;
1305
1306 fail:
1307    if (mach) {
1308       align_free(mach->InputSampleOffsetApply);
1309       align_free(mach->Inputs);
1310       align_free(mach->Outputs);
1311       align_free(mach);
1312    }
1313    return NULL;
1314 }
1315
1316
1317 void
1318 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1319 {
1320    if (mach) {
1321       FREE(mach->Instructions);
1322       FREE(mach->Declarations);
1323       FREE(mach->Imms);
1324
1325       align_free(mach->InputSampleOffsetApply);
1326       align_free(mach->Inputs);
1327       align_free(mach->Outputs);
1328
1329       align_free(mach);
1330    }
1331 }
1332
1333 static void
1334 micro_add(union tgsi_exec_channel *dst,
1335           const union tgsi_exec_channel *src0,
1336           const union tgsi_exec_channel *src1)
1337 {
1338    dst->f[0] = src0->f[0] + src1->f[0];
1339    dst->f[1] = src0->f[1] + src1->f[1];
1340    dst->f[2] = src0->f[2] + src1->f[2];
1341    dst->f[3] = src0->f[3] + src1->f[3];
1342 }
1343
1344 static void
1345 micro_div(
1346    union tgsi_exec_channel *dst,
1347    const union tgsi_exec_channel *src0,
1348    const union tgsi_exec_channel *src1 )
1349 {
1350    if (src1->f[0] != 0) {
1351       dst->f[0] = src0->f[0] / src1->f[0];
1352    }
1353    if (src1->f[1] != 0) {
1354       dst->f[1] = src0->f[1] / src1->f[1];
1355    }
1356    if (src1->f[2] != 0) {
1357       dst->f[2] = src0->f[2] / src1->f[2];
1358    }
1359    if (src1->f[3] != 0) {
1360       dst->f[3] = src0->f[3] / src1->f[3];
1361    }
1362 }
1363
1364 static void
1365 micro_lt(
1366    union tgsi_exec_channel *dst,
1367    const union tgsi_exec_channel *src0,
1368    const union tgsi_exec_channel *src1,
1369    const union tgsi_exec_channel *src2,
1370    const union tgsi_exec_channel *src3 )
1371 {
1372    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1373    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1374    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1375    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1376 }
1377
1378 static void
1379 micro_max(union tgsi_exec_channel *dst,
1380           const union tgsi_exec_channel *src0,
1381           const union tgsi_exec_channel *src1)
1382 {
1383    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1384    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1385    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1386    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1387 }
1388
1389 static void
1390 micro_min(union tgsi_exec_channel *dst,
1391           const union tgsi_exec_channel *src0,
1392           const union tgsi_exec_channel *src1)
1393 {
1394    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1395    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1396    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1397    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1398 }
1399
1400 static void
1401 micro_mul(union tgsi_exec_channel *dst,
1402           const union tgsi_exec_channel *src0,
1403           const union tgsi_exec_channel *src1)
1404 {
1405    dst->f[0] = src0->f[0] * src1->f[0];
1406    dst->f[1] = src0->f[1] * src1->f[1];
1407    dst->f[2] = src0->f[2] * src1->f[2];
1408    dst->f[3] = src0->f[3] * src1->f[3];
1409 }
1410
1411 static void
1412 micro_neg(
1413    union tgsi_exec_channel *dst,
1414    const union tgsi_exec_channel *src )
1415 {
1416    dst->f[0] = -src->f[0];
1417    dst->f[1] = -src->f[1];
1418    dst->f[2] = -src->f[2];
1419    dst->f[3] = -src->f[3];
1420 }
1421
1422 static void
1423 micro_pow(
1424    union tgsi_exec_channel *dst,
1425    const union tgsi_exec_channel *src0,
1426    const union tgsi_exec_channel *src1 )
1427 {
1428 #if FAST_MATH
1429    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1430    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1431    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1432    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1433 #else
1434    dst->f[0] = powf( src0->f[0], src1->f[0] );
1435    dst->f[1] = powf( src0->f[1], src1->f[1] );
1436    dst->f[2] = powf( src0->f[2], src1->f[2] );
1437    dst->f[3] = powf( src0->f[3], src1->f[3] );
1438 #endif
1439 }
1440
1441 static void
1442 micro_ldexp(union tgsi_exec_channel *dst,
1443             const union tgsi_exec_channel *src0,
1444             const union tgsi_exec_channel *src1)
1445 {
1446    dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1447    dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1448    dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1449    dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1450 }
1451
1452 static void
1453 micro_sub(union tgsi_exec_channel *dst,
1454           const union tgsi_exec_channel *src0,
1455           const union tgsi_exec_channel *src1)
1456 {
1457    dst->f[0] = src0->f[0] - src1->f[0];
1458    dst->f[1] = src0->f[1] - src1->f[1];
1459    dst->f[2] = src0->f[2] - src1->f[2];
1460    dst->f[3] = src0->f[3] - src1->f[3];
1461 }
1462
1463 static void
1464 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1465                        const uint file,
1466                        const uint swizzle,
1467                        const union tgsi_exec_channel *index,
1468                        const union tgsi_exec_channel *index2D,
1469                        union tgsi_exec_channel *chan)
1470 {
1471    uint i;
1472
1473    assert(swizzle < 4);
1474
1475    switch (file) {
1476    case TGSI_FILE_CONSTANT:
1477       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1478          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1479          assert(mach->Consts[index2D->i[i]]);
1480
1481          if (index->i[i] < 0) {
1482             chan->u[i] = 0;
1483          } else {
1484             /* NOTE: copying the const value as a uint instead of float */
1485             const uint constbuf = index2D->i[i];
1486             const uint *buf = (const uint *)mach->Consts[constbuf];
1487             const int pos = index->i[i] * 4 + swizzle;
1488             /* const buffer bounds check */
1489             if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1490                if (0) {
1491                   /* Debug: print warning */
1492                   static int count = 0;
1493                   if (count++ < 100)
1494                      debug_printf("TGSI Exec: const buffer index %d"
1495                                   " out of bounds\n", pos);
1496                }
1497                chan->u[i] = 0;
1498             }
1499             else
1500                chan->u[i] = buf[pos];
1501          }
1502       }
1503       break;
1504
1505    case TGSI_FILE_INPUT:
1506       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1507          /*
1508          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1509             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1510                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1511                          index2D->i[i], index->i[i]);
1512                          }*/
1513          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1514          assert(pos >= 0);
1515          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1516          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1517       }
1518       break;
1519
1520    case TGSI_FILE_SYSTEM_VALUE:
1521       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1522          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1523       }
1524       break;
1525
1526    case TGSI_FILE_TEMPORARY:
1527       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1528          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1529          assert(index2D->i[i] == 0);
1530
1531          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1532       }
1533       break;
1534
1535    case TGSI_FILE_IMMEDIATE:
1536       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1537          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1538          assert(index2D->i[i] == 0);
1539
1540          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1541       }
1542       break;
1543
1544    case TGSI_FILE_ADDRESS:
1545       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1546          assert(index->i[i] >= 0);
1547          assert(index2D->i[i] == 0);
1548
1549          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1550       }
1551       break;
1552
1553    case TGSI_FILE_OUTPUT:
1554       /* vertex/fragment output vars can be read too */
1555       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1556          assert(index->i[i] >= 0);
1557          assert(index2D->i[i] == 0);
1558
1559          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1560       }
1561       break;
1562
1563    default:
1564       assert(0);
1565       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1566          chan->u[i] = 0;
1567       }
1568    }
1569 }
1570
1571 static void
1572 get_index_registers(const struct tgsi_exec_machine *mach,
1573                     const struct tgsi_full_src_register *reg,
1574                     union tgsi_exec_channel *index,
1575                     union tgsi_exec_channel *index2D)
1576 {
1577    uint swizzle;
1578
1579    /* We start with a direct index into a register file.
1580     *
1581     *    file[1],
1582     *    where:
1583     *       file = Register.File
1584     *       [1] = Register.Index
1585     */
1586    index->i[0] =
1587    index->i[1] =
1588    index->i[2] =
1589    index->i[3] = reg->Register.Index;
1590
1591    /* There is an extra source register that indirectly subscripts
1592     * a register file. The direct index now becomes an offset
1593     * that is being added to the indirect register.
1594     *
1595     *    file[ind[2].x+1],
1596     *    where:
1597     *       ind = Indirect.File
1598     *       [2] = Indirect.Index
1599     *       .x = Indirect.SwizzleX
1600     */
1601    if (reg->Register.Indirect) {
1602       union tgsi_exec_channel index2;
1603       union tgsi_exec_channel indir_index;
1604       const uint execmask = mach->ExecMask;
1605       uint i;
1606
1607       /* which address register (always zero now) */
1608       index2.i[0] =
1609       index2.i[1] =
1610       index2.i[2] =
1611       index2.i[3] = reg->Indirect.Index;
1612       /* get current value of address register[swizzle] */
1613       swizzle = reg->Indirect.Swizzle;
1614       fetch_src_file_channel(mach,
1615                              reg->Indirect.File,
1616                              swizzle,
1617                              &index2,
1618                              &ZeroVec,
1619                              &indir_index);
1620
1621       /* add value of address register to the offset */
1622       index->i[0] += indir_index.i[0];
1623       index->i[1] += indir_index.i[1];
1624       index->i[2] += indir_index.i[2];
1625       index->i[3] += indir_index.i[3];
1626
1627       /* for disabled execution channels, zero-out the index to
1628        * avoid using a potential garbage value.
1629        */
1630       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1631          if ((execmask & (1 << i)) == 0)
1632             index->i[i] = 0;
1633       }
1634    }
1635
1636    /* There is an extra source register that is a second
1637     * subscript to a register file. Effectively it means that
1638     * the register file is actually a 2D array of registers.
1639     *
1640     *    file[3][1],
1641     *    where:
1642     *       [3] = Dimension.Index
1643     */
1644    if (reg->Register.Dimension) {
1645       index2D->i[0] =
1646       index2D->i[1] =
1647       index2D->i[2] =
1648       index2D->i[3] = reg->Dimension.Index;
1649
1650       /* Again, the second subscript index can be addressed indirectly
1651        * identically to the first one.
1652        * Nothing stops us from indirectly addressing the indirect register,
1653        * but there is no need for that, so we won't exercise it.
1654        *
1655        *    file[ind[4].y+3][1],
1656        *    where:
1657        *       ind = DimIndirect.File
1658        *       [4] = DimIndirect.Index
1659        *       .y = DimIndirect.SwizzleX
1660        */
1661       if (reg->Dimension.Indirect) {
1662          union tgsi_exec_channel index2;
1663          union tgsi_exec_channel indir_index;
1664          const uint execmask = mach->ExecMask;
1665          uint i;
1666
1667          index2.i[0] =
1668          index2.i[1] =
1669          index2.i[2] =
1670          index2.i[3] = reg->DimIndirect.Index;
1671
1672          swizzle = reg->DimIndirect.Swizzle;
1673          fetch_src_file_channel(mach,
1674                                 reg->DimIndirect.File,
1675                                 swizzle,
1676                                 &index2,
1677                                 &ZeroVec,
1678                                 &indir_index);
1679
1680          index2D->i[0] += indir_index.i[0];
1681          index2D->i[1] += indir_index.i[1];
1682          index2D->i[2] += indir_index.i[2];
1683          index2D->i[3] += indir_index.i[3];
1684
1685          /* for disabled execution channels, zero-out the index to
1686           * avoid using a potential garbage value.
1687           */
1688          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1689             if ((execmask & (1 << i)) == 0) {
1690                index2D->i[i] = 0;
1691             }
1692          }
1693       }
1694
1695       /* If by any chance there was a need for a 3D array of register
1696        * files, we would have to check whether Dimension is followed
1697        * by a dimension register and continue the saga.
1698        */
1699    } else {
1700       index2D->i[0] =
1701       index2D->i[1] =
1702       index2D->i[2] =
1703       index2D->i[3] = 0;
1704    }
1705 }
1706
1707
1708 static void
1709 fetch_source_d(const struct tgsi_exec_machine *mach,
1710                union tgsi_exec_channel *chan,
1711                const struct tgsi_full_src_register *reg,
1712                const uint chan_index)
1713 {
1714    union tgsi_exec_channel index;
1715    union tgsi_exec_channel index2D;
1716    uint swizzle;
1717
1718    get_index_registers(mach, reg, &index, &index2D);
1719
1720
1721    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1722    fetch_src_file_channel(mach,
1723                           reg->Register.File,
1724                           swizzle,
1725                           &index,
1726                           &index2D,
1727                           chan);
1728 }
1729
1730 static void
1731 fetch_source(const struct tgsi_exec_machine *mach,
1732              union tgsi_exec_channel *chan,
1733              const struct tgsi_full_src_register *reg,
1734              const uint chan_index,
1735              enum tgsi_exec_datatype src_datatype)
1736 {
1737    fetch_source_d(mach, chan, reg, chan_index);
1738
1739    if (reg->Register.Absolute) {
1740       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1741          micro_abs(chan, chan);
1742       } else {
1743          micro_iabs(chan, chan);
1744       }
1745    }
1746
1747    if (reg->Register.Negate) {
1748       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1749          micro_neg(chan, chan);
1750       } else {
1751          micro_ineg(chan, chan);
1752       }
1753    }
1754 }
1755
1756 static union tgsi_exec_channel *
1757 store_dest_dstret(struct tgsi_exec_machine *mach,
1758                  const union tgsi_exec_channel *chan,
1759                  const struct tgsi_full_dst_register *reg,
1760                  uint chan_index,
1761                  enum tgsi_exec_datatype dst_datatype)
1762 {
1763    static union tgsi_exec_channel null;
1764    union tgsi_exec_channel *dst;
1765    union tgsi_exec_channel index2D;
1766    int offset = 0;  /* indirection offset */
1767    int index;
1768
1769    /* for debugging */
1770    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1771       check_inf_or_nan(chan);
1772    }
1773
1774    /* There is an extra source register that indirectly subscripts
1775     * a register file. The direct index now becomes an offset
1776     * that is being added to the indirect register.
1777     *
1778     *    file[ind[2].x+1],
1779     *    where:
1780     *       ind = Indirect.File
1781     *       [2] = Indirect.Index
1782     *       .x = Indirect.SwizzleX
1783     */
1784    if (reg->Register.Indirect) {
1785       union tgsi_exec_channel index;
1786       union tgsi_exec_channel indir_index;
1787       uint swizzle;
1788
1789       /* which address register (always zero for now) */
1790       index.i[0] =
1791       index.i[1] =
1792       index.i[2] =
1793       index.i[3] = reg->Indirect.Index;
1794
1795       /* get current value of address register[swizzle] */
1796       swizzle = reg->Indirect.Swizzle;
1797
1798       /* fetch values from the address/indirection register */
1799       fetch_src_file_channel(mach,
1800                              reg->Indirect.File,
1801                              swizzle,
1802                              &index,
1803                              &ZeroVec,
1804                              &indir_index);
1805
1806       /* save indirection offset */
1807       offset = indir_index.i[0];
1808    }
1809
1810    /* There is an extra source register that is a second
1811     * subscript to a register file. Effectively it means that
1812     * the register file is actually a 2D array of registers.
1813     *
1814     *    file[3][1],
1815     *    where:
1816     *       [3] = Dimension.Index
1817     */
1818    if (reg->Register.Dimension) {
1819       index2D.i[0] =
1820       index2D.i[1] =
1821       index2D.i[2] =
1822       index2D.i[3] = reg->Dimension.Index;
1823
1824       /* Again, the second subscript index can be addressed indirectly
1825        * identically to the first one.
1826        * Nothing stops us from indirectly addressing the indirect register,
1827        * but there is no need for that, so we won't exercise it.
1828        *
1829        *    file[ind[4].y+3][1],
1830        *    where:
1831        *       ind = DimIndirect.File
1832        *       [4] = DimIndirect.Index
1833        *       .y = DimIndirect.SwizzleX
1834        */
1835       if (reg->Dimension.Indirect) {
1836          union tgsi_exec_channel index2;
1837          union tgsi_exec_channel indir_index;
1838          const uint execmask = mach->ExecMask;
1839          unsigned swizzle;
1840          uint i;
1841
1842          index2.i[0] =
1843          index2.i[1] =
1844          index2.i[2] =
1845          index2.i[3] = reg->DimIndirect.Index;
1846
1847          swizzle = reg->DimIndirect.Swizzle;
1848          fetch_src_file_channel(mach,
1849                                 reg->DimIndirect.File,
1850                                 swizzle,
1851                                 &index2,
1852                                 &ZeroVec,
1853                                 &indir_index);
1854
1855          index2D.i[0] += indir_index.i[0];
1856          index2D.i[1] += indir_index.i[1];
1857          index2D.i[2] += indir_index.i[2];
1858          index2D.i[3] += indir_index.i[3];
1859
1860          /* for disabled execution channels, zero-out the index to
1861           * avoid using a potential garbage value.
1862           */
1863          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1864             if ((execmask & (1 << i)) == 0) {
1865                index2D.i[i] = 0;
1866             }
1867          }
1868       }
1869
1870       /* If by any chance there was a need for a 3D array of register
1871        * files, we would have to check whether Dimension is followed
1872        * by a dimension register and continue the saga.
1873        */
1874    } else {
1875       index2D.i[0] =
1876       index2D.i[1] =
1877       index2D.i[2] =
1878       index2D.i[3] = 0;
1879    }
1880
1881    switch (reg->Register.File) {
1882    case TGSI_FILE_NULL:
1883       dst = &null;
1884       break;
1885
1886    case TGSI_FILE_OUTPUT:
1887       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1888          + reg->Register.Index;
1889       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1890 #if 0
1891       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1892                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1893                    reg->Register.Index);
1894       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1895          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1896          for (i = 0; i < TGSI_QUAD_SIZE; i++)
1897             if (execmask & (1 << i))
1898                debug_printf("%f, ", chan->f[i]);
1899          debug_printf(")\n");
1900       }
1901 #endif
1902       break;
1903
1904    case TGSI_FILE_TEMPORARY:
1905       index = reg->Register.Index;
1906       assert( index < TGSI_EXEC_NUM_TEMPS );
1907       dst = &mach->Temps[offset + index].xyzw[chan_index];
1908       break;
1909
1910    case TGSI_FILE_ADDRESS:
1911       index = reg->Register.Index;
1912       dst = &mach->Addrs[index].xyzw[chan_index];
1913       break;
1914
1915    default:
1916       assert( 0 );
1917       return NULL;
1918    }
1919
1920    return dst;
1921 }
1922
1923 static void
1924 store_dest_double(struct tgsi_exec_machine *mach,
1925                  const union tgsi_exec_channel *chan,
1926                  const struct tgsi_full_dst_register *reg,
1927                  uint chan_index,
1928                  enum tgsi_exec_datatype dst_datatype)
1929 {
1930    union tgsi_exec_channel *dst;
1931    const uint execmask = mach->ExecMask;
1932    int i;
1933
1934    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1935    if (!dst)
1936       return;
1937
1938    /* doubles path */
1939    for (i = 0; i < TGSI_QUAD_SIZE; i++)
1940       if (execmask & (1 << i))
1941          dst->i[i] = chan->i[i];
1942 }
1943
1944 static void
1945 store_dest(struct tgsi_exec_machine *mach,
1946            const union tgsi_exec_channel *chan,
1947            const struct tgsi_full_dst_register *reg,
1948            const struct tgsi_full_instruction *inst,
1949            uint chan_index,
1950            enum tgsi_exec_datatype dst_datatype)
1951 {
1952    union tgsi_exec_channel *dst;
1953    const uint execmask = mach->ExecMask;
1954    int i;
1955
1956    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1957    if (!dst)
1958       return;
1959
1960    if (!inst->Instruction.Saturate) {
1961       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1962          if (execmask & (1 << i))
1963             dst->i[i] = chan->i[i];
1964    }
1965    else {
1966       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1967          if (execmask & (1 << i)) {
1968             if (chan->f[i] < 0.0f)
1969                dst->f[i] = 0.0f;
1970             else if (chan->f[i] > 1.0f)
1971                dst->f[i] = 1.0f;
1972             else
1973                dst->i[i] = chan->i[i];
1974          }
1975    }
1976 }
1977
1978 #define FETCH(VAL,INDEX,CHAN)\
1979     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1980
1981 #define IFETCH(VAL,INDEX,CHAN)\
1982     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1983
1984
1985 /**
1986  * Execute ARB-style KIL which is predicated by a src register.
1987  * Kill fragment if any of the four values is less than zero.
1988  */
1989 static void
1990 exec_kill_if(struct tgsi_exec_machine *mach,
1991              const struct tgsi_full_instruction *inst)
1992 {
1993    uint uniquemask;
1994    uint chan_index;
1995    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1996    union tgsi_exec_channel r[1];
1997
1998    /* This mask stores component bits that were already tested. */
1999    uniquemask = 0;
2000
2001    for (chan_index = 0; chan_index < 4; chan_index++)
2002    {
2003       uint swizzle;
2004       uint i;
2005
2006       /* unswizzle channel */
2007       swizzle = tgsi_util_get_full_src_register_swizzle (
2008                         &inst->Src[0],
2009                         chan_index);
2010
2011       /* check if the component has not been already tested */
2012       if (uniquemask & (1 << swizzle))
2013          continue;
2014       uniquemask |= 1 << swizzle;
2015
2016       FETCH(&r[0], 0, chan_index);
2017       for (i = 0; i < 4; i++)
2018          if (r[0].f[i] < 0.0f)
2019             kilmask |= 1 << i;
2020    }
2021
2022    /* restrict to fragments currently executing */
2023    kilmask &= mach->ExecMask;
2024
2025    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2026 }
2027
2028 /**
2029  * Unconditional fragment kill/discard.
2030  */
2031 static void
2032 exec_kill(struct tgsi_exec_machine *mach)
2033 {
2034    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2035
2036    /* kill fragment for all fragments currently executing */
2037    kilmask = mach->ExecMask;
2038    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2039 }
2040
2041 static void
2042 emit_vertex(struct tgsi_exec_machine *mach,
2043             const struct tgsi_full_instruction *inst)
2044 {
2045    union tgsi_exec_channel r[1];
2046    unsigned stream_id;
2047    unsigned *prim_count;
2048    /* FIXME: check for exec mask correctly
2049    unsigned i;
2050    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2051          if ((mach->ExecMask & (1 << i)))
2052    */
2053    IFETCH(&r[0], 0, TGSI_CHAN_X);
2054    stream_id = r[0].u[0];
2055    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2056    if (mach->ExecMask) {
2057       if (mach->Primitives[stream_id][*prim_count] >= mach->MaxOutputVertices)
2058          return;
2059
2060       if (mach->Primitives[stream_id][*prim_count] == 0)
2061          mach->PrimitiveOffsets[stream_id][*prim_count] = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0];
2062       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
2063       mach->Primitives[stream_id][*prim_count]++;
2064    }
2065 }
2066
2067 static void
2068 emit_primitive(struct tgsi_exec_machine *mach,
2069                const struct tgsi_full_instruction *inst)
2070 {
2071    unsigned *prim_count;
2072    union tgsi_exec_channel r[1];
2073    unsigned stream_id = 0;
2074    /* FIXME: check for exec mask correctly
2075    unsigned i;
2076    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2077          if ((mach->ExecMask & (1 << i)))
2078    */
2079    if (inst) {
2080       IFETCH(&r[0], 0, TGSI_CHAN_X);
2081       stream_id = r[0].u[0];
2082    }
2083    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2084    if (mach->ExecMask) {
2085       ++(*prim_count);
2086       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
2087       mach->Primitives[stream_id][*prim_count] = 0;
2088    }
2089 }
2090
2091 static void
2092 conditional_emit_primitive(struct tgsi_exec_machine *mach)
2093 {
2094    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
2095       int emitted_verts =
2096          mach->Primitives[0][mach->Temps[temp_prim_idxs[0].idx].xyzw[temp_prim_idxs[0].chan].u[0]];
2097       if (emitted_verts) {
2098          emit_primitive(mach, NULL);
2099       }
2100    }
2101 }
2102
2103
2104 /*
2105  * Fetch four texture samples using STR texture coordinates.
2106  */
2107 static void
2108 fetch_texel( struct tgsi_sampler *sampler,
2109              const unsigned sview_idx,
2110              const unsigned sampler_idx,
2111              const union tgsi_exec_channel *s,
2112              const union tgsi_exec_channel *t,
2113              const union tgsi_exec_channel *p,
2114              const union tgsi_exec_channel *c0,
2115              const union tgsi_exec_channel *c1,
2116              float derivs[3][2][TGSI_QUAD_SIZE],
2117              const int8_t offset[3],
2118              enum tgsi_sampler_control control,
2119              union tgsi_exec_channel *r,
2120              union tgsi_exec_channel *g,
2121              union tgsi_exec_channel *b,
2122              union tgsi_exec_channel *a )
2123 {
2124    uint j;
2125    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2126
2127    /* FIXME: handle explicit derivs, offsets */
2128    sampler->get_samples(sampler, sview_idx, sampler_idx,
2129                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
2130
2131    for (j = 0; j < 4; j++) {
2132       r->f[j] = rgba[0][j];
2133       g->f[j] = rgba[1][j];
2134       b->f[j] = rgba[2][j];
2135       a->f[j] = rgba[3][j];
2136    }
2137 }
2138
2139
2140 #define TEX_MODIFIER_NONE           0
2141 #define TEX_MODIFIER_PROJECTED      1
2142 #define TEX_MODIFIER_LOD_BIAS       2
2143 #define TEX_MODIFIER_EXPLICIT_LOD   3
2144 #define TEX_MODIFIER_LEVEL_ZERO     4
2145 #define TEX_MODIFIER_GATHER         5
2146
2147 /*
2148  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2149  */
2150 static void
2151 fetch_texel_offsets(struct tgsi_exec_machine *mach,
2152                     const struct tgsi_full_instruction *inst,
2153                     int8_t offsets[3])
2154 {
2155    if (inst->Texture.NumOffsets == 1) {
2156       union tgsi_exec_channel index;
2157       union tgsi_exec_channel offset[3];
2158       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2159       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2160                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2161       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2162                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2163       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2164                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2165      offsets[0] = offset[0].i[0];
2166      offsets[1] = offset[1].i[0];
2167      offsets[2] = offset[2].i[0];
2168    } else {
2169      assert(inst->Texture.NumOffsets == 0);
2170      offsets[0] = offsets[1] = offsets[2] = 0;
2171    }
2172 }
2173
2174
2175 /*
2176  * Fetch dx and dy values for one channel (s, t or r).
2177  * Put dx values into one float array, dy values into another.
2178  */
2179 static void
2180 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2181                            const struct tgsi_full_instruction *inst,
2182                            unsigned regdsrcx,
2183                            unsigned chan,
2184                            float derivs[2][TGSI_QUAD_SIZE])
2185 {
2186    union tgsi_exec_channel d;
2187    FETCH(&d, regdsrcx, chan);
2188    derivs[0][0] = d.f[0];
2189    derivs[0][1] = d.f[1];
2190    derivs[0][2] = d.f[2];
2191    derivs[0][3] = d.f[3];
2192    FETCH(&d, regdsrcx + 1, chan);
2193    derivs[1][0] = d.f[0];
2194    derivs[1][1] = d.f[1];
2195    derivs[1][2] = d.f[2];
2196    derivs[1][3] = d.f[3];
2197 }
2198
2199 static uint
2200 fetch_sampler_unit(struct tgsi_exec_machine *mach,
2201                    const struct tgsi_full_instruction *inst,
2202                    uint sampler)
2203 {
2204    uint unit = 0;
2205    int i;
2206    if (inst->Src[sampler].Register.Indirect) {
2207       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2208       union tgsi_exec_channel indir_index, index2;
2209       const uint execmask = mach->ExecMask;
2210       index2.i[0] =
2211       index2.i[1] =
2212       index2.i[2] =
2213       index2.i[3] = reg->Indirect.Index;
2214
2215       fetch_src_file_channel(mach,
2216                              reg->Indirect.File,
2217                              reg->Indirect.Swizzle,
2218                              &index2,
2219                              &ZeroVec,
2220                              &indir_index);
2221       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2222          if (execmask & (1 << i)) {
2223             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2224             break;
2225          }
2226       }
2227
2228    } else {
2229       unit = inst->Src[sampler].Register.Index;
2230    }
2231    return unit;
2232 }
2233
2234 /*
2235  * execute a texture instruction.
2236  *
2237  * modifier is used to control the channel routing for the
2238  * instruction variants like proj, lod, and texture with lod bias.
2239  * sampler indicates which src register the sampler is contained in.
2240  */
2241 static void
2242 exec_tex(struct tgsi_exec_machine *mach,
2243          const struct tgsi_full_instruction *inst,
2244          uint modifier, uint sampler)
2245 {
2246    const union tgsi_exec_channel *args[5], *proj = NULL;
2247    union tgsi_exec_channel r[5];
2248    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2249    uint chan;
2250    uint unit;
2251    int8_t offsets[3];
2252    int dim, shadow_ref, i;
2253
2254    unit = fetch_sampler_unit(mach, inst, sampler);
2255    /* always fetch all 3 offsets, overkill but keeps code simple */
2256    fetch_texel_offsets(mach, inst, offsets);
2257
2258    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2259    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2260
2261    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2262    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2263
2264    assert(dim <= 4);
2265    if (shadow_ref >= 0)
2266       assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2267
2268    /* fetch modifier to the last argument */
2269    if (modifier != TEX_MODIFIER_NONE) {
2270       const int last = ARRAY_SIZE(args) - 1;
2271
2272       /* fetch modifier from src0.w or src1.x */
2273       if (sampler == 1) {
2274          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2275          FETCH(&r[last], 0, TGSI_CHAN_W);
2276       }
2277       else {
2278          FETCH(&r[last], 1, TGSI_CHAN_X);
2279       }
2280
2281       if (modifier != TEX_MODIFIER_PROJECTED) {
2282          args[last] = &r[last];
2283       }
2284       else {
2285          proj = &r[last];
2286          args[last] = &ZeroVec;
2287       }
2288
2289       /* point unused arguments to zero vector */
2290       for (i = dim; i < last; i++)
2291          args[i] = &ZeroVec;
2292
2293       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2294          control = TGSI_SAMPLER_LOD_EXPLICIT;
2295       else if (modifier == TEX_MODIFIER_LOD_BIAS)
2296          control = TGSI_SAMPLER_LOD_BIAS;
2297       else if (modifier == TEX_MODIFIER_GATHER)
2298          control = TGSI_SAMPLER_GATHER;
2299    }
2300    else {
2301       for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2302          args[i] = &ZeroVec;
2303    }
2304
2305    /* fetch coordinates */
2306    for (i = 0; i < dim; i++) {
2307       FETCH(&r[i], 0, TGSI_CHAN_X + i);
2308
2309       if (proj)
2310          micro_div(&r[i], &r[i], proj);
2311
2312       args[i] = &r[i];
2313    }
2314
2315    /* fetch reference value */
2316    if (shadow_ref >= 0) {
2317       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2318
2319       if (proj)
2320          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2321
2322       args[shadow_ref] = &r[shadow_ref];
2323    }
2324
2325    fetch_texel(mach->Sampler, unit, unit,
2326          args[0], args[1], args[2], args[3], args[4],
2327          NULL, offsets, control,
2328          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2329
2330 #if 0
2331    debug_printf("fetch r: %g %g %g %g\n",
2332          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2333    debug_printf("fetch g: %g %g %g %g\n",
2334          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2335    debug_printf("fetch b: %g %g %g %g\n",
2336          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2337    debug_printf("fetch a: %g %g %g %g\n",
2338          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2339 #endif
2340
2341    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2342       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2343          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2344       }
2345    }
2346 }
2347
2348 static void
2349 exec_lodq(struct tgsi_exec_machine *mach,
2350           const struct tgsi_full_instruction *inst)
2351 {
2352    uint resource_unit, sampler_unit;
2353    unsigned dim;
2354    unsigned i;
2355    union tgsi_exec_channel coords[4];
2356    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2357    union tgsi_exec_channel r[2];
2358
2359    resource_unit = fetch_sampler_unit(mach, inst, 1);
2360    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2361       uint target = mach->SamplerViews[resource_unit].Resource;
2362       dim = tgsi_util_get_texture_coord_dim(target);
2363       sampler_unit = fetch_sampler_unit(mach, inst, 2);
2364    } else {
2365       dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2366       sampler_unit = resource_unit;
2367    }
2368    assert(dim <= ARRAY_SIZE(coords));
2369    /* fetch coordinates */
2370    for (i = 0; i < dim; i++) {
2371       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2372       args[i] = &coords[i];
2373    }
2374    for (i = dim; i < ARRAY_SIZE(coords); i++) {
2375       args[i] = &ZeroVec;
2376    }
2377    mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2378                             args[0]->f,
2379                             args[1]->f,
2380                             args[2]->f,
2381                             args[3]->f,
2382                             TGSI_SAMPLER_LOD_NONE,
2383                             r[0].f,
2384                             r[1].f);
2385
2386    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2387       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2388                  TGSI_EXEC_DATA_FLOAT);
2389    }
2390    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2391       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2392                  TGSI_EXEC_DATA_FLOAT);
2393    }
2394    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2395       unsigned char swizzles[4];
2396       unsigned chan;
2397       swizzles[0] = inst->Src[1].Register.SwizzleX;
2398       swizzles[1] = inst->Src[1].Register.SwizzleY;
2399       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2400       swizzles[3] = inst->Src[1].Register.SwizzleW;
2401
2402       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2403          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2404             if (swizzles[chan] >= 2) {
2405                store_dest(mach, &ZeroVec,
2406                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2407             } else {
2408                store_dest(mach, &r[swizzles[chan]],
2409                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2410             }
2411          }
2412       }
2413    } else {
2414       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2415          store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2416                     TGSI_EXEC_DATA_FLOAT);
2417       }
2418       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2419          store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2420                     TGSI_EXEC_DATA_FLOAT);
2421       }
2422    }
2423 }
2424
2425 static void
2426 exec_txd(struct tgsi_exec_machine *mach,
2427          const struct tgsi_full_instruction *inst)
2428 {
2429    union tgsi_exec_channel r[4];
2430    float derivs[3][2][TGSI_QUAD_SIZE];
2431    uint chan;
2432    uint unit;
2433    int8_t offsets[3];
2434
2435    unit = fetch_sampler_unit(mach, inst, 3);
2436    /* always fetch all 3 offsets, overkill but keeps code simple */
2437    fetch_texel_offsets(mach, inst, offsets);
2438
2439    switch (inst->Texture.Texture) {
2440    case TGSI_TEXTURE_1D:
2441       FETCH(&r[0], 0, TGSI_CHAN_X);
2442
2443       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2444
2445       fetch_texel(mach->Sampler, unit, unit,
2446                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2447                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2448                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2449       break;
2450
2451    case TGSI_TEXTURE_SHADOW1D:
2452    case TGSI_TEXTURE_1D_ARRAY:
2453    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2454       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2455       FETCH(&r[0], 0, TGSI_CHAN_X);
2456       FETCH(&r[1], 0, TGSI_CHAN_Y);
2457       FETCH(&r[2], 0, TGSI_CHAN_Z);
2458
2459       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2460
2461       fetch_texel(mach->Sampler, unit, unit,
2462                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2463                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2464                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2465       break;
2466
2467    case TGSI_TEXTURE_2D:
2468    case TGSI_TEXTURE_RECT:
2469       FETCH(&r[0], 0, TGSI_CHAN_X);
2470       FETCH(&r[1], 0, TGSI_CHAN_Y);
2471
2472       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2473       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2474
2475       fetch_texel(mach->Sampler, unit, unit,
2476                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2477                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2478                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2479       break;
2480
2481
2482    case TGSI_TEXTURE_SHADOW2D:
2483    case TGSI_TEXTURE_SHADOWRECT:
2484    case TGSI_TEXTURE_2D_ARRAY:
2485    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2486       /* only SHADOW2D_ARRAY actually needs W */
2487       FETCH(&r[0], 0, TGSI_CHAN_X);
2488       FETCH(&r[1], 0, TGSI_CHAN_Y);
2489       FETCH(&r[2], 0, TGSI_CHAN_Z);
2490       FETCH(&r[3], 0, TGSI_CHAN_W);
2491
2492       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2493       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2494
2495       fetch_texel(mach->Sampler, unit, unit,
2496                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2497                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2498                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2499       break;
2500
2501    case TGSI_TEXTURE_3D:
2502    case TGSI_TEXTURE_CUBE:
2503    case TGSI_TEXTURE_CUBE_ARRAY:
2504    case TGSI_TEXTURE_SHADOWCUBE:
2505       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2506       FETCH(&r[0], 0, TGSI_CHAN_X);
2507       FETCH(&r[1], 0, TGSI_CHAN_Y);
2508       FETCH(&r[2], 0, TGSI_CHAN_Z);
2509       FETCH(&r[3], 0, TGSI_CHAN_W);
2510
2511       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2512       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2513       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2514
2515       fetch_texel(mach->Sampler, unit, unit,
2516                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2517                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2518                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2519       break;
2520
2521    default:
2522       assert(0);
2523    }
2524
2525    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2526       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2527          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2528       }
2529    }
2530 }
2531
2532
2533 static void
2534 exec_txf(struct tgsi_exec_machine *mach,
2535          const struct tgsi_full_instruction *inst)
2536 {
2537    union tgsi_exec_channel r[4];
2538    uint chan;
2539    uint unit;
2540    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2541    int j;
2542    int8_t offsets[3];
2543    unsigned target;
2544
2545    unit = fetch_sampler_unit(mach, inst, 1);
2546    /* always fetch all 3 offsets, overkill but keeps code simple */
2547    fetch_texel_offsets(mach, inst, offsets);
2548
2549    IFETCH(&r[3], 0, TGSI_CHAN_W);
2550
2551    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2552        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2553       target = mach->SamplerViews[unit].Resource;
2554    }
2555    else {
2556       target = inst->Texture.Texture;
2557    }
2558    switch(target) {
2559    case TGSI_TEXTURE_3D:
2560    case TGSI_TEXTURE_2D_ARRAY:
2561    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2562    case TGSI_TEXTURE_2D_ARRAY_MSAA:
2563       IFETCH(&r[2], 0, TGSI_CHAN_Z);
2564       /* fallthrough */
2565    case TGSI_TEXTURE_2D:
2566    case TGSI_TEXTURE_RECT:
2567    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2568    case TGSI_TEXTURE_SHADOW2D:
2569    case TGSI_TEXTURE_SHADOWRECT:
2570    case TGSI_TEXTURE_1D_ARRAY:
2571    case TGSI_TEXTURE_2D_MSAA:
2572       IFETCH(&r[1], 0, TGSI_CHAN_Y);
2573       /* fallthrough */
2574    case TGSI_TEXTURE_BUFFER:
2575    case TGSI_TEXTURE_1D:
2576    case TGSI_TEXTURE_SHADOW1D:
2577       IFETCH(&r[0], 0, TGSI_CHAN_X);
2578       break;
2579    default:
2580       assert(0);
2581       break;
2582    }
2583
2584    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2585                             offsets, rgba);
2586
2587    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2588       r[0].f[j] = rgba[0][j];
2589       r[1].f[j] = rgba[1][j];
2590       r[2].f[j] = rgba[2][j];
2591       r[3].f[j] = rgba[3][j];
2592    }
2593
2594    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2595        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2596       unsigned char swizzles[4];
2597       swizzles[0] = inst->Src[1].Register.SwizzleX;
2598       swizzles[1] = inst->Src[1].Register.SwizzleY;
2599       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2600       swizzles[3] = inst->Src[1].Register.SwizzleW;
2601
2602       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2603          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2604             store_dest(mach, &r[swizzles[chan]],
2605                        &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2606          }
2607       }
2608    }
2609    else {
2610       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2611          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2612             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2613          }
2614       }
2615    }
2616 }
2617
2618 static void
2619 exec_txq(struct tgsi_exec_machine *mach,
2620          const struct tgsi_full_instruction *inst)
2621 {
2622    int result[4];
2623    union tgsi_exec_channel r[4], src;
2624    uint chan;
2625    uint unit;
2626    int i,j;
2627
2628    unit = fetch_sampler_unit(mach, inst, 1);
2629
2630    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2631
2632    /* XXX: This interface can't return per-pixel values */
2633    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2634
2635    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2636       for (j = 0; j < 4; j++) {
2637          r[j].i[i] = result[j];
2638       }
2639    }
2640
2641    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2642       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2643          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2644                     TGSI_EXEC_DATA_INT);
2645       }
2646    }
2647 }
2648
2649 static void
2650 exec_sample(struct tgsi_exec_machine *mach,
2651             const struct tgsi_full_instruction *inst,
2652             uint modifier, boolean compare)
2653 {
2654    const uint resource_unit = inst->Src[1].Register.Index;
2655    const uint sampler_unit = inst->Src[2].Register.Index;
2656    union tgsi_exec_channel r[5], c1;
2657    const union tgsi_exec_channel *lod = &ZeroVec;
2658    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2659    uint chan;
2660    unsigned char swizzles[4];
2661    int8_t offsets[3];
2662
2663    /* always fetch all 3 offsets, overkill but keeps code simple */
2664    fetch_texel_offsets(mach, inst, offsets);
2665
2666    assert(modifier != TEX_MODIFIER_PROJECTED);
2667
2668    if (modifier != TEX_MODIFIER_NONE) {
2669       if (modifier == TEX_MODIFIER_LOD_BIAS) {
2670          FETCH(&c1, 3, TGSI_CHAN_X);
2671          lod = &c1;
2672          control = TGSI_SAMPLER_LOD_BIAS;
2673       }
2674       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2675          FETCH(&c1, 3, TGSI_CHAN_X);
2676          lod = &c1;
2677          control = TGSI_SAMPLER_LOD_EXPLICIT;
2678       }
2679       else if (modifier == TEX_MODIFIER_GATHER) {
2680          control = TGSI_SAMPLER_GATHER;
2681       }
2682       else {
2683          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2684          control = TGSI_SAMPLER_LOD_ZERO;
2685       }
2686    }
2687
2688    FETCH(&r[0], 0, TGSI_CHAN_X);
2689
2690    switch (mach->SamplerViews[resource_unit].Resource) {
2691    case TGSI_TEXTURE_1D:
2692       if (compare) {
2693          FETCH(&r[2], 3, TGSI_CHAN_X);
2694          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2695                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2696                      NULL, offsets, control,
2697                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2698       }
2699       else {
2700          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2701                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2702                      NULL, offsets, control,
2703                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2704       }
2705       break;
2706
2707    case TGSI_TEXTURE_1D_ARRAY:
2708    case TGSI_TEXTURE_2D:
2709    case TGSI_TEXTURE_RECT:
2710       FETCH(&r[1], 0, TGSI_CHAN_Y);
2711       if (compare) {
2712          FETCH(&r[2], 3, TGSI_CHAN_X);
2713          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2714                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2715                      NULL, offsets, control,
2716                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2717       }
2718       else {
2719          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2720                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2721                      NULL, offsets, control,
2722                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2723       }
2724       break;
2725
2726    case TGSI_TEXTURE_2D_ARRAY:
2727    case TGSI_TEXTURE_3D:
2728    case TGSI_TEXTURE_CUBE:
2729       FETCH(&r[1], 0, TGSI_CHAN_Y);
2730       FETCH(&r[2], 0, TGSI_CHAN_Z);
2731       if(compare) {
2732          FETCH(&r[3], 3, TGSI_CHAN_X);
2733          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2734                      &r[0], &r[1], &r[2], &r[3], lod,
2735                      NULL, offsets, control,
2736                      &r[0], &r[1], &r[2], &r[3]);
2737       }
2738       else {
2739          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2740                      &r[0], &r[1], &r[2], &ZeroVec, lod,
2741                      NULL, offsets, control,
2742                      &r[0], &r[1], &r[2], &r[3]);
2743       }
2744       break;
2745
2746    case TGSI_TEXTURE_CUBE_ARRAY:
2747       FETCH(&r[1], 0, TGSI_CHAN_Y);
2748       FETCH(&r[2], 0, TGSI_CHAN_Z);
2749       FETCH(&r[3], 0, TGSI_CHAN_W);
2750       if(compare) {
2751          FETCH(&r[4], 3, TGSI_CHAN_X);
2752          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2753                      &r[0], &r[1], &r[2], &r[3], &r[4],
2754                      NULL, offsets, control,
2755                      &r[0], &r[1], &r[2], &r[3]);
2756       }
2757       else {
2758          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2759                      &r[0], &r[1], &r[2], &r[3], lod,
2760                      NULL, offsets, control,
2761                      &r[0], &r[1], &r[2], &r[3]);
2762       }
2763       break;
2764
2765
2766    default:
2767       assert(0);
2768    }
2769
2770    swizzles[0] = inst->Src[1].Register.SwizzleX;
2771    swizzles[1] = inst->Src[1].Register.SwizzleY;
2772    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2773    swizzles[3] = inst->Src[1].Register.SwizzleW;
2774
2775    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2776       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2777          store_dest(mach, &r[swizzles[chan]],
2778                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2779       }
2780    }
2781 }
2782
2783 static void
2784 exec_sample_d(struct tgsi_exec_machine *mach,
2785               const struct tgsi_full_instruction *inst)
2786 {
2787    const uint resource_unit = inst->Src[1].Register.Index;
2788    const uint sampler_unit = inst->Src[2].Register.Index;
2789    union tgsi_exec_channel r[4];
2790    float derivs[3][2][TGSI_QUAD_SIZE];
2791    uint chan;
2792    unsigned char swizzles[4];
2793    int8_t offsets[3];
2794
2795    /* always fetch all 3 offsets, overkill but keeps code simple */
2796    fetch_texel_offsets(mach, inst, offsets);
2797
2798    FETCH(&r[0], 0, TGSI_CHAN_X);
2799
2800    switch (mach->SamplerViews[resource_unit].Resource) {
2801    case TGSI_TEXTURE_1D:
2802    case TGSI_TEXTURE_1D_ARRAY:
2803       /* only 1D array actually needs Y */
2804       FETCH(&r[1], 0, TGSI_CHAN_Y);
2805
2806       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2807
2808       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2809                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2810                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2811                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2812       break;
2813
2814    case TGSI_TEXTURE_2D:
2815    case TGSI_TEXTURE_RECT:
2816    case TGSI_TEXTURE_2D_ARRAY:
2817       /* only 2D array actually needs Z */
2818       FETCH(&r[1], 0, TGSI_CHAN_Y);
2819       FETCH(&r[2], 0, TGSI_CHAN_Z);
2820
2821       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2822       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2823
2824       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2825                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2826                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2827                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2828       break;
2829
2830    case TGSI_TEXTURE_3D:
2831    case TGSI_TEXTURE_CUBE:
2832    case TGSI_TEXTURE_CUBE_ARRAY:
2833       /* only cube array actually needs W */
2834       FETCH(&r[1], 0, TGSI_CHAN_Y);
2835       FETCH(&r[2], 0, TGSI_CHAN_Z);
2836       FETCH(&r[3], 0, TGSI_CHAN_W);
2837
2838       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2839       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2840       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2841
2842       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2843                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2844                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2845                   &r[0], &r[1], &r[2], &r[3]);
2846       break;
2847
2848    default:
2849       assert(0);
2850    }
2851
2852    swizzles[0] = inst->Src[1].Register.SwizzleX;
2853    swizzles[1] = inst->Src[1].Register.SwizzleY;
2854    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2855    swizzles[3] = inst->Src[1].Register.SwizzleW;
2856
2857    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2858       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2859          store_dest(mach, &r[swizzles[chan]],
2860                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2861       }
2862    }
2863 }
2864
2865
2866 /**
2867  * Evaluate a constant-valued coefficient at the position of the
2868  * current quad.
2869  */
2870 static void
2871 eval_constant_coef(
2872    struct tgsi_exec_machine *mach,
2873    unsigned attrib,
2874    unsigned chan )
2875 {
2876    unsigned i;
2877
2878    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2879       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2880    }
2881 }
2882
2883 static void
2884 interp_constant_offset(
2885       UNUSED const struct tgsi_exec_machine *mach,
2886       UNUSED unsigned attrib,
2887       UNUSED unsigned chan,
2888       UNUSED float ofs_x,
2889       UNUSED float ofs_y,
2890       UNUSED union tgsi_exec_channel *out_chan)
2891 {
2892 }
2893
2894 /**
2895  * Evaluate a linear-valued coefficient at the position of the
2896  * current quad.
2897  */
2898 static void
2899 interp_linear_offset(
2900       const struct tgsi_exec_machine *mach,
2901       unsigned attrib,
2902       unsigned chan,
2903       float ofs_x,
2904       float ofs_y,
2905       union tgsi_exec_channel *out_chan)
2906 {
2907    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2908    const float dady = mach->InterpCoefs[attrib].dady[chan];
2909    const float delta = ofs_x * dadx + ofs_y * dady;
2910    out_chan->f[0] += delta;
2911    out_chan->f[1] += delta;
2912    out_chan->f[2] += delta;
2913    out_chan->f[3] += delta;
2914 }
2915
2916 static void
2917 eval_linear_coef(struct tgsi_exec_machine *mach,
2918                  unsigned attrib,
2919                  unsigned chan)
2920 {
2921    const float x = mach->QuadPos.xyzw[0].f[0];
2922    const float y = mach->QuadPos.xyzw[1].f[0];
2923    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2924    const float dady = mach->InterpCoefs[attrib].dady[chan];
2925    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2926
2927    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2928    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2929    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2930    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2931 }
2932
2933 /**
2934  * Evaluate a perspective-valued coefficient at the position of the
2935  * current quad.
2936  */
2937
2938 static void
2939 interp_perspective_offset(
2940    const struct tgsi_exec_machine *mach,
2941    unsigned attrib,
2942    unsigned chan,
2943    float ofs_x,
2944    float ofs_y,
2945    union tgsi_exec_channel *out_chan)
2946 {
2947    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2948    const float dady = mach->InterpCoefs[attrib].dady[chan];
2949    const float *w = mach->QuadPos.xyzw[3].f;
2950    const float delta = ofs_x * dadx + ofs_y * dady;
2951    out_chan->f[0] += delta / w[0];
2952    out_chan->f[1] += delta / w[1];
2953    out_chan->f[2] += delta / w[2];
2954    out_chan->f[3] += delta / w[3];
2955 }
2956
2957 static void
2958 eval_perspective_coef(
2959    struct tgsi_exec_machine *mach,
2960    unsigned attrib,
2961    unsigned chan )
2962 {
2963    const float x = mach->QuadPos.xyzw[0].f[0];
2964    const float y = mach->QuadPos.xyzw[1].f[0];
2965    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2966    const float dady = mach->InterpCoefs[attrib].dady[chan];
2967    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2968    const float *w = mach->QuadPos.xyzw[3].f;
2969    /* divide by W here */
2970    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2971    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2972    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2973    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2974 }
2975
2976
2977 typedef void (* eval_coef_func)(
2978    struct tgsi_exec_machine *mach,
2979    unsigned attrib,
2980    unsigned chan );
2981
2982 static void
2983 exec_declaration(struct tgsi_exec_machine *mach,
2984                  const struct tgsi_full_declaration *decl)
2985 {
2986    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2987       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2988       return;
2989    }
2990
2991    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2992       if (decl->Declaration.File == TGSI_FILE_INPUT) {
2993          uint first, last, mask;
2994
2995          first = decl->Range.First;
2996          last = decl->Range.Last;
2997          mask = decl->Declaration.UsageMask;
2998
2999          /* XXX we could remove this special-case code since
3000           * mach->InterpCoefs[first].a0 should already have the
3001           * front/back-face value.  But we should first update the
3002           * ureg code to emit the right UsageMask value (WRITEMASK_X).
3003           * Then, we could remove the tgsi_exec_machine::Face field.
3004           */
3005          /* XXX make FACE a system value */
3006          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
3007             uint i;
3008
3009             assert(decl->Semantic.Index == 0);
3010             assert(first == last);
3011
3012             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3013                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
3014             }
3015          } else {
3016             eval_coef_func eval;
3017             apply_sample_offset_func interp;
3018             uint i, j;
3019
3020             switch (decl->Interp.Interpolate) {
3021             case TGSI_INTERPOLATE_CONSTANT:
3022                eval = eval_constant_coef;
3023                interp = interp_constant_offset;
3024                break;
3025
3026             case TGSI_INTERPOLATE_LINEAR:
3027                eval = eval_linear_coef;
3028                interp = interp_linear_offset;
3029                break;
3030
3031             case TGSI_INTERPOLATE_PERSPECTIVE:
3032                eval = eval_perspective_coef;
3033                interp = interp_perspective_offset;
3034                break;
3035
3036             case TGSI_INTERPOLATE_COLOR:
3037                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
3038                interp = mach->flatshade_color ? interp_constant_offset : interp_perspective_offset;
3039                break;
3040
3041             default:
3042                assert(0);
3043                return;
3044             }
3045
3046             for (i = first; i <= last; i++)
3047                mach->InputSampleOffsetApply[i] = interp;
3048
3049             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3050                if (mask & (1 << j)) {
3051                   for (i = first; i <= last; i++) {
3052                      eval(mach, i, j);
3053                   }
3054                }
3055             }
3056          }
3057
3058          if (DEBUG_EXECUTION) {
3059             uint i, j;
3060             for (i = first; i <= last; ++i) {
3061                debug_printf("IN[%2u] = ", i);
3062                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3063                   if (j > 0) {
3064                      debug_printf("         ");
3065                   }
3066                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3067                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
3068                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
3069                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
3070                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
3071                }
3072             }
3073          }
3074       }
3075    }
3076
3077 }
3078
3079 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
3080                                 const union tgsi_exec_channel *src);
3081
3082 static void
3083 exec_scalar_unary(struct tgsi_exec_machine *mach,
3084                   const struct tgsi_full_instruction *inst,
3085                   micro_unary_op op,
3086                   enum tgsi_exec_datatype dst_datatype,
3087                   enum tgsi_exec_datatype src_datatype)
3088 {
3089    unsigned int chan;
3090    union tgsi_exec_channel src;
3091    union tgsi_exec_channel dst;
3092
3093    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
3094    op(&dst, &src);
3095    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3096       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3097          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3098       }
3099    }
3100 }
3101
3102 static void
3103 exec_vector_unary(struct tgsi_exec_machine *mach,
3104                   const struct tgsi_full_instruction *inst,
3105                   micro_unary_op op,
3106                   enum tgsi_exec_datatype dst_datatype,
3107                   enum tgsi_exec_datatype src_datatype)
3108 {
3109    unsigned int chan;
3110    struct tgsi_exec_vector dst;
3111
3112    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3113       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3114          union tgsi_exec_channel src;
3115
3116          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
3117          op(&dst.xyzw[chan], &src);
3118       }
3119    }
3120    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3121       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3122          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3123       }
3124    }
3125 }
3126
3127 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
3128                                  const union tgsi_exec_channel *src0,
3129                                  const union tgsi_exec_channel *src1);
3130
3131 static void
3132 exec_scalar_binary(struct tgsi_exec_machine *mach,
3133                    const struct tgsi_full_instruction *inst,
3134                    micro_binary_op op,
3135                    enum tgsi_exec_datatype dst_datatype,
3136                    enum tgsi_exec_datatype src_datatype)
3137 {
3138    unsigned int chan;
3139    union tgsi_exec_channel src[2];
3140    union tgsi_exec_channel dst;
3141
3142    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
3143    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
3144    op(&dst, &src[0], &src[1]);
3145    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3146       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3147          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3148       }
3149    }
3150 }
3151
3152 static void
3153 exec_vector_binary(struct tgsi_exec_machine *mach,
3154                    const struct tgsi_full_instruction *inst,
3155                    micro_binary_op op,
3156                    enum tgsi_exec_datatype dst_datatype,
3157                    enum tgsi_exec_datatype src_datatype)
3158 {
3159    unsigned int chan;
3160    struct tgsi_exec_vector dst;
3161
3162    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3163       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3164          union tgsi_exec_channel src[2];
3165
3166          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3167          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3168          op(&dst.xyzw[chan], &src[0], &src[1]);
3169       }
3170    }
3171    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3172       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3173          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3174       }
3175    }
3176 }
3177
3178 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
3179                                   const union tgsi_exec_channel *src0,
3180                                   const union tgsi_exec_channel *src1,
3181                                   const union tgsi_exec_channel *src2);
3182
3183 static void
3184 exec_vector_trinary(struct tgsi_exec_machine *mach,
3185                     const struct tgsi_full_instruction *inst,
3186                     micro_trinary_op op,
3187                     enum tgsi_exec_datatype dst_datatype,
3188                     enum tgsi_exec_datatype src_datatype)
3189 {
3190    unsigned int chan;
3191    struct tgsi_exec_vector dst;
3192
3193    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3194       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3195          union tgsi_exec_channel src[3];
3196
3197          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3198          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3199          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3200          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3201       }
3202    }
3203    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3204       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3205          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3206       }
3207    }
3208 }
3209
3210 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
3211                                      const union tgsi_exec_channel *src0,
3212                                      const union tgsi_exec_channel *src1,
3213                                      const union tgsi_exec_channel *src2,
3214                                      const union tgsi_exec_channel *src3);
3215
3216 static void
3217 exec_vector_quaternary(struct tgsi_exec_machine *mach,
3218                        const struct tgsi_full_instruction *inst,
3219                        micro_quaternary_op op,
3220                        enum tgsi_exec_datatype dst_datatype,
3221                        enum tgsi_exec_datatype src_datatype)
3222 {
3223    unsigned int chan;
3224    struct tgsi_exec_vector dst;
3225
3226    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3227       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3228          union tgsi_exec_channel src[4];
3229
3230          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3231          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3232          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3233          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3234          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3235       }
3236    }
3237    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3238       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3239          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3240       }
3241    }
3242 }
3243
3244 static void
3245 exec_dp3(struct tgsi_exec_machine *mach,
3246          const struct tgsi_full_instruction *inst)
3247 {
3248    unsigned int chan;
3249    union tgsi_exec_channel arg[3];
3250
3251    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3252    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3253    micro_mul(&arg[2], &arg[0], &arg[1]);
3254
3255    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3256       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3257       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3258       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3259    }
3260
3261    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3262       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3263          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3264       }
3265    }
3266 }
3267
3268 static void
3269 exec_dp4(struct tgsi_exec_machine *mach,
3270          const struct tgsi_full_instruction *inst)
3271 {
3272    unsigned int chan;
3273    union tgsi_exec_channel arg[3];
3274
3275    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3276    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3277    micro_mul(&arg[2], &arg[0], &arg[1]);
3278
3279    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3280       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3281       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3282       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3283    }
3284
3285    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3286       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3287          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3288       }
3289    }
3290 }
3291
3292 static void
3293 exec_dp2(struct tgsi_exec_machine *mach,
3294          const struct tgsi_full_instruction *inst)
3295 {
3296    unsigned int chan;
3297    union tgsi_exec_channel arg[3];
3298
3299    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3300    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3301    micro_mul(&arg[2], &arg[0], &arg[1]);
3302
3303    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3304    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3305    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3306
3307    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3308       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3309          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3310       }
3311    }
3312 }
3313
3314 static void
3315 exec_pk2h(struct tgsi_exec_machine *mach,
3316           const struct tgsi_full_instruction *inst)
3317 {
3318    unsigned chan;
3319    union tgsi_exec_channel arg[2], dst;
3320
3321    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3322    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3323    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3324       dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
3325          (util_float_to_half(arg[1].f[chan]) << 16);
3326    }
3327    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3328       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3329          store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
3330       }
3331    }
3332 }
3333
3334 static void
3335 exec_up2h(struct tgsi_exec_machine *mach,
3336           const struct tgsi_full_instruction *inst)
3337 {
3338    unsigned chan;
3339    union tgsi_exec_channel arg, dst[2];
3340
3341    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3342    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3343       dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
3344       dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
3345    }
3346    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3347       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3348          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3349       }
3350    }
3351 }
3352
3353 static void
3354 micro_ucmp(union tgsi_exec_channel *dst,
3355            const union tgsi_exec_channel *src0,
3356            const union tgsi_exec_channel *src1,
3357            const union tgsi_exec_channel *src2)
3358 {
3359    dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
3360    dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
3361    dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
3362    dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
3363 }
3364
3365 static void
3366 exec_ucmp(struct tgsi_exec_machine *mach,
3367           const struct tgsi_full_instruction *inst)
3368 {
3369    unsigned int chan;
3370    struct tgsi_exec_vector dst;
3371
3372    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3373       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3374          union tgsi_exec_channel src[3];
3375
3376          fetch_source(mach, &src[0], &inst->Src[0], chan,
3377                       TGSI_EXEC_DATA_UINT);
3378          fetch_source(mach, &src[1], &inst->Src[1], chan,
3379                       TGSI_EXEC_DATA_FLOAT);
3380          fetch_source(mach, &src[2], &inst->Src[2], chan,
3381                       TGSI_EXEC_DATA_FLOAT);
3382          micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3383       }
3384    }
3385    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3386       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3387          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan,
3388                     TGSI_EXEC_DATA_FLOAT);
3389       }
3390    }
3391 }
3392
3393 static void
3394 exec_dst(struct tgsi_exec_machine *mach,
3395          const struct tgsi_full_instruction *inst)
3396 {
3397    union tgsi_exec_channel r[2];
3398    union tgsi_exec_channel d[4];
3399
3400    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3401       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3402       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3403       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3404    }
3405    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3406       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3407    }
3408    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3409       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3410    }
3411
3412    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3413       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3414    }
3415    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3416       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3417    }
3418    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3419       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3420    }
3421    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3422       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3423    }
3424 }
3425
3426 static void
3427 exec_log(struct tgsi_exec_machine *mach,
3428          const struct tgsi_full_instruction *inst)
3429 {
3430    union tgsi_exec_channel r[3];
3431
3432    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3433    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3434    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3435    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3436    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3437       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3438    }
3439    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3440       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3441       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3442       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3443    }
3444    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3445       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3446    }
3447    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3448       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3449    }
3450 }
3451
3452 static void
3453 exec_exp(struct tgsi_exec_machine *mach,
3454          const struct tgsi_full_instruction *inst)
3455 {
3456    union tgsi_exec_channel r[3];
3457
3458    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3459    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3460    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3461       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3462       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3463    }
3464    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3465       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3466       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3467    }
3468    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3469       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3470       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3471    }
3472    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3473       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3474    }
3475 }
3476
3477 static void
3478 exec_lit(struct tgsi_exec_machine *mach,
3479          const struct tgsi_full_instruction *inst)
3480 {
3481    union tgsi_exec_channel r[3];
3482    union tgsi_exec_channel d[3];
3483
3484    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3485       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3486       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3487          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3488          micro_max(&r[1], &r[1], &ZeroVec);
3489
3490          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3491          micro_min(&r[2], &r[2], &P128Vec);
3492          micro_max(&r[2], &r[2], &M128Vec);
3493          micro_pow(&r[1], &r[1], &r[2]);
3494          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3495          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3496       }
3497       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3498          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3499          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3500       }
3501    }
3502    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3503       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3504    }
3505
3506    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3507       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3508    }
3509 }
3510
3511 static void
3512 exec_break(struct tgsi_exec_machine *mach)
3513 {
3514    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3515       /* turn off loop channels for each enabled exec channel */
3516       mach->LoopMask &= ~mach->ExecMask;
3517       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3518       UPDATE_EXEC_MASK(mach);
3519    } else {
3520       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3521
3522       mach->Switch.mask = 0x0;
3523
3524       UPDATE_EXEC_MASK(mach);
3525    }
3526 }
3527
3528 static void
3529 exec_switch(struct tgsi_exec_machine *mach,
3530             const struct tgsi_full_instruction *inst)
3531 {
3532    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3533    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3534
3535    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3536    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3537    mach->Switch.mask = 0x0;
3538    mach->Switch.defaultMask = 0x0;
3539
3540    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3541    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3542
3543    UPDATE_EXEC_MASK(mach);
3544 }
3545
3546 static void
3547 exec_case(struct tgsi_exec_machine *mach,
3548           const struct tgsi_full_instruction *inst)
3549 {
3550    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3551    union tgsi_exec_channel src;
3552    uint mask = 0;
3553
3554    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3555
3556    if (mach->Switch.selector.u[0] == src.u[0]) {
3557       mask |= 0x1;
3558    }
3559    if (mach->Switch.selector.u[1] == src.u[1]) {
3560       mask |= 0x2;
3561    }
3562    if (mach->Switch.selector.u[2] == src.u[2]) {
3563       mask |= 0x4;
3564    }
3565    if (mach->Switch.selector.u[3] == src.u[3]) {
3566       mask |= 0x8;
3567    }
3568
3569    mach->Switch.defaultMask |= mask;
3570
3571    mach->Switch.mask |= mask & prevMask;
3572
3573    UPDATE_EXEC_MASK(mach);
3574 }
3575
3576 /* FIXME: this will only work if default is last */
3577 static void
3578 exec_default(struct tgsi_exec_machine *mach)
3579 {
3580    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3581
3582    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3583
3584    UPDATE_EXEC_MASK(mach);
3585 }
3586
3587 static void
3588 exec_endswitch(struct tgsi_exec_machine *mach)
3589 {
3590    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3591    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3592
3593    UPDATE_EXEC_MASK(mach);
3594 }
3595
3596 typedef void (* micro_dop)(union tgsi_double_channel *dst,
3597                            const union tgsi_double_channel *src);
3598
3599 typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3600                                const union tgsi_double_channel *src0,
3601                                union tgsi_exec_channel *src1);
3602
3603 typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3604                              const union tgsi_exec_channel *src);
3605
3606 typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3607                              const union tgsi_double_channel *src);
3608
3609 static void
3610 fetch_double_channel(struct tgsi_exec_machine *mach,
3611                      union tgsi_double_channel *chan,
3612                      const struct tgsi_full_src_register *reg,
3613                      uint chan_0,
3614                      uint chan_1)
3615 {
3616    union tgsi_exec_channel src[2];
3617    uint i;
3618
3619    fetch_source_d(mach, &src[0], reg, chan_0);
3620    fetch_source_d(mach, &src[1], reg, chan_1);
3621
3622    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3623       chan->u[i][0] = src[0].u[i];
3624       chan->u[i][1] = src[1].u[i];
3625    }
3626    if (reg->Register.Absolute) {
3627       micro_dabs(chan, chan);
3628    }
3629    if (reg->Register.Negate) {
3630       micro_dneg(chan, chan);
3631    }
3632 }
3633
3634 static void
3635 store_double_channel(struct tgsi_exec_machine *mach,
3636                      const union tgsi_double_channel *chan,
3637                      const struct tgsi_full_dst_register *reg,
3638                      const struct tgsi_full_instruction *inst,
3639                      uint chan_0,
3640                      uint chan_1)
3641 {
3642    union tgsi_exec_channel dst[2];
3643    uint i;
3644    union tgsi_double_channel temp;
3645    const uint execmask = mach->ExecMask;
3646
3647    if (!inst->Instruction.Saturate) {
3648       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3649          if (execmask & (1 << i)) {
3650             dst[0].u[i] = chan->u[i][0];
3651             dst[1].u[i] = chan->u[i][1];
3652          }
3653    }
3654    else {
3655       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3656          if (execmask & (1 << i)) {
3657             if (chan->d[i] < 0.0)
3658                temp.d[i] = 0.0;
3659             else if (chan->d[i] > 1.0)
3660                temp.d[i] = 1.0;
3661             else
3662                temp.d[i] = chan->d[i];
3663
3664             dst[0].u[i] = temp.u[i][0];
3665             dst[1].u[i] = temp.u[i][1];
3666          }
3667    }
3668
3669    store_dest_double(mach, &dst[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
3670    if (chan_1 != (unsigned)-1)
3671       store_dest_double(mach, &dst[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
3672 }
3673
3674 static void
3675 exec_double_unary(struct tgsi_exec_machine *mach,
3676                   const struct tgsi_full_instruction *inst,
3677                   micro_dop op)
3678 {
3679    union tgsi_double_channel src;
3680    union tgsi_double_channel dst;
3681
3682    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3683       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3684       op(&dst, &src);
3685       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3686    }
3687    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3688       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3689       op(&dst, &src);
3690       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3691    }
3692 }
3693
3694 static void
3695 exec_double_binary(struct tgsi_exec_machine *mach,
3696                    const struct tgsi_full_instruction *inst,
3697                    micro_dop op,
3698                    enum tgsi_exec_datatype dst_datatype)
3699 {
3700    union tgsi_double_channel src[2];
3701    union tgsi_double_channel dst;
3702    int first_dest_chan, second_dest_chan;
3703    int wmask;
3704
3705    wmask = inst->Dst[0].Register.WriteMask;
3706    /* these are & because of the way DSLT etc store their destinations */
3707    if (wmask & TGSI_WRITEMASK_XY) {
3708       first_dest_chan = TGSI_CHAN_X;
3709       second_dest_chan = TGSI_CHAN_Y;
3710       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3711          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3712          second_dest_chan = -1;
3713       }
3714
3715       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3716       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3717       op(&dst, src);
3718       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3719    }
3720
3721    if (wmask & TGSI_WRITEMASK_ZW) {
3722       first_dest_chan = TGSI_CHAN_Z;
3723       second_dest_chan = TGSI_CHAN_W;
3724       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3725          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3726          second_dest_chan = -1;
3727       }
3728
3729       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3730       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3731       op(&dst, src);
3732       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3733    }
3734 }
3735
3736 static void
3737 exec_double_trinary(struct tgsi_exec_machine *mach,
3738                     const struct tgsi_full_instruction *inst,
3739                     micro_dop op)
3740 {
3741    union tgsi_double_channel src[3];
3742    union tgsi_double_channel dst;
3743
3744    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3745       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3746       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3747       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3748       op(&dst, src);
3749       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3750    }
3751    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3752       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3753       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3754       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3755       op(&dst, src);
3756       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3757    }
3758 }
3759
3760 static void
3761 exec_dldexp(struct tgsi_exec_machine *mach,
3762             const struct tgsi_full_instruction *inst)
3763 {
3764    union tgsi_double_channel src0;
3765    union tgsi_exec_channel src1;
3766    union tgsi_double_channel dst;
3767    int wmask;
3768
3769    wmask = inst->Dst[0].Register.WriteMask;
3770    if (wmask & TGSI_WRITEMASK_XY) {
3771       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3772       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3773       micro_dldexp(&dst, &src0, &src1);
3774       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3775    }
3776
3777    if (wmask & TGSI_WRITEMASK_ZW) {
3778       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3779       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3780       micro_dldexp(&dst, &src0, &src1);
3781       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3782    }
3783 }
3784
3785 static void
3786 exec_dfracexp(struct tgsi_exec_machine *mach,
3787               const struct tgsi_full_instruction *inst)
3788 {
3789    union tgsi_double_channel src;
3790    union tgsi_double_channel dst;
3791    union tgsi_exec_channel dst_exp;
3792
3793    fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3794    micro_dfracexp(&dst, &dst_exp, &src);
3795    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
3796       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3797    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
3798       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3799    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3800       if (inst->Dst[1].Register.WriteMask & (1 << chan))
3801          store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
3802    }
3803 }
3804
3805 static void
3806 exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3807             const struct tgsi_full_instruction *inst,
3808             micro_dop_sop op)
3809 {
3810    union tgsi_double_channel src0;
3811    union tgsi_exec_channel src1;
3812    union tgsi_double_channel dst;
3813    int wmask;
3814
3815    wmask = inst->Dst[0].Register.WriteMask;
3816    if (wmask & TGSI_WRITEMASK_XY) {
3817       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3818       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3819       op(&dst, &src0, &src1);
3820       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3821    }
3822
3823    if (wmask & TGSI_WRITEMASK_ZW) {
3824       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3825       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3826       op(&dst, &src0, &src1);
3827       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3828    }
3829 }
3830
3831 static int
3832 get_image_coord_dim(unsigned tgsi_tex)
3833 {
3834    int dim;
3835    switch (tgsi_tex) {
3836    case TGSI_TEXTURE_BUFFER:
3837    case TGSI_TEXTURE_1D:
3838       dim = 1;
3839       break;
3840    case TGSI_TEXTURE_2D:
3841    case TGSI_TEXTURE_RECT:
3842    case TGSI_TEXTURE_1D_ARRAY:
3843    case TGSI_TEXTURE_2D_MSAA:
3844       dim = 2;
3845       break;
3846    case TGSI_TEXTURE_3D:
3847    case TGSI_TEXTURE_CUBE:
3848    case TGSI_TEXTURE_2D_ARRAY:
3849    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3850    case TGSI_TEXTURE_CUBE_ARRAY:
3851       dim = 3;
3852       break;
3853    default:
3854       assert(!"unknown texture target");
3855       dim = 0;
3856       break;
3857    }
3858
3859    return dim;
3860 }
3861
3862 static int
3863 get_image_coord_sample(unsigned tgsi_tex)
3864 {
3865    int sample = 0;
3866    switch (tgsi_tex) {
3867    case TGSI_TEXTURE_2D_MSAA:
3868       sample = 3;
3869       break;
3870    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3871       sample = 4;
3872       break;
3873    default:
3874       break;
3875    }
3876    return sample;
3877 }
3878
3879 static void
3880 exec_load_img(struct tgsi_exec_machine *mach,
3881               const struct tgsi_full_instruction *inst)
3882 {
3883    union tgsi_exec_channel r[4], sample_r;
3884    uint unit;
3885    int sample;
3886    int i, j;
3887    int dim;
3888    uint chan;
3889    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3890    struct tgsi_image_params params;
3891    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3892
3893    unit = fetch_sampler_unit(mach, inst, 0);
3894    dim = get_image_coord_dim(inst->Memory.Texture);
3895    sample = get_image_coord_sample(inst->Memory.Texture);
3896    assert(dim <= 3);
3897
3898    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3899    params.unit = unit;
3900    params.tgsi_tex_instr = inst->Memory.Texture;
3901    params.format = inst->Memory.Format;
3902
3903    for (i = 0; i < dim; i++) {
3904       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3905    }
3906
3907    if (sample)
3908       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3909
3910    mach->Image->load(mach->Image, &params,
3911                      r[0].i, r[1].i, r[2].i, sample_r.i,
3912                      rgba);
3913    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3914       r[0].f[j] = rgba[0][j];
3915       r[1].f[j] = rgba[1][j];
3916       r[2].f[j] = rgba[2][j];
3917       r[3].f[j] = rgba[3][j];
3918    }
3919    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3920       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3921          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3922       }
3923    }
3924 }
3925
3926 static void
3927 exec_load_buf(struct tgsi_exec_machine *mach,
3928               const struct tgsi_full_instruction *inst)
3929 {
3930    union tgsi_exec_channel r[4];
3931    uint unit;
3932    int j;
3933    uint chan;
3934    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3935    struct tgsi_buffer_params params;
3936    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3937
3938    unit = fetch_sampler_unit(mach, inst, 0);
3939
3940    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3941    params.unit = unit;
3942    IFETCH(&r[0], 1, TGSI_CHAN_X);
3943
3944    mach->Buffer->load(mach->Buffer, &params,
3945                       r[0].i, rgba);
3946    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3947       r[0].f[j] = rgba[0][j];
3948       r[1].f[j] = rgba[1][j];
3949       r[2].f[j] = rgba[2][j];
3950       r[3].f[j] = rgba[3][j];
3951    }
3952    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3953       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3954          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3955       }
3956    }
3957 }
3958
3959 static void
3960 exec_load_mem(struct tgsi_exec_machine *mach,
3961               const struct tgsi_full_instruction *inst)
3962 {
3963    union tgsi_exec_channel r[4];
3964    uint chan;
3965    char *ptr = mach->LocalMem;
3966    uint32_t offset;
3967    int j;
3968
3969    IFETCH(&r[0], 1, TGSI_CHAN_X);
3970    if (r[0].u[0] >= mach->LocalMemSize)
3971       return;
3972
3973    offset = r[0].u[0];
3974    ptr += offset;
3975
3976    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3977       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3978          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3979             memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
3980          }
3981       }
3982    }
3983
3984    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3985       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3986          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3987       }
3988    }
3989 }
3990
3991 static void
3992 exec_load(struct tgsi_exec_machine *mach,
3993           const struct tgsi_full_instruction *inst)
3994 {
3995    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
3996       exec_load_img(mach, inst);
3997    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
3998       exec_load_buf(mach, inst);
3999    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4000       exec_load_mem(mach, inst);
4001 }
4002
4003 static uint
4004 fetch_store_img_unit(struct tgsi_exec_machine *mach,
4005                      const struct tgsi_full_dst_register *dst)
4006 {
4007    uint unit = 0;
4008    int i;
4009    if (dst->Register.Indirect) {
4010       union tgsi_exec_channel indir_index, index2;
4011       const uint execmask = mach->ExecMask;
4012       index2.i[0] =
4013       index2.i[1] =
4014       index2.i[2] =
4015       index2.i[3] = dst->Indirect.Index;
4016
4017       fetch_src_file_channel(mach,
4018                              dst->Indirect.File,
4019                              dst->Indirect.Swizzle,
4020                              &index2,
4021                              &ZeroVec,
4022                              &indir_index);
4023       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4024          if (execmask & (1 << i)) {
4025             unit = dst->Register.Index + indir_index.i[i];
4026             break;
4027          }
4028       }
4029    } else {
4030       unit = dst->Register.Index;
4031    }
4032    return unit;
4033 }
4034
4035 static void
4036 exec_store_img(struct tgsi_exec_machine *mach,
4037                const struct tgsi_full_instruction *inst)
4038 {
4039    union tgsi_exec_channel r[3], sample_r;
4040    union tgsi_exec_channel value[4];
4041    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4042    struct tgsi_image_params params;
4043    int dim;
4044    int sample;
4045    int i, j;
4046    uint unit;
4047    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4048    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4049    dim = get_image_coord_dim(inst->Memory.Texture);
4050    sample = get_image_coord_sample(inst->Memory.Texture);
4051    assert(dim <= 3);
4052
4053    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4054    params.unit = unit;
4055    params.tgsi_tex_instr = inst->Memory.Texture;
4056    params.format = inst->Memory.Format;
4057
4058    for (i = 0; i < dim; i++) {
4059       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
4060    }
4061
4062    for (i = 0; i < 4; i++) {
4063       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4064    }
4065    if (sample)
4066       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
4067
4068    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4069       rgba[0][j] = value[0].f[j];
4070       rgba[1][j] = value[1].f[j];
4071       rgba[2][j] = value[2].f[j];
4072       rgba[3][j] = value[3].f[j];
4073    }
4074
4075    mach->Image->store(mach->Image, &params,
4076                       r[0].i, r[1].i, r[2].i, sample_r.i,
4077                       rgba);
4078 }
4079
4080 static void
4081 exec_store_buf(struct tgsi_exec_machine *mach,
4082                const struct tgsi_full_instruction *inst)
4083 {
4084    union tgsi_exec_channel r[3];
4085    union tgsi_exec_channel value[4];
4086    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4087    struct tgsi_buffer_params params;
4088    int i, j;
4089    uint unit;
4090    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4091
4092    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4093
4094    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4095    params.unit = unit;
4096    params.writemask = inst->Dst[0].Register.WriteMask;
4097
4098    IFETCH(&r[0], 0, TGSI_CHAN_X);
4099    for (i = 0; i < 4; i++) {
4100       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4101    }
4102
4103    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4104       rgba[0][j] = value[0].f[j];
4105       rgba[1][j] = value[1].f[j];
4106       rgba[2][j] = value[2].f[j];
4107       rgba[3][j] = value[3].f[j];
4108    }
4109
4110    mach->Buffer->store(mach->Buffer, &params,
4111                       r[0].i,
4112                       rgba);
4113 }
4114
4115 static void
4116 exec_store_mem(struct tgsi_exec_machine *mach,
4117                const struct tgsi_full_instruction *inst)
4118 {
4119    union tgsi_exec_channel r[3];
4120    union tgsi_exec_channel value[4];
4121    uint i, chan;
4122    char *ptr = mach->LocalMem;
4123    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4124    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4125
4126    IFETCH(&r[0], 0, TGSI_CHAN_X);
4127
4128    for (i = 0; i < 4; i++) {
4129       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4130    }
4131
4132    if (r[0].u[0] >= mach->LocalMemSize)
4133       return;
4134    ptr += r[0].u[0];
4135
4136    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4137       if (execmask & (1 << i)) {
4138          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4139             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4140                memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
4141             }
4142          }
4143       }
4144    }
4145 }
4146
4147 static void
4148 exec_store(struct tgsi_exec_machine *mach,
4149            const struct tgsi_full_instruction *inst)
4150 {
4151    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
4152       exec_store_img(mach, inst);
4153    else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
4154       exec_store_buf(mach, inst);
4155    else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
4156       exec_store_mem(mach, inst);
4157 }
4158
4159 static void
4160 exec_atomop_img(struct tgsi_exec_machine *mach,
4161                 const struct tgsi_full_instruction *inst)
4162 {
4163    union tgsi_exec_channel r[4], sample_r;
4164    union tgsi_exec_channel value[4], value2[4];
4165    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4166    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4167    struct tgsi_image_params params;
4168    int dim;
4169    int sample;
4170    int i, j;
4171    uint unit, chan;
4172    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4173    unit = fetch_sampler_unit(mach, inst, 0);
4174    dim = get_image_coord_dim(inst->Memory.Texture);
4175    sample = get_image_coord_sample(inst->Memory.Texture);
4176    assert(dim <= 3);
4177
4178    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4179    params.unit = unit;
4180    params.tgsi_tex_instr = inst->Memory.Texture;
4181    params.format = inst->Memory.Format;
4182
4183    for (i = 0; i < dim; i++) {
4184       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4185    }
4186
4187    for (i = 0; i < 4; i++) {
4188       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4189       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4190          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4191    }
4192    if (sample)
4193       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4194
4195    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4196       rgba[0][j] = value[0].f[j];
4197       rgba[1][j] = value[1].f[j];
4198       rgba[2][j] = value[2].f[j];
4199       rgba[3][j] = value[3].f[j];
4200    }
4201    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4202       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4203          rgba2[0][j] = value2[0].f[j];
4204          rgba2[1][j] = value2[1].f[j];
4205          rgba2[2][j] = value2[2].f[j];
4206          rgba2[3][j] = value2[3].f[j];
4207       }
4208    }
4209
4210    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4211                    r[0].i, r[1].i, r[2].i, sample_r.i,
4212                    rgba, rgba2);
4213
4214    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4215       r[0].f[j] = rgba[0][j];
4216       r[1].f[j] = rgba[1][j];
4217       r[2].f[j] = rgba[2][j];
4218       r[3].f[j] = rgba[3][j];
4219    }
4220    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4221       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4222          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4223       }
4224    }
4225 }
4226
4227 static void
4228 exec_atomop_buf(struct tgsi_exec_machine *mach,
4229                 const struct tgsi_full_instruction *inst)
4230 {
4231    union tgsi_exec_channel r[4];
4232    union tgsi_exec_channel value[4], value2[4];
4233    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4234    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4235    struct tgsi_buffer_params params;
4236    int i, j;
4237    uint unit, chan;
4238    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4239
4240    unit = fetch_sampler_unit(mach, inst, 0);
4241
4242    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4243    params.unit = unit;
4244    params.writemask = inst->Dst[0].Register.WriteMask;
4245
4246    IFETCH(&r[0], 1, TGSI_CHAN_X);
4247
4248    for (i = 0; i < 4; i++) {
4249       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4250       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4251          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4252    }
4253
4254    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4255       rgba[0][j] = value[0].f[j];
4256       rgba[1][j] = value[1].f[j];
4257       rgba[2][j] = value[2].f[j];
4258       rgba[3][j] = value[3].f[j];
4259    }
4260    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4261       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4262          rgba2[0][j] = value2[0].f[j];
4263          rgba2[1][j] = value2[1].f[j];
4264          rgba2[2][j] = value2[2].f[j];
4265          rgba2[3][j] = value2[3].f[j];
4266       }
4267    }
4268
4269    mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
4270                    r[0].i,
4271                    rgba, rgba2);
4272
4273    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4274       r[0].f[j] = rgba[0][j];
4275       r[1].f[j] = rgba[1][j];
4276       r[2].f[j] = rgba[2][j];
4277       r[3].f[j] = rgba[3][j];
4278    }
4279    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4280       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4281          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4282       }
4283    }
4284 }
4285
4286 static void
4287 exec_atomop_mem(struct tgsi_exec_machine *mach,
4288                 const struct tgsi_full_instruction *inst)
4289 {
4290    union tgsi_exec_channel r[4];
4291    union tgsi_exec_channel value[4], value2[4];
4292    char *ptr = mach->LocalMem;
4293    uint32_t val;
4294    uint chan, i;
4295    uint32_t offset;
4296    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4297    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4298    IFETCH(&r[0], 1, TGSI_CHAN_X);
4299
4300    if (r[0].u[0] >= mach->LocalMemSize)
4301       return;
4302
4303    offset = r[0].u[0];
4304    ptr += offset;
4305    for (i = 0; i < 4; i++) {
4306       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4307       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4308          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4309    }
4310
4311    memcpy(&r[0].u[0], ptr, 4);
4312    val = r[0].u[0];
4313    switch (inst->Instruction.Opcode) {
4314    case TGSI_OPCODE_ATOMUADD:
4315       val += value[0].u[0];
4316       break;
4317    case TGSI_OPCODE_ATOMXOR:
4318       val ^= value[0].u[0];
4319       break;
4320    case TGSI_OPCODE_ATOMOR:
4321       val |= value[0].u[0];
4322       break;
4323    case TGSI_OPCODE_ATOMAND:
4324       val &= value[0].u[0];
4325       break;
4326    case TGSI_OPCODE_ATOMUMIN:
4327       val = MIN2(val, value[0].u[0]);
4328       break;
4329    case TGSI_OPCODE_ATOMUMAX:
4330       val = MAX2(val, value[0].u[0]);
4331       break;
4332    case TGSI_OPCODE_ATOMIMIN:
4333       val = MIN2(r[0].i[0], value[0].i[0]);
4334       break;
4335    case TGSI_OPCODE_ATOMIMAX:
4336       val = MAX2(r[0].i[0], value[0].i[0]);
4337       break;
4338    case TGSI_OPCODE_ATOMXCHG:
4339       val = value[0].i[0];
4340       break;
4341    case TGSI_OPCODE_ATOMCAS:
4342       if (val == value[0].u[0])
4343          val = value2[0].u[0];
4344       break;
4345    case TGSI_OPCODE_ATOMFADD:
4346       val = fui(r[0].f[0] + value[0].f[0]);
4347       break;
4348    default:
4349       break;
4350    }
4351    for (i = 0; i < TGSI_QUAD_SIZE; i++)
4352       if (execmask & (1 << i))
4353          memcpy(ptr, &val, 4);
4354
4355    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4356       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4357          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4358       }
4359    }
4360 }
4361
4362 static void
4363 exec_atomop(struct tgsi_exec_machine *mach,
4364             const struct tgsi_full_instruction *inst)
4365 {
4366    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4367       exec_atomop_img(mach, inst);
4368    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4369       exec_atomop_buf(mach, inst);
4370    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4371       exec_atomop_mem(mach, inst);
4372 }
4373
4374 static void
4375 exec_resq_img(struct tgsi_exec_machine *mach,
4376               const struct tgsi_full_instruction *inst)
4377 {
4378    int result[4];
4379    union tgsi_exec_channel r[4];
4380    uint unit;
4381    int i, chan, j;
4382    struct tgsi_image_params params;
4383    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4384
4385    unit = fetch_sampler_unit(mach, inst, 0);
4386
4387    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4388    params.unit = unit;
4389    params.tgsi_tex_instr = inst->Memory.Texture;
4390    params.format = inst->Memory.Format;
4391
4392    mach->Image->get_dims(mach->Image, &params, result);
4393
4394    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4395       for (j = 0; j < 4; j++) {
4396          r[j].i[i] = result[j];
4397       }
4398    }
4399
4400    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4401       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4402          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4403                     TGSI_EXEC_DATA_INT);
4404       }
4405    }
4406 }
4407
4408 static void
4409 exec_resq_buf(struct tgsi_exec_machine *mach,
4410               const struct tgsi_full_instruction *inst)
4411 {
4412    int result;
4413    union tgsi_exec_channel r[4];
4414    uint unit;
4415    int i, chan;
4416    struct tgsi_buffer_params params;
4417    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4418
4419    unit = fetch_sampler_unit(mach, inst, 0);
4420
4421    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4422    params.unit = unit;
4423
4424    mach->Buffer->get_dims(mach->Buffer, &params, &result);
4425
4426    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4427       r[0].i[i] = result;
4428    }
4429
4430    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4431       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4432          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4433                     TGSI_EXEC_DATA_INT);
4434       }
4435    }
4436 }
4437
4438 static void
4439 exec_resq(struct tgsi_exec_machine *mach,
4440           const struct tgsi_full_instruction *inst)
4441 {
4442    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4443       exec_resq_img(mach, inst);
4444    else
4445       exec_resq_buf(mach, inst);
4446 }
4447
4448 static void
4449 micro_f2u64(union tgsi_double_channel *dst,
4450             const union tgsi_exec_channel *src)
4451 {
4452    dst->u64[0] = (uint64_t)src->f[0];
4453    dst->u64[1] = (uint64_t)src->f[1];
4454    dst->u64[2] = (uint64_t)src->f[2];
4455    dst->u64[3] = (uint64_t)src->f[3];
4456 }
4457
4458 static void
4459 micro_f2i64(union tgsi_double_channel *dst,
4460             const union tgsi_exec_channel *src)
4461 {
4462    dst->i64[0] = (int64_t)src->f[0];
4463    dst->i64[1] = (int64_t)src->f[1];
4464    dst->i64[2] = (int64_t)src->f[2];
4465    dst->i64[3] = (int64_t)src->f[3];
4466 }
4467
4468 static void
4469 micro_u2i64(union tgsi_double_channel *dst,
4470             const union tgsi_exec_channel *src)
4471 {
4472    dst->u64[0] = (uint64_t)src->u[0];
4473    dst->u64[1] = (uint64_t)src->u[1];
4474    dst->u64[2] = (uint64_t)src->u[2];
4475    dst->u64[3] = (uint64_t)src->u[3];
4476 }
4477
4478 static void
4479 micro_i2i64(union tgsi_double_channel *dst,
4480             const union tgsi_exec_channel *src)
4481 {
4482    dst->i64[0] = (int64_t)src->i[0];
4483    dst->i64[1] = (int64_t)src->i[1];
4484    dst->i64[2] = (int64_t)src->i[2];
4485    dst->i64[3] = (int64_t)src->i[3];
4486 }
4487
4488 static void
4489 micro_d2u64(union tgsi_double_channel *dst,
4490            const union tgsi_double_channel *src)
4491 {
4492    dst->u64[0] = (uint64_t)src->d[0];
4493    dst->u64[1] = (uint64_t)src->d[1];
4494    dst->u64[2] = (uint64_t)src->d[2];
4495    dst->u64[3] = (uint64_t)src->d[3];
4496 }
4497
4498 static void
4499 micro_d2i64(union tgsi_double_channel *dst,
4500            const union tgsi_double_channel *src)
4501 {
4502    dst->i64[0] = (int64_t)src->d[0];
4503    dst->i64[1] = (int64_t)src->d[1];
4504    dst->i64[2] = (int64_t)src->d[2];
4505    dst->i64[3] = (int64_t)src->d[3];
4506 }
4507
4508 static void
4509 micro_u642d(union tgsi_double_channel *dst,
4510            const union tgsi_double_channel *src)
4511 {
4512    dst->d[0] = (double)src->u64[0];
4513    dst->d[1] = (double)src->u64[1];
4514    dst->d[2] = (double)src->u64[2];
4515    dst->d[3] = (double)src->u64[3];
4516 }
4517
4518 static void
4519 micro_i642d(union tgsi_double_channel *dst,
4520            const union tgsi_double_channel *src)
4521 {
4522    dst->d[0] = (double)src->i64[0];
4523    dst->d[1] = (double)src->i64[1];
4524    dst->d[2] = (double)src->i64[2];
4525    dst->d[3] = (double)src->i64[3];
4526 }
4527
4528 static void
4529 micro_u642f(union tgsi_exec_channel *dst,
4530             const union tgsi_double_channel *src)
4531 {
4532    dst->f[0] = (float)src->u64[0];
4533    dst->f[1] = (float)src->u64[1];
4534    dst->f[2] = (float)src->u64[2];
4535    dst->f[3] = (float)src->u64[3];
4536 }
4537
4538 static void
4539 micro_i642f(union tgsi_exec_channel *dst,
4540             const union tgsi_double_channel *src)
4541 {
4542    dst->f[0] = (float)src->i64[0];
4543    dst->f[1] = (float)src->i64[1];
4544    dst->f[2] = (float)src->i64[2];
4545    dst->f[3] = (float)src->i64[3];
4546 }
4547
4548 static void
4549 exec_t_2_64(struct tgsi_exec_machine *mach,
4550           const struct tgsi_full_instruction *inst,
4551           micro_dop_s op,
4552           enum tgsi_exec_datatype src_datatype)
4553 {
4554    union tgsi_exec_channel src;
4555    union tgsi_double_channel dst;
4556
4557    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4558       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4559       op(&dst, &src);
4560       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4561    }
4562    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4563       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4564       op(&dst, &src);
4565       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4566    }
4567 }
4568
4569 static void
4570 exec_64_2_t(struct tgsi_exec_machine *mach,
4571             const struct tgsi_full_instruction *inst,
4572             micro_sop_d op,
4573             enum tgsi_exec_datatype dst_datatype)
4574 {
4575    union tgsi_double_channel src;
4576    union tgsi_exec_channel dst;
4577    int wm = inst->Dst[0].Register.WriteMask;
4578    int i;
4579    int bit;
4580    for (i = 0; i < 2; i++) {
4581       bit = ffs(wm);
4582       if (bit) {
4583          wm &= ~(1 << (bit - 1));
4584          if (i == 0)
4585             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4586          else
4587             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4588          op(&dst, &src);
4589          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
4590       }
4591    }
4592 }
4593
4594 static void
4595 micro_i2f(union tgsi_exec_channel *dst,
4596           const union tgsi_exec_channel *src)
4597 {
4598    dst->f[0] = (float)src->i[0];
4599    dst->f[1] = (float)src->i[1];
4600    dst->f[2] = (float)src->i[2];
4601    dst->f[3] = (float)src->i[3];
4602 }
4603
4604 static void
4605 micro_not(union tgsi_exec_channel *dst,
4606           const union tgsi_exec_channel *src)
4607 {
4608    dst->u[0] = ~src->u[0];
4609    dst->u[1] = ~src->u[1];
4610    dst->u[2] = ~src->u[2];
4611    dst->u[3] = ~src->u[3];
4612 }
4613
4614 static void
4615 micro_shl(union tgsi_exec_channel *dst,
4616           const union tgsi_exec_channel *src0,
4617           const union tgsi_exec_channel *src1)
4618 {
4619    unsigned masked_count;
4620    masked_count = src1->u[0] & 0x1f;
4621    dst->u[0] = src0->u[0] << masked_count;
4622    masked_count = src1->u[1] & 0x1f;
4623    dst->u[1] = src0->u[1] << masked_count;
4624    masked_count = src1->u[2] & 0x1f;
4625    dst->u[2] = src0->u[2] << masked_count;
4626    masked_count = src1->u[3] & 0x1f;
4627    dst->u[3] = src0->u[3] << masked_count;
4628 }
4629
4630 static void
4631 micro_and(union tgsi_exec_channel *dst,
4632           const union tgsi_exec_channel *src0,
4633           const union tgsi_exec_channel *src1)
4634 {
4635    dst->u[0] = src0->u[0] & src1->u[0];
4636    dst->u[1] = src0->u[1] & src1->u[1];
4637    dst->u[2] = src0->u[2] & src1->u[2];
4638    dst->u[3] = src0->u[3] & src1->u[3];
4639 }
4640
4641 static void
4642 micro_or(union tgsi_exec_channel *dst,
4643          const union tgsi_exec_channel *src0,
4644          const union tgsi_exec_channel *src1)
4645 {
4646    dst->u[0] = src0->u[0] | src1->u[0];
4647    dst->u[1] = src0->u[1] | src1->u[1];
4648    dst->u[2] = src0->u[2] | src1->u[2];
4649    dst->u[3] = src0->u[3] | src1->u[3];
4650 }
4651
4652 static void
4653 micro_xor(union tgsi_exec_channel *dst,
4654           const union tgsi_exec_channel *src0,
4655           const union tgsi_exec_channel *src1)
4656 {
4657    dst->u[0] = src0->u[0] ^ src1->u[0];
4658    dst->u[1] = src0->u[1] ^ src1->u[1];
4659    dst->u[2] = src0->u[2] ^ src1->u[2];
4660    dst->u[3] = src0->u[3] ^ src1->u[3];
4661 }
4662
4663 static void
4664 micro_mod(union tgsi_exec_channel *dst,
4665           const union tgsi_exec_channel *src0,
4666           const union tgsi_exec_channel *src1)
4667 {
4668    dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
4669    dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
4670    dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
4671    dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
4672 }
4673
4674 static void
4675 micro_f2i(union tgsi_exec_channel *dst,
4676           const union tgsi_exec_channel *src)
4677 {
4678    dst->i[0] = (int)src->f[0];
4679    dst->i[1] = (int)src->f[1];
4680    dst->i[2] = (int)src->f[2];
4681    dst->i[3] = (int)src->f[3];
4682 }
4683
4684 static void
4685 micro_fseq(union tgsi_exec_channel *dst,
4686            const union tgsi_exec_channel *src0,
4687            const union tgsi_exec_channel *src1)
4688 {
4689    dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4690    dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4691    dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4692    dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4693 }
4694
4695 static void
4696 micro_fsge(union tgsi_exec_channel *dst,
4697            const union tgsi_exec_channel *src0,
4698            const union tgsi_exec_channel *src1)
4699 {
4700    dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4701    dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4702    dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4703    dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4704 }
4705
4706 static void
4707 micro_fslt(union tgsi_exec_channel *dst,
4708            const union tgsi_exec_channel *src0,
4709            const union tgsi_exec_channel *src1)
4710 {
4711    dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4712    dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4713    dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4714    dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4715 }
4716
4717 static void
4718 micro_fsne(union tgsi_exec_channel *dst,
4719            const union tgsi_exec_channel *src0,
4720            const union tgsi_exec_channel *src1)
4721 {
4722    dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4723    dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4724    dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4725    dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4726 }
4727
4728 static void
4729 micro_idiv(union tgsi_exec_channel *dst,
4730            const union tgsi_exec_channel *src0,
4731            const union tgsi_exec_channel *src1)
4732 {
4733    dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4734    dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4735    dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4736    dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4737 }
4738
4739 static void
4740 micro_imax(union tgsi_exec_channel *dst,
4741            const union tgsi_exec_channel *src0,
4742            const union tgsi_exec_channel *src1)
4743 {
4744    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4745    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4746    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4747    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4748 }
4749
4750 static void
4751 micro_imin(union tgsi_exec_channel *dst,
4752            const union tgsi_exec_channel *src0,
4753            const union tgsi_exec_channel *src1)
4754 {
4755    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4756    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4757    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4758    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4759 }
4760
4761 static void
4762 micro_isge(union tgsi_exec_channel *dst,
4763            const union tgsi_exec_channel *src0,
4764            const union tgsi_exec_channel *src1)
4765 {
4766    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4767    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4768    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4769    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4770 }
4771
4772 static void
4773 micro_ishr(union tgsi_exec_channel *dst,
4774            const union tgsi_exec_channel *src0,
4775            const union tgsi_exec_channel *src1)
4776 {
4777    unsigned masked_count;
4778    masked_count = src1->i[0] & 0x1f;
4779    dst->i[0] = src0->i[0] >> masked_count;
4780    masked_count = src1->i[1] & 0x1f;
4781    dst->i[1] = src0->i[1] >> masked_count;
4782    masked_count = src1->i[2] & 0x1f;
4783    dst->i[2] = src0->i[2] >> masked_count;
4784    masked_count = src1->i[3] & 0x1f;
4785    dst->i[3] = src0->i[3] >> masked_count;
4786 }
4787
4788 static void
4789 micro_islt(union tgsi_exec_channel *dst,
4790            const union tgsi_exec_channel *src0,
4791            const union tgsi_exec_channel *src1)
4792 {
4793    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4794    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4795    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4796    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4797 }
4798
4799 static void
4800 micro_f2u(union tgsi_exec_channel *dst,
4801           const union tgsi_exec_channel *src)
4802 {
4803    dst->u[0] = (uint)src->f[0];
4804    dst->u[1] = (uint)src->f[1];
4805    dst->u[2] = (uint)src->f[2];
4806    dst->u[3] = (uint)src->f[3];
4807 }
4808
4809 static void
4810 micro_u2f(union tgsi_exec_channel *dst,
4811           const union tgsi_exec_channel *src)
4812 {
4813    dst->f[0] = (float)src->u[0];
4814    dst->f[1] = (float)src->u[1];
4815    dst->f[2] = (float)src->u[2];
4816    dst->f[3] = (float)src->u[3];
4817 }
4818
4819 static void
4820 micro_uadd(union tgsi_exec_channel *dst,
4821            const union tgsi_exec_channel *src0,
4822            const union tgsi_exec_channel *src1)
4823 {
4824    dst->u[0] = src0->u[0] + src1->u[0];
4825    dst->u[1] = src0->u[1] + src1->u[1];
4826    dst->u[2] = src0->u[2] + src1->u[2];
4827    dst->u[3] = src0->u[3] + src1->u[3];
4828 }
4829
4830 static void
4831 micro_udiv(union tgsi_exec_channel *dst,
4832            const union tgsi_exec_channel *src0,
4833            const union tgsi_exec_channel *src1)
4834 {
4835    dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4836    dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4837    dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4838    dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4839 }
4840
4841 static void
4842 micro_umad(union tgsi_exec_channel *dst,
4843            const union tgsi_exec_channel *src0,
4844            const union tgsi_exec_channel *src1,
4845            const union tgsi_exec_channel *src2)
4846 {
4847    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4848    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4849    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4850    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4851 }
4852
4853 static void
4854 micro_umax(union tgsi_exec_channel *dst,
4855            const union tgsi_exec_channel *src0,
4856            const union tgsi_exec_channel *src1)
4857 {
4858    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4859    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4860    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4861    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4862 }
4863
4864 static void
4865 micro_umin(union tgsi_exec_channel *dst,
4866            const union tgsi_exec_channel *src0,
4867            const union tgsi_exec_channel *src1)
4868 {
4869    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4870    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4871    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4872    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4873 }
4874
4875 static void
4876 micro_umod(union tgsi_exec_channel *dst,
4877            const union tgsi_exec_channel *src0,
4878            const union tgsi_exec_channel *src1)
4879 {
4880    dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4881    dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4882    dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4883    dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4884 }
4885
4886 static void
4887 micro_umul(union tgsi_exec_channel *dst,
4888            const union tgsi_exec_channel *src0,
4889            const union tgsi_exec_channel *src1)
4890 {
4891    dst->u[0] = src0->u[0] * src1->u[0];
4892    dst->u[1] = src0->u[1] * src1->u[1];
4893    dst->u[2] = src0->u[2] * src1->u[2];
4894    dst->u[3] = src0->u[3] * src1->u[3];
4895 }
4896
4897 static void
4898 micro_imul_hi(union tgsi_exec_channel *dst,
4899               const union tgsi_exec_channel *src0,
4900               const union tgsi_exec_channel *src1)
4901 {
4902 #define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4903    dst->i[0] = I64M(src0->i[0], src1->i[0]);
4904    dst->i[1] = I64M(src0->i[1], src1->i[1]);
4905    dst->i[2] = I64M(src0->i[2], src1->i[2]);
4906    dst->i[3] = I64M(src0->i[3], src1->i[3]);
4907 #undef I64M
4908 }
4909
4910 static void
4911 micro_umul_hi(union tgsi_exec_channel *dst,
4912               const union tgsi_exec_channel *src0,
4913               const union tgsi_exec_channel *src1)
4914 {
4915 #define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4916    dst->u[0] = U64M(src0->u[0], src1->u[0]);
4917    dst->u[1] = U64M(src0->u[1], src1->u[1]);
4918    dst->u[2] = U64M(src0->u[2], src1->u[2]);
4919    dst->u[3] = U64M(src0->u[3], src1->u[3]);
4920 #undef U64M
4921 }
4922
4923 static void
4924 micro_useq(union tgsi_exec_channel *dst,
4925            const union tgsi_exec_channel *src0,
4926            const union tgsi_exec_channel *src1)
4927 {
4928    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4929    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4930    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4931    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4932 }
4933
4934 static void
4935 micro_usge(union tgsi_exec_channel *dst,
4936            const union tgsi_exec_channel *src0,
4937            const union tgsi_exec_channel *src1)
4938 {
4939    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4940    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4941    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4942    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4943 }
4944
4945 static void
4946 micro_ushr(union tgsi_exec_channel *dst,
4947            const union tgsi_exec_channel *src0,
4948            const union tgsi_exec_channel *src1)
4949 {
4950    unsigned masked_count;
4951    masked_count = src1->u[0] & 0x1f;
4952    dst->u[0] = src0->u[0] >> masked_count;
4953    masked_count = src1->u[1] & 0x1f;
4954    dst->u[1] = src0->u[1] >> masked_count;
4955    masked_count = src1->u[2] & 0x1f;
4956    dst->u[2] = src0->u[2] >> masked_count;
4957    masked_count = src1->u[3] & 0x1f;
4958    dst->u[3] = src0->u[3] >> masked_count;
4959 }
4960
4961 static void
4962 micro_uslt(union tgsi_exec_channel *dst,
4963            const union tgsi_exec_channel *src0,
4964            const union tgsi_exec_channel *src1)
4965 {
4966    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4967    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4968    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4969    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4970 }
4971
4972 static void
4973 micro_usne(union tgsi_exec_channel *dst,
4974            const union tgsi_exec_channel *src0,
4975            const union tgsi_exec_channel *src1)
4976 {
4977    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4978    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4979    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4980    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
4981 }
4982
4983 static void
4984 micro_uarl(union tgsi_exec_channel *dst,
4985            const union tgsi_exec_channel *src)
4986 {
4987    dst->i[0] = src->u[0];
4988    dst->i[1] = src->u[1];
4989    dst->i[2] = src->u[2];
4990    dst->i[3] = src->u[3];
4991 }
4992
4993 /**
4994  * Signed bitfield extract (i.e. sign-extend the extracted bits)
4995  */
4996 static void
4997 micro_ibfe(union tgsi_exec_channel *dst,
4998            const union tgsi_exec_channel *src0,
4999            const union tgsi_exec_channel *src1,
5000            const union tgsi_exec_channel *src2)
5001 {
5002    int i;
5003    for (i = 0; i < 4; i++) {
5004       int width = src2->i[i];
5005       int offset = src1->i[i] & 0x1f;
5006       if (width == 32 && offset == 0) {
5007          dst->i[i] = src0->i[i];
5008          continue;
5009       }
5010       width &= 0x1f;
5011       if (width == 0)
5012          dst->i[i] = 0;
5013       else if (width + offset < 32)
5014          dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
5015       else
5016          dst->i[i] = src0->i[i] >> offset;
5017    }
5018 }
5019
5020 /**
5021  * Unsigned bitfield extract
5022  */
5023 static void
5024 micro_ubfe(union tgsi_exec_channel *dst,
5025            const union tgsi_exec_channel *src0,
5026            const union tgsi_exec_channel *src1,
5027            const union tgsi_exec_channel *src2)
5028 {
5029    int i;
5030    for (i = 0; i < 4; i++) {
5031       int width = src2->u[i];
5032       int offset = src1->u[i] & 0x1f;
5033       if (width == 32 && offset == 0) {
5034          dst->u[i] = src0->u[i];
5035          continue;
5036       }
5037       width &= 0x1f;
5038       if (width == 0)
5039          dst->u[i] = 0;
5040       else if (width + offset < 32)
5041          dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
5042       else
5043          dst->u[i] = src0->u[i] >> offset;
5044    }
5045 }
5046
5047 /**
5048  * Bitfield insert: copy low bits from src1 into a region of src0.
5049  */
5050 static void
5051 micro_bfi(union tgsi_exec_channel *dst,
5052           const union tgsi_exec_channel *src0,
5053           const union tgsi_exec_channel *src1,
5054           const union tgsi_exec_channel *src2,
5055           const union tgsi_exec_channel *src3)
5056 {
5057    int i;
5058    for (i = 0; i < 4; i++) {
5059       int width = src3->u[i];
5060       int offset = src2->u[i] & 0x1f;
5061       if (width == 32) {
5062          dst->u[i] = src1->u[i];
5063       } else {
5064          int bitmask = ((1 << width) - 1) << offset;
5065          dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
5066       }
5067    }
5068 }
5069
5070 static void
5071 micro_brev(union tgsi_exec_channel *dst,
5072            const union tgsi_exec_channel *src)
5073 {
5074    dst->u[0] = util_bitreverse(src->u[0]);
5075    dst->u[1] = util_bitreverse(src->u[1]);
5076    dst->u[2] = util_bitreverse(src->u[2]);
5077    dst->u[3] = util_bitreverse(src->u[3]);
5078 }
5079
5080 static void
5081 micro_popc(union tgsi_exec_channel *dst,
5082            const union tgsi_exec_channel *src)
5083 {
5084    dst->u[0] = util_bitcount(src->u[0]);
5085    dst->u[1] = util_bitcount(src->u[1]);
5086    dst->u[2] = util_bitcount(src->u[2]);
5087    dst->u[3] = util_bitcount(src->u[3]);
5088 }
5089
5090 static void
5091 micro_lsb(union tgsi_exec_channel *dst,
5092           const union tgsi_exec_channel *src)
5093 {
5094    dst->i[0] = ffs(src->u[0]) - 1;
5095    dst->i[1] = ffs(src->u[1]) - 1;
5096    dst->i[2] = ffs(src->u[2]) - 1;
5097    dst->i[3] = ffs(src->u[3]) - 1;
5098 }
5099
5100 static void
5101 micro_imsb(union tgsi_exec_channel *dst,
5102            const union tgsi_exec_channel *src)
5103 {
5104    dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
5105    dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
5106    dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
5107    dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
5108 }
5109
5110 static void
5111 micro_umsb(union tgsi_exec_channel *dst,
5112            const union tgsi_exec_channel *src)
5113 {
5114    dst->i[0] = util_last_bit(src->u[0]) - 1;
5115    dst->i[1] = util_last_bit(src->u[1]) - 1;
5116    dst->i[2] = util_last_bit(src->u[2]) - 1;
5117    dst->i[3] = util_last_bit(src->u[3]) - 1;
5118 }
5119
5120
5121 static void
5122 exec_interp_at_sample(struct tgsi_exec_machine *mach,
5123                       const struct tgsi_full_instruction *inst)
5124 {
5125    union tgsi_exec_channel index;
5126    union tgsi_exec_channel index2D;
5127    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5128    const struct tgsi_full_src_register *reg = &inst->Src[0];
5129
5130    assert(reg->Register.File == TGSI_FILE_INPUT);
5131    assert(inst->Src[1].Register.File == TGSI_FILE_IMMEDIATE);
5132
5133    get_index_registers(mach, reg, &index, &index2D);
5134    float sample = mach->Imms[inst->Src[1].Register.Index][inst->Src[1].Register.SwizzleX];
5135
5136    /* Short cut: sample 0 is like a normal fetch */
5137    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5138       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5139          continue;
5140
5141       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5142                              &result[chan]);
5143       if (sample != 0.0f) {
5144
5145       /* TODO: define the samples > 0, but so far we only do fake MSAA */
5146          float x = 0;
5147          float y = 0;
5148
5149          unsigned pos = index2D.i[chan] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[chan];
5150          assert(pos >= 0);
5151          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
5152          mach->InputSampleOffsetApply[pos](mach, pos, chan, x, y, &result[chan]);
5153       }
5154       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5155    }
5156 }
5157
5158
5159 static void
5160 exec_interp_at_offset(struct tgsi_exec_machine *mach,
5161                       const struct tgsi_full_instruction *inst)
5162 {
5163    union tgsi_exec_channel index;
5164    union tgsi_exec_channel index2D;
5165    union tgsi_exec_channel ofsx;
5166    union tgsi_exec_channel ofsy;
5167    const struct tgsi_full_src_register *reg = &inst->Src[0];
5168
5169    assert(reg->Register.File == TGSI_FILE_INPUT);
5170
5171    get_index_registers(mach, reg, &index, &index2D);
5172    unsigned pos = index2D.i[0] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[0];
5173
5174    fetch_source(mach, &ofsx, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
5175    fetch_source(mach, &ofsy, &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
5176
5177    for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5178       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5179          continue;
5180       union tgsi_exec_channel result;
5181       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D, &result);
5182       mach->InputSampleOffsetApply[pos](mach, pos, chan, ofsx.f[chan], ofsy.f[chan], &result);
5183       store_dest(mach, &result, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5184    }
5185 }
5186
5187
5188 static void
5189 exec_interp_at_centroid(struct tgsi_exec_machine *mach,
5190                         const struct tgsi_full_instruction *inst)
5191 {
5192    union tgsi_exec_channel index;
5193    union tgsi_exec_channel index2D;
5194    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5195    const struct tgsi_full_src_register *reg = &inst->Src[0];
5196
5197    assert(reg->Register.File == TGSI_FILE_INPUT);
5198    get_index_registers(mach, reg, &index, &index2D);
5199
5200    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5201       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5202          continue;
5203
5204       /* Here we should add the change to use a sample that lies within the
5205        * primitive (Section 15.2):
5206        *
5207        * "When interpolating variables declared using centroid in ,
5208        * the variable is sampled at a location within the pixel covered
5209        * by the primitive generating the fragment.
5210        * ...
5211        * The built-in functions interpolateAtCentroid ... will sample
5212        * variables as though they were declared with the centroid ...
5213        * qualifier[s]."
5214        *
5215        * Since we only support 1 sample currently, this is just a pass-through.
5216        */
5217       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5218                              &result[chan]);
5219       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5220    }
5221
5222 }
5223
5224
5225 /**
5226  * Execute a TGSI instruction.
5227  * Returns TRUE if a barrier instruction is hit,
5228  * otherwise FALSE.
5229  */
5230 static boolean
5231 exec_instruction(
5232    struct tgsi_exec_machine *mach,
5233    const struct tgsi_full_instruction *inst,
5234    int *pc )
5235 {
5236    union tgsi_exec_channel r[10];
5237
5238    (*pc)++;
5239
5240    switch (inst->Instruction.Opcode) {
5241    case TGSI_OPCODE_ARL:
5242       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5243       break;
5244
5245    case TGSI_OPCODE_MOV:
5246       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5247       break;
5248
5249    case TGSI_OPCODE_LIT:
5250       exec_lit(mach, inst);
5251       break;
5252
5253    case TGSI_OPCODE_RCP:
5254       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5255       break;
5256
5257    case TGSI_OPCODE_RSQ:
5258       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5259       break;
5260
5261    case TGSI_OPCODE_EXP:
5262       exec_exp(mach, inst);
5263       break;
5264
5265    case TGSI_OPCODE_LOG:
5266       exec_log(mach, inst);
5267       break;
5268
5269    case TGSI_OPCODE_MUL:
5270       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5271       break;
5272
5273    case TGSI_OPCODE_ADD:
5274       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5275       break;
5276
5277    case TGSI_OPCODE_DP3:
5278       exec_dp3(mach, inst);
5279       break;
5280
5281    case TGSI_OPCODE_DP4:
5282       exec_dp4(mach, inst);
5283       break;
5284
5285    case TGSI_OPCODE_DST:
5286       exec_dst(mach, inst);
5287       break;
5288
5289    case TGSI_OPCODE_MIN:
5290       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5291       break;
5292
5293    case TGSI_OPCODE_MAX:
5294       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5295       break;
5296
5297    case TGSI_OPCODE_SLT:
5298       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5299       break;
5300
5301    case TGSI_OPCODE_SGE:
5302       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5303       break;
5304
5305    case TGSI_OPCODE_MAD:
5306       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5307       break;
5308
5309    case TGSI_OPCODE_LRP:
5310       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5311       break;
5312
5313    case TGSI_OPCODE_SQRT:
5314       exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5315       break;
5316
5317    case TGSI_OPCODE_FRC:
5318       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5319       break;
5320
5321    case TGSI_OPCODE_FLR:
5322       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5323       break;
5324
5325    case TGSI_OPCODE_ROUND:
5326       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5327       break;
5328
5329    case TGSI_OPCODE_EX2:
5330       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5331       break;
5332
5333    case TGSI_OPCODE_LG2:
5334       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5335       break;
5336
5337    case TGSI_OPCODE_POW:
5338       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5339       break;
5340
5341    case TGSI_OPCODE_LDEXP:
5342       exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5343       break;
5344
5345    case TGSI_OPCODE_COS:
5346       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5347       break;
5348
5349    case TGSI_OPCODE_DDX_FINE:
5350       exec_vector_unary(mach, inst, micro_ddx_fine, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5351       break;
5352
5353    case TGSI_OPCODE_DDX:
5354       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5355       break;
5356
5357    case TGSI_OPCODE_DDY_FINE:
5358       exec_vector_unary(mach, inst, micro_ddy_fine, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5359       break;
5360
5361    case TGSI_OPCODE_DDY:
5362       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5363       break;
5364
5365    case TGSI_OPCODE_KILL:
5366       exec_kill (mach);
5367       break;
5368
5369    case TGSI_OPCODE_KILL_IF:
5370       exec_kill_if (mach, inst);
5371       break;
5372
5373    case TGSI_OPCODE_PK2H:
5374       exec_pk2h(mach, inst);
5375       break;
5376
5377    case TGSI_OPCODE_PK2US:
5378       assert (0);
5379       break;
5380
5381    case TGSI_OPCODE_PK4B:
5382       assert (0);
5383       break;
5384
5385    case TGSI_OPCODE_PK4UB:
5386       assert (0);
5387       break;
5388
5389    case TGSI_OPCODE_SEQ:
5390       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5391       break;
5392
5393    case TGSI_OPCODE_SGT:
5394       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5395       break;
5396
5397    case TGSI_OPCODE_SIN:
5398       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5399       break;
5400
5401    case TGSI_OPCODE_SLE:
5402       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5403       break;
5404
5405    case TGSI_OPCODE_SNE:
5406       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5407       break;
5408
5409    case TGSI_OPCODE_TEX:
5410       /* simple texture lookup */
5411       /* src[0] = texcoord */
5412       /* src[1] = sampler unit */
5413       exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5414       break;
5415
5416    case TGSI_OPCODE_TXB:
5417       /* Texture lookup with lod bias */
5418       /* src[0] = texcoord (src[0].w = LOD bias) */
5419       /* src[1] = sampler unit */
5420       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5421       break;
5422
5423    case TGSI_OPCODE_TXD:
5424       /* Texture lookup with explict partial derivatives */
5425       /* src[0] = texcoord */
5426       /* src[1] = d[strq]/dx */
5427       /* src[2] = d[strq]/dy */
5428       /* src[3] = sampler unit */
5429       exec_txd(mach, inst);
5430       break;
5431
5432    case TGSI_OPCODE_TXL:
5433       /* Texture lookup with explit LOD */
5434       /* src[0] = texcoord (src[0].w = LOD) */
5435       /* src[1] = sampler unit */
5436       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5437       break;
5438
5439    case TGSI_OPCODE_TXP:
5440       /* Texture lookup with projection */
5441       /* src[0] = texcoord (src[0].w = projection) */
5442       /* src[1] = sampler unit */
5443       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5444       break;
5445
5446    case TGSI_OPCODE_TG4:
5447       /* src[0] = texcoord */
5448       /* src[1] = component */
5449       /* src[2] = sampler unit */
5450       exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5451       break;
5452
5453    case TGSI_OPCODE_LODQ:
5454       /* src[0] = texcoord */
5455       /* src[1] = sampler unit */
5456       exec_lodq(mach, inst);
5457       break;
5458
5459    case TGSI_OPCODE_UP2H:
5460       exec_up2h(mach, inst);
5461       break;
5462
5463    case TGSI_OPCODE_UP2US:
5464       assert (0);
5465       break;
5466
5467    case TGSI_OPCODE_UP4B:
5468       assert (0);
5469       break;
5470
5471    case TGSI_OPCODE_UP4UB:
5472       assert (0);
5473       break;
5474
5475    case TGSI_OPCODE_ARR:
5476       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5477       break;
5478
5479    case TGSI_OPCODE_CAL:
5480       /* skip the call if no execution channels are enabled */
5481       if (mach->ExecMask) {
5482          /* do the call */
5483
5484          /* First, record the depths of the execution stacks.
5485           * This is important for deeply nested/looped return statements.
5486           * We have to unwind the stacks by the correct amount.  For a
5487           * real code generator, we could determine the number of entries
5488           * to pop off each stack with simple static analysis and avoid
5489           * implementing this data structure at run time.
5490           */
5491          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5492          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5493          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5494          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5495          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5496          /* note that PC was already incremented above */
5497          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5498
5499          mach->CallStackTop++;
5500
5501          /* Second, push the Cond, Loop, Cont, Func stacks */
5502          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5503          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5504          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5505          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5506          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5507          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5508
5509          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5510          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5511          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5512          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5513          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5514          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5515
5516          /* Finally, jump to the subroutine.  The label is a pointer
5517           * (an instruction number) to the BGNSUB instruction.
5518           */
5519          *pc = inst->Label.Label;
5520          assert(mach->Instructions[*pc].Instruction.Opcode
5521                 == TGSI_OPCODE_BGNSUB);
5522       }
5523       break;
5524
5525    case TGSI_OPCODE_RET:
5526       mach->FuncMask &= ~mach->ExecMask;
5527       UPDATE_EXEC_MASK(mach);
5528
5529       if (mach->FuncMask == 0x0) {
5530          /* really return now (otherwise, keep executing */
5531
5532          if (mach->CallStackTop == 0) {
5533             /* returning from main() */
5534             mach->CondStackTop = 0;
5535             mach->LoopStackTop = 0;
5536             mach->ContStackTop = 0;
5537             mach->LoopLabelStackTop = 0;
5538             mach->SwitchStackTop = 0;
5539             mach->BreakStackTop = 0;
5540             *pc = -1;
5541             return FALSE;
5542          }
5543
5544          assert(mach->CallStackTop > 0);
5545          mach->CallStackTop--;
5546
5547          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5548          mach->CondMask = mach->CondStack[mach->CondStackTop];
5549
5550          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5551          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5552
5553          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5554          mach->ContMask = mach->ContStack[mach->ContStackTop];
5555
5556          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5557          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5558
5559          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5560          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5561
5562          assert(mach->FuncStackTop > 0);
5563          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5564
5565          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5566
5567          UPDATE_EXEC_MASK(mach);
5568       }
5569       break;
5570
5571    case TGSI_OPCODE_SSG:
5572       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5573       break;
5574
5575    case TGSI_OPCODE_CMP:
5576       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5577       break;
5578
5579    case TGSI_OPCODE_DIV:
5580       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5581       break;
5582
5583    case TGSI_OPCODE_DP2:
5584       exec_dp2(mach, inst);
5585       break;
5586
5587    case TGSI_OPCODE_IF:
5588       /* push CondMask */
5589       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5590       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5591       FETCH( &r[0], 0, TGSI_CHAN_X );
5592       /* update CondMask */
5593       if( ! r[0].f[0] ) {
5594          mach->CondMask &= ~0x1;
5595       }
5596       if( ! r[0].f[1] ) {
5597          mach->CondMask &= ~0x2;
5598       }
5599       if( ! r[0].f[2] ) {
5600          mach->CondMask &= ~0x4;
5601       }
5602       if( ! r[0].f[3] ) {
5603          mach->CondMask &= ~0x8;
5604       }
5605       UPDATE_EXEC_MASK(mach);
5606       /* Todo: If CondMask==0, jump to ELSE */
5607       break;
5608
5609    case TGSI_OPCODE_UIF:
5610       /* push CondMask */
5611       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5612       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5613       IFETCH( &r[0], 0, TGSI_CHAN_X );
5614       /* update CondMask */
5615       if( ! r[0].u[0] ) {
5616          mach->CondMask &= ~0x1;
5617       }
5618       if( ! r[0].u[1] ) {
5619          mach->CondMask &= ~0x2;
5620       }
5621       if( ! r[0].u[2] ) {
5622          mach->CondMask &= ~0x4;
5623       }
5624       if( ! r[0].u[3] ) {
5625          mach->CondMask &= ~0x8;
5626       }
5627       UPDATE_EXEC_MASK(mach);
5628       /* Todo: If CondMask==0, jump to ELSE */
5629       break;
5630
5631    case TGSI_OPCODE_ELSE:
5632       /* invert CondMask wrt previous mask */
5633       {
5634          uint prevMask;
5635          assert(mach->CondStackTop > 0);
5636          prevMask = mach->CondStack[mach->CondStackTop - 1];
5637          mach->CondMask = ~mach->CondMask & prevMask;
5638          UPDATE_EXEC_MASK(mach);
5639          /* Todo: If CondMask==0, jump to ENDIF */
5640       }
5641       break;
5642
5643    case TGSI_OPCODE_ENDIF:
5644       /* pop CondMask */
5645       assert(mach->CondStackTop > 0);
5646       mach->CondMask = mach->CondStack[--mach->CondStackTop];
5647       UPDATE_EXEC_MASK(mach);
5648       break;
5649
5650    case TGSI_OPCODE_END:
5651       /* make sure we end primitives which haven't
5652        * been explicitly emitted */
5653       conditional_emit_primitive(mach);
5654       /* halt execution */
5655       *pc = -1;
5656       break;
5657
5658    case TGSI_OPCODE_CEIL:
5659       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5660       break;
5661
5662    case TGSI_OPCODE_I2F:
5663       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
5664       break;
5665
5666    case TGSI_OPCODE_NOT:
5667       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5668       break;
5669
5670    case TGSI_OPCODE_TRUNC:
5671       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5672       break;
5673
5674    case TGSI_OPCODE_SHL:
5675       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5676       break;
5677
5678    case TGSI_OPCODE_AND:
5679       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5680       break;
5681
5682    case TGSI_OPCODE_OR:
5683       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5684       break;
5685
5686    case TGSI_OPCODE_MOD:
5687       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5688       break;
5689
5690    case TGSI_OPCODE_XOR:
5691       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5692       break;
5693
5694    case TGSI_OPCODE_TXF:
5695       exec_txf(mach, inst);
5696       break;
5697
5698    case TGSI_OPCODE_TXQ:
5699       exec_txq(mach, inst);
5700       break;
5701
5702    case TGSI_OPCODE_EMIT:
5703       emit_vertex(mach, inst);
5704       break;
5705
5706    case TGSI_OPCODE_ENDPRIM:
5707       emit_primitive(mach, inst);
5708       break;
5709
5710    case TGSI_OPCODE_BGNLOOP:
5711       /* push LoopMask and ContMasks */
5712       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5713       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5714       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5715       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5716
5717       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5718       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5719       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5720       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5721       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5722       break;
5723
5724    case TGSI_OPCODE_ENDLOOP:
5725       /* Restore ContMask, but don't pop */
5726       assert(mach->ContStackTop > 0);
5727       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5728       UPDATE_EXEC_MASK(mach);
5729       if (mach->ExecMask) {
5730          /* repeat loop: jump to instruction just past BGNLOOP */
5731          assert(mach->LoopLabelStackTop > 0);
5732          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5733       }
5734       else {
5735          /* exit loop: pop LoopMask */
5736          assert(mach->LoopStackTop > 0);
5737          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5738          /* pop ContMask */
5739          assert(mach->ContStackTop > 0);
5740          mach->ContMask = mach->ContStack[--mach->ContStackTop];
5741          assert(mach->LoopLabelStackTop > 0);
5742          --mach->LoopLabelStackTop;
5743
5744          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5745       }
5746       UPDATE_EXEC_MASK(mach);
5747       break;
5748
5749    case TGSI_OPCODE_BRK:
5750       exec_break(mach);
5751       break;
5752
5753    case TGSI_OPCODE_CONT:
5754       /* turn off cont channels for each enabled exec channel */
5755       mach->ContMask &= ~mach->ExecMask;
5756       /* Todo: if mach->LoopMask == 0, jump to end of loop */
5757       UPDATE_EXEC_MASK(mach);
5758       break;
5759
5760    case TGSI_OPCODE_BGNSUB:
5761       /* no-op */
5762       break;
5763
5764    case TGSI_OPCODE_ENDSUB:
5765       /*
5766        * XXX: This really should be a no-op. We should never reach this opcode.
5767        */
5768
5769       assert(mach->CallStackTop > 0);
5770       mach->CallStackTop--;
5771
5772       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5773       mach->CondMask = mach->CondStack[mach->CondStackTop];
5774
5775       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5776       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5777
5778       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5779       mach->ContMask = mach->ContStack[mach->ContStackTop];
5780
5781       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5782       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5783
5784       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5785       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5786
5787       assert(mach->FuncStackTop > 0);
5788       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5789
5790       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5791
5792       UPDATE_EXEC_MASK(mach);
5793       break;
5794
5795    case TGSI_OPCODE_NOP:
5796       break;
5797
5798    case TGSI_OPCODE_F2I:
5799       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5800       break;
5801
5802    case TGSI_OPCODE_FSEQ:
5803       exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5804       break;
5805
5806    case TGSI_OPCODE_FSGE:
5807       exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5808       break;
5809
5810    case TGSI_OPCODE_FSLT:
5811       exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5812       break;
5813
5814    case TGSI_OPCODE_FSNE:
5815       exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5816       break;
5817
5818    case TGSI_OPCODE_IDIV:
5819       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5820       break;
5821
5822    case TGSI_OPCODE_IMAX:
5823       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5824       break;
5825
5826    case TGSI_OPCODE_IMIN:
5827       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5828       break;
5829
5830    case TGSI_OPCODE_INEG:
5831       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5832       break;
5833
5834    case TGSI_OPCODE_ISGE:
5835       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5836       break;
5837
5838    case TGSI_OPCODE_ISHR:
5839       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5840       break;
5841
5842    case TGSI_OPCODE_ISLT:
5843       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5844       break;
5845
5846    case TGSI_OPCODE_F2U:
5847       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5848       break;
5849
5850    case TGSI_OPCODE_U2F:
5851       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
5852       break;
5853
5854    case TGSI_OPCODE_UADD:
5855       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5856       break;
5857
5858    case TGSI_OPCODE_UDIV:
5859       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5860       break;
5861
5862    case TGSI_OPCODE_UMAD:
5863       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5864       break;
5865
5866    case TGSI_OPCODE_UMAX:
5867       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5868       break;
5869
5870    case TGSI_OPCODE_UMIN:
5871       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5872       break;
5873
5874    case TGSI_OPCODE_UMOD:
5875       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5876       break;
5877
5878    case TGSI_OPCODE_UMUL:
5879       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5880       break;
5881
5882    case TGSI_OPCODE_IMUL_HI:
5883       exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5884       break;
5885
5886    case TGSI_OPCODE_UMUL_HI:
5887       exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5888       break;
5889
5890    case TGSI_OPCODE_USEQ:
5891       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5892       break;
5893
5894    case TGSI_OPCODE_USGE:
5895       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5896       break;
5897
5898    case TGSI_OPCODE_USHR:
5899       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5900       break;
5901
5902    case TGSI_OPCODE_USLT:
5903       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5904       break;
5905
5906    case TGSI_OPCODE_USNE:
5907       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5908       break;
5909
5910    case TGSI_OPCODE_SWITCH:
5911       exec_switch(mach, inst);
5912       break;
5913
5914    case TGSI_OPCODE_CASE:
5915       exec_case(mach, inst);
5916       break;
5917
5918    case TGSI_OPCODE_DEFAULT:
5919       exec_default(mach);
5920       break;
5921
5922    case TGSI_OPCODE_ENDSWITCH:
5923       exec_endswitch(mach);
5924       break;
5925
5926    case TGSI_OPCODE_SAMPLE_I:
5927       exec_txf(mach, inst);
5928       break;
5929
5930    case TGSI_OPCODE_SAMPLE_I_MS:
5931       exec_txf(mach, inst);
5932       break;
5933
5934    case TGSI_OPCODE_SAMPLE:
5935       exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5936       break;
5937
5938    case TGSI_OPCODE_SAMPLE_B:
5939       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5940       break;
5941
5942    case TGSI_OPCODE_SAMPLE_C:
5943       exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5944       break;
5945
5946    case TGSI_OPCODE_SAMPLE_C_LZ:
5947       exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5948       break;
5949
5950    case TGSI_OPCODE_SAMPLE_D:
5951       exec_sample_d(mach, inst);
5952       break;
5953
5954    case TGSI_OPCODE_SAMPLE_L:
5955       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5956       break;
5957
5958    case TGSI_OPCODE_GATHER4:
5959       exec_sample(mach, inst, TEX_MODIFIER_GATHER, FALSE);
5960       break;
5961
5962    case TGSI_OPCODE_SVIEWINFO:
5963       exec_txq(mach, inst);
5964       break;
5965
5966    case TGSI_OPCODE_SAMPLE_POS:
5967       assert(0);
5968       break;
5969
5970    case TGSI_OPCODE_SAMPLE_INFO:
5971       assert(0);
5972       break;
5973
5974    case TGSI_OPCODE_LOD:
5975       exec_lodq(mach, inst);
5976       break;
5977
5978    case TGSI_OPCODE_UARL:
5979       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5980       break;
5981
5982    case TGSI_OPCODE_UCMP:
5983       exec_ucmp(mach, inst);
5984       break;
5985
5986    case TGSI_OPCODE_IABS:
5987       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5988       break;
5989
5990    case TGSI_OPCODE_ISSG:
5991       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5992       break;
5993
5994    case TGSI_OPCODE_TEX2:
5995       /* simple texture lookup */
5996       /* src[0] = texcoord */
5997       /* src[1] = compare */
5998       /* src[2] = sampler unit */
5999       exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
6000       break;
6001    case TGSI_OPCODE_TXB2:
6002       /* simple texture lookup */
6003       /* src[0] = texcoord */
6004       /* src[1] = bias */
6005       /* src[2] = sampler unit */
6006       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
6007       break;
6008    case TGSI_OPCODE_TXL2:
6009       /* simple texture lookup */
6010       /* src[0] = texcoord */
6011       /* src[1] = lod */
6012       /* src[2] = sampler unit */
6013       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
6014       break;
6015
6016    case TGSI_OPCODE_IBFE:
6017       exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6018       break;
6019    case TGSI_OPCODE_UBFE:
6020       exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6021       break;
6022    case TGSI_OPCODE_BFI:
6023       exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6024       break;
6025    case TGSI_OPCODE_BREV:
6026       exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6027       break;
6028    case TGSI_OPCODE_POPC:
6029       exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6030       break;
6031    case TGSI_OPCODE_LSB:
6032       exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6033       break;
6034    case TGSI_OPCODE_IMSB:
6035       exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6036       break;
6037    case TGSI_OPCODE_UMSB:
6038       exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6039       break;
6040
6041    case TGSI_OPCODE_F2D:
6042       exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
6043       break;
6044
6045    case TGSI_OPCODE_D2F:
6046       exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
6047       break;
6048
6049    case TGSI_OPCODE_DABS:
6050       exec_double_unary(mach, inst, micro_dabs);
6051       break;
6052
6053    case TGSI_OPCODE_DNEG:
6054       exec_double_unary(mach, inst, micro_dneg);
6055       break;
6056
6057    case TGSI_OPCODE_DADD:
6058       exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
6059       break;
6060
6061    case TGSI_OPCODE_DDIV:
6062       exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
6063       break;
6064
6065    case TGSI_OPCODE_DMUL:
6066       exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
6067       break;
6068
6069    case TGSI_OPCODE_DMAX:
6070       exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
6071       break;
6072
6073    case TGSI_OPCODE_DMIN:
6074       exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
6075       break;
6076
6077    case TGSI_OPCODE_DSLT:
6078       exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
6079       break;
6080
6081    case TGSI_OPCODE_DSGE:
6082       exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
6083       break;
6084
6085    case TGSI_OPCODE_DSEQ:
6086       exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
6087       break;
6088
6089    case TGSI_OPCODE_DSNE:
6090       exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
6091       break;
6092
6093    case TGSI_OPCODE_DRCP:
6094       exec_double_unary(mach, inst, micro_drcp);
6095       break;
6096
6097    case TGSI_OPCODE_DSQRT:
6098       exec_double_unary(mach, inst, micro_dsqrt);
6099       break;
6100
6101    case TGSI_OPCODE_DRSQ:
6102       exec_double_unary(mach, inst, micro_drsq);
6103       break;
6104
6105    case TGSI_OPCODE_DMAD:
6106       exec_double_trinary(mach, inst, micro_dmad);
6107       break;
6108
6109    case TGSI_OPCODE_DFRAC:
6110       exec_double_unary(mach, inst, micro_dfrac);
6111       break;
6112
6113    case TGSI_OPCODE_DLDEXP:
6114       exec_dldexp(mach, inst);
6115       break;
6116
6117    case TGSI_OPCODE_DFRACEXP:
6118       exec_dfracexp(mach, inst);
6119       break;
6120
6121    case TGSI_OPCODE_I2D:
6122       exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
6123       break;
6124
6125    case TGSI_OPCODE_D2I:
6126       exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
6127       break;
6128
6129    case TGSI_OPCODE_U2D:
6130       exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
6131       break;
6132
6133    case TGSI_OPCODE_D2U:
6134       exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
6135       break;
6136
6137    case TGSI_OPCODE_LOAD:
6138       exec_load(mach, inst);
6139       break;
6140
6141    case TGSI_OPCODE_STORE:
6142       exec_store(mach, inst);
6143       break;
6144
6145    case TGSI_OPCODE_ATOMUADD:
6146    case TGSI_OPCODE_ATOMXCHG:
6147    case TGSI_OPCODE_ATOMCAS:
6148    case TGSI_OPCODE_ATOMAND:
6149    case TGSI_OPCODE_ATOMOR:
6150    case TGSI_OPCODE_ATOMXOR:
6151    case TGSI_OPCODE_ATOMUMIN:
6152    case TGSI_OPCODE_ATOMUMAX:
6153    case TGSI_OPCODE_ATOMIMIN:
6154    case TGSI_OPCODE_ATOMIMAX:
6155    case TGSI_OPCODE_ATOMFADD:
6156       exec_atomop(mach, inst);
6157       break;
6158
6159    case TGSI_OPCODE_RESQ:
6160       exec_resq(mach, inst);
6161       break;
6162    case TGSI_OPCODE_BARRIER:
6163    case TGSI_OPCODE_MEMBAR:
6164       return TRUE;
6165       break;
6166
6167    case TGSI_OPCODE_I64ABS:
6168       exec_double_unary(mach, inst, micro_i64abs);
6169       break;
6170
6171    case TGSI_OPCODE_I64SSG:
6172       exec_double_unary(mach, inst, micro_i64sgn);
6173       break;
6174
6175    case TGSI_OPCODE_I64NEG:
6176       exec_double_unary(mach, inst, micro_i64neg);
6177       break;
6178
6179    case TGSI_OPCODE_U64SEQ:
6180       exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
6181       break;
6182
6183    case TGSI_OPCODE_U64SNE:
6184       exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
6185       break;
6186
6187    case TGSI_OPCODE_I64SLT:
6188       exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
6189       break;
6190    case TGSI_OPCODE_U64SLT:
6191       exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
6192       break;
6193
6194    case TGSI_OPCODE_I64SGE:
6195       exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
6196       break;
6197    case TGSI_OPCODE_U64SGE:
6198       exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
6199       break;
6200
6201    case TGSI_OPCODE_I64MIN:
6202       exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
6203       break;
6204    case TGSI_OPCODE_U64MIN:
6205       exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
6206       break;
6207    case TGSI_OPCODE_I64MAX:
6208       exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
6209       break;
6210    case TGSI_OPCODE_U64MAX:
6211       exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
6212       break;
6213    case TGSI_OPCODE_U64ADD:
6214       exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
6215       break;
6216    case TGSI_OPCODE_U64MUL:
6217       exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
6218       break;
6219    case TGSI_OPCODE_U64SHL:
6220       exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
6221       break;
6222    case TGSI_OPCODE_I64SHR:
6223       exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
6224       break;
6225    case TGSI_OPCODE_U64SHR:
6226       exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
6227       break;
6228    case TGSI_OPCODE_U64DIV:
6229       exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
6230       break;
6231    case TGSI_OPCODE_I64DIV:
6232       exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
6233       break;
6234    case TGSI_OPCODE_U64MOD:
6235       exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
6236       break;
6237    case TGSI_OPCODE_I64MOD:
6238       exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
6239       break;
6240
6241    case TGSI_OPCODE_F2U64:
6242       exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
6243       break;
6244
6245    case TGSI_OPCODE_F2I64:
6246       exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
6247       break;
6248
6249    case TGSI_OPCODE_U2I64:
6250       exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
6251       break;
6252    case TGSI_OPCODE_I2I64:
6253       exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
6254       break;
6255
6256    case TGSI_OPCODE_D2U64:
6257       exec_double_unary(mach, inst, micro_d2u64);
6258       break;
6259
6260    case TGSI_OPCODE_D2I64:
6261       exec_double_unary(mach, inst, micro_d2i64);
6262       break;
6263
6264    case TGSI_OPCODE_U642F:
6265       exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
6266       break;
6267    case TGSI_OPCODE_I642F:
6268       exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
6269       break;
6270
6271    case TGSI_OPCODE_U642D:
6272       exec_double_unary(mach, inst, micro_u642d);
6273       break;
6274    case TGSI_OPCODE_I642D:
6275       exec_double_unary(mach, inst, micro_i642d);
6276       break;
6277    case TGSI_OPCODE_INTERP_SAMPLE:
6278       exec_interp_at_sample(mach, inst);
6279       break;
6280    case TGSI_OPCODE_INTERP_OFFSET:
6281       exec_interp_at_offset(mach, inst);
6282       break;
6283    case TGSI_OPCODE_INTERP_CENTROID:
6284       exec_interp_at_centroid(mach, inst);
6285       break;
6286    default:
6287       assert( 0 );
6288    }
6289    return FALSE;
6290 }
6291
6292 static void
6293 tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
6294 {
6295    uint default_mask = 0xf;
6296
6297    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
6298    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
6299
6300    if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
6301       for (unsigned i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) {
6302          mach->Temps[temp_prim_idxs[i].idx].xyzw[temp_prim_idxs[i].chan].u[0] = 0;
6303          mach->Primitives[i][0] = 0;
6304       }
6305       /* GS runs on a single primitive for now */
6306       default_mask = 0x1;
6307    }
6308
6309    if (mach->NonHelperMask == 0)
6310       mach->NonHelperMask = default_mask;
6311    mach->CondMask = default_mask;
6312    mach->LoopMask = default_mask;
6313    mach->ContMask = default_mask;
6314    mach->FuncMask = default_mask;
6315    mach->ExecMask = default_mask;
6316
6317    mach->Switch.mask = default_mask;
6318
6319    assert(mach->CondStackTop == 0);
6320    assert(mach->LoopStackTop == 0);
6321    assert(mach->ContStackTop == 0);
6322    assert(mach->SwitchStackTop == 0);
6323    assert(mach->BreakStackTop == 0);
6324    assert(mach->CallStackTop == 0);
6325 }
6326
6327 /**
6328  * Run TGSI interpreter.
6329  * \return bitmask of "alive" quad components
6330  */
6331 uint
6332 tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
6333 {
6334    uint i;
6335
6336    mach->pc = start_pc;
6337
6338    if (!start_pc) {
6339       tgsi_exec_machine_setup_masks(mach);
6340
6341       /* execute declarations (interpolants) */
6342       for (i = 0; i < mach->NumDeclarations; i++) {
6343          exec_declaration( mach, mach->Declarations+i );
6344       }
6345    }
6346
6347    {
6348 #if DEBUG_EXECUTION
6349       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
6350       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6351       uint inst = 1;
6352
6353       if (!start_pc) {
6354          memset(mach->Temps, 0, sizeof(temps));
6355          if (mach->Outputs)
6356             memset(mach->Outputs, 0, sizeof(outputs));
6357          memset(temps, 0, sizeof(temps));
6358          memset(outputs, 0, sizeof(outputs));
6359       }
6360 #endif
6361
6362       /* execute instructions, until pc is set to -1 */
6363       while (mach->pc != -1) {
6364          boolean barrier_hit;
6365 #if DEBUG_EXECUTION
6366          uint i;
6367
6368          tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6369 #endif
6370
6371          assert(mach->pc < (int) mach->NumInstructions);
6372          barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6373
6374          /* for compute shaders if we hit a barrier return now for later rescheduling */
6375          if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6376             return 0;
6377
6378 #if DEBUG_EXECUTION
6379          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
6380             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6381                uint j;
6382
6383                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6384                debug_printf("TEMP[%2u] = ", i);
6385                for (j = 0; j < 4; j++) {
6386                   if (j > 0) {
6387                      debug_printf("           ");
6388                   }
6389                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6390                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6391                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6392                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6393                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6394                }
6395             }
6396          }
6397          if (mach->Outputs) {
6398             for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6399                if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6400                   uint j;
6401
6402                   memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6403                   debug_printf("OUT[%2u] =  ", i);
6404                   for (j = 0; j < 4; j++) {
6405                      if (j > 0) {
6406                         debug_printf("           ");
6407                      }
6408                      debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6409                                   outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6410                                   outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6411                                   outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6412                                   outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6413                   }
6414                }
6415             }
6416          }
6417 #endif
6418       }
6419    }
6420
6421 #if 0
6422    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6423    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6424       /*
6425        * Scale back depth component.
6426        */
6427       for (i = 0; i < 4; i++)
6428          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6429    }
6430 #endif
6431
6432    /* Strictly speaking, these assertions aren't really needed but they
6433     * can potentially catch some bugs in the control flow code.
6434     */
6435    assert(mach->CondStackTop == 0);
6436    assert(mach->LoopStackTop == 0);
6437    assert(mach->ContStackTop == 0);
6438    assert(mach->SwitchStackTop == 0);
6439    assert(mach->BreakStackTop == 0);
6440    assert(mach->CallStackTop == 0);
6441
6442    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
6443 }