src/gallium/auxiliary/tgsi/tgsi_exec.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 VMware, Inc.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * TGSI interpreter/executor.
  31  *
  32  * Flow control information:
  33  *
  34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
  35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
  36  * care since a condition may be true for some quad components but false
  37  * for other components.
  38  *
  39  * We basically execute all statements (even if they're in the part of
  40  * an IF/ELSE clause that's "not taken") and use a special mask to
  41  * control writing to destination registers.  This is the ExecMask.
  42  * See store_dest().
  43  *
  44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
  45  * ContMask) which are controlled by the flow control instructions (namely:
  46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
  47  *
  48  *
  49  * Authors:
  50  *   Michal Krol
  51  *   Brian Paul
  52  */
  53
  54 #include "pipe/p_compiler.h"
  55 #include "pipe/p_state.h"
  56 #include "pipe/p_shader_tokens.h"
  57 #include "tgsi/tgsi_dump.h"
  58 #include "tgsi/tgsi_parse.h"
  59 #include "tgsi/tgsi_util.h"
  60 #include "tgsi_exec.h"
  61 #include "util/u_half.h"
  62 #include "util/u_memory.h"
  63 #include "util/u_math.h"
  64 #include "util/rounding.h"
  65
  66
  67 #define DEBUG_EXECUTION 0
  68
  69
  70 #define FAST_MATH 0
  71
  72 #define TILE_TOP_LEFT     0
  73 #define TILE_TOP_RIGHT    1
  74 #define TILE_BOTTOM_LEFT  2
  75 #define TILE_BOTTOM_RIGHT 3
  76
  77 union tgsi_double_channel {
  78    double d[TGSI_QUAD_SIZE];
  79    unsigned u[TGSI_QUAD_SIZE][2];
  80    uint64_t u64[TGSI_QUAD_SIZE];
  81    int64_t i64[TGSI_QUAD_SIZE];
  82 };
  83
  84 struct tgsi_double_vector {
  85    union tgsi_double_channel xy;
  86    union tgsi_double_channel zw;
  87 };
  88
  89 static void
  90 micro_abs(union tgsi_exec_channel *dst,
  91           const union tgsi_exec_channel *src)
  92 {
  93    dst->f[0] = fabsf(src->f[0]);
  94    dst->f[1] = fabsf(src->f[1]);
  95    dst->f[2] = fabsf(src->f[2]);
  96    dst->f[3] = fabsf(src->f[3]);
  97 }
  98
  99 static void
 100 micro_arl(union tgsi_exec_channel *dst,
 101           const union tgsi_exec_channel *src)
 102 {
 103    dst->i[0] = (int)floorf(src->f[0]);
 104    dst->i[1] = (int)floorf(src->f[1]);
 105    dst->i[2] = (int)floorf(src->f[2]);
 106    dst->i[3] = (int)floorf(src->f[3]);
 107 }
 108
 109 static void
 110 micro_arr(union tgsi_exec_channel *dst,
 111           const union tgsi_exec_channel *src)
 112 {
 113    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
 114    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
 115    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
 116    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
 117 }
 118
 119 static void
 120 micro_ceil(union tgsi_exec_channel *dst,
 121            const union tgsi_exec_channel *src)
 122 {
 123    dst->f[0] = ceilf(src->f[0]);
 124    dst->f[1] = ceilf(src->f[1]);
 125    dst->f[2] = ceilf(src->f[2]);
 126    dst->f[3] = ceilf(src->f[3]);
 127 }
 128
 129 static void
 130 micro_cmp(union tgsi_exec_channel *dst,
 131           const union tgsi_exec_channel *src0,
 132           const union tgsi_exec_channel *src1,
 133           const union tgsi_exec_channel *src2)
 134 {
 135    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
 136    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
 137    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
 138    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
 139 }
 140
 141 static void
 142 micro_cos(union tgsi_exec_channel *dst,
 143           const union tgsi_exec_channel *src)
 144 {
 145    dst->f[0] = cosf(src->f[0]);
 146    dst->f[1] = cosf(src->f[1]);
 147    dst->f[2] = cosf(src->f[2]);
 148    dst->f[3] = cosf(src->f[3]);
 149 }
 150
 151 static void
 152 micro_d2f(union tgsi_exec_channel *dst,
 153           const union tgsi_double_channel *src)
 154 {
 155    dst->f[0] = (float)src->d[0];
 156    dst->f[1] = (float)src->d[1];
 157    dst->f[2] = (float)src->d[2];
 158    dst->f[3] = (float)src->d[3];
 159 }
 160
 161 static void
 162 micro_d2i(union tgsi_exec_channel *dst,
 163           const union tgsi_double_channel *src)
 164 {
 165    dst->i[0] = (int)src->d[0];
 166    dst->i[1] = (int)src->d[1];
 167    dst->i[2] = (int)src->d[2];
 168    dst->i[3] = (int)src->d[3];
 169 }
 170
 171 static void
 172 micro_d2u(union tgsi_exec_channel *dst,
 173           const union tgsi_double_channel *src)
 174 {
 175    dst->u[0] = (unsigned)src->d[0];
 176    dst->u[1] = (unsigned)src->d[1];
 177    dst->u[2] = (unsigned)src->d[2];
 178    dst->u[3] = (unsigned)src->d[3];
 179 }
 180 static void
 181 micro_dabs(union tgsi_double_channel *dst,
 182            const union tgsi_double_channel *src)
 183 {
 184    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
 185    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
 186    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
 187    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
 188 }
 189
 190 static void
 191 micro_dadd(union tgsi_double_channel *dst,
 192           const union tgsi_double_channel *src)
 193 {
 194    dst->d[0] = src[0].d[0] + src[1].d[0];
 195    dst->d[1] = src[0].d[1] + src[1].d[1];
 196    dst->d[2] = src[0].d[2] + src[1].d[2];
 197    dst->d[3] = src[0].d[3] + src[1].d[3];
 198 }
 199
 200 static void
 201 micro_ddiv(union tgsi_double_channel *dst,
 202           const union tgsi_double_channel *src)
 203 {
 204    dst->d[0] = src[0].d[0] / src[1].d[0];
 205    dst->d[1] = src[0].d[1] / src[1].d[1];
 206    dst->d[2] = src[0].d[2] / src[1].d[2];
 207    dst->d[3] = src[0].d[3] / src[1].d[3];
 208 }
 209
 210 static void
 211 micro_ddx(union tgsi_exec_channel *dst,
 212           const union tgsi_exec_channel *src)
 213 {
 214    dst->f[0] =
 215    dst->f[1] =
 216    dst->f[2] =
 217    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 218 }
 219
 220 static void
 221 micro_ddx_fine(union tgsi_exec_channel *dst,
 222           const union tgsi_exec_channel *src)
 223 {
 224    dst->f[0] =
 225    dst->f[1] = src->f[TILE_TOP_RIGHT] - src->f[TILE_TOP_LEFT];
 226    dst->f[2] =
 227    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
 228 }
 229
 230
 231 static void
 232 micro_ddy(union tgsi_exec_channel *dst,
 233           const union tgsi_exec_channel *src)
 234 {
 235    dst->f[0] =
 236    dst->f[1] =
 237    dst->f[2] =
 238    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 239 }
 240
 241 static void
 242 micro_ddy_fine(union tgsi_exec_channel *dst,
 243           const union tgsi_exec_channel *src)
 244 {
 245    dst->f[0] =
 246    dst->f[2] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
 247    dst->f[1] =
 248    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_TOP_RIGHT];
 249 }
 250
 251 static void
 252 micro_dmul(union tgsi_double_channel *dst,
 253            const union tgsi_double_channel *src)
 254 {
 255    dst->d[0] = src[0].d[0] * src[1].d[0];
 256    dst->d[1] = src[0].d[1] * src[1].d[1];
 257    dst->d[2] = src[0].d[2] * src[1].d[2];
 258    dst->d[3] = src[0].d[3] * src[1].d[3];
 259 }
 260
 261 static void
 262 micro_dmax(union tgsi_double_channel *dst,
 263            const union tgsi_double_channel *src)
 264 {
 265    dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
 266    dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
 267    dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
 268    dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
 269 }
 270
 271 static void
 272 micro_dmin(union tgsi_double_channel *dst,
 273            const union tgsi_double_channel *src)
 274 {
 275    dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
 276    dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
 277    dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
 278    dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
 279 }
 280
 281 static void
 282 micro_dneg(union tgsi_double_channel *dst,
 283            const union tgsi_double_channel *src)
 284 {
 285    dst->d[0] = -src->d[0];
 286    dst->d[1] = -src->d[1];
 287    dst->d[2] = -src->d[2];
 288    dst->d[3] = -src->d[3];
 289 }
 290
 291 static void
 292 micro_dslt(union tgsi_double_channel *dst,
 293            const union tgsi_double_channel *src)
 294 {
 295    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
 296    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
 297    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
 298    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
 299 }
 300
 301 static void
 302 micro_dsne(union tgsi_double_channel *dst,
 303            const union tgsi_double_channel *src)
 304 {
 305    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
 306    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
 307    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
 308    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
 309 }
 310
 311 static void
 312 micro_dsge(union tgsi_double_channel *dst,
 313            const union tgsi_double_channel *src)
 314 {
 315    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
 316    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
 317    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
 318    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
 319 }
 320
 321 static void
 322 micro_dseq(union tgsi_double_channel *dst,
 323            const union tgsi_double_channel *src)
 324 {
 325    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
 326    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
 327    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
 328    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
 329 }
 330
 331 static void
 332 micro_drcp(union tgsi_double_channel *dst,
 333            const union tgsi_double_channel *src)
 334 {
 335    dst->d[0] = 1.0 / src->d[0];
 336    dst->d[1] = 1.0 / src->d[1];
 337    dst->d[2] = 1.0 / src->d[2];
 338    dst->d[3] = 1.0 / src->d[3];
 339 }
 340
 341 static void
 342 micro_dsqrt(union tgsi_double_channel *dst,
 343             const union tgsi_double_channel *src)
 344 {
 345    dst->d[0] = sqrt(src->d[0]);
 346    dst->d[1] = sqrt(src->d[1]);
 347    dst->d[2] = sqrt(src->d[2]);
 348    dst->d[3] = sqrt(src->d[3]);
 349 }
 350
 351 static void
 352 micro_drsq(union tgsi_double_channel *dst,
 353           const union tgsi_double_channel *src)
 354 {
 355    dst->d[0] = 1.0 / sqrt(src->d[0]);
 356    dst->d[1] = 1.0 / sqrt(src->d[1]);
 357    dst->d[2] = 1.0 / sqrt(src->d[2]);
 358    dst->d[3] = 1.0 / sqrt(src->d[3]);
 359 }
 360
 361 static void
 362 micro_dmad(union tgsi_double_channel *dst,
 363            const union tgsi_double_channel *src)
 364 {
 365    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
 366    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
 367    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
 368    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
 369 }
 370
 371 static void
 372 micro_dfrac(union tgsi_double_channel *dst,
 373             const union tgsi_double_channel *src)
 374 {
 375    dst->d[0] = src->d[0] - floor(src->d[0]);
 376    dst->d[1] = src->d[1] - floor(src->d[1]);
 377    dst->d[2] = src->d[2] - floor(src->d[2]);
 378    dst->d[3] = src->d[3] - floor(src->d[3]);
 379 }
 380
 381 static void
 382 micro_dflr(union tgsi_double_channel *dst,
 383            const union tgsi_double_channel *src)
 384 {
 385    dst->d[0] = floor(src->d[0]);
 386    dst->d[1] = floor(src->d[1]);
 387    dst->d[2] = floor(src->d[2]);
 388    dst->d[3] = floor(src->d[3]);
 389 }
 390
 391 static void
 392 micro_dldexp(union tgsi_double_channel *dst,
 393              const union tgsi_double_channel *src0,
 394              union tgsi_exec_channel *src1)
 395 {
 396    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
 397    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
 398    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
 399    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
 400 }
 401
 402 static void
 403 micro_dfracexp(union tgsi_double_channel *dst,
 404                union tgsi_exec_channel *dst_exp,
 405                const union tgsi_double_channel *src)
 406 {
 407    dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
 408    dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
 409    dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
 410    dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
 411 }
 412
 413 static void
 414 micro_exp2(union tgsi_exec_channel *dst,
 415            const union tgsi_exec_channel *src)
 416 {
 417 #if FAST_MATH
 418    dst->f[0] = util_fast_exp2(src->f[0]);
 419    dst->f[1] = util_fast_exp2(src->f[1]);
 420    dst->f[2] = util_fast_exp2(src->f[2]);
 421    dst->f[3] = util_fast_exp2(src->f[3]);
 422 #else
 423 #if DEBUG
 424    /* Inf is okay for this instruction, so clamp it to silence assertions. */
 425    uint i;
 426    union tgsi_exec_channel clamped;
 427
 428    for (i = 0; i < 4; i++) {
 429       if (src->f[i] > 127.99999f) {
 430          clamped.f[i] = 127.99999f;
 431       } else if (src->f[i] < -126.99999f) {
 432          clamped.f[i] = -126.99999f;
 433       } else {
 434          clamped.f[i] = src->f[i];
 435       }
 436    }
 437    src = &clamped;
 438 #endif /* DEBUG */
 439
 440    dst->f[0] = powf(2.0f, src->f[0]);
 441    dst->f[1] = powf(2.0f, src->f[1]);
 442    dst->f[2] = powf(2.0f, src->f[2]);
 443    dst->f[3] = powf(2.0f, src->f[3]);
 444 #endif /* FAST_MATH */
 445 }
 446
 447 static void
 448 micro_f2d(union tgsi_double_channel *dst,
 449           const union tgsi_exec_channel *src)
 450 {
 451    dst->d[0] = (double)src->f[0];
 452    dst->d[1] = (double)src->f[1];
 453    dst->d[2] = (double)src->f[2];
 454    dst->d[3] = (double)src->f[3];
 455 }
 456
 457 static void
 458 micro_flr(union tgsi_exec_channel *dst,
 459           const union tgsi_exec_channel *src)
 460 {
 461    dst->f[0] = floorf(src->f[0]);
 462    dst->f[1] = floorf(src->f[1]);
 463    dst->f[2] = floorf(src->f[2]);
 464    dst->f[3] = floorf(src->f[3]);
 465 }
 466
 467 static void
 468 micro_frc(union tgsi_exec_channel *dst,
 469           const union tgsi_exec_channel *src)
 470 {
 471    dst->f[0] = src->f[0] - floorf(src->f[0]);
 472    dst->f[1] = src->f[1] - floorf(src->f[1]);
 473    dst->f[2] = src->f[2] - floorf(src->f[2]);
 474    dst->f[3] = src->f[3] - floorf(src->f[3]);
 475 }
 476
 477 static void
 478 micro_i2d(union tgsi_double_channel *dst,
 479           const union tgsi_exec_channel *src)
 480 {
 481    dst->d[0] = (double)src->i[0];
 482    dst->d[1] = (double)src->i[1];
 483    dst->d[2] = (double)src->i[2];
 484    dst->d[3] = (double)src->i[3];
 485 }
 486
 487 static void
 488 micro_iabs(union tgsi_exec_channel *dst,
 489            const union tgsi_exec_channel *src)
 490 {
 491    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
 492    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
 493    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
 494    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
 495 }
 496
 497 static void
 498 micro_ineg(union tgsi_exec_channel *dst,
 499            const union tgsi_exec_channel *src)
 500 {
 501    dst->i[0] = -src->i[0];
 502    dst->i[1] = -src->i[1];
 503    dst->i[2] = -src->i[2];
 504    dst->i[3] = -src->i[3];
 505 }
 506
 507 static void
 508 micro_lg2(union tgsi_exec_channel *dst,
 509           const union tgsi_exec_channel *src)
 510 {
 511 #if FAST_MATH
 512    dst->f[0] = util_fast_log2(src->f[0]);
 513    dst->f[1] = util_fast_log2(src->f[1]);
 514    dst->f[2] = util_fast_log2(src->f[2]);
 515    dst->f[3] = util_fast_log2(src->f[3]);
 516 #else
 517    dst->f[0] = logf(src->f[0]) * 1.442695f;
 518    dst->f[1] = logf(src->f[1]) * 1.442695f;
 519    dst->f[2] = logf(src->f[2]) * 1.442695f;
 520    dst->f[3] = logf(src->f[3]) * 1.442695f;
 521 #endif
 522 }
 523
 524 static void
 525 micro_lrp(union tgsi_exec_channel *dst,
 526           const union tgsi_exec_channel *src0,
 527           const union tgsi_exec_channel *src1,
 528           const union tgsi_exec_channel *src2)
 529 {
 530    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
 531    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
 532    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
 533    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
 534 }
 535
 536 static void
 537 micro_mad(union tgsi_exec_channel *dst,
 538           const union tgsi_exec_channel *src0,
 539           const union tgsi_exec_channel *src1,
 540           const union tgsi_exec_channel *src2)
 541 {
 542    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
 543    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
 544    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
 545    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
 546 }
 547
 548 static void
 549 micro_mov(union tgsi_exec_channel *dst,
 550           const union tgsi_exec_channel *src)
 551 {
 552    dst->u[0] = src->u[0];
 553    dst->u[1] = src->u[1];
 554    dst->u[2] = src->u[2];
 555    dst->u[3] = src->u[3];
 556 }
 557
 558 static void
 559 micro_rcp(union tgsi_exec_channel *dst,
 560           const union tgsi_exec_channel *src)
 561 {
 562 #if 0 /* for debugging */
 563    assert(src->f[0] != 0.0f);
 564    assert(src->f[1] != 0.0f);
 565    assert(src->f[2] != 0.0f);
 566    assert(src->f[3] != 0.0f);
 567 #endif
 568    dst->f[0] = 1.0f / src->f[0];
 569    dst->f[1] = 1.0f / src->f[1];
 570    dst->f[2] = 1.0f / src->f[2];
 571    dst->f[3] = 1.0f / src->f[3];
 572 }
 573
 574 static void
 575 micro_rnd(union tgsi_exec_channel *dst,
 576           const union tgsi_exec_channel *src)
 577 {
 578    dst->f[0] = _mesa_roundevenf(src->f[0]);
 579    dst->f[1] = _mesa_roundevenf(src->f[1]);
 580    dst->f[2] = _mesa_roundevenf(src->f[2]);
 581    dst->f[3] = _mesa_roundevenf(src->f[3]);
 582 }
 583
 584 static void
 585 micro_rsq(union tgsi_exec_channel *dst,
 586           const union tgsi_exec_channel *src)
 587 {
 588 #if 0 /* for debugging */
 589    assert(src->f[0] != 0.0f);
 590    assert(src->f[1] != 0.0f);
 591    assert(src->f[2] != 0.0f);
 592    assert(src->f[3] != 0.0f);
 593 #endif
 594    dst->f[0] = 1.0f / sqrtf(src->f[0]);
 595    dst->f[1] = 1.0f / sqrtf(src->f[1]);
 596    dst->f[2] = 1.0f / sqrtf(src->f[2]);
 597    dst->f[3] = 1.0f / sqrtf(src->f[3]);
 598 }
 599
 600 static void
 601 micro_sqrt(union tgsi_exec_channel *dst,
 602            const union tgsi_exec_channel *src)
 603 {
 604    dst->f[0] = sqrtf(src->f[0]);
 605    dst->f[1] = sqrtf(src->f[1]);
 606    dst->f[2] = sqrtf(src->f[2]);
 607    dst->f[3] = sqrtf(src->f[3]);
 608 }
 609
 610 static void
 611 micro_seq(union tgsi_exec_channel *dst,
 612           const union tgsi_exec_channel *src0,
 613           const union tgsi_exec_channel *src1)
 614 {
 615    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
 616    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
 617    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
 618    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
 619 }
 620
 621 static void
 622 micro_sge(union tgsi_exec_channel *dst,
 623           const union tgsi_exec_channel *src0,
 624           const union tgsi_exec_channel *src1)
 625 {
 626    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
 627    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
 628    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
 629    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
 630 }
 631
 632 static void
 633 micro_sgn(union tgsi_exec_channel *dst,
 634           const union tgsi_exec_channel *src)
 635 {
 636    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
 637    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
 638    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
 639    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
 640 }
 641
 642 static void
 643 micro_isgn(union tgsi_exec_channel *dst,
 644           const union tgsi_exec_channel *src)
 645 {
 646    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
 647    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
 648    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
 649    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
 650 }
 651
 652 static void
 653 micro_sgt(union tgsi_exec_channel *dst,
 654           const union tgsi_exec_channel *src0,
 655           const union tgsi_exec_channel *src1)
 656 {
 657    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
 658    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
 659    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
 660    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
 661 }
 662
 663 static void
 664 micro_sin(union tgsi_exec_channel *dst,
 665           const union tgsi_exec_channel *src)
 666 {
 667    dst->f[0] = sinf(src->f[0]);
 668    dst->f[1] = sinf(src->f[1]);
 669    dst->f[2] = sinf(src->f[2]);
 670    dst->f[3] = sinf(src->f[3]);
 671 }
 672
 673 static void
 674 micro_sle(union tgsi_exec_channel *dst,
 675           const union tgsi_exec_channel *src0,
 676           const union tgsi_exec_channel *src1)
 677 {
 678    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
 679    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
 680    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
 681    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
 682 }
 683
 684 static void
 685 micro_slt(union tgsi_exec_channel *dst,
 686           const union tgsi_exec_channel *src0,
 687           const union tgsi_exec_channel *src1)
 688 {
 689    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
 690    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
 691    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
 692    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
 693 }
 694
 695 static void
 696 micro_sne(union tgsi_exec_channel *dst,
 697           const union tgsi_exec_channel *src0,
 698           const union tgsi_exec_channel *src1)
 699 {
 700    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
 701    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
 702    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
 703    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
 704 }
 705
 706 static void
 707 micro_trunc(union tgsi_exec_channel *dst,
 708             const union tgsi_exec_channel *src)
 709 {
 710    dst->f[0] = truncf(src->f[0]);
 711    dst->f[1] = truncf(src->f[1]);
 712    dst->f[2] = truncf(src->f[2]);
 713    dst->f[3] = truncf(src->f[3]);
 714 }
 715
 716 static void
 717 micro_u2d(union tgsi_double_channel *dst,
 718           const union tgsi_exec_channel *src)
 719 {
 720    dst->d[0] = (double)src->u[0];
 721    dst->d[1] = (double)src->u[1];
 722    dst->d[2] = (double)src->u[2];
 723    dst->d[3] = (double)src->u[3];
 724 }
 725
 726 static void
 727 micro_i64abs(union tgsi_double_channel *dst,
 728              const union tgsi_double_channel *src)
 729 {
 730    dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
 731    dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
 732    dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
 733    dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
 734 }
 735
 736 static void
 737 micro_i64sgn(union tgsi_double_channel *dst,
 738              const union tgsi_double_channel *src)
 739 {
 740    dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
 741    dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
 742    dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
 743    dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
 744 }
 745
 746 static void
 747 micro_i64neg(union tgsi_double_channel *dst,
 748              const union tgsi_double_channel *src)
 749 {
 750    dst->i64[0] = -src->i64[0];
 751    dst->i64[1] = -src->i64[1];
 752    dst->i64[2] = -src->i64[2];
 753    dst->i64[3] = -src->i64[3];
 754 }
 755
 756 static void
 757 micro_u64seq(union tgsi_double_channel *dst,
 758            const union tgsi_double_channel *src)
 759 {
 760    dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
 761    dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
 762    dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
 763    dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
 764 }
 765
 766 static void
 767 micro_u64sne(union tgsi_double_channel *dst,
 768              const union tgsi_double_channel *src)
 769 {
 770    dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
 771    dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
 772    dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
 773    dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
 774 }
 775
 776 static void
 777 micro_i64slt(union tgsi_double_channel *dst,
 778              const union tgsi_double_channel *src)
 779 {
 780    dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
 781    dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
 782    dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
 783    dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
 784 }
 785
 786 static void
 787 micro_u64slt(union tgsi_double_channel *dst,
 788              const union tgsi_double_channel *src)
 789 {
 790    dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
 791    dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
 792    dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
 793    dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
 794 }
 795
 796 static void
 797 micro_i64sge(union tgsi_double_channel *dst,
 798            const union tgsi_double_channel *src)
 799 {
 800    dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
 801    dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
 802    dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
 803    dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
 804 }
 805
 806 static void
 807 micro_u64sge(union tgsi_double_channel *dst,
 808              const union tgsi_double_channel *src)
 809 {
 810    dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
 811    dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
 812    dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
 813    dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
 814 }
 815
 816 static void
 817 micro_u64max(union tgsi_double_channel *dst,
 818              const union tgsi_double_channel *src)
 819 {
 820    dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
 821    dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
 822    dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
 823    dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
 824 }
 825
 826 static void
 827 micro_i64max(union tgsi_double_channel *dst,
 828              const union tgsi_double_channel *src)
 829 {
 830    dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
 831    dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
 832    dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
 833    dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
 834 }
 835
 836 static void
 837 micro_u64min(union tgsi_double_channel *dst,
 838              const union tgsi_double_channel *src)
 839 {
 840    dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
 841    dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
 842    dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
 843    dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
 844 }
 845
 846 static void
 847 micro_i64min(union tgsi_double_channel *dst,
 848              const union tgsi_double_channel *src)
 849 {
 850    dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
 851    dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
 852    dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
 853    dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
 854 }
 855
 856 static void
 857 micro_u64add(union tgsi_double_channel *dst,
 858              const union tgsi_double_channel *src)
 859 {
 860    dst->u64[0] = src[0].u64[0] + src[1].u64[0];
 861    dst->u64[1] = src[0].u64[1] + src[1].u64[1];
 862    dst->u64[2] = src[0].u64[2] + src[1].u64[2];
 863    dst->u64[3] = src[0].u64[3] + src[1].u64[3];
 864 }
 865
 866 static void
 867 micro_u64mul(union tgsi_double_channel *dst,
 868              const union tgsi_double_channel *src)
 869 {
 870    dst->u64[0] = src[0].u64[0] * src[1].u64[0];
 871    dst->u64[1] = src[0].u64[1] * src[1].u64[1];
 872    dst->u64[2] = src[0].u64[2] * src[1].u64[2];
 873    dst->u64[3] = src[0].u64[3] * src[1].u64[3];
 874 }
 875
 876 static void
 877 micro_u64div(union tgsi_double_channel *dst,
 878              const union tgsi_double_channel *src)
 879 {
 880    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
 881    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
 882    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
 883    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
 884 }
 885
 886 static void
 887 micro_i64div(union tgsi_double_channel *dst,
 888              const union tgsi_double_channel *src)
 889 {
 890    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
 891    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
 892    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
 893    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
 894 }
 895
 896 static void
 897 micro_u64mod(union tgsi_double_channel *dst,
 898              const union tgsi_double_channel *src)
 899 {
 900    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
 901    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
 902    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
 903    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
 904 }
 905
 906 static void
 907 micro_i64mod(union tgsi_double_channel *dst,
 908              const union tgsi_double_channel *src)
 909 {
 910    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
 911    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
 912    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
 913    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
 914 }
 915
 916 static void
 917 micro_u64shl(union tgsi_double_channel *dst,
 918              const union tgsi_double_channel *src0,
 919              union tgsi_exec_channel *src1)
 920 {
 921    unsigned masked_count;
 922    masked_count = src1->u[0] & 0x3f;
 923    dst->u64[0] = src0->u64[0] << masked_count;
 924    masked_count = src1->u[1] & 0x3f;
 925    dst->u64[1] = src0->u64[1] << masked_count;
 926    masked_count = src1->u[2] & 0x3f;
 927    dst->u64[2] = src0->u64[2] << masked_count;
 928    masked_count = src1->u[3] & 0x3f;
 929    dst->u64[3] = src0->u64[3] << masked_count;
 930 }
 931
 932 static void
 933 micro_i64shr(union tgsi_double_channel *dst,
 934              const union tgsi_double_channel *src0,
 935              union tgsi_exec_channel *src1)
 936 {
 937    unsigned masked_count;
 938    masked_count = src1->u[0] & 0x3f;
 939    dst->i64[0] = src0->i64[0] >> masked_count;
 940    masked_count = src1->u[1] & 0x3f;
 941    dst->i64[1] = src0->i64[1] >> masked_count;
 942    masked_count = src1->u[2] & 0x3f;
 943    dst->i64[2] = src0->i64[2] >> masked_count;
 944    masked_count = src1->u[3] & 0x3f;
 945    dst->i64[3] = src0->i64[3] >> masked_count;
 946 }
 947
 948 static void
 949 micro_u64shr(union tgsi_double_channel *dst,
 950              const union tgsi_double_channel *src0,
 951              union tgsi_exec_channel *src1)
 952 {
 953    unsigned masked_count;
 954    masked_count = src1->u[0] & 0x3f;
 955    dst->u64[0] = src0->u64[0] >> masked_count;
 956    masked_count = src1->u[1] & 0x3f;
 957    dst->u64[1] = src0->u64[1] >> masked_count;
 958    masked_count = src1->u[2] & 0x3f;
 959    dst->u64[2] = src0->u64[2] >> masked_count;
 960    masked_count = src1->u[3] & 0x3f;
 961    dst->u64[3] = src0->u64[3] >> masked_count;
 962 }
 963
 964 enum tgsi_exec_datatype {
 965    TGSI_EXEC_DATA_FLOAT,
 966    TGSI_EXEC_DATA_INT,
 967    TGSI_EXEC_DATA_UINT,
 968    TGSI_EXEC_DATA_DOUBLE,
 969    TGSI_EXEC_DATA_INT64,
 970    TGSI_EXEC_DATA_UINT64,
 971 };
 972
 973 /*
 974  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
 975  */
 976 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
 977 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
 978 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
 979 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 980 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 981 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
 982 #define TEMP_PRIMITIVE_S1_I   TGSI_EXEC_TEMP_PRIMITIVE_S1_I
 983 #define TEMP_PRIMITIVE_S1_C   TGSI_EXEC_TEMP_PRIMITIVE_S1_C
 984 #define TEMP_PRIMITIVE_S2_I   TGSI_EXEC_TEMP_PRIMITIVE_S2_I
 985 #define TEMP_PRIMITIVE_S2_C   TGSI_EXEC_TEMP_PRIMITIVE_S2_C
 986 #define TEMP_PRIMITIVE_S3_I   TGSI_EXEC_TEMP_PRIMITIVE_S3_I
 987 #define TEMP_PRIMITIVE_S3_C   TGSI_EXEC_TEMP_PRIMITIVE_S3_C
 988
 989 static const struct {
 990    int idx;
 991    int chan;
 992 } temp_prim_idxs[] = {
 993    { TEMP_PRIMITIVE_I, TEMP_PRIMITIVE_C },
 994    { TEMP_PRIMITIVE_S1_I, TEMP_PRIMITIVE_S1_C },
 995    { TEMP_PRIMITIVE_S2_I, TEMP_PRIMITIVE_S2_C },
 996    { TEMP_PRIMITIVE_S3_I, TEMP_PRIMITIVE_S3_C },
 997 };
 998
 999 /** The execution mask depends on the conditional mask and the loop mask */
1000 #define UPDATE_EXEC_MASK(MACH) \
1001       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
1002
1003
1004 static const union tgsi_exec_channel ZeroVec =
1005    { { 0.0, 0.0, 0.0, 0.0 } };
1006
1007 static const union tgsi_exec_channel OneVec = {
1008    {1.0f, 1.0f, 1.0f, 1.0f}
1009 };
1010
1011 static const union tgsi_exec_channel P128Vec = {
1012    {128.0f, 128.0f, 128.0f, 128.0f}
1013 };
1014
1015 static const union tgsi_exec_channel M128Vec = {
1016    {-128.0f, -128.0f, -128.0f, -128.0f}
1017 };
1018
1019
1020 /**
1021  * Assert that none of the float values in 'chan' are infinite or NaN.
1022  * NaN and Inf may occur normally during program execution and should
1023  * not lead to crashes, etc.  But when debugging, it's helpful to catch
1024  * them.
1025  */
1026 static inline void
1027 check_inf_or_nan(const union tgsi_exec_channel *chan)
1028 {
1029    assert(!util_is_inf_or_nan((chan)->f[0]));
1030    assert(!util_is_inf_or_nan((chan)->f[1]));
1031    assert(!util_is_inf_or_nan((chan)->f[2]));
1032    assert(!util_is_inf_or_nan((chan)->f[3]));
1033 }
1034
1035
1036 #ifdef DEBUG
1037 static void
1038 print_chan(const char *msg, const union tgsi_exec_channel *chan)
1039 {
1040    debug_printf("%s = {%f, %f, %f, %f}\n",
1041                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
1042 }
1043 #endif
1044
1045
1046 #ifdef DEBUG
1047 static void
1048 print_temp(const struct tgsi_exec_machine *mach, uint index)
1049 {
1050    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1051    int i;
1052    debug_printf("Temp[%u] =\n", index);
1053    for (i = 0; i < 4; i++) {
1054       debug_printf("  %c: { %f, %f, %f, %f }\n",
1055                    "XYZW"[i],
1056                    tmp->xyzw[i].f[0],
1057                    tmp->xyzw[i].f[1],
1058                    tmp->xyzw[i].f[2],
1059                    tmp->xyzw[i].f[3]);
1060    }
1061 }
1062 #endif
1063
1064
1065 void
1066 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1067                                unsigned num_bufs,
1068                                const void **bufs,
1069                                const unsigned *buf_sizes)
1070 {
1071    unsigned i;
1072
1073    for (i = 0; i < num_bufs; i++) {
1074       mach->Consts[i] = bufs[i];
1075       mach->ConstsSize[i] = buf_sizes[i];
1076    }
1077 }
1078
1079 /**
1080  * Initialize machine state by expanding tokens to full instructions,
1081  * allocating temporary storage, setting up constants, etc.
1082  * After this, we can call tgsi_exec_machine_run() many times.
1083  */
1084 void
1085 tgsi_exec_machine_bind_shader(
1086    struct tgsi_exec_machine *mach,
1087    const struct tgsi_token *tokens,
1088    struct tgsi_sampler *sampler,
1089    struct tgsi_image *image,
1090    struct tgsi_buffer *buffer)
1091 {
1092    uint k;
1093    struct tgsi_parse_context parse;
1094    struct tgsi_full_instruction *instructions;
1095    struct tgsi_full_declaration *declarations;
1096    uint maxInstructions = 10, numInstructions = 0;
1097    uint maxDeclarations = 10, numDeclarations = 0;
1098
1099 #if 0
1100    tgsi_dump(tokens, 0);
1101 #endif
1102
1103    util_init_math();
1104
1105
1106    mach->Tokens = tokens;
1107    mach->Sampler = sampler;
1108    mach->Image = image;
1109    mach->Buffer = buffer;
1110
1111    if (!tokens) {
1112       /* unbind and free all */
1113       FREE(mach->Declarations);
1114       mach->Declarations = NULL;
1115       mach->NumDeclarations = 0;
1116
1117       FREE(mach->Instructions);
1118       mach->Instructions = NULL;
1119       mach->NumInstructions = 0;
1120
1121       return;
1122    }
1123
1124    k = tgsi_parse_init (&parse, mach->Tokens);
1125    if (k != TGSI_PARSE_OK) {
1126       debug_printf( "Problem parsing!\n" );
1127       return;
1128    }
1129
1130    mach->ImmLimit = 0;
1131    mach->NumOutputs = 0;
1132
1133    for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1134       mach->SysSemanticToIndex[k] = -1;
1135
1136    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1137        !mach->UsedGeometryShader) {
1138       struct tgsi_exec_vector *inputs;
1139       struct tgsi_exec_vector *outputs;
1140
1141       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1142                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1143                             16);
1144
1145       if (!inputs)
1146          return;
1147
1148       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1149                              TGSI_MAX_TOTAL_VERTICES, 16);
1150
1151       if (!outputs) {
1152          align_free(inputs);
1153          return;
1154       }
1155
1156       align_free(mach->Inputs);
1157       align_free(mach->Outputs);
1158
1159       mach->Inputs = inputs;
1160       mach->Outputs = outputs;
1161       mach->UsedGeometryShader = TRUE;
1162    }
1163
1164    declarations = (struct tgsi_full_declaration *)
1165       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1166
1167    if (!declarations) {
1168       return;
1169    }
1170
1171    instructions = (struct tgsi_full_instruction *)
1172       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1173
1174    if (!instructions) {
1175       FREE( declarations );
1176       return;
1177    }
1178
1179    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1180       uint i;
1181
1182       tgsi_parse_token( &parse );
1183       switch( parse.FullToken.Token.Type ) {
1184       case TGSI_TOKEN_TYPE_DECLARATION:
1185          /* save expanded declaration */
1186          if (numDeclarations == maxDeclarations) {
1187             declarations = REALLOC(declarations,
1188                                    maxDeclarations
1189                                    * sizeof(struct tgsi_full_declaration),
1190                                    (maxDeclarations + 10)
1191                                    * sizeof(struct tgsi_full_declaration));
1192             maxDeclarations += 10;
1193          }
1194          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT)
1195             mach->NumOutputs = MAX2(mach->NumOutputs, parse.FullToken.FullDeclaration.Range.Last + 1);
1196          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1197             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1198             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1199          }
1200
1201          memcpy(declarations + numDeclarations,
1202                 &parse.FullToken.FullDeclaration,
1203                 sizeof(declarations[0]));
1204          numDeclarations++;
1205          break;
1206
1207       case TGSI_TOKEN_TYPE_IMMEDIATE:
1208          {
1209             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1210             assert( size <= 4 );
1211             if (mach->ImmLimit >= mach->ImmsReserved) {
1212                unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1213                float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1214                if (imms) {
1215                   mach->ImmsReserved = newReserved;
1216                   mach->Imms = imms;
1217                } else {
1218                   debug_printf("Unable to (re)allocate space for immidiate constants\n");
1219                   break;
1220                }
1221             }
1222
1223             for( i = 0; i < size; i++ ) {
1224                mach->Imms[mach->ImmLimit][i] =
1225                   parse.FullToken.FullImmediate.u[i].Float;
1226             }
1227             mach->ImmLimit += 1;
1228          }
1229          break;
1230
1231       case TGSI_TOKEN_TYPE_INSTRUCTION:
1232
1233          /* save expanded instruction */
1234          if (numInstructions == maxInstructions) {
1235             instructions = REALLOC(instructions,
1236                                    maxInstructions
1237                                    * sizeof(struct tgsi_full_instruction),
1238                                    (maxInstructions + 10)
1239                                    * sizeof(struct tgsi_full_instruction));
1240             maxInstructions += 10;
1241          }
1242
1243          memcpy(instructions + numInstructions,
1244                 &parse.FullToken.FullInstruction,
1245                 sizeof(instructions[0]));
1246
1247          numInstructions++;
1248          break;
1249
1250       case TGSI_TOKEN_TYPE_PROPERTY:
1251          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1252             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1253                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1254             }
1255          }
1256          break;
1257
1258       default:
1259          assert( 0 );
1260       }
1261    }
1262    tgsi_parse_free (&parse);
1263
1264    FREE(mach->Declarations);
1265    mach->Declarations = declarations;
1266    mach->NumDeclarations = numDeclarations;
1267
1268    FREE(mach->Instructions);
1269    mach->Instructions = instructions;
1270    mach->NumInstructions = numInstructions;
1271 }
1272
1273
1274 struct tgsi_exec_machine *
1275 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1276 {
1277    struct tgsi_exec_machine *mach;
1278
1279    mach = align_malloc( sizeof *mach, 16 );
1280    if (!mach)
1281       goto fail;
1282
1283    memset(mach, 0, sizeof(*mach));
1284
1285    mach->ShaderType = shader_type;
1286    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1287    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1288
1289    if (shader_type != PIPE_SHADER_COMPUTE) {
1290       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1291       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1292       if (!mach->Inputs || !mach->Outputs)
1293          goto fail;
1294    }
1295
1296    if (shader_type == PIPE_SHADER_FRAGMENT) {
1297       mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
1298       if (!mach->InputSampleOffsetApply)
1299          goto fail;
1300    }
1301
1302 #ifdef DEBUG
1303    /* silence warnings */
1304    (void) print_chan;
1305    (void) print_temp;
1306 #endif
1307
1308    return mach;
1309
1310 fail:
1311    if (mach) {
1312       align_free(mach->InputSampleOffsetApply);
1313       align_free(mach->Inputs);
1314       align_free(mach->Outputs);
1315       align_free(mach);
1316    }
1317    return NULL;
1318 }
1319
1320
1321 void
1322 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1323 {
1324    if (mach) {
1325       FREE(mach->Instructions);
1326       FREE(mach->Declarations);
1327       FREE(mach->Imms);
1328
1329       align_free(mach->InputSampleOffsetApply);
1330       align_free(mach->Inputs);
1331       align_free(mach->Outputs);
1332
1333       align_free(mach);
1334    }
1335 }
1336
1337 static void
1338 micro_add(union tgsi_exec_channel *dst,
1339           const union tgsi_exec_channel *src0,
1340           const union tgsi_exec_channel *src1)
1341 {
1342    dst->f[0] = src0->f[0] + src1->f[0];
1343    dst->f[1] = src0->f[1] + src1->f[1];
1344    dst->f[2] = src0->f[2] + src1->f[2];
1345    dst->f[3] = src0->f[3] + src1->f[3];
1346 }
1347
1348 static void
1349 micro_div(
1350    union tgsi_exec_channel *dst,
1351    const union tgsi_exec_channel *src0,
1352    const union tgsi_exec_channel *src1 )
1353 {
1354    if (src1->f[0] != 0) {
1355       dst->f[0] = src0->f[0] / src1->f[0];
1356    }
1357    if (src1->f[1] != 0) {
1358       dst->f[1] = src0->f[1] / src1->f[1];
1359    }
1360    if (src1->f[2] != 0) {
1361       dst->f[2] = src0->f[2] / src1->f[2];
1362    }
1363    if (src1->f[3] != 0) {
1364       dst->f[3] = src0->f[3] / src1->f[3];
1365    }
1366 }
1367
1368 static void
1369 micro_lt(
1370    union tgsi_exec_channel *dst,
1371    const union tgsi_exec_channel *src0,
1372    const union tgsi_exec_channel *src1,
1373    const union tgsi_exec_channel *src2,
1374    const union tgsi_exec_channel *src3 )
1375 {
1376    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1377    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1378    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1379    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1380 }
1381
1382 static void
1383 micro_max(union tgsi_exec_channel *dst,
1384           const union tgsi_exec_channel *src0,
1385           const union tgsi_exec_channel *src1)
1386 {
1387    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1388    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1389    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1390    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1391 }
1392
1393 static void
1394 micro_min(union tgsi_exec_channel *dst,
1395           const union tgsi_exec_channel *src0,
1396           const union tgsi_exec_channel *src1)
1397 {
1398    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1399    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1400    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1401    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1402 }
1403
1404 static void
1405 micro_mul(union tgsi_exec_channel *dst,
1406           const union tgsi_exec_channel *src0,
1407           const union tgsi_exec_channel *src1)
1408 {
1409    dst->f[0] = src0->f[0] * src1->f[0];
1410    dst->f[1] = src0->f[1] * src1->f[1];
1411    dst->f[2] = src0->f[2] * src1->f[2];
1412    dst->f[3] = src0->f[3] * src1->f[3];
1413 }
1414
1415 static void
1416 micro_neg(
1417    union tgsi_exec_channel *dst,
1418    const union tgsi_exec_channel *src )
1419 {
1420    dst->f[0] = -src->f[0];
1421    dst->f[1] = -src->f[1];
1422    dst->f[2] = -src->f[2];
1423    dst->f[3] = -src->f[3];
1424 }
1425
1426 static void
1427 micro_pow(
1428    union tgsi_exec_channel *dst,
1429    const union tgsi_exec_channel *src0,
1430    const union tgsi_exec_channel *src1 )
1431 {
1432 #if FAST_MATH
1433    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1434    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1435    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1436    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1437 #else
1438    dst->f[0] = powf( src0->f[0], src1->f[0] );
1439    dst->f[1] = powf( src0->f[1], src1->f[1] );
1440    dst->f[2] = powf( src0->f[2], src1->f[2] );
1441    dst->f[3] = powf( src0->f[3], src1->f[3] );
1442 #endif
1443 }
1444
1445 static void
1446 micro_ldexp(union tgsi_exec_channel *dst,
1447             const union tgsi_exec_channel *src0,
1448             const union tgsi_exec_channel *src1)
1449 {
1450    dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1451    dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1452    dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1453    dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1454 }
1455
1456 static void
1457 micro_sub(union tgsi_exec_channel *dst,
1458           const union tgsi_exec_channel *src0,
1459           const union tgsi_exec_channel *src1)
1460 {
1461    dst->f[0] = src0->f[0] - src1->f[0];
1462    dst->f[1] = src0->f[1] - src1->f[1];
1463    dst->f[2] = src0->f[2] - src1->f[2];
1464    dst->f[3] = src0->f[3] - src1->f[3];
1465 }
1466
1467 static void
1468 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1469                        const uint file,
1470                        const uint swizzle,
1471                        const union tgsi_exec_channel *index,
1472                        const union tgsi_exec_channel *index2D,
1473                        union tgsi_exec_channel *chan)
1474 {
1475    uint i;
1476
1477    assert(swizzle < 4);
1478
1479    switch (file) {
1480    case TGSI_FILE_CONSTANT:
1481       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1482          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1483          assert(mach->Consts[index2D->i[i]]);
1484
1485          if (index->i[i] < 0) {
1486             chan->u[i] = 0;
1487          } else {
1488             /* NOTE: copying the const value as a uint instead of float */
1489             const uint constbuf = index2D->i[i];
1490             const uint *buf = (const uint *)mach->Consts[constbuf];
1491             const int pos = index->i[i] * 4 + swizzle;
1492             /* const buffer bounds check */
1493             if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1494                if (0) {
1495                   /* Debug: print warning */
1496                   static int count = 0;
1497                   if (count++ < 100)
1498                      debug_printf("TGSI Exec: const buffer index %d"
1499                                   " out of bounds\n", pos);
1500                }
1501                chan->u[i] = 0;
1502             }
1503             else
1504                chan->u[i] = buf[pos];
1505          }
1506       }
1507       break;
1508
1509    case TGSI_FILE_INPUT:
1510       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1511          /*
1512          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1513             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1514                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1515                          index2D->i[i], index->i[i]);
1516                          }*/
1517          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1518          assert(pos >= 0);
1519          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1520          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1521       }
1522       break;
1523
1524    case TGSI_FILE_SYSTEM_VALUE:
1525       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1526          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1527       }
1528       break;
1529
1530    case TGSI_FILE_TEMPORARY:
1531       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1532          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1533          assert(index2D->i[i] == 0);
1534
1535          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1536       }
1537       break;
1538
1539    case TGSI_FILE_IMMEDIATE:
1540       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1541          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1542          assert(index2D->i[i] == 0);
1543
1544          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1545       }
1546       break;
1547
1548    case TGSI_FILE_ADDRESS:
1549       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1550          assert(index->i[i] >= 0);
1551          assert(index2D->i[i] == 0);
1552
1553          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1554       }
1555       break;
1556
1557    case TGSI_FILE_OUTPUT:
1558       /* vertex/fragment output vars can be read too */
1559       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1560          assert(index->i[i] >= 0);
1561          assert(index2D->i[i] == 0);
1562
1563          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1564       }
1565       break;
1566
1567    default:
1568       assert(0);
1569       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1570          chan->u[i] = 0;
1571       }
1572    }
1573 }
1574
1575 static void
1576 get_index_registers(const struct tgsi_exec_machine *mach,
1577                     const struct tgsi_full_src_register *reg,
1578                     union tgsi_exec_channel *index,
1579                     union tgsi_exec_channel *index2D)
1580 {
1581    uint swizzle;
1582
1583    /* We start with a direct index into a register file.
1584     *
1585     *    file[1],
1586     *    where:
1587     *       file = Register.File
1588     *       [1] = Register.Index
1589     */
1590    index->i[0] =
1591    index->i[1] =
1592    index->i[2] =
1593    index->i[3] = reg->Register.Index;
1594
1595    /* There is an extra source register that indirectly subscripts
1596     * a register file. The direct index now becomes an offset
1597     * that is being added to the indirect register.
1598     *
1599     *    file[ind[2].x+1],
1600     *    where:
1601     *       ind = Indirect.File
1602     *       [2] = Indirect.Index
1603     *       .x = Indirect.SwizzleX
1604     */
1605    if (reg->Register.Indirect) {
1606       union tgsi_exec_channel index2;
1607       union tgsi_exec_channel indir_index;
1608       const uint execmask = mach->ExecMask;
1609       uint i;
1610
1611       /* which address register (always zero now) */
1612       index2.i[0] =
1613       index2.i[1] =
1614       index2.i[2] =
1615       index2.i[3] = reg->Indirect.Index;
1616       /* get current value of address register[swizzle] */
1617       swizzle = reg->Indirect.Swizzle;
1618       fetch_src_file_channel(mach,
1619                              reg->Indirect.File,
1620                              swizzle,
1621                              &index2,
1622                              &ZeroVec,
1623                              &indir_index);
1624
1625       /* add value of address register to the offset */
1626       index->i[0] += indir_index.i[0];
1627       index->i[1] += indir_index.i[1];
1628       index->i[2] += indir_index.i[2];
1629       index->i[3] += indir_index.i[3];
1630
1631       /* for disabled execution channels, zero-out the index to
1632        * avoid using a potential garbage value.
1633        */
1634       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1635          if ((execmask & (1 << i)) == 0)
1636             index->i[i] = 0;
1637       }
1638    }
1639
1640    /* There is an extra source register that is a second
1641     * subscript to a register file. Effectively it means that
1642     * the register file is actually a 2D array of registers.
1643     *
1644     *    file[3][1],
1645     *    where:
1646     *       [3] = Dimension.Index
1647     */
1648    if (reg->Register.Dimension) {
1649       index2D->i[0] =
1650       index2D->i[1] =
1651       index2D->i[2] =
1652       index2D->i[3] = reg->Dimension.Index;
1653
1654       /* Again, the second subscript index can be addressed indirectly
1655        * identically to the first one.
1656        * Nothing stops us from indirectly addressing the indirect register,
1657        * but there is no need for that, so we won't exercise it.
1658        *
1659        *    file[ind[4].y+3][1],
1660        *    where:
1661        *       ind = DimIndirect.File
1662        *       [4] = DimIndirect.Index
1663        *       .y = DimIndirect.SwizzleX
1664        */
1665       if (reg->Dimension.Indirect) {
1666          union tgsi_exec_channel index2;
1667          union tgsi_exec_channel indir_index;
1668          const uint execmask = mach->ExecMask;
1669          uint i;
1670
1671          index2.i[0] =
1672          index2.i[1] =
1673          index2.i[2] =
1674          index2.i[3] = reg->DimIndirect.Index;
1675
1676          swizzle = reg->DimIndirect.Swizzle;
1677          fetch_src_file_channel(mach,
1678                                 reg->DimIndirect.File,
1679                                 swizzle,
1680                                 &index2,
1681                                 &ZeroVec,
1682                                 &indir_index);
1683
1684          index2D->i[0] += indir_index.i[0];
1685          index2D->i[1] += indir_index.i[1];
1686          index2D->i[2] += indir_index.i[2];
1687          index2D->i[3] += indir_index.i[3];
1688
1689          /* for disabled execution channels, zero-out the index to
1690           * avoid using a potential garbage value.
1691           */
1692          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1693             if ((execmask & (1 << i)) == 0) {
1694                index2D->i[i] = 0;
1695             }
1696          }
1697       }
1698
1699       /* If by any chance there was a need for a 3D array of register
1700        * files, we would have to check whether Dimension is followed
1701        * by a dimension register and continue the saga.
1702        */
1703    } else {
1704       index2D->i[0] =
1705       index2D->i[1] =
1706       index2D->i[2] =
1707       index2D->i[3] = 0;
1708    }
1709 }
1710
1711
1712 static void
1713 fetch_source_d(const struct tgsi_exec_machine *mach,
1714                union tgsi_exec_channel *chan,
1715                const struct tgsi_full_src_register *reg,
1716                const uint chan_index)
1717 {
1718    union tgsi_exec_channel index;
1719    union tgsi_exec_channel index2D;
1720    uint swizzle;
1721
1722    get_index_registers(mach, reg, &index, &index2D);
1723
1724
1725    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1726    fetch_src_file_channel(mach,
1727                           reg->Register.File,
1728                           swizzle,
1729                           &index,
1730                           &index2D,
1731                           chan);
1732 }
1733
1734 static void
1735 fetch_source(const struct tgsi_exec_machine *mach,
1736              union tgsi_exec_channel *chan,
1737              const struct tgsi_full_src_register *reg,
1738              const uint chan_index,
1739              enum tgsi_exec_datatype src_datatype)
1740 {
1741    fetch_source_d(mach, chan, reg, chan_index);
1742
1743    if (reg->Register.Absolute) {
1744       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1745          micro_abs(chan, chan);
1746       } else {
1747          micro_iabs(chan, chan);
1748       }
1749    }
1750
1751    if (reg->Register.Negate) {
1752       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1753          micro_neg(chan, chan);
1754       } else {
1755          micro_ineg(chan, chan);
1756       }
1757    }
1758 }
1759
1760 static union tgsi_exec_channel *
1761 store_dest_dstret(struct tgsi_exec_machine *mach,
1762                  const union tgsi_exec_channel *chan,
1763                  const struct tgsi_full_dst_register *reg,
1764                  uint chan_index,
1765                  enum tgsi_exec_datatype dst_datatype)
1766 {
1767    static union tgsi_exec_channel null;
1768    union tgsi_exec_channel *dst;
1769    union tgsi_exec_channel index2D;
1770    int offset = 0;  /* indirection offset */
1771    int index;
1772
1773    /* for debugging */
1774    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1775       check_inf_or_nan(chan);
1776    }
1777
1778    /* There is an extra source register that indirectly subscripts
1779     * a register file. The direct index now becomes an offset
1780     * that is being added to the indirect register.
1781     *
1782     *    file[ind[2].x+1],
1783     *    where:
1784     *       ind = Indirect.File
1785     *       [2] = Indirect.Index
1786     *       .x = Indirect.SwizzleX
1787     */
1788    if (reg->Register.Indirect) {
1789       union tgsi_exec_channel index;
1790       union tgsi_exec_channel indir_index;
1791       uint swizzle;
1792
1793       /* which address register (always zero for now) */
1794       index.i[0] =
1795       index.i[1] =
1796       index.i[2] =
1797       index.i[3] = reg->Indirect.Index;
1798
1799       /* get current value of address register[swizzle] */
1800       swizzle = reg->Indirect.Swizzle;
1801
1802       /* fetch values from the address/indirection register */
1803       fetch_src_file_channel(mach,
1804                              reg->Indirect.File,
1805                              swizzle,
1806                              &index,
1807                              &ZeroVec,
1808                              &indir_index);
1809
1810       /* save indirection offset */
1811       offset = indir_index.i[0];
1812    }
1813
1814    /* There is an extra source register that is a second
1815     * subscript to a register file. Effectively it means that
1816     * the register file is actually a 2D array of registers.
1817     *
1818     *    file[3][1],
1819     *    where:
1820     *       [3] = Dimension.Index
1821     */
1822    if (reg->Register.Dimension) {
1823       index2D.i[0] =
1824       index2D.i[1] =
1825       index2D.i[2] =
1826       index2D.i[3] = reg->Dimension.Index;
1827
1828       /* Again, the second subscript index can be addressed indirectly
1829        * identically to the first one.
1830        * Nothing stops us from indirectly addressing the indirect register,
1831        * but there is no need for that, so we won't exercise it.
1832        *
1833        *    file[ind[4].y+3][1],
1834        *    where:
1835        *       ind = DimIndirect.File
1836        *       [4] = DimIndirect.Index
1837        *       .y = DimIndirect.SwizzleX
1838        */
1839       if (reg->Dimension.Indirect) {
1840          union tgsi_exec_channel index2;
1841          union tgsi_exec_channel indir_index;
1842          const uint execmask = mach->ExecMask;
1843          unsigned swizzle;
1844          uint i;
1845
1846          index2.i[0] =
1847          index2.i[1] =
1848          index2.i[2] =
1849          index2.i[3] = reg->DimIndirect.Index;
1850
1851          swizzle = reg->DimIndirect.Swizzle;
1852          fetch_src_file_channel(mach,
1853                                 reg->DimIndirect.File,
1854                                 swizzle,
1855                                 &index2,
1856                                 &ZeroVec,
1857                                 &indir_index);
1858
1859          index2D.i[0] += indir_index.i[0];
1860          index2D.i[1] += indir_index.i[1];
1861          index2D.i[2] += indir_index.i[2];
1862          index2D.i[3] += indir_index.i[3];
1863
1864          /* for disabled execution channels, zero-out the index to
1865           * avoid using a potential garbage value.
1866           */
1867          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1868             if ((execmask & (1 << i)) == 0) {
1869                index2D.i[i] = 0;
1870             }
1871          }
1872       }
1873
1874       /* If by any chance there was a need for a 3D array of register
1875        * files, we would have to check whether Dimension is followed
1876        * by a dimension register and continue the saga.
1877        */
1878    } else {
1879       index2D.i[0] =
1880       index2D.i[1] =
1881       index2D.i[2] =
1882       index2D.i[3] = 0;
1883    }
1884
1885    switch (reg->Register.File) {
1886    case TGSI_FILE_NULL:
1887       dst = &null;
1888       break;
1889
1890    case TGSI_FILE_OUTPUT:
1891       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1892          + reg->Register.Index;
1893       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1894 #if 0
1895       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1896                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1897                    reg->Register.Index);
1898       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1899          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1900          for (i = 0; i < TGSI_QUAD_SIZE; i++)
1901             if (execmask & (1 << i))
1902                debug_printf("%f, ", chan->f[i]);
1903          debug_printf(")\n");
1904       }
1905 #endif
1906       break;
1907
1908    case TGSI_FILE_TEMPORARY:
1909       index = reg->Register.Index;
1910       assert( index < TGSI_EXEC_NUM_TEMPS );
1911       dst = &mach->Temps[offset + index].xyzw[chan_index];
1912       break;
1913
1914    case TGSI_FILE_ADDRESS:
1915       index = reg->Register.Index;
1916       dst = &mach->Addrs[index].xyzw[chan_index];
1917       break;
1918
1919    default:
1920       assert( 0 );
1921       return NULL;
1922    }
1923
1924    return dst;
1925 }
1926
1927 static void
1928 store_dest_double(struct tgsi_exec_machine *mach,
1929                  const union tgsi_exec_channel *chan,
1930                  const struct tgsi_full_dst_register *reg,
1931                  uint chan_index,
1932                  enum tgsi_exec_datatype dst_datatype)
1933 {
1934    union tgsi_exec_channel *dst;
1935    const uint execmask = mach->ExecMask;
1936    int i;
1937
1938    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1939    if (!dst)
1940       return;
1941
1942    /* doubles path */
1943    for (i = 0; i < TGSI_QUAD_SIZE; i++)
1944       if (execmask & (1 << i))
1945          dst->i[i] = chan->i[i];
1946 }
1947
1948 static void
1949 store_dest(struct tgsi_exec_machine *mach,
1950            const union tgsi_exec_channel *chan,
1951            const struct tgsi_full_dst_register *reg,
1952            const struct tgsi_full_instruction *inst,
1953            uint chan_index,
1954            enum tgsi_exec_datatype dst_datatype)
1955 {
1956    union tgsi_exec_channel *dst;
1957    const uint execmask = mach->ExecMask;
1958    int i;
1959
1960    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1961    if (!dst)
1962       return;
1963
1964    if (!inst->Instruction.Saturate) {
1965       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1966          if (execmask & (1 << i))
1967             dst->i[i] = chan->i[i];
1968    }
1969    else {
1970       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1971          if (execmask & (1 << i)) {
1972             if (chan->f[i] < 0.0f)
1973                dst->f[i] = 0.0f;
1974             else if (chan->f[i] > 1.0f)
1975                dst->f[i] = 1.0f;
1976             else
1977                dst->i[i] = chan->i[i];
1978          }
1979    }
1980 }
1981
1982 #define FETCH(VAL,INDEX,CHAN)\
1983     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1984
1985 #define IFETCH(VAL,INDEX,CHAN)\
1986     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1987
1988
1989 /**
1990  * Execute ARB-style KIL which is predicated by a src register.
1991  * Kill fragment if any of the four values is less than zero.
1992  */
1993 static void
1994 exec_kill_if(struct tgsi_exec_machine *mach,
1995              const struct tgsi_full_instruction *inst)
1996 {
1997    uint uniquemask;
1998    uint chan_index;
1999    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2000    union tgsi_exec_channel r[1];
2001
2002    /* This mask stores component bits that were already tested. */
2003    uniquemask = 0;
2004
2005    for (chan_index = 0; chan_index < 4; chan_index++)
2006    {
2007       uint swizzle;
2008       uint i;
2009
2010       /* unswizzle channel */
2011       swizzle = tgsi_util_get_full_src_register_swizzle (
2012                         &inst->Src[0],
2013                         chan_index);
2014
2015       /* check if the component has not been already tested */
2016       if (uniquemask & (1 << swizzle))
2017          continue;
2018       uniquemask |= 1 << swizzle;
2019
2020       FETCH(&r[0], 0, chan_index);
2021       for (i = 0; i < 4; i++)
2022          if (r[0].f[i] < 0.0f)
2023             kilmask |= 1 << i;
2024    }
2025
2026    /* restrict to fragments currently executing */
2027    kilmask &= mach->ExecMask;
2028
2029    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2030 }
2031
2032 /**
2033  * Unconditional fragment kill/discard.
2034  */
2035 static void
2036 exec_kill(struct tgsi_exec_machine *mach)
2037 {
2038    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2039
2040    /* kill fragment for all fragments currently executing */
2041    kilmask = mach->ExecMask;
2042    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2043 }
2044
2045 static void
2046 emit_vertex(struct tgsi_exec_machine *mach,
2047             const struct tgsi_full_instruction *inst)
2048 {
2049    union tgsi_exec_channel r[1];
2050    unsigned stream_id;
2051    unsigned *prim_count;
2052    /* FIXME: check for exec mask correctly
2053    unsigned i;
2054    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2055          if ((mach->ExecMask & (1 << i)))
2056    */
2057    IFETCH(&r[0], 0, TGSI_CHAN_X);
2058    stream_id = r[0].u[0];
2059    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2060    if (mach->ExecMask) {
2061       if (mach->Primitives[stream_id][*prim_count] >= mach->MaxOutputVertices)
2062          return;
2063
2064       if (mach->Primitives[stream_id][*prim_count] == 0)
2065          mach->PrimitiveOffsets[stream_id][*prim_count] = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0];
2066       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
2067       mach->Primitives[stream_id][*prim_count]++;
2068    }
2069 }
2070
2071 static void
2072 emit_primitive(struct tgsi_exec_machine *mach,
2073                const struct tgsi_full_instruction *inst)
2074 {
2075    unsigned *prim_count;
2076    union tgsi_exec_channel r[1];
2077    unsigned stream_id = 0;
2078    /* FIXME: check for exec mask correctly
2079    unsigned i;
2080    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2081          if ((mach->ExecMask & (1 << i)))
2082    */
2083    if (inst) {
2084       IFETCH(&r[0], 0, TGSI_CHAN_X);
2085       stream_id = r[0].u[0];
2086    }
2087    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2088    if (mach->ExecMask) {
2089       ++(*prim_count);
2090       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
2091       mach->Primitives[stream_id][*prim_count] = 0;
2092    }
2093 }
2094
2095 static void
2096 conditional_emit_primitive(struct tgsi_exec_machine *mach)
2097 {
2098    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
2099       int emitted_verts =
2100          mach->Primitives[0][mach->Temps[temp_prim_idxs[0].idx].xyzw[temp_prim_idxs[0].chan].u[0]];
2101       if (emitted_verts) {
2102          emit_primitive(mach, NULL);
2103       }
2104    }
2105 }
2106
2107
2108 /*
2109  * Fetch four texture samples using STR texture coordinates.
2110  */
2111 static void
2112 fetch_texel( struct tgsi_sampler *sampler,
2113              const unsigned sview_idx,
2114              const unsigned sampler_idx,
2115              const union tgsi_exec_channel *s,
2116              const union tgsi_exec_channel *t,
2117              const union tgsi_exec_channel *p,
2118              const union tgsi_exec_channel *c0,
2119              const union tgsi_exec_channel *c1,
2120              float derivs[3][2][TGSI_QUAD_SIZE],
2121              const int8_t offset[3],
2122              enum tgsi_sampler_control control,
2123              union tgsi_exec_channel *r,
2124              union tgsi_exec_channel *g,
2125              union tgsi_exec_channel *b,
2126              union tgsi_exec_channel *a )
2127 {
2128    uint j;
2129    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2130
2131    /* FIXME: handle explicit derivs, offsets */
2132    sampler->get_samples(sampler, sview_idx, sampler_idx,
2133                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
2134
2135    for (j = 0; j < 4; j++) {
2136       r->f[j] = rgba[0][j];
2137       g->f[j] = rgba[1][j];
2138       b->f[j] = rgba[2][j];
2139       a->f[j] = rgba[3][j];
2140    }
2141 }
2142
2143
2144 #define TEX_MODIFIER_NONE           0
2145 #define TEX_MODIFIER_PROJECTED      1
2146 #define TEX_MODIFIER_LOD_BIAS       2
2147 #define TEX_MODIFIER_EXPLICIT_LOD   3
2148 #define TEX_MODIFIER_LEVEL_ZERO     4
2149 #define TEX_MODIFIER_GATHER         5
2150
2151 /*
2152  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2153  */
2154 static void
2155 fetch_texel_offsets(struct tgsi_exec_machine *mach,
2156                     const struct tgsi_full_instruction *inst,
2157                     int8_t offsets[3])
2158 {
2159    if (inst->Texture.NumOffsets == 1) {
2160       union tgsi_exec_channel index;
2161       union tgsi_exec_channel offset[3];
2162       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2163       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2164                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2165       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2166                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2167       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2168                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2169      offsets[0] = offset[0].i[0];
2170      offsets[1] = offset[1].i[0];
2171      offsets[2] = offset[2].i[0];
2172    } else {
2173      assert(inst->Texture.NumOffsets == 0);
2174      offsets[0] = offsets[1] = offsets[2] = 0;
2175    }
2176 }
2177
2178
2179 /*
2180  * Fetch dx and dy values for one channel (s, t or r).
2181  * Put dx values into one float array, dy values into another.
2182  */
2183 static void
2184 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2185                            const struct tgsi_full_instruction *inst,
2186                            unsigned regdsrcx,
2187                            unsigned chan,
2188                            float derivs[2][TGSI_QUAD_SIZE])
2189 {
2190    union tgsi_exec_channel d;
2191    FETCH(&d, regdsrcx, chan);
2192    derivs[0][0] = d.f[0];
2193    derivs[0][1] = d.f[1];
2194    derivs[0][2] = d.f[2];
2195    derivs[0][3] = d.f[3];
2196    FETCH(&d, regdsrcx + 1, chan);
2197    derivs[1][0] = d.f[0];
2198    derivs[1][1] = d.f[1];
2199    derivs[1][2] = d.f[2];
2200    derivs[1][3] = d.f[3];
2201 }
2202
2203 static uint
2204 fetch_sampler_unit(struct tgsi_exec_machine *mach,
2205                    const struct tgsi_full_instruction *inst,
2206                    uint sampler)
2207 {
2208    uint unit = 0;
2209    int i;
2210    if (inst->Src[sampler].Register.Indirect) {
2211       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2212       union tgsi_exec_channel indir_index, index2;
2213       const uint execmask = mach->ExecMask;
2214       index2.i[0] =
2215       index2.i[1] =
2216       index2.i[2] =
2217       index2.i[3] = reg->Indirect.Index;
2218
2219       fetch_src_file_channel(mach,
2220                              reg->Indirect.File,
2221                              reg->Indirect.Swizzle,
2222                              &index2,
2223                              &ZeroVec,
2224                              &indir_index);
2225       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2226          if (execmask & (1 << i)) {
2227             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2228             break;
2229          }
2230       }
2231
2232    } else {
2233       unit = inst->Src[sampler].Register.Index;
2234    }
2235    return unit;
2236 }
2237
2238 /*
2239  * execute a texture instruction.
2240  *
2241  * modifier is used to control the channel routing for the
2242  * instruction variants like proj, lod, and texture with lod bias.
2243  * sampler indicates which src register the sampler is contained in.
2244  */
2245 static void
2246 exec_tex(struct tgsi_exec_machine *mach,
2247          const struct tgsi_full_instruction *inst,
2248          uint modifier, uint sampler)
2249 {
2250    const union tgsi_exec_channel *args[5], *proj = NULL;
2251    union tgsi_exec_channel r[5];
2252    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2253    uint chan;
2254    uint unit;
2255    int8_t offsets[3];
2256    int dim, shadow_ref, i;
2257
2258    unit = fetch_sampler_unit(mach, inst, sampler);
2259    /* always fetch all 3 offsets, overkill but keeps code simple */
2260    fetch_texel_offsets(mach, inst, offsets);
2261
2262    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2263    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2264
2265    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2266    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2267
2268    assert(dim <= 4);
2269    if (shadow_ref >= 0)
2270       assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2271
2272    /* fetch modifier to the last argument */
2273    if (modifier != TEX_MODIFIER_NONE) {
2274       const int last = ARRAY_SIZE(args) - 1;
2275
2276       /* fetch modifier from src0.w or src1.x */
2277       if (sampler == 1) {
2278          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2279          FETCH(&r[last], 0, TGSI_CHAN_W);
2280       }
2281       else {
2282          FETCH(&r[last], 1, TGSI_CHAN_X);
2283       }
2284
2285       if (modifier != TEX_MODIFIER_PROJECTED) {
2286          args[last] = &r[last];
2287       }
2288       else {
2289          proj = &r[last];
2290          args[last] = &ZeroVec;
2291       }
2292
2293       /* point unused arguments to zero vector */
2294       for (i = dim; i < last; i++)
2295          args[i] = &ZeroVec;
2296
2297       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2298          control = TGSI_SAMPLER_LOD_EXPLICIT;
2299       else if (modifier == TEX_MODIFIER_LOD_BIAS)
2300          control = TGSI_SAMPLER_LOD_BIAS;
2301       else if (modifier == TEX_MODIFIER_GATHER)
2302          control = TGSI_SAMPLER_GATHER;
2303    }
2304    else {
2305       for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2306          args[i] = &ZeroVec;
2307    }
2308
2309    /* fetch coordinates */
2310    for (i = 0; i < dim; i++) {
2311       FETCH(&r[i], 0, TGSI_CHAN_X + i);
2312
2313       if (proj)
2314          micro_div(&r[i], &r[i], proj);
2315
2316       args[i] = &r[i];
2317    }
2318
2319    /* fetch reference value */
2320    if (shadow_ref >= 0) {
2321       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2322
2323       if (proj)
2324          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2325
2326       args[shadow_ref] = &r[shadow_ref];
2327    }
2328
2329    fetch_texel(mach->Sampler, unit, unit,
2330          args[0], args[1], args[2], args[3], args[4],
2331          NULL, offsets, control,
2332          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2333
2334 #if 0
2335    debug_printf("fetch r: %g %g %g %g\n",
2336          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2337    debug_printf("fetch g: %g %g %g %g\n",
2338          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2339    debug_printf("fetch b: %g %g %g %g\n",
2340          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2341    debug_printf("fetch a: %g %g %g %g\n",
2342          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2343 #endif
2344
2345    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2346       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2347          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2348       }
2349    }
2350 }
2351
2352 static void
2353 exec_lodq(struct tgsi_exec_machine *mach,
2354           const struct tgsi_full_instruction *inst)
2355 {
2356    uint resource_unit, sampler_unit;
2357    unsigned dim;
2358    unsigned i;
2359    union tgsi_exec_channel coords[4];
2360    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2361    union tgsi_exec_channel r[2];
2362
2363    resource_unit = fetch_sampler_unit(mach, inst, 1);
2364    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2365       uint target = mach->SamplerViews[resource_unit].Resource;
2366       dim = tgsi_util_get_texture_coord_dim(target);
2367       sampler_unit = fetch_sampler_unit(mach, inst, 2);
2368    } else {
2369       dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2370       sampler_unit = resource_unit;
2371    }
2372    assert(dim <= ARRAY_SIZE(coords));
2373    /* fetch coordinates */
2374    for (i = 0; i < dim; i++) {
2375       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2376       args[i] = &coords[i];
2377    }
2378    for (i = dim; i < ARRAY_SIZE(coords); i++) {
2379       args[i] = &ZeroVec;
2380    }
2381    mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2382                             args[0]->f,
2383                             args[1]->f,
2384                             args[2]->f,
2385                             args[3]->f,
2386                             TGSI_SAMPLER_LOD_NONE,
2387                             r[0].f,
2388                             r[1].f);
2389
2390    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2391       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2392                  TGSI_EXEC_DATA_FLOAT);
2393    }
2394    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2395       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2396                  TGSI_EXEC_DATA_FLOAT);
2397    }
2398    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2399       unsigned char swizzles[4];
2400       unsigned chan;
2401       swizzles[0] = inst->Src[1].Register.SwizzleX;
2402       swizzles[1] = inst->Src[1].Register.SwizzleY;
2403       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2404       swizzles[3] = inst->Src[1].Register.SwizzleW;
2405
2406       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2407          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2408             if (swizzles[chan] >= 2) {
2409                store_dest(mach, &ZeroVec,
2410                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2411             } else {
2412                store_dest(mach, &r[swizzles[chan]],
2413                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2414             }
2415          }
2416       }
2417    } else {
2418       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2419          store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2420                     TGSI_EXEC_DATA_FLOAT);
2421       }
2422       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2423          store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2424                     TGSI_EXEC_DATA_FLOAT);
2425       }
2426    }
2427 }
2428
2429 static void
2430 exec_txd(struct tgsi_exec_machine *mach,
2431          const struct tgsi_full_instruction *inst)
2432 {
2433    union tgsi_exec_channel r[4];
2434    float derivs[3][2][TGSI_QUAD_SIZE];
2435    uint chan;
2436    uint unit;
2437    int8_t offsets[3];
2438
2439    unit = fetch_sampler_unit(mach, inst, 3);
2440    /* always fetch all 3 offsets, overkill but keeps code simple */
2441    fetch_texel_offsets(mach, inst, offsets);
2442
2443    switch (inst->Texture.Texture) {
2444    case TGSI_TEXTURE_1D:
2445       FETCH(&r[0], 0, TGSI_CHAN_X);
2446
2447       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2448
2449       fetch_texel(mach->Sampler, unit, unit,
2450                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2451                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2452                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2453       break;
2454
2455    case TGSI_TEXTURE_SHADOW1D:
2456    case TGSI_TEXTURE_1D_ARRAY:
2457    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2458       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2459       FETCH(&r[0], 0, TGSI_CHAN_X);
2460       FETCH(&r[1], 0, TGSI_CHAN_Y);
2461       FETCH(&r[2], 0, TGSI_CHAN_Z);
2462
2463       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2464
2465       fetch_texel(mach->Sampler, unit, unit,
2466                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2467                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2468                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2469       break;
2470
2471    case TGSI_TEXTURE_2D:
2472    case TGSI_TEXTURE_RECT:
2473       FETCH(&r[0], 0, TGSI_CHAN_X);
2474       FETCH(&r[1], 0, TGSI_CHAN_Y);
2475
2476       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2477       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2478
2479       fetch_texel(mach->Sampler, unit, unit,
2480                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2481                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2482                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2483       break;
2484
2485
2486    case TGSI_TEXTURE_SHADOW2D:
2487    case TGSI_TEXTURE_SHADOWRECT:
2488    case TGSI_TEXTURE_2D_ARRAY:
2489    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2490       /* only SHADOW2D_ARRAY actually needs W */
2491       FETCH(&r[0], 0, TGSI_CHAN_X);
2492       FETCH(&r[1], 0, TGSI_CHAN_Y);
2493       FETCH(&r[2], 0, TGSI_CHAN_Z);
2494       FETCH(&r[3], 0, TGSI_CHAN_W);
2495
2496       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2497       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2498
2499       fetch_texel(mach->Sampler, unit, unit,
2500                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2501                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2502                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2503       break;
2504
2505    case TGSI_TEXTURE_3D:
2506    case TGSI_TEXTURE_CUBE:
2507    case TGSI_TEXTURE_CUBE_ARRAY:
2508    case TGSI_TEXTURE_SHADOWCUBE:
2509       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2510       FETCH(&r[0], 0, TGSI_CHAN_X);
2511       FETCH(&r[1], 0, TGSI_CHAN_Y);
2512       FETCH(&r[2], 0, TGSI_CHAN_Z);
2513       FETCH(&r[3], 0, TGSI_CHAN_W);
2514
2515       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2516       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2517       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2518
2519       fetch_texel(mach->Sampler, unit, unit,
2520                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2521                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2522                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2523       break;
2524
2525    default:
2526       assert(0);
2527    }
2528
2529    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2530       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2531          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2532       }
2533    }
2534 }
2535
2536
2537 static void
2538 exec_txf(struct tgsi_exec_machine *mach,
2539          const struct tgsi_full_instruction *inst)
2540 {
2541    union tgsi_exec_channel r[4];
2542    uint chan;
2543    uint unit;
2544    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2545    int j;
2546    int8_t offsets[3];
2547    unsigned target;
2548
2549    unit = fetch_sampler_unit(mach, inst, 1);
2550    /* always fetch all 3 offsets, overkill but keeps code simple */
2551    fetch_texel_offsets(mach, inst, offsets);
2552
2553    IFETCH(&r[3], 0, TGSI_CHAN_W);
2554
2555    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2556        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2557       target = mach->SamplerViews[unit].Resource;
2558    }
2559    else {
2560       target = inst->Texture.Texture;
2561    }
2562    switch(target) {
2563    case TGSI_TEXTURE_3D:
2564    case TGSI_TEXTURE_2D_ARRAY:
2565    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2566    case TGSI_TEXTURE_2D_ARRAY_MSAA:
2567       IFETCH(&r[2], 0, TGSI_CHAN_Z);
2568       /* fallthrough */
2569    case TGSI_TEXTURE_2D:
2570    case TGSI_TEXTURE_RECT:
2571    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2572    case TGSI_TEXTURE_SHADOW2D:
2573    case TGSI_TEXTURE_SHADOWRECT:
2574    case TGSI_TEXTURE_1D_ARRAY:
2575    case TGSI_TEXTURE_2D_MSAA:
2576       IFETCH(&r[1], 0, TGSI_CHAN_Y);
2577       /* fallthrough */
2578    case TGSI_TEXTURE_BUFFER:
2579    case TGSI_TEXTURE_1D:
2580    case TGSI_TEXTURE_SHADOW1D:
2581       IFETCH(&r[0], 0, TGSI_CHAN_X);
2582       break;
2583    default:
2584       assert(0);
2585       break;
2586    }
2587
2588    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2589                             offsets, rgba);
2590
2591    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2592       r[0].f[j] = rgba[0][j];
2593       r[1].f[j] = rgba[1][j];
2594       r[2].f[j] = rgba[2][j];
2595       r[3].f[j] = rgba[3][j];
2596    }
2597
2598    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2599        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2600       unsigned char swizzles[4];
2601       swizzles[0] = inst->Src[1].Register.SwizzleX;
2602       swizzles[1] = inst->Src[1].Register.SwizzleY;
2603       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2604       swizzles[3] = inst->Src[1].Register.SwizzleW;
2605
2606       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2607          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2608             store_dest(mach, &r[swizzles[chan]],
2609                        &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2610          }
2611       }
2612    }
2613    else {
2614       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2615          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2616             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2617          }
2618       }
2619    }
2620 }
2621
2622 static void
2623 exec_txq(struct tgsi_exec_machine *mach,
2624          const struct tgsi_full_instruction *inst)
2625 {
2626    int result[4];
2627    union tgsi_exec_channel r[4], src;
2628    uint chan;
2629    uint unit;
2630    int i,j;
2631
2632    unit = fetch_sampler_unit(mach, inst, 1);
2633
2634    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2635
2636    /* XXX: This interface can't return per-pixel values */
2637    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2638
2639    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2640       for (j = 0; j < 4; j++) {
2641          r[j].i[i] = result[j];
2642       }
2643    }
2644
2645    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2646       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2647          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2648                     TGSI_EXEC_DATA_INT);
2649       }
2650    }
2651 }
2652
2653 static void
2654 exec_sample(struct tgsi_exec_machine *mach,
2655             const struct tgsi_full_instruction *inst,
2656             uint modifier, boolean compare)
2657 {
2658    const uint resource_unit = inst->Src[1].Register.Index;
2659    const uint sampler_unit = inst->Src[2].Register.Index;
2660    union tgsi_exec_channel r[5], c1;
2661    const union tgsi_exec_channel *lod = &ZeroVec;
2662    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2663    uint chan;
2664    unsigned char swizzles[4];
2665    int8_t offsets[3];
2666
2667    /* always fetch all 3 offsets, overkill but keeps code simple */
2668    fetch_texel_offsets(mach, inst, offsets);
2669
2670    assert(modifier != TEX_MODIFIER_PROJECTED);
2671
2672    if (modifier != TEX_MODIFIER_NONE) {
2673       if (modifier == TEX_MODIFIER_LOD_BIAS) {
2674          FETCH(&c1, 3, TGSI_CHAN_X);
2675          lod = &c1;
2676          control = TGSI_SAMPLER_LOD_BIAS;
2677       }
2678       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2679          FETCH(&c1, 3, TGSI_CHAN_X);
2680          lod = &c1;
2681          control = TGSI_SAMPLER_LOD_EXPLICIT;
2682       }
2683       else if (modifier == TEX_MODIFIER_GATHER) {
2684          control = TGSI_SAMPLER_GATHER;
2685       }
2686       else {
2687          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2688          control = TGSI_SAMPLER_LOD_ZERO;
2689       }
2690    }
2691
2692    FETCH(&r[0], 0, TGSI_CHAN_X);
2693
2694    switch (mach->SamplerViews[resource_unit].Resource) {
2695    case TGSI_TEXTURE_1D:
2696       if (compare) {
2697          FETCH(&r[2], 3, TGSI_CHAN_X);
2698          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2699                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2700                      NULL, offsets, control,
2701                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2702       }
2703       else {
2704          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2705                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2706                      NULL, offsets, control,
2707                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2708       }
2709       break;
2710
2711    case TGSI_TEXTURE_1D_ARRAY:
2712    case TGSI_TEXTURE_2D:
2713    case TGSI_TEXTURE_RECT:
2714       FETCH(&r[1], 0, TGSI_CHAN_Y);
2715       if (compare) {
2716          FETCH(&r[2], 3, TGSI_CHAN_X);
2717          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2718                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2719                      NULL, offsets, control,
2720                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2721       }
2722       else {
2723          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2724                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2725                      NULL, offsets, control,
2726                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2727       }
2728       break;
2729
2730    case TGSI_TEXTURE_2D_ARRAY:
2731    case TGSI_TEXTURE_3D:
2732    case TGSI_TEXTURE_CUBE:
2733       FETCH(&r[1], 0, TGSI_CHAN_Y);
2734       FETCH(&r[2], 0, TGSI_CHAN_Z);
2735       if(compare) {
2736          FETCH(&r[3], 3, TGSI_CHAN_X);
2737          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2738                      &r[0], &r[1], &r[2], &r[3], lod,
2739                      NULL, offsets, control,
2740                      &r[0], &r[1], &r[2], &r[3]);
2741       }
2742       else {
2743          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2744                      &r[0], &r[1], &r[2], &ZeroVec, lod,
2745                      NULL, offsets, control,
2746                      &r[0], &r[1], &r[2], &r[3]);
2747       }
2748       break;
2749
2750    case TGSI_TEXTURE_CUBE_ARRAY:
2751       FETCH(&r[1], 0, TGSI_CHAN_Y);
2752       FETCH(&r[2], 0, TGSI_CHAN_Z);
2753       FETCH(&r[3], 0, TGSI_CHAN_W);
2754       if(compare) {
2755          FETCH(&r[4], 3, TGSI_CHAN_X);
2756          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2757                      &r[0], &r[1], &r[2], &r[3], &r[4],
2758                      NULL, offsets, control,
2759                      &r[0], &r[1], &r[2], &r[3]);
2760       }
2761       else {
2762          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2763                      &r[0], &r[1], &r[2], &r[3], lod,
2764                      NULL, offsets, control,
2765                      &r[0], &r[1], &r[2], &r[3]);
2766       }
2767       break;
2768
2769
2770    default:
2771       assert(0);
2772    }
2773
2774    swizzles[0] = inst->Src[1].Register.SwizzleX;
2775    swizzles[1] = inst->Src[1].Register.SwizzleY;
2776    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2777    swizzles[3] = inst->Src[1].Register.SwizzleW;
2778
2779    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2780       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2781          store_dest(mach, &r[swizzles[chan]],
2782                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2783       }
2784    }
2785 }
2786
2787 static void
2788 exec_sample_d(struct tgsi_exec_machine *mach,
2789               const struct tgsi_full_instruction *inst)
2790 {
2791    const uint resource_unit = inst->Src[1].Register.Index;
2792    const uint sampler_unit = inst->Src[2].Register.Index;
2793    union tgsi_exec_channel r[4];
2794    float derivs[3][2][TGSI_QUAD_SIZE];
2795    uint chan;
2796    unsigned char swizzles[4];
2797    int8_t offsets[3];
2798
2799    /* always fetch all 3 offsets, overkill but keeps code simple */
2800    fetch_texel_offsets(mach, inst, offsets);
2801
2802    FETCH(&r[0], 0, TGSI_CHAN_X);
2803
2804    switch (mach->SamplerViews[resource_unit].Resource) {
2805    case TGSI_TEXTURE_1D:
2806    case TGSI_TEXTURE_1D_ARRAY:
2807       /* only 1D array actually needs Y */
2808       FETCH(&r[1], 0, TGSI_CHAN_Y);
2809
2810       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2811
2812       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2813                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2814                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2815                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2816       break;
2817
2818    case TGSI_TEXTURE_2D:
2819    case TGSI_TEXTURE_RECT:
2820    case TGSI_TEXTURE_2D_ARRAY:
2821       /* only 2D array actually needs Z */
2822       FETCH(&r[1], 0, TGSI_CHAN_Y);
2823       FETCH(&r[2], 0, TGSI_CHAN_Z);
2824
2825       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2826       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2827
2828       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2829                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2830                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2831                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2832       break;
2833
2834    case TGSI_TEXTURE_3D:
2835    case TGSI_TEXTURE_CUBE:
2836    case TGSI_TEXTURE_CUBE_ARRAY:
2837       /* only cube array actually needs W */
2838       FETCH(&r[1], 0, TGSI_CHAN_Y);
2839       FETCH(&r[2], 0, TGSI_CHAN_Z);
2840       FETCH(&r[3], 0, TGSI_CHAN_W);
2841
2842       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2843       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2844       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2845
2846       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2847                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2848                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2849                   &r[0], &r[1], &r[2], &r[3]);
2850       break;
2851
2852    default:
2853       assert(0);
2854    }
2855
2856    swizzles[0] = inst->Src[1].Register.SwizzleX;
2857    swizzles[1] = inst->Src[1].Register.SwizzleY;
2858    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2859    swizzles[3] = inst->Src[1].Register.SwizzleW;
2860
2861    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2862       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2863          store_dest(mach, &r[swizzles[chan]],
2864                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2865       }
2866    }
2867 }
2868
2869
2870 /**
2871  * Evaluate a constant-valued coefficient at the position of the
2872  * current quad.
2873  */
2874 static void
2875 eval_constant_coef(
2876    struct tgsi_exec_machine *mach,
2877    unsigned attrib,
2878    unsigned chan )
2879 {
2880    unsigned i;
2881
2882    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2883       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2884    }
2885 }
2886
2887 static void
2888 interp_constant_offset(
2889       UNUSED const struct tgsi_exec_machine *mach,
2890       UNUSED unsigned attrib,
2891       UNUSED unsigned chan,
2892       UNUSED float ofs_x,
2893       UNUSED float ofs_y,
2894       UNUSED union tgsi_exec_channel *out_chan)
2895 {
2896 }
2897
2898 /**
2899  * Evaluate a linear-valued coefficient at the position of the
2900  * current quad.
2901  */
2902 static void
2903 interp_linear_offset(
2904       const struct tgsi_exec_machine *mach,
2905       unsigned attrib,
2906       unsigned chan,
2907       float ofs_x,
2908       float ofs_y,
2909       union tgsi_exec_channel *out_chan)
2910 {
2911    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2912    const float dady = mach->InterpCoefs[attrib].dady[chan];
2913    const float delta = ofs_x * dadx + ofs_y * dady;
2914    out_chan->f[0] += delta;
2915    out_chan->f[1] += delta;
2916    out_chan->f[2] += delta;
2917    out_chan->f[3] += delta;
2918 }
2919
2920 static void
2921 eval_linear_coef(struct tgsi_exec_machine *mach,
2922                  unsigned attrib,
2923                  unsigned chan)
2924 {
2925    const float x = mach->QuadPos.xyzw[0].f[0];
2926    const float y = mach->QuadPos.xyzw[1].f[0];
2927    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2928    const float dady = mach->InterpCoefs[attrib].dady[chan];
2929    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2930
2931    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2932    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2933    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2934    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2935 }
2936
2937 /**
2938  * Evaluate a perspective-valued coefficient at the position of the
2939  * current quad.
2940  */
2941
2942 static void
2943 interp_perspective_offset(
2944    const struct tgsi_exec_machine *mach,
2945    unsigned attrib,
2946    unsigned chan,
2947    float ofs_x,
2948    float ofs_y,
2949    union tgsi_exec_channel *out_chan)
2950 {
2951    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2952    const float dady = mach->InterpCoefs[attrib].dady[chan];
2953    const float *w = mach->QuadPos.xyzw[3].f;
2954    const float delta = ofs_x * dadx + ofs_y * dady;
2955    out_chan->f[0] += delta / w[0];
2956    out_chan->f[1] += delta / w[1];
2957    out_chan->f[2] += delta / w[2];
2958    out_chan->f[3] += delta / w[3];
2959 }
2960
2961 static void
2962 eval_perspective_coef(
2963    struct tgsi_exec_machine *mach,
2964    unsigned attrib,
2965    unsigned chan )
2966 {
2967    const float x = mach->QuadPos.xyzw[0].f[0];
2968    const float y = mach->QuadPos.xyzw[1].f[0];
2969    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2970    const float dady = mach->InterpCoefs[attrib].dady[chan];
2971    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2972    const float *w = mach->QuadPos.xyzw[3].f;
2973    /* divide by W here */
2974    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2975    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2976    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2977    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2978 }
2979
2980
2981 typedef void (* eval_coef_func)(
2982    struct tgsi_exec_machine *mach,
2983    unsigned attrib,
2984    unsigned chan );
2985
2986 static void
2987 exec_declaration(struct tgsi_exec_machine *mach,
2988                  const struct tgsi_full_declaration *decl)
2989 {
2990    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2991       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2992       return;
2993    }
2994
2995    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2996       if (decl->Declaration.File == TGSI_FILE_INPUT) {
2997          uint first, last, mask;
2998
2999          first = decl->Range.First;
3000          last = decl->Range.Last;
3001          mask = decl->Declaration.UsageMask;
3002
3003          /* XXX we could remove this special-case code since
3004           * mach->InterpCoefs[first].a0 should already have the
3005           * front/back-face value.  But we should first update the
3006           * ureg code to emit the right UsageMask value (WRITEMASK_X).
3007           * Then, we could remove the tgsi_exec_machine::Face field.
3008           */
3009          /* XXX make FACE a system value */
3010          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
3011             uint i;
3012
3013             assert(decl->Semantic.Index == 0);
3014             assert(first == last);
3015
3016             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3017                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
3018             }
3019          } else {
3020             eval_coef_func eval;
3021             apply_sample_offset_func interp;
3022             uint i, j;
3023
3024             switch (decl->Interp.Interpolate) {
3025             case TGSI_INTERPOLATE_CONSTANT:
3026                eval = eval_constant_coef;
3027                interp = interp_constant_offset;
3028                break;
3029
3030             case TGSI_INTERPOLATE_LINEAR:
3031                eval = eval_linear_coef;
3032                interp = interp_linear_offset;
3033                break;
3034
3035             case TGSI_INTERPOLATE_PERSPECTIVE:
3036                eval = eval_perspective_coef;
3037                interp = interp_perspective_offset;
3038                break;
3039
3040             case TGSI_INTERPOLATE_COLOR:
3041                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
3042                interp = mach->flatshade_color ? interp_constant_offset : interp_perspective_offset;
3043                break;
3044
3045             default:
3046                assert(0);
3047                return;
3048             }
3049
3050             for (i = first; i <= last; i++)
3051                mach->InputSampleOffsetApply[i] = interp;
3052
3053             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3054                if (mask & (1 << j)) {
3055                   for (i = first; i <= last; i++) {
3056                      eval(mach, i, j);
3057                   }
3058                }
3059             }
3060          }
3061
3062          if (DEBUG_EXECUTION) {
3063             uint i, j;
3064             for (i = first; i <= last; ++i) {
3065                debug_printf("IN[%2u] = ", i);
3066                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3067                   if (j > 0) {
3068                      debug_printf("         ");
3069                   }
3070                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3071                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
3072                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
3073                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
3074                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
3075                }
3076             }
3077          }
3078       }
3079    }
3080
3081 }
3082
3083 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
3084                                 const union tgsi_exec_channel *src);
3085
3086 static void
3087 exec_scalar_unary(struct tgsi_exec_machine *mach,
3088                   const struct tgsi_full_instruction *inst,
3089                   micro_unary_op op,
3090                   enum tgsi_exec_datatype dst_datatype,
3091                   enum tgsi_exec_datatype src_datatype)
3092 {
3093    unsigned int chan;
3094    union tgsi_exec_channel src;
3095    union tgsi_exec_channel dst;
3096
3097    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
3098    op(&dst, &src);
3099    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3100       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3101          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3102       }
3103    }
3104 }
3105
3106 static void
3107 exec_vector_unary(struct tgsi_exec_machine *mach,
3108                   const struct tgsi_full_instruction *inst,
3109                   micro_unary_op op,
3110                   enum tgsi_exec_datatype dst_datatype,
3111                   enum tgsi_exec_datatype src_datatype)
3112 {
3113    unsigned int chan;
3114    struct tgsi_exec_vector dst;
3115
3116    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3117       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3118          union tgsi_exec_channel src;
3119
3120          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
3121          op(&dst.xyzw[chan], &src);
3122       }
3123    }
3124    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3125       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3126          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3127       }
3128    }
3129 }
3130
3131 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
3132                                  const union tgsi_exec_channel *src0,
3133                                  const union tgsi_exec_channel *src1);
3134
3135 static void
3136 exec_scalar_binary(struct tgsi_exec_machine *mach,
3137                    const struct tgsi_full_instruction *inst,
3138                    micro_binary_op op,
3139                    enum tgsi_exec_datatype dst_datatype,
3140                    enum tgsi_exec_datatype src_datatype)
3141 {
3142    unsigned int chan;
3143    union tgsi_exec_channel src[2];
3144    union tgsi_exec_channel dst;
3145
3146    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
3147    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
3148    op(&dst, &src[0], &src[1]);
3149    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3150       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3151          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3152       }
3153    }
3154 }
3155
3156 static void
3157 exec_vector_binary(struct tgsi_exec_machine *mach,
3158                    const struct tgsi_full_instruction *inst,
3159                    micro_binary_op op,
3160                    enum tgsi_exec_datatype dst_datatype,
3161                    enum tgsi_exec_datatype src_datatype)
3162 {
3163    unsigned int chan;
3164    struct tgsi_exec_vector dst;
3165
3166    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3167       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3168          union tgsi_exec_channel src[2];
3169
3170          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3171          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3172          op(&dst.xyzw[chan], &src[0], &src[1]);
3173       }
3174    }
3175    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3176       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3177          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3178       }
3179    }
3180 }
3181
3182 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
3183                                   const union tgsi_exec_channel *src0,
3184                                   const union tgsi_exec_channel *src1,
3185                                   const union tgsi_exec_channel *src2);
3186
3187 static void
3188 exec_vector_trinary(struct tgsi_exec_machine *mach,
3189                     const struct tgsi_full_instruction *inst,
3190                     micro_trinary_op op,
3191                     enum tgsi_exec_datatype dst_datatype,
3192                     enum tgsi_exec_datatype src_datatype)
3193 {
3194    unsigned int chan;
3195    struct tgsi_exec_vector dst;
3196
3197    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3198       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3199          union tgsi_exec_channel src[3];
3200
3201          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3202          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3203          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3204          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3205       }
3206    }
3207    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3208       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3209          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3210       }
3211    }
3212 }
3213
3214 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
3215                                      const union tgsi_exec_channel *src0,
3216                                      const union tgsi_exec_channel *src1,
3217                                      const union tgsi_exec_channel *src2,
3218                                      const union tgsi_exec_channel *src3);
3219
3220 static void
3221 exec_vector_quaternary(struct tgsi_exec_machine *mach,
3222                        const struct tgsi_full_instruction *inst,
3223                        micro_quaternary_op op,
3224                        enum tgsi_exec_datatype dst_datatype,
3225                        enum tgsi_exec_datatype src_datatype)
3226 {
3227    unsigned int chan;
3228    struct tgsi_exec_vector dst;
3229
3230    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3231       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3232          union tgsi_exec_channel src[4];
3233
3234          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3235          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3236          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3237          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3238          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3239       }
3240    }
3241    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3242       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3243          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3244       }
3245    }
3246 }
3247
3248 static void
3249 exec_dp3(struct tgsi_exec_machine *mach,
3250          const struct tgsi_full_instruction *inst)
3251 {
3252    unsigned int chan;
3253    union tgsi_exec_channel arg[3];
3254
3255    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3256    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3257    micro_mul(&arg[2], &arg[0], &arg[1]);
3258
3259    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3260       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3261       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3262       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3263    }
3264
3265    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3266       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3267          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3268       }
3269    }
3270 }
3271
3272 static void
3273 exec_dp4(struct tgsi_exec_machine *mach,
3274          const struct tgsi_full_instruction *inst)
3275 {
3276    unsigned int chan;
3277    union tgsi_exec_channel arg[3];
3278
3279    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3280    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3281    micro_mul(&arg[2], &arg[0], &arg[1]);
3282
3283    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3284       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3285       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3286       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3287    }
3288
3289    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3290       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3291          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3292       }
3293    }
3294 }
3295
3296 static void
3297 exec_dp2(struct tgsi_exec_machine *mach,
3298          const struct tgsi_full_instruction *inst)
3299 {
3300    unsigned int chan;
3301    union tgsi_exec_channel arg[3];
3302
3303    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3304    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3305    micro_mul(&arg[2], &arg[0], &arg[1]);
3306
3307    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3308    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3309    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3310
3311    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3312       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3313          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3314       }
3315    }
3316 }
3317
3318 static void
3319 exec_pk2h(struct tgsi_exec_machine *mach,
3320           const struct tgsi_full_instruction *inst)
3321 {
3322    unsigned chan;
3323    union tgsi_exec_channel arg[2], dst;
3324
3325    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3326    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3327    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3328       dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
3329          (util_float_to_half(arg[1].f[chan]) << 16);
3330    }
3331    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3332       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3333          store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
3334       }
3335    }
3336 }
3337
3338 static void
3339 exec_up2h(struct tgsi_exec_machine *mach,
3340           const struct tgsi_full_instruction *inst)
3341 {
3342    unsigned chan;
3343    union tgsi_exec_channel arg, dst[2];
3344
3345    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3346    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3347       dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
3348       dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
3349    }
3350    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3351       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3352          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3353       }
3354    }
3355 }
3356
3357 static void
3358 micro_ucmp(union tgsi_exec_channel *dst,
3359            const union tgsi_exec_channel *src0,
3360            const union tgsi_exec_channel *src1,
3361            const union tgsi_exec_channel *src2)
3362 {
3363    dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
3364    dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
3365    dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
3366    dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
3367 }
3368
3369 static void
3370 exec_ucmp(struct tgsi_exec_machine *mach,
3371           const struct tgsi_full_instruction *inst)
3372 {
3373    unsigned int chan;
3374    struct tgsi_exec_vector dst;
3375
3376    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3377       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3378          union tgsi_exec_channel src[3];
3379
3380          fetch_source(mach, &src[0], &inst->Src[0], chan,
3381                       TGSI_EXEC_DATA_UINT);
3382          fetch_source(mach, &src[1], &inst->Src[1], chan,
3383                       TGSI_EXEC_DATA_FLOAT);
3384          fetch_source(mach, &src[2], &inst->Src[2], chan,
3385                       TGSI_EXEC_DATA_FLOAT);
3386          micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3387       }
3388    }
3389    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3390       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3391          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan,
3392                     TGSI_EXEC_DATA_FLOAT);
3393       }
3394    }
3395 }
3396
3397 static void
3398 exec_dst(struct tgsi_exec_machine *mach,
3399          const struct tgsi_full_instruction *inst)
3400 {
3401    union tgsi_exec_channel r[2];
3402    union tgsi_exec_channel d[4];
3403
3404    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3405       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3406       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3407       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3408    }
3409    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3410       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3411    }
3412    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3413       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3414    }
3415
3416    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3417       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3418    }
3419    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3420       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3421    }
3422    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3423       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3424    }
3425    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3426       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3427    }
3428 }
3429
3430 static void
3431 exec_log(struct tgsi_exec_machine *mach,
3432          const struct tgsi_full_instruction *inst)
3433 {
3434    union tgsi_exec_channel r[3];
3435
3436    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3437    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3438    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3439    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3440    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3441       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3442    }
3443    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3444       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3445       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3446       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3447    }
3448    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3449       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3450    }
3451    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3452       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3453    }
3454 }
3455
3456 static void
3457 exec_exp(struct tgsi_exec_machine *mach,
3458          const struct tgsi_full_instruction *inst)
3459 {
3460    union tgsi_exec_channel r[3];
3461
3462    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3463    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3464    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3465       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3466       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3467    }
3468    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3469       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3470       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3471    }
3472    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3473       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3474       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3475    }
3476    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3477       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3478    }
3479 }
3480
3481 static void
3482 exec_lit(struct tgsi_exec_machine *mach,
3483          const struct tgsi_full_instruction *inst)
3484 {
3485    union tgsi_exec_channel r[3];
3486    union tgsi_exec_channel d[3];
3487
3488    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3489       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3490       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3491          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3492          micro_max(&r[1], &r[1], &ZeroVec);
3493
3494          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3495          micro_min(&r[2], &r[2], &P128Vec);
3496          micro_max(&r[2], &r[2], &M128Vec);
3497          micro_pow(&r[1], &r[1], &r[2]);
3498          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3499          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3500       }
3501       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3502          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3503          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3504       }
3505    }
3506    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3507       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3508    }
3509
3510    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3511       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3512    }
3513 }
3514
3515 static void
3516 exec_break(struct tgsi_exec_machine *mach)
3517 {
3518    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3519       /* turn off loop channels for each enabled exec channel */
3520       mach->LoopMask &= ~mach->ExecMask;
3521       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3522       UPDATE_EXEC_MASK(mach);
3523    } else {
3524       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3525
3526       mach->Switch.mask = 0x0;
3527
3528       UPDATE_EXEC_MASK(mach);
3529    }
3530 }
3531
3532 static void
3533 exec_switch(struct tgsi_exec_machine *mach,
3534             const struct tgsi_full_instruction *inst)
3535 {
3536    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3537    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3538
3539    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3540    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3541    mach->Switch.mask = 0x0;
3542    mach->Switch.defaultMask = 0x0;
3543
3544    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3545    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3546
3547    UPDATE_EXEC_MASK(mach);
3548 }
3549
3550 static void
3551 exec_case(struct tgsi_exec_machine *mach,
3552           const struct tgsi_full_instruction *inst)
3553 {
3554    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3555    union tgsi_exec_channel src;
3556    uint mask = 0;
3557
3558    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3559
3560    if (mach->Switch.selector.u[0] == src.u[0]) {
3561       mask |= 0x1;
3562    }
3563    if (mach->Switch.selector.u[1] == src.u[1]) {
3564       mask |= 0x2;
3565    }
3566    if (mach->Switch.selector.u[2] == src.u[2]) {
3567       mask |= 0x4;
3568    }
3569    if (mach->Switch.selector.u[3] == src.u[3]) {
3570       mask |= 0x8;
3571    }
3572
3573    mach->Switch.defaultMask |= mask;
3574
3575    mach->Switch.mask |= mask & prevMask;
3576
3577    UPDATE_EXEC_MASK(mach);
3578 }
3579
3580 /* FIXME: this will only work if default is last */
3581 static void
3582 exec_default(struct tgsi_exec_machine *mach)
3583 {
3584    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3585
3586    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3587
3588    UPDATE_EXEC_MASK(mach);
3589 }
3590
3591 static void
3592 exec_endswitch(struct tgsi_exec_machine *mach)
3593 {
3594    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3595    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3596
3597    UPDATE_EXEC_MASK(mach);
3598 }
3599
3600 typedef void (* micro_dop)(union tgsi_double_channel *dst,
3601                            const union tgsi_double_channel *src);
3602
3603 typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3604                                const union tgsi_double_channel *src0,
3605                                union tgsi_exec_channel *src1);
3606
3607 typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3608                              const union tgsi_exec_channel *src);
3609
3610 typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3611                              const union tgsi_double_channel *src);
3612
3613 static void
3614 fetch_double_channel(struct tgsi_exec_machine *mach,
3615                      union tgsi_double_channel *chan,
3616                      const struct tgsi_full_src_register *reg,
3617                      uint chan_0,
3618                      uint chan_1)
3619 {
3620    union tgsi_exec_channel src[2];
3621    uint i;
3622
3623    fetch_source_d(mach, &src[0], reg, chan_0);
3624    fetch_source_d(mach, &src[1], reg, chan_1);
3625
3626    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3627       chan->u[i][0] = src[0].u[i];
3628       chan->u[i][1] = src[1].u[i];
3629    }
3630    if (reg->Register.Absolute) {
3631       micro_dabs(chan, chan);
3632    }
3633    if (reg->Register.Negate) {
3634       micro_dneg(chan, chan);
3635    }
3636 }
3637
3638 static void
3639 store_double_channel(struct tgsi_exec_machine *mach,
3640                      const union tgsi_double_channel *chan,
3641                      const struct tgsi_full_dst_register *reg,
3642                      const struct tgsi_full_instruction *inst,
3643                      uint chan_0,
3644                      uint chan_1)
3645 {
3646    union tgsi_exec_channel dst[2];
3647    uint i;
3648    union tgsi_double_channel temp;
3649    const uint execmask = mach->ExecMask;
3650
3651    if (!inst->Instruction.Saturate) {
3652       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3653          if (execmask & (1 << i)) {
3654             dst[0].u[i] = chan->u[i][0];
3655             dst[1].u[i] = chan->u[i][1];
3656          }
3657    }
3658    else {
3659       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3660          if (execmask & (1 << i)) {
3661             if (chan->d[i] < 0.0)
3662                temp.d[i] = 0.0;
3663             else if (chan->d[i] > 1.0)
3664                temp.d[i] = 1.0;
3665             else
3666                temp.d[i] = chan->d[i];
3667
3668             dst[0].u[i] = temp.u[i][0];
3669             dst[1].u[i] = temp.u[i][1];
3670          }
3671    }
3672
3673    store_dest_double(mach, &dst[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
3674    if (chan_1 != (unsigned)-1)
3675       store_dest_double(mach, &dst[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
3676 }
3677
3678 static void
3679 exec_double_unary(struct tgsi_exec_machine *mach,
3680                   const struct tgsi_full_instruction *inst,
3681                   micro_dop op)
3682 {
3683    union tgsi_double_channel src;
3684    union tgsi_double_channel dst;
3685
3686    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3687       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3688       op(&dst, &src);
3689       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3690    }
3691    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3692       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3693       op(&dst, &src);
3694       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3695    }
3696 }
3697
3698 static void
3699 exec_double_binary(struct tgsi_exec_machine *mach,
3700                    const struct tgsi_full_instruction *inst,
3701                    micro_dop op,
3702                    enum tgsi_exec_datatype dst_datatype)
3703 {
3704    union tgsi_double_channel src[2];
3705    union tgsi_double_channel dst;
3706    int first_dest_chan, second_dest_chan;
3707    int wmask;
3708
3709    wmask = inst->Dst[0].Register.WriteMask;
3710    /* these are & because of the way DSLT etc store their destinations */
3711    if (wmask & TGSI_WRITEMASK_XY) {
3712       first_dest_chan = TGSI_CHAN_X;
3713       second_dest_chan = TGSI_CHAN_Y;
3714       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3715          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3716          second_dest_chan = -1;
3717       }
3718
3719       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3720       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3721       op(&dst, src);
3722       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3723    }
3724
3725    if (wmask & TGSI_WRITEMASK_ZW) {
3726       first_dest_chan = TGSI_CHAN_Z;
3727       second_dest_chan = TGSI_CHAN_W;
3728       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3729          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3730          second_dest_chan = -1;
3731       }
3732
3733       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3734       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3735       op(&dst, src);
3736       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3737    }
3738 }
3739
3740 static void
3741 exec_double_trinary(struct tgsi_exec_machine *mach,
3742                     const struct tgsi_full_instruction *inst,
3743                     micro_dop op)
3744 {
3745    union tgsi_double_channel src[3];
3746    union tgsi_double_channel dst;
3747
3748    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3749       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3750       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3751       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3752       op(&dst, src);
3753       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3754    }
3755    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3756       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3757       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3758       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3759       op(&dst, src);
3760       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3761    }
3762 }
3763
3764 static void
3765 exec_dldexp(struct tgsi_exec_machine *mach,
3766             const struct tgsi_full_instruction *inst)
3767 {
3768    union tgsi_double_channel src0;
3769    union tgsi_exec_channel src1;
3770    union tgsi_double_channel dst;
3771    int wmask;
3772
3773    wmask = inst->Dst[0].Register.WriteMask;
3774    if (wmask & TGSI_WRITEMASK_XY) {
3775       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3776       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3777       micro_dldexp(&dst, &src0, &src1);
3778       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3779    }
3780
3781    if (wmask & TGSI_WRITEMASK_ZW) {
3782       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3783       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3784       micro_dldexp(&dst, &src0, &src1);
3785       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3786    }
3787 }
3788
3789 static void
3790 exec_dfracexp(struct tgsi_exec_machine *mach,
3791               const struct tgsi_full_instruction *inst)
3792 {
3793    union tgsi_double_channel src;
3794    union tgsi_double_channel dst;
3795    union tgsi_exec_channel dst_exp;
3796
3797    fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3798    micro_dfracexp(&dst, &dst_exp, &src);
3799    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
3800       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3801    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
3802       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3803    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3804       if (inst->Dst[1].Register.WriteMask & (1 << chan))
3805          store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
3806    }
3807 }
3808
3809 static void
3810 exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3811             const struct tgsi_full_instruction *inst,
3812             micro_dop_sop op)
3813 {
3814    union tgsi_double_channel src0;
3815    union tgsi_exec_channel src1;
3816    union tgsi_double_channel dst;
3817    int wmask;
3818
3819    wmask = inst->Dst[0].Register.WriteMask;
3820    if (wmask & TGSI_WRITEMASK_XY) {
3821       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3822       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3823       op(&dst, &src0, &src1);
3824       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3825    }
3826
3827    if (wmask & TGSI_WRITEMASK_ZW) {
3828       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3829       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3830       op(&dst, &src0, &src1);
3831       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3832    }
3833 }
3834
3835 static int
3836 get_image_coord_dim(unsigned tgsi_tex)
3837 {
3838    int dim;
3839    switch (tgsi_tex) {
3840    case TGSI_TEXTURE_BUFFER:
3841    case TGSI_TEXTURE_1D:
3842       dim = 1;
3843       break;
3844    case TGSI_TEXTURE_2D:
3845    case TGSI_TEXTURE_RECT:
3846    case TGSI_TEXTURE_1D_ARRAY:
3847    case TGSI_TEXTURE_2D_MSAA:
3848       dim = 2;
3849       break;
3850    case TGSI_TEXTURE_3D:
3851    case TGSI_TEXTURE_CUBE:
3852    case TGSI_TEXTURE_2D_ARRAY:
3853    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3854    case TGSI_TEXTURE_CUBE_ARRAY:
3855       dim = 3;
3856       break;
3857    default:
3858       assert(!"unknown texture target");
3859       dim = 0;
3860       break;
3861    }
3862
3863    return dim;
3864 }
3865
3866 static int
3867 get_image_coord_sample(unsigned tgsi_tex)
3868 {
3869    int sample = 0;
3870    switch (tgsi_tex) {
3871    case TGSI_TEXTURE_2D_MSAA:
3872       sample = 3;
3873       break;
3874    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3875       sample = 4;
3876       break;
3877    default:
3878       break;
3879    }
3880    return sample;
3881 }
3882
3883 static void
3884 exec_load_img(struct tgsi_exec_machine *mach,
3885               const struct tgsi_full_instruction *inst)
3886 {
3887    union tgsi_exec_channel r[4], sample_r;
3888    uint unit;
3889    int sample;
3890    int i, j;
3891    int dim;
3892    uint chan;
3893    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3894    struct tgsi_image_params params;
3895    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3896
3897    unit = fetch_sampler_unit(mach, inst, 0);
3898    dim = get_image_coord_dim(inst->Memory.Texture);
3899    sample = get_image_coord_sample(inst->Memory.Texture);
3900    assert(dim <= 3);
3901
3902    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3903    params.unit = unit;
3904    params.tgsi_tex_instr = inst->Memory.Texture;
3905    params.format = inst->Memory.Format;
3906
3907    for (i = 0; i < dim; i++) {
3908       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3909    }
3910
3911    if (sample)
3912       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3913
3914    mach->Image->load(mach->Image, &params,
3915                      r[0].i, r[1].i, r[2].i, sample_r.i,
3916                      rgba);
3917    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3918       r[0].f[j] = rgba[0][j];
3919       r[1].f[j] = rgba[1][j];
3920       r[2].f[j] = rgba[2][j];
3921       r[3].f[j] = rgba[3][j];
3922    }
3923    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3924       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3925          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3926       }
3927    }
3928 }
3929
3930 static void
3931 exec_load_buf(struct tgsi_exec_machine *mach,
3932               const struct tgsi_full_instruction *inst)
3933 {
3934    union tgsi_exec_channel r[4];
3935    uint unit;
3936    int j;
3937    uint chan;
3938    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3939    struct tgsi_buffer_params params;
3940    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3941
3942    unit = fetch_sampler_unit(mach, inst, 0);
3943
3944    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3945    params.unit = unit;
3946    IFETCH(&r[0], 1, TGSI_CHAN_X);
3947
3948    mach->Buffer->load(mach->Buffer, &params,
3949                       r[0].i, rgba);
3950    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3951       r[0].f[j] = rgba[0][j];
3952       r[1].f[j] = rgba[1][j];
3953       r[2].f[j] = rgba[2][j];
3954       r[3].f[j] = rgba[3][j];
3955    }
3956    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3957       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3958          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3959       }
3960    }
3961 }
3962
3963 static void
3964 exec_load_mem(struct tgsi_exec_machine *mach,
3965               const struct tgsi_full_instruction *inst)
3966 {
3967    union tgsi_exec_channel r[4];
3968    uint chan;
3969    char *ptr = mach->LocalMem;
3970    uint32_t offset;
3971    int j;
3972
3973    IFETCH(&r[0], 1, TGSI_CHAN_X);
3974    if (r[0].u[0] >= mach->LocalMemSize)
3975       return;
3976
3977    offset = r[0].u[0];
3978    ptr += offset;
3979
3980    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3981       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3982          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3983             memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
3984          }
3985       }
3986    }
3987
3988    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3989       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3990          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3991       }
3992    }
3993 }
3994
3995 static void
3996 exec_load(struct tgsi_exec_machine *mach,
3997           const struct tgsi_full_instruction *inst)
3998 {
3999    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4000       exec_load_img(mach, inst);
4001    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4002       exec_load_buf(mach, inst);
4003    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4004       exec_load_mem(mach, inst);
4005 }
4006
4007 static uint
4008 fetch_store_img_unit(struct tgsi_exec_machine *mach,
4009                      const struct tgsi_full_dst_register *dst)
4010 {
4011    uint unit = 0;
4012    int i;
4013    if (dst->Register.Indirect) {
4014       union tgsi_exec_channel indir_index, index2;
4015       const uint execmask = mach->ExecMask;
4016       index2.i[0] =
4017       index2.i[1] =
4018       index2.i[2] =
4019       index2.i[3] = dst->Indirect.Index;
4020
4021       fetch_src_file_channel(mach,
4022                              dst->Indirect.File,
4023                              dst->Indirect.Swizzle,
4024                              &index2,
4025                              &ZeroVec,
4026                              &indir_index);
4027       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4028          if (execmask & (1 << i)) {
4029             unit = dst->Register.Index + indir_index.i[i];
4030             break;
4031          }
4032       }
4033    } else {
4034       unit = dst->Register.Index;
4035    }
4036    return unit;
4037 }
4038
4039 static void
4040 exec_store_img(struct tgsi_exec_machine *mach,
4041                const struct tgsi_full_instruction *inst)
4042 {
4043    union tgsi_exec_channel r[3], sample_r;
4044    union tgsi_exec_channel value[4];
4045    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4046    struct tgsi_image_params params;
4047    int dim;
4048    int sample;
4049    int i, j;
4050    uint unit;
4051    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4052    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4053    dim = get_image_coord_dim(inst->Memory.Texture);
4054    sample = get_image_coord_sample(inst->Memory.Texture);
4055    assert(dim <= 3);
4056
4057    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4058    params.unit = unit;
4059    params.tgsi_tex_instr = inst->Memory.Texture;
4060    params.format = inst->Memory.Format;
4061
4062    for (i = 0; i < dim; i++) {
4063       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
4064    }
4065
4066    for (i = 0; i < 4; i++) {
4067       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4068    }
4069    if (sample)
4070       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
4071
4072    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4073       rgba[0][j] = value[0].f[j];
4074       rgba[1][j] = value[1].f[j];
4075       rgba[2][j] = value[2].f[j];
4076       rgba[3][j] = value[3].f[j];
4077    }
4078
4079    mach->Image->store(mach->Image, &params,
4080                       r[0].i, r[1].i, r[2].i, sample_r.i,
4081                       rgba);
4082 }
4083
4084 static void
4085 exec_store_buf(struct tgsi_exec_machine *mach,
4086                const struct tgsi_full_instruction *inst)
4087 {
4088    union tgsi_exec_channel r[3];
4089    union tgsi_exec_channel value[4];
4090    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4091    struct tgsi_buffer_params params;
4092    int i, j;
4093    uint unit;
4094    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4095
4096    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4097
4098    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4099    params.unit = unit;
4100    params.writemask = inst->Dst[0].Register.WriteMask;
4101
4102    IFETCH(&r[0], 0, TGSI_CHAN_X);
4103    for (i = 0; i < 4; i++) {
4104       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4105    }
4106
4107    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4108       rgba[0][j] = value[0].f[j];
4109       rgba[1][j] = value[1].f[j];
4110       rgba[2][j] = value[2].f[j];
4111       rgba[3][j] = value[3].f[j];
4112    }
4113
4114    mach->Buffer->store(mach->Buffer, &params,
4115                       r[0].i,
4116                       rgba);
4117 }
4118
4119 static void
4120 exec_store_mem(struct tgsi_exec_machine *mach,
4121                const struct tgsi_full_instruction *inst)
4122 {
4123    union tgsi_exec_channel r[3];
4124    union tgsi_exec_channel value[4];
4125    uint i, chan;
4126    char *ptr = mach->LocalMem;
4127    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4128    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4129
4130    IFETCH(&r[0], 0, TGSI_CHAN_X);
4131
4132    for (i = 0; i < 4; i++) {
4133       FETCH(&value[i], 1, TGSI_CHAN_X + i);
4134    }
4135
4136    if (r[0].u[0] >= mach->LocalMemSize)
4137       return;
4138    ptr += r[0].u[0];
4139
4140    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4141       if (execmask & (1 << i)) {
4142          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4143             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4144                memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
4145             }
4146          }
4147       }
4148    }
4149 }
4150
4151 static void
4152 exec_store(struct tgsi_exec_machine *mach,
4153            const struct tgsi_full_instruction *inst)
4154 {
4155    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
4156       exec_store_img(mach, inst);
4157    else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
4158       exec_store_buf(mach, inst);
4159    else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
4160       exec_store_mem(mach, inst);
4161 }
4162
4163 static void
4164 exec_atomop_img(struct tgsi_exec_machine *mach,
4165                 const struct tgsi_full_instruction *inst)
4166 {
4167    union tgsi_exec_channel r[4], sample_r;
4168    union tgsi_exec_channel value[4], value2[4];
4169    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4170    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4171    struct tgsi_image_params params;
4172    int dim;
4173    int sample;
4174    int i, j;
4175    uint unit, chan;
4176    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4177    unit = fetch_sampler_unit(mach, inst, 0);
4178    dim = get_image_coord_dim(inst->Memory.Texture);
4179    sample = get_image_coord_sample(inst->Memory.Texture);
4180    assert(dim <= 3);
4181
4182    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4183    params.unit = unit;
4184    params.tgsi_tex_instr = inst->Memory.Texture;
4185    params.format = inst->Memory.Format;
4186
4187    for (i = 0; i < dim; i++) {
4188       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4189    }
4190
4191    for (i = 0; i < 4; i++) {
4192       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4193       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4194          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4195    }
4196    if (sample)
4197       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4198
4199    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4200       rgba[0][j] = value[0].f[j];
4201       rgba[1][j] = value[1].f[j];
4202       rgba[2][j] = value[2].f[j];
4203       rgba[3][j] = value[3].f[j];
4204    }
4205    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4206       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4207          rgba2[0][j] = value2[0].f[j];
4208          rgba2[1][j] = value2[1].f[j];
4209          rgba2[2][j] = value2[2].f[j];
4210          rgba2[3][j] = value2[3].f[j];
4211       }
4212    }
4213
4214    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4215                    r[0].i, r[1].i, r[2].i, sample_r.i,
4216                    rgba, rgba2);
4217
4218    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4219       r[0].f[j] = rgba[0][j];
4220       r[1].f[j] = rgba[1][j];
4221       r[2].f[j] = rgba[2][j];
4222       r[3].f[j] = rgba[3][j];
4223    }
4224    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4225       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4226          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4227       }
4228    }
4229 }
4230
4231 static void
4232 exec_atomop_buf(struct tgsi_exec_machine *mach,
4233                 const struct tgsi_full_instruction *inst)
4234 {
4235    union tgsi_exec_channel r[4];
4236    union tgsi_exec_channel value[4], value2[4];
4237    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4238    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4239    struct tgsi_buffer_params params;
4240    int i, j;
4241    uint unit, chan;
4242    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4243
4244    unit = fetch_sampler_unit(mach, inst, 0);
4245
4246    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4247    params.unit = unit;
4248    params.writemask = inst->Dst[0].Register.WriteMask;
4249
4250    IFETCH(&r[0], 1, TGSI_CHAN_X);
4251
4252    for (i = 0; i < 4; i++) {
4253       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4254       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4255          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4256    }
4257
4258    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4259       rgba[0][j] = value[0].f[j];
4260       rgba[1][j] = value[1].f[j];
4261       rgba[2][j] = value[2].f[j];
4262       rgba[3][j] = value[3].f[j];
4263    }
4264    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4265       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4266          rgba2[0][j] = value2[0].f[j];
4267          rgba2[1][j] = value2[1].f[j];
4268          rgba2[2][j] = value2[2].f[j];
4269          rgba2[3][j] = value2[3].f[j];
4270       }
4271    }
4272
4273    mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
4274                    r[0].i,
4275                    rgba, rgba2);
4276
4277    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4278       r[0].f[j] = rgba[0][j];
4279       r[1].f[j] = rgba[1][j];
4280       r[2].f[j] = rgba[2][j];
4281       r[3].f[j] = rgba[3][j];
4282    }
4283    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4284       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4285          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4286       }
4287    }
4288 }
4289
4290 static void
4291 exec_atomop_mem(struct tgsi_exec_machine *mach,
4292                 const struct tgsi_full_instruction *inst)
4293 {
4294    union tgsi_exec_channel r[4];
4295    union tgsi_exec_channel value[4], value2[4];
4296    char *ptr = mach->LocalMem;
4297    uint32_t val;
4298    uint chan, i;
4299    uint32_t offset;
4300    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4301    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4302    IFETCH(&r[0], 1, TGSI_CHAN_X);
4303
4304    if (r[0].u[0] >= mach->LocalMemSize)
4305       return;
4306
4307    offset = r[0].u[0];
4308    ptr += offset;
4309    for (i = 0; i < 4; i++) {
4310       FETCH(&value[i], 2, TGSI_CHAN_X + i);
4311       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4312          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4313    }
4314
4315    memcpy(&r[0].u[0], ptr, 4);
4316    val = r[0].u[0];
4317    switch (inst->Instruction.Opcode) {
4318    case TGSI_OPCODE_ATOMUADD:
4319       val += value[0].u[0];
4320       break;
4321    case TGSI_OPCODE_ATOMXOR:
4322       val ^= value[0].u[0];
4323       break;
4324    case TGSI_OPCODE_ATOMOR:
4325       val |= value[0].u[0];
4326       break;
4327    case TGSI_OPCODE_ATOMAND:
4328       val &= value[0].u[0];
4329       break;
4330    case TGSI_OPCODE_ATOMUMIN:
4331       val = MIN2(val, value[0].u[0]);
4332       break;
4333    case TGSI_OPCODE_ATOMUMAX:
4334       val = MAX2(val, value[0].u[0]);
4335       break;
4336    case TGSI_OPCODE_ATOMIMIN:
4337       val = MIN2(r[0].i[0], value[0].i[0]);
4338       break;
4339    case TGSI_OPCODE_ATOMIMAX:
4340       val = MAX2(r[0].i[0], value[0].i[0]);
4341       break;
4342    case TGSI_OPCODE_ATOMXCHG:
4343       val = value[0].i[0];
4344       break;
4345    case TGSI_OPCODE_ATOMCAS:
4346       if (val == value[0].u[0])
4347          val = value2[0].u[0];
4348       break;
4349    case TGSI_OPCODE_ATOMFADD:
4350       val = fui(r[0].f[0] + value[0].f[0]);
4351       break;
4352    default:
4353       break;
4354    }
4355    for (i = 0; i < TGSI_QUAD_SIZE; i++)
4356       if (execmask & (1 << i))
4357          memcpy(ptr, &val, 4);
4358
4359    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4360       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4361          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4362       }
4363    }
4364 }
4365
4366 static void
4367 exec_atomop(struct tgsi_exec_machine *mach,
4368             const struct tgsi_full_instruction *inst)
4369 {
4370    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4371       exec_atomop_img(mach, inst);
4372    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4373       exec_atomop_buf(mach, inst);
4374    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4375       exec_atomop_mem(mach, inst);
4376 }
4377
4378 static void
4379 exec_resq_img(struct tgsi_exec_machine *mach,
4380               const struct tgsi_full_instruction *inst)
4381 {
4382    int result[4];
4383    union tgsi_exec_channel r[4];
4384    uint unit;
4385    int i, chan, j;
4386    struct tgsi_image_params params;
4387    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4388
4389    unit = fetch_sampler_unit(mach, inst, 0);
4390
4391    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4392    params.unit = unit;
4393    params.tgsi_tex_instr = inst->Memory.Texture;
4394    params.format = inst->Memory.Format;
4395
4396    mach->Image->get_dims(mach->Image, &params, result);
4397
4398    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4399       for (j = 0; j < 4; j++) {
4400          r[j].i[i] = result[j];
4401       }
4402    }
4403
4404    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4405       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4406          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4407                     TGSI_EXEC_DATA_INT);
4408       }
4409    }
4410 }
4411
4412 static void
4413 exec_resq_buf(struct tgsi_exec_machine *mach,
4414               const struct tgsi_full_instruction *inst)
4415 {
4416    int result;
4417    union tgsi_exec_channel r[4];
4418    uint unit;
4419    int i, chan;
4420    struct tgsi_buffer_params params;
4421    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4422
4423    unit = fetch_sampler_unit(mach, inst, 0);
4424
4425    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4426    params.unit = unit;
4427
4428    mach->Buffer->get_dims(mach->Buffer, &params, &result);
4429
4430    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4431       r[0].i[i] = result;
4432    }
4433
4434    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4435       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4436          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4437                     TGSI_EXEC_DATA_INT);
4438       }
4439    }
4440 }
4441
4442 static void
4443 exec_resq(struct tgsi_exec_machine *mach,
4444           const struct tgsi_full_instruction *inst)
4445 {
4446    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4447       exec_resq_img(mach, inst);
4448    else
4449       exec_resq_buf(mach, inst);
4450 }
4451
4452 static void
4453 micro_f2u64(union tgsi_double_channel *dst,
4454             const union tgsi_exec_channel *src)
4455 {
4456    dst->u64[0] = (uint64_t)src->f[0];
4457    dst->u64[1] = (uint64_t)src->f[1];
4458    dst->u64[2] = (uint64_t)src->f[2];
4459    dst->u64[3] = (uint64_t)src->f[3];
4460 }
4461
4462 static void
4463 micro_f2i64(union tgsi_double_channel *dst,
4464             const union tgsi_exec_channel *src)
4465 {
4466    dst->i64[0] = (int64_t)src->f[0];
4467    dst->i64[1] = (int64_t)src->f[1];
4468    dst->i64[2] = (int64_t)src->f[2];
4469    dst->i64[3] = (int64_t)src->f[3];
4470 }
4471
4472 static void
4473 micro_u2i64(union tgsi_double_channel *dst,
4474             const union tgsi_exec_channel *src)
4475 {
4476    dst->u64[0] = (uint64_t)src->u[0];
4477    dst->u64[1] = (uint64_t)src->u[1];
4478    dst->u64[2] = (uint64_t)src->u[2];
4479    dst->u64[3] = (uint64_t)src->u[3];
4480 }
4481
4482 static void
4483 micro_i2i64(union tgsi_double_channel *dst,
4484             const union tgsi_exec_channel *src)
4485 {
4486    dst->i64[0] = (int64_t)src->i[0];
4487    dst->i64[1] = (int64_t)src->i[1];
4488    dst->i64[2] = (int64_t)src->i[2];
4489    dst->i64[3] = (int64_t)src->i[3];
4490 }
4491
4492 static void
4493 micro_d2u64(union tgsi_double_channel *dst,
4494            const union tgsi_double_channel *src)
4495 {
4496    dst->u64[0] = (uint64_t)src->d[0];
4497    dst->u64[1] = (uint64_t)src->d[1];
4498    dst->u64[2] = (uint64_t)src->d[2];
4499    dst->u64[3] = (uint64_t)src->d[3];
4500 }
4501
4502 static void
4503 micro_d2i64(union tgsi_double_channel *dst,
4504            const union tgsi_double_channel *src)
4505 {
4506    dst->i64[0] = (int64_t)src->d[0];
4507    dst->i64[1] = (int64_t)src->d[1];
4508    dst->i64[2] = (int64_t)src->d[2];
4509    dst->i64[3] = (int64_t)src->d[3];
4510 }
4511
4512 static void
4513 micro_u642d(union tgsi_double_channel *dst,
4514            const union tgsi_double_channel *src)
4515 {
4516    dst->d[0] = (double)src->u64[0];
4517    dst->d[1] = (double)src->u64[1];
4518    dst->d[2] = (double)src->u64[2];
4519    dst->d[3] = (double)src->u64[3];
4520 }
4521
4522 static void
4523 micro_i642d(union tgsi_double_channel *dst,
4524            const union tgsi_double_channel *src)
4525 {
4526    dst->d[0] = (double)src->i64[0];
4527    dst->d[1] = (double)src->i64[1];
4528    dst->d[2] = (double)src->i64[2];
4529    dst->d[3] = (double)src->i64[3];
4530 }
4531
4532 static void
4533 micro_u642f(union tgsi_exec_channel *dst,
4534             const union tgsi_double_channel *src)
4535 {
4536    dst->f[0] = (float)src->u64[0];
4537    dst->f[1] = (float)src->u64[1];
4538    dst->f[2] = (float)src->u64[2];
4539    dst->f[3] = (float)src->u64[3];
4540 }
4541
4542 static void
4543 micro_i642f(union tgsi_exec_channel *dst,
4544             const union tgsi_double_channel *src)
4545 {
4546    dst->f[0] = (float)src->i64[0];
4547    dst->f[1] = (float)src->i64[1];
4548    dst->f[2] = (float)src->i64[2];
4549    dst->f[3] = (float)src->i64[3];
4550 }
4551
4552 static void
4553 exec_t_2_64(struct tgsi_exec_machine *mach,
4554           const struct tgsi_full_instruction *inst,
4555           micro_dop_s op,
4556           enum tgsi_exec_datatype src_datatype)
4557 {
4558    union tgsi_exec_channel src;
4559    union tgsi_double_channel dst;
4560
4561    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4562       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4563       op(&dst, &src);
4564       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4565    }
4566    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4567       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4568       op(&dst, &src);
4569       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4570    }
4571 }
4572
4573 static void
4574 exec_64_2_t(struct tgsi_exec_machine *mach,
4575             const struct tgsi_full_instruction *inst,
4576             micro_sop_d op,
4577             enum tgsi_exec_datatype dst_datatype)
4578 {
4579    union tgsi_double_channel src;
4580    union tgsi_exec_channel dst;
4581    int wm = inst->Dst[0].Register.WriteMask;
4582    int i;
4583    int bit;
4584    for (i = 0; i < 2; i++) {
4585       bit = ffs(wm);
4586       if (bit) {
4587          wm &= ~(1 << (bit - 1));
4588          if (i == 0)
4589             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4590          else
4591             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4592          op(&dst, &src);
4593          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
4594       }
4595    }
4596 }
4597
4598 static void
4599 micro_i2f(union tgsi_exec_channel *dst,
4600           const union tgsi_exec_channel *src)
4601 {
4602    dst->f[0] = (float)src->i[0];
4603    dst->f[1] = (float)src->i[1];
4604    dst->f[2] = (float)src->i[2];
4605    dst->f[3] = (float)src->i[3];
4606 }
4607
4608 static void
4609 micro_not(union tgsi_exec_channel *dst,
4610           const union tgsi_exec_channel *src)
4611 {
4612    dst->u[0] = ~src->u[0];
4613    dst->u[1] = ~src->u[1];
4614    dst->u[2] = ~src->u[2];
4615    dst->u[3] = ~src->u[3];
4616 }
4617
4618 static void
4619 micro_shl(union tgsi_exec_channel *dst,
4620           const union tgsi_exec_channel *src0,
4621           const union tgsi_exec_channel *src1)
4622 {
4623    unsigned masked_count;
4624    masked_count = src1->u[0] & 0x1f;
4625    dst->u[0] = src0->u[0] << masked_count;
4626    masked_count = src1->u[1] & 0x1f;
4627    dst->u[1] = src0->u[1] << masked_count;
4628    masked_count = src1->u[2] & 0x1f;
4629    dst->u[2] = src0->u[2] << masked_count;
4630    masked_count = src1->u[3] & 0x1f;
4631    dst->u[3] = src0->u[3] << masked_count;
4632 }
4633
4634 static void
4635 micro_and(union tgsi_exec_channel *dst,
4636           const union tgsi_exec_channel *src0,
4637           const union tgsi_exec_channel *src1)
4638 {
4639    dst->u[0] = src0->u[0] & src1->u[0];
4640    dst->u[1] = src0->u[1] & src1->u[1];
4641    dst->u[2] = src0->u[2] & src1->u[2];
4642    dst->u[3] = src0->u[3] & src1->u[3];
4643 }
4644
4645 static void
4646 micro_or(union tgsi_exec_channel *dst,
4647          const union tgsi_exec_channel *src0,
4648          const union tgsi_exec_channel *src1)
4649 {
4650    dst->u[0] = src0->u[0] | src1->u[0];
4651    dst->u[1] = src0->u[1] | src1->u[1];
4652    dst->u[2] = src0->u[2] | src1->u[2];
4653    dst->u[3] = src0->u[3] | src1->u[3];
4654 }
4655
4656 static void
4657 micro_xor(union tgsi_exec_channel *dst,
4658           const union tgsi_exec_channel *src0,
4659           const union tgsi_exec_channel *src1)
4660 {
4661    dst->u[0] = src0->u[0] ^ src1->u[0];
4662    dst->u[1] = src0->u[1] ^ src1->u[1];
4663    dst->u[2] = src0->u[2] ^ src1->u[2];
4664    dst->u[3] = src0->u[3] ^ src1->u[3];
4665 }
4666
4667 static void
4668 micro_mod(union tgsi_exec_channel *dst,
4669           const union tgsi_exec_channel *src0,
4670           const union tgsi_exec_channel *src1)
4671 {
4672    dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
4673    dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
4674    dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
4675    dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
4676 }
4677
4678 static void
4679 micro_f2i(union tgsi_exec_channel *dst,
4680           const union tgsi_exec_channel *src)
4681 {
4682    dst->i[0] = (int)src->f[0];
4683    dst->i[1] = (int)src->f[1];
4684    dst->i[2] = (int)src->f[2];
4685    dst->i[3] = (int)src->f[3];
4686 }
4687
4688 static void
4689 micro_fseq(union tgsi_exec_channel *dst,
4690            const union tgsi_exec_channel *src0,
4691            const union tgsi_exec_channel *src1)
4692 {
4693    dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4694    dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4695    dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4696    dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4697 }
4698
4699 static void
4700 micro_fsge(union tgsi_exec_channel *dst,
4701            const union tgsi_exec_channel *src0,
4702            const union tgsi_exec_channel *src1)
4703 {
4704    dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4705    dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4706    dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4707    dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4708 }
4709
4710 static void
4711 micro_fslt(union tgsi_exec_channel *dst,
4712            const union tgsi_exec_channel *src0,
4713            const union tgsi_exec_channel *src1)
4714 {
4715    dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4716    dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4717    dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4718    dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4719 }
4720
4721 static void
4722 micro_fsne(union tgsi_exec_channel *dst,
4723            const union tgsi_exec_channel *src0,
4724            const union tgsi_exec_channel *src1)
4725 {
4726    dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4727    dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4728    dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4729    dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4730 }
4731
4732 static void
4733 micro_idiv(union tgsi_exec_channel *dst,
4734            const union tgsi_exec_channel *src0,
4735            const union tgsi_exec_channel *src1)
4736 {
4737    dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4738    dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4739    dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4740    dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4741 }
4742
4743 static void
4744 micro_imax(union tgsi_exec_channel *dst,
4745            const union tgsi_exec_channel *src0,
4746            const union tgsi_exec_channel *src1)
4747 {
4748    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4749    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4750    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4751    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4752 }
4753
4754 static void
4755 micro_imin(union tgsi_exec_channel *dst,
4756            const union tgsi_exec_channel *src0,
4757            const union tgsi_exec_channel *src1)
4758 {
4759    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4760    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4761    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4762    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4763 }
4764
4765 static void
4766 micro_isge(union tgsi_exec_channel *dst,
4767            const union tgsi_exec_channel *src0,
4768            const union tgsi_exec_channel *src1)
4769 {
4770    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4771    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4772    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4773    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4774 }
4775
4776 static void
4777 micro_ishr(union tgsi_exec_channel *dst,
4778            const union tgsi_exec_channel *src0,
4779            const union tgsi_exec_channel *src1)
4780 {
4781    unsigned masked_count;
4782    masked_count = src1->i[0] & 0x1f;
4783    dst->i[0] = src0->i[0] >> masked_count;
4784    masked_count = src1->i[1] & 0x1f;
4785    dst->i[1] = src0->i[1] >> masked_count;
4786    masked_count = src1->i[2] & 0x1f;
4787    dst->i[2] = src0->i[2] >> masked_count;
4788    masked_count = src1->i[3] & 0x1f;
4789    dst->i[3] = src0->i[3] >> masked_count;
4790 }
4791
4792 static void
4793 micro_islt(union tgsi_exec_channel *dst,
4794            const union tgsi_exec_channel *src0,
4795            const union tgsi_exec_channel *src1)
4796 {
4797    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4798    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4799    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4800    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4801 }
4802
4803 static void
4804 micro_f2u(union tgsi_exec_channel *dst,
4805           const union tgsi_exec_channel *src)
4806 {
4807    dst->u[0] = (uint)src->f[0];
4808    dst->u[1] = (uint)src->f[1];
4809    dst->u[2] = (uint)src->f[2];
4810    dst->u[3] = (uint)src->f[3];
4811 }
4812
4813 static void
4814 micro_u2f(union tgsi_exec_channel *dst,
4815           const union tgsi_exec_channel *src)
4816 {
4817    dst->f[0] = (float)src->u[0];
4818    dst->f[1] = (float)src->u[1];
4819    dst->f[2] = (float)src->u[2];
4820    dst->f[3] = (float)src->u[3];
4821 }
4822
4823 static void
4824 micro_uadd(union tgsi_exec_channel *dst,
4825            const union tgsi_exec_channel *src0,
4826            const union tgsi_exec_channel *src1)
4827 {
4828    dst->u[0] = src0->u[0] + src1->u[0];
4829    dst->u[1] = src0->u[1] + src1->u[1];
4830    dst->u[2] = src0->u[2] + src1->u[2];
4831    dst->u[3] = src0->u[3] + src1->u[3];
4832 }
4833
4834 static void
4835 micro_udiv(union tgsi_exec_channel *dst,
4836            const union tgsi_exec_channel *src0,
4837            const union tgsi_exec_channel *src1)
4838 {
4839    dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4840    dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4841    dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4842    dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4843 }
4844
4845 static void
4846 micro_umad(union tgsi_exec_channel *dst,
4847            const union tgsi_exec_channel *src0,
4848            const union tgsi_exec_channel *src1,
4849            const union tgsi_exec_channel *src2)
4850 {
4851    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4852    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4853    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4854    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4855 }
4856
4857 static void
4858 micro_umax(union tgsi_exec_channel *dst,
4859            const union tgsi_exec_channel *src0,
4860            const union tgsi_exec_channel *src1)
4861 {
4862    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4863    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4864    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4865    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4866 }
4867
4868 static void
4869 micro_umin(union tgsi_exec_channel *dst,
4870            const union tgsi_exec_channel *src0,
4871            const union tgsi_exec_channel *src1)
4872 {
4873    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4874    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4875    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4876    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4877 }
4878
4879 static void
4880 micro_umod(union tgsi_exec_channel *dst,
4881            const union tgsi_exec_channel *src0,
4882            const union tgsi_exec_channel *src1)
4883 {
4884    dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4885    dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4886    dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4887    dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4888 }
4889
4890 static void
4891 micro_umul(union tgsi_exec_channel *dst,
4892            const union tgsi_exec_channel *src0,
4893            const union tgsi_exec_channel *src1)
4894 {
4895    dst->u[0] = src0->u[0] * src1->u[0];
4896    dst->u[1] = src0->u[1] * src1->u[1];
4897    dst->u[2] = src0->u[2] * src1->u[2];
4898    dst->u[3] = src0->u[3] * src1->u[3];
4899 }
4900
4901 static void
4902 micro_imul_hi(union tgsi_exec_channel *dst,
4903               const union tgsi_exec_channel *src0,
4904               const union tgsi_exec_channel *src1)
4905 {
4906 #define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4907    dst->i[0] = I64M(src0->i[0], src1->i[0]);
4908    dst->i[1] = I64M(src0->i[1], src1->i[1]);
4909    dst->i[2] = I64M(src0->i[2], src1->i[2]);
4910    dst->i[3] = I64M(src0->i[3], src1->i[3]);
4911 #undef I64M
4912 }
4913
4914 static void
4915 micro_umul_hi(union tgsi_exec_channel *dst,
4916               const union tgsi_exec_channel *src0,
4917               const union tgsi_exec_channel *src1)
4918 {
4919 #define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4920    dst->u[0] = U64M(src0->u[0], src1->u[0]);
4921    dst->u[1] = U64M(src0->u[1], src1->u[1]);
4922    dst->u[2] = U64M(src0->u[2], src1->u[2]);
4923    dst->u[3] = U64M(src0->u[3], src1->u[3]);
4924 #undef U64M
4925 }
4926
4927 static void
4928 micro_useq(union tgsi_exec_channel *dst,
4929            const union tgsi_exec_channel *src0,
4930            const union tgsi_exec_channel *src1)
4931 {
4932    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4933    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4934    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4935    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4936 }
4937
4938 static void
4939 micro_usge(union tgsi_exec_channel *dst,
4940            const union tgsi_exec_channel *src0,
4941            const union tgsi_exec_channel *src1)
4942 {
4943    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4944    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4945    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4946    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4947 }
4948
4949 static void
4950 micro_ushr(union tgsi_exec_channel *dst,
4951            const union tgsi_exec_channel *src0,
4952            const union tgsi_exec_channel *src1)
4953 {
4954    unsigned masked_count;
4955    masked_count = src1->u[0] & 0x1f;
4956    dst->u[0] = src0->u[0] >> masked_count;
4957    masked_count = src1->u[1] & 0x1f;
4958    dst->u[1] = src0->u[1] >> masked_count;
4959    masked_count = src1->u[2] & 0x1f;
4960    dst->u[2] = src0->u[2] >> masked_count;
4961    masked_count = src1->u[3] & 0x1f;
4962    dst->u[3] = src0->u[3] >> masked_count;
4963 }
4964
4965 static void
4966 micro_uslt(union tgsi_exec_channel *dst,
4967            const union tgsi_exec_channel *src0,
4968            const union tgsi_exec_channel *src1)
4969 {
4970    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4971    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4972    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4973    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4974 }
4975
4976 static void
4977 micro_usne(union tgsi_exec_channel *dst,
4978            const union tgsi_exec_channel *src0,
4979            const union tgsi_exec_channel *src1)
4980 {
4981    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4982    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4983    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4984    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
4985 }
4986
4987 static void
4988 micro_uarl(union tgsi_exec_channel *dst,
4989            const union tgsi_exec_channel *src)
4990 {
4991    dst->i[0] = src->u[0];
4992    dst->i[1] = src->u[1];
4993    dst->i[2] = src->u[2];
4994    dst->i[3] = src->u[3];
4995 }
4996
4997 /**
4998  * Signed bitfield extract (i.e. sign-extend the extracted bits)
4999  */
5000 static void
5001 micro_ibfe(union tgsi_exec_channel *dst,
5002            const union tgsi_exec_channel *src0,
5003            const union tgsi_exec_channel *src1,
5004            const union tgsi_exec_channel *src2)
5005 {
5006    int i;
5007    for (i = 0; i < 4; i++) {
5008       int width = src2->i[i];
5009       int offset = src1->i[i] & 0x1f;
5010       if (width == 32 && offset == 0) {
5011          dst->i[i] = src0->i[i];
5012          continue;
5013       }
5014       width &= 0x1f;
5015       if (width == 0)
5016          dst->i[i] = 0;
5017       else if (width + offset < 32)
5018          dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
5019       else
5020          dst->i[i] = src0->i[i] >> offset;
5021    }
5022 }
5023
5024 /**
5025  * Unsigned bitfield extract
5026  */
5027 static void
5028 micro_ubfe(union tgsi_exec_channel *dst,
5029            const union tgsi_exec_channel *src0,
5030            const union tgsi_exec_channel *src1,
5031            const union tgsi_exec_channel *src2)
5032 {
5033    int i;
5034    for (i = 0; i < 4; i++) {
5035       int width = src2->u[i];
5036       int offset = src1->u[i] & 0x1f;
5037       if (width == 32 && offset == 0) {
5038          dst->u[i] = src0->u[i];
5039          continue;
5040       }
5041       width &= 0x1f;
5042       if (width == 0)
5043          dst->u[i] = 0;
5044       else if (width + offset < 32)
5045          dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
5046       else
5047          dst->u[i] = src0->u[i] >> offset;
5048    }
5049 }
5050
5051 /**
5052  * Bitfield insert: copy low bits from src1 into a region of src0.
5053  */
5054 static void
5055 micro_bfi(union tgsi_exec_channel *dst,
5056           const union tgsi_exec_channel *src0,
5057           const union tgsi_exec_channel *src1,
5058           const union tgsi_exec_channel *src2,
5059           const union tgsi_exec_channel *src3)
5060 {
5061    int i;
5062    for (i = 0; i < 4; i++) {
5063       int width = src3->u[i];
5064       int offset = src2->u[i] & 0x1f;
5065       if (width == 32) {
5066          dst->u[i] = src1->u[i];
5067       } else {
5068          int bitmask = ((1 << width) - 1) << offset;
5069          dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
5070       }
5071    }
5072 }
5073
5074 static void
5075 micro_brev(union tgsi_exec_channel *dst,
5076            const union tgsi_exec_channel *src)
5077 {
5078    dst->u[0] = util_bitreverse(src->u[0]);
5079    dst->u[1] = util_bitreverse(src->u[1]);
5080    dst->u[2] = util_bitreverse(src->u[2]);
5081    dst->u[3] = util_bitreverse(src->u[3]);
5082 }
5083
5084 static void
5085 micro_popc(union tgsi_exec_channel *dst,
5086            const union tgsi_exec_channel *src)
5087 {
5088    dst->u[0] = util_bitcount(src->u[0]);
5089    dst->u[1] = util_bitcount(src->u[1]);
5090    dst->u[2] = util_bitcount(src->u[2]);
5091    dst->u[3] = util_bitcount(src->u[3]);
5092 }
5093
5094 static void
5095 micro_lsb(union tgsi_exec_channel *dst,
5096           const union tgsi_exec_channel *src)
5097 {
5098    dst->i[0] = ffs(src->u[0]) - 1;
5099    dst->i[1] = ffs(src->u[1]) - 1;
5100    dst->i[2] = ffs(src->u[2]) - 1;
5101    dst->i[3] = ffs(src->u[3]) - 1;
5102 }
5103
5104 static void
5105 micro_imsb(union tgsi_exec_channel *dst,
5106            const union tgsi_exec_channel *src)
5107 {
5108    dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
5109    dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
5110    dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
5111    dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
5112 }
5113
5114 static void
5115 micro_umsb(union tgsi_exec_channel *dst,
5116            const union tgsi_exec_channel *src)
5117 {
5118    dst->i[0] = util_last_bit(src->u[0]) - 1;
5119    dst->i[1] = util_last_bit(src->u[1]) - 1;
5120    dst->i[2] = util_last_bit(src->u[2]) - 1;
5121    dst->i[3] = util_last_bit(src->u[3]) - 1;
5122 }
5123
5124
5125 static void
5126 exec_interp_at_sample(struct tgsi_exec_machine *mach,
5127                       const struct tgsi_full_instruction *inst)
5128 {
5129    union tgsi_exec_channel index;
5130    union tgsi_exec_channel index2D;
5131    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5132    const struct tgsi_full_src_register *reg = &inst->Src[0];
5133
5134    assert(reg->Register.File == TGSI_FILE_INPUT);
5135    assert(inst->Src[1].Register.File == TGSI_FILE_IMMEDIATE);
5136
5137    get_index_registers(mach, reg, &index, &index2D);
5138    float sample = mach->Imms[inst->Src[1].Register.Index][inst->Src[1].Register.SwizzleX];
5139
5140    /* Short cut: sample 0 is like a normal fetch */
5141    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5142       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5143          continue;
5144
5145       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5146                              &result[chan]);
5147       if (sample != 0.0f) {
5148
5149       /* TODO: define the samples > 0, but so far we only do fake MSAA */
5150          float x = 0;
5151          float y = 0;
5152
5153          unsigned pos = index2D.i[chan] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[chan];
5154          assert(pos >= 0);
5155          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
5156          mach->InputSampleOffsetApply[pos](mach, pos, chan, x, y, &result[chan]);
5157       }
5158       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5159    }
5160 }
5161
5162
5163 static void
5164 exec_interp_at_offset(struct tgsi_exec_machine *mach,
5165                       const struct tgsi_full_instruction *inst)
5166 {
5167    union tgsi_exec_channel index;
5168    union tgsi_exec_channel index2D;
5169    union tgsi_exec_channel ofsx;
5170    union tgsi_exec_channel ofsy;
5171    const struct tgsi_full_src_register *reg = &inst->Src[0];
5172
5173    assert(reg->Register.File == TGSI_FILE_INPUT);
5174
5175    get_index_registers(mach, reg, &index, &index2D);
5176    unsigned pos = index2D.i[0] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[0];
5177
5178    fetch_source(mach, &ofsx, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
5179    fetch_source(mach, &ofsy, &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
5180
5181    for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5182       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5183          continue;
5184       union tgsi_exec_channel result;
5185       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D, &result);
5186       mach->InputSampleOffsetApply[pos](mach, pos, chan, ofsx.f[chan], ofsy.f[chan], &result);
5187       store_dest(mach, &result, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5188    }
5189 }
5190
5191
5192 static void
5193 exec_interp_at_centroid(struct tgsi_exec_machine *mach,
5194                         const struct tgsi_full_instruction *inst)
5195 {
5196    union tgsi_exec_channel index;
5197    union tgsi_exec_channel index2D;
5198    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5199    const struct tgsi_full_src_register *reg = &inst->Src[0];
5200
5201    assert(reg->Register.File == TGSI_FILE_INPUT);
5202    get_index_registers(mach, reg, &index, &index2D);
5203
5204    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5205       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5206          continue;
5207
5208       /* Here we should add the change to use a sample that lies within the
5209        * primitive (Section 15.2):
5210        *
5211        * "When interpolating variables declared using centroid in ,
5212        * the variable is sampled at a location within the pixel covered
5213        * by the primitive generating the fragment.
5214        * ...
5215        * The built-in functions interpolateAtCentroid ... will sample
5216        * variables as though they were declared with the centroid ...
5217        * qualifier[s]."
5218        *
5219        * Since we only support 1 sample currently, this is just a pass-through.
5220        */
5221       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5222                              &result[chan]);
5223       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5224    }
5225
5226 }
5227
5228
5229 /**
5230  * Execute a TGSI instruction.
5231  * Returns TRUE if a barrier instruction is hit,
5232  * otherwise FALSE.
5233  */
5234 static boolean
5235 exec_instruction(
5236    struct tgsi_exec_machine *mach,
5237    const struct tgsi_full_instruction *inst,
5238    int *pc )
5239 {
5240    union tgsi_exec_channel r[10];
5241
5242    (*pc)++;
5243
5244    switch (inst->Instruction.Opcode) {
5245    case TGSI_OPCODE_ARL:
5246       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5247       break;
5248
5249    case TGSI_OPCODE_MOV:
5250       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5251       break;
5252
5253    case TGSI_OPCODE_LIT:
5254       exec_lit(mach, inst);
5255       break;
5256
5257    case TGSI_OPCODE_RCP:
5258       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5259       break;
5260
5261    case TGSI_OPCODE_RSQ:
5262       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5263       break;
5264
5265    case TGSI_OPCODE_EXP:
5266       exec_exp(mach, inst);
5267       break;
5268
5269    case TGSI_OPCODE_LOG:
5270       exec_log(mach, inst);
5271       break;
5272
5273    case TGSI_OPCODE_MUL:
5274       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5275       break;
5276
5277    case TGSI_OPCODE_ADD:
5278       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5279       break;
5280
5281    case TGSI_OPCODE_DP3:
5282       exec_dp3(mach, inst);
5283       break;
5284
5285    case TGSI_OPCODE_DP4:
5286       exec_dp4(mach, inst);
5287       break;
5288
5289    case TGSI_OPCODE_DST:
5290       exec_dst(mach, inst);
5291       break;
5292
5293    case TGSI_OPCODE_MIN:
5294       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5295       break;
5296
5297    case TGSI_OPCODE_MAX:
5298       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5299       break;
5300
5301    case TGSI_OPCODE_SLT:
5302       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5303       break;
5304
5305    case TGSI_OPCODE_SGE:
5306       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5307       break;
5308
5309    case TGSI_OPCODE_MAD:
5310       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5311       break;
5312
5313    case TGSI_OPCODE_LRP:
5314       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5315       break;
5316
5317    case TGSI_OPCODE_SQRT:
5318       exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5319       break;
5320
5321    case TGSI_OPCODE_FRC:
5322       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5323       break;
5324
5325    case TGSI_OPCODE_FLR:
5326       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5327       break;
5328
5329    case TGSI_OPCODE_ROUND:
5330       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5331       break;
5332
5333    case TGSI_OPCODE_EX2:
5334       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5335       break;
5336
5337    case TGSI_OPCODE_LG2:
5338       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5339       break;
5340
5341    case TGSI_OPCODE_POW:
5342       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5343       break;
5344
5345    case TGSI_OPCODE_LDEXP:
5346       exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5347       break;
5348
5349    case TGSI_OPCODE_COS:
5350       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5351       break;
5352
5353    case TGSI_OPCODE_DDX_FINE:
5354       exec_vector_unary(mach, inst, micro_ddx_fine, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5355       break;
5356
5357    case TGSI_OPCODE_DDX:
5358       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5359       break;
5360
5361    case TGSI_OPCODE_DDY_FINE:
5362       exec_vector_unary(mach, inst, micro_ddy_fine, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5363       break;
5364
5365    case TGSI_OPCODE_DDY:
5366       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5367       break;
5368
5369    case TGSI_OPCODE_KILL:
5370       exec_kill (mach);
5371       break;
5372
5373    case TGSI_OPCODE_KILL_IF:
5374       exec_kill_if (mach, inst);
5375       break;
5376
5377    case TGSI_OPCODE_PK2H:
5378       exec_pk2h(mach, inst);
5379       break;
5380
5381    case TGSI_OPCODE_PK2US:
5382       assert (0);
5383       break;
5384
5385    case TGSI_OPCODE_PK4B:
5386       assert (0);
5387       break;
5388
5389    case TGSI_OPCODE_PK4UB:
5390       assert (0);
5391       break;
5392
5393    case TGSI_OPCODE_SEQ:
5394       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5395       break;
5396
5397    case TGSI_OPCODE_SGT:
5398       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5399       break;
5400
5401    case TGSI_OPCODE_SIN:
5402       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5403       break;
5404
5405    case TGSI_OPCODE_SLE:
5406       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5407       break;
5408
5409    case TGSI_OPCODE_SNE:
5410       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5411       break;
5412
5413    case TGSI_OPCODE_TEX:
5414       /* simple texture lookup */
5415       /* src[0] = texcoord */
5416       /* src[1] = sampler unit */
5417       exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5418       break;
5419
5420    case TGSI_OPCODE_TXB:
5421       /* Texture lookup with lod bias */
5422       /* src[0] = texcoord (src[0].w = LOD bias) */
5423       /* src[1] = sampler unit */
5424       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5425       break;
5426
5427    case TGSI_OPCODE_TXD:
5428       /* Texture lookup with explict partial derivatives */
5429       /* src[0] = texcoord */
5430       /* src[1] = d[strq]/dx */
5431       /* src[2] = d[strq]/dy */
5432       /* src[3] = sampler unit */
5433       exec_txd(mach, inst);
5434       break;
5435
5436    case TGSI_OPCODE_TXL:
5437       /* Texture lookup with explit LOD */
5438       /* src[0] = texcoord (src[0].w = LOD) */
5439       /* src[1] = sampler unit */
5440       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5441       break;
5442
5443    case TGSI_OPCODE_TXP:
5444       /* Texture lookup with projection */
5445       /* src[0] = texcoord (src[0].w = projection) */
5446       /* src[1] = sampler unit */
5447       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5448       break;
5449
5450    case TGSI_OPCODE_TG4:
5451       /* src[0] = texcoord */
5452       /* src[1] = component */
5453       /* src[2] = sampler unit */
5454       exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5455       break;
5456
5457    case TGSI_OPCODE_LODQ:
5458       /* src[0] = texcoord */
5459       /* src[1] = sampler unit */
5460       exec_lodq(mach, inst);
5461       break;
5462
5463    case TGSI_OPCODE_UP2H:
5464       exec_up2h(mach, inst);
5465       break;
5466
5467    case TGSI_OPCODE_UP2US:
5468       assert (0);
5469       break;
5470
5471    case TGSI_OPCODE_UP4B:
5472       assert (0);
5473       break;
5474
5475    case TGSI_OPCODE_UP4UB:
5476       assert (0);
5477       break;
5478
5479    case TGSI_OPCODE_ARR:
5480       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5481       break;
5482
5483    case TGSI_OPCODE_CAL:
5484       /* skip the call if no execution channels are enabled */
5485       if (mach->ExecMask) {
5486          /* do the call */
5487
5488          /* First, record the depths of the execution stacks.
5489           * This is important for deeply nested/looped return statements.
5490           * We have to unwind the stacks by the correct amount.  For a
5491           * real code generator, we could determine the number of entries
5492           * to pop off each stack with simple static analysis and avoid
5493           * implementing this data structure at run time.
5494           */
5495          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5496          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5497          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5498          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5499          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5500          /* note that PC was already incremented above */
5501          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5502
5503          mach->CallStackTop++;
5504
5505          /* Second, push the Cond, Loop, Cont, Func stacks */
5506          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5507          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5508          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5509          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5510          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5511          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5512
5513          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5514          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5515          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5516          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5517          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5518          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5519
5520          /* Finally, jump to the subroutine.  The label is a pointer
5521           * (an instruction number) to the BGNSUB instruction.
5522           */
5523          *pc = inst->Label.Label;
5524          assert(mach->Instructions[*pc].Instruction.Opcode
5525                 == TGSI_OPCODE_BGNSUB);
5526       }
5527       break;
5528
5529    case TGSI_OPCODE_RET:
5530       mach->FuncMask &= ~mach->ExecMask;
5531       UPDATE_EXEC_MASK(mach);
5532
5533       if (mach->FuncMask == 0x0) {
5534          /* really return now (otherwise, keep executing */
5535
5536          if (mach->CallStackTop == 0) {
5537             /* returning from main() */
5538             mach->CondStackTop = 0;
5539             mach->LoopStackTop = 0;
5540             mach->ContStackTop = 0;
5541             mach->LoopLabelStackTop = 0;
5542             mach->SwitchStackTop = 0;
5543             mach->BreakStackTop = 0;
5544             *pc = -1;
5545             return FALSE;
5546          }
5547
5548          assert(mach->CallStackTop > 0);
5549          mach->CallStackTop--;
5550
5551          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5552          mach->CondMask = mach->CondStack[mach->CondStackTop];
5553
5554          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5555          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5556
5557          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5558          mach->ContMask = mach->ContStack[mach->ContStackTop];
5559
5560          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5561          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5562
5563          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5564          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5565
5566          assert(mach->FuncStackTop > 0);
5567          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5568
5569          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5570
5571          UPDATE_EXEC_MASK(mach);
5572       }
5573       break;
5574
5575    case TGSI_OPCODE_SSG:
5576       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5577       break;
5578
5579    case TGSI_OPCODE_CMP:
5580       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5581       break;
5582
5583    case TGSI_OPCODE_DIV:
5584       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5585       break;
5586
5587    case TGSI_OPCODE_DP2:
5588       exec_dp2(mach, inst);
5589       break;
5590
5591    case TGSI_OPCODE_IF:
5592       /* push CondMask */
5593       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5594       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5595       FETCH( &r[0], 0, TGSI_CHAN_X );
5596       /* update CondMask */
5597       if( ! r[0].f[0] ) {
5598          mach->CondMask &= ~0x1;
5599       }
5600       if( ! r[0].f[1] ) {
5601          mach->CondMask &= ~0x2;
5602       }
5603       if( ! r[0].f[2] ) {
5604          mach->CondMask &= ~0x4;
5605       }
5606       if( ! r[0].f[3] ) {
5607          mach->CondMask &= ~0x8;
5608       }
5609       UPDATE_EXEC_MASK(mach);
5610       /* Todo: If CondMask==0, jump to ELSE */
5611       break;
5612
5613    case TGSI_OPCODE_UIF:
5614       /* push CondMask */
5615       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5616       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5617       IFETCH( &r[0], 0, TGSI_CHAN_X );
5618       /* update CondMask */
5619       if( ! r[0].u[0] ) {
5620          mach->CondMask &= ~0x1;
5621       }
5622       if( ! r[0].u[1] ) {
5623          mach->CondMask &= ~0x2;
5624       }
5625       if( ! r[0].u[2] ) {
5626          mach->CondMask &= ~0x4;
5627       }
5628       if( ! r[0].u[3] ) {
5629          mach->CondMask &= ~0x8;
5630       }
5631       UPDATE_EXEC_MASK(mach);
5632       /* Todo: If CondMask==0, jump to ELSE */
5633       break;
5634
5635    case TGSI_OPCODE_ELSE:
5636       /* invert CondMask wrt previous mask */
5637       {
5638          uint prevMask;
5639          assert(mach->CondStackTop > 0);
5640          prevMask = mach->CondStack[mach->CondStackTop - 1];
5641          mach->CondMask = ~mach->CondMask & prevMask;
5642          UPDATE_EXEC_MASK(mach);
5643          /* Todo: If CondMask==0, jump to ENDIF */
5644       }
5645       break;
5646
5647    case TGSI_OPCODE_ENDIF:
5648       /* pop CondMask */
5649       assert(mach->CondStackTop > 0);
5650       mach->CondMask = mach->CondStack[--mach->CondStackTop];
5651       UPDATE_EXEC_MASK(mach);
5652       break;
5653
5654    case TGSI_OPCODE_END:
5655       /* make sure we end primitives which haven't
5656        * been explicitly emitted */
5657       conditional_emit_primitive(mach);
5658       /* halt execution */
5659       *pc = -1;
5660       break;
5661
5662    case TGSI_OPCODE_CEIL:
5663       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5664       break;
5665
5666    case TGSI_OPCODE_I2F:
5667       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
5668       break;
5669
5670    case TGSI_OPCODE_NOT:
5671       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5672       break;
5673
5674    case TGSI_OPCODE_TRUNC:
5675       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5676       break;
5677
5678    case TGSI_OPCODE_SHL:
5679       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5680       break;
5681
5682    case TGSI_OPCODE_AND:
5683       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5684       break;
5685
5686    case TGSI_OPCODE_OR:
5687       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5688       break;
5689
5690    case TGSI_OPCODE_MOD:
5691       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5692       break;
5693
5694    case TGSI_OPCODE_XOR:
5695       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5696       break;
5697
5698    case TGSI_OPCODE_TXF:
5699       exec_txf(mach, inst);
5700       break;
5701
5702    case TGSI_OPCODE_TXQ:
5703       exec_txq(mach, inst);
5704       break;
5705
5706    case TGSI_OPCODE_EMIT:
5707       emit_vertex(mach, inst);
5708       break;
5709
5710    case TGSI_OPCODE_ENDPRIM:
5711       emit_primitive(mach, inst);
5712       break;
5713
5714    case TGSI_OPCODE_BGNLOOP:
5715       /* push LoopMask and ContMasks */
5716       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5717       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5718       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5719       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5720
5721       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5722       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5723       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5724       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5725       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5726       break;
5727
5728    case TGSI_OPCODE_ENDLOOP:
5729       /* Restore ContMask, but don't pop */
5730       assert(mach->ContStackTop > 0);
5731       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5732       UPDATE_EXEC_MASK(mach);
5733       if (mach->ExecMask) {
5734          /* repeat loop: jump to instruction just past BGNLOOP */
5735          assert(mach->LoopLabelStackTop > 0);
5736          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5737       }
5738       else {
5739          /* exit loop: pop LoopMask */
5740          assert(mach->LoopStackTop > 0);
5741          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5742          /* pop ContMask */
5743          assert(mach->ContStackTop > 0);
5744          mach->ContMask = mach->ContStack[--mach->ContStackTop];
5745          assert(mach->LoopLabelStackTop > 0);
5746          --mach->LoopLabelStackTop;
5747
5748          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5749       }
5750       UPDATE_EXEC_MASK(mach);
5751       break;
5752
5753    case TGSI_OPCODE_BRK:
5754       exec_break(mach);
5755       break;
5756
5757    case TGSI_OPCODE_CONT:
5758       /* turn off cont channels for each enabled exec channel */
5759       mach->ContMask &= ~mach->ExecMask;
5760       /* Todo: if mach->LoopMask == 0, jump to end of loop */
5761       UPDATE_EXEC_MASK(mach);
5762       break;
5763
5764    case TGSI_OPCODE_BGNSUB:
5765       /* no-op */
5766       break;
5767
5768    case TGSI_OPCODE_ENDSUB:
5769       /*
5770        * XXX: This really should be a no-op. We should never reach this opcode.
5771        */
5772
5773       assert(mach->CallStackTop > 0);
5774       mach->CallStackTop--;
5775
5776       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5777       mach->CondMask = mach->CondStack[mach->CondStackTop];
5778
5779       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5780       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5781
5782       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5783       mach->ContMask = mach->ContStack[mach->ContStackTop];
5784
5785       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5786       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5787
5788       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5789       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5790
5791       assert(mach->FuncStackTop > 0);
5792       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5793
5794       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5795
5796       UPDATE_EXEC_MASK(mach);
5797       break;
5798
5799    case TGSI_OPCODE_NOP:
5800       break;
5801
5802    case TGSI_OPCODE_F2I:
5803       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5804       break;
5805
5806    case TGSI_OPCODE_FSEQ:
5807       exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5808       break;
5809
5810    case TGSI_OPCODE_FSGE:
5811       exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5812       break;
5813
5814    case TGSI_OPCODE_FSLT:
5815       exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5816       break;
5817
5818    case TGSI_OPCODE_FSNE:
5819       exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5820       break;
5821
5822    case TGSI_OPCODE_IDIV:
5823       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5824       break;
5825
5826    case TGSI_OPCODE_IMAX:
5827       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5828       break;
5829
5830    case TGSI_OPCODE_IMIN:
5831       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5832       break;
5833
5834    case TGSI_OPCODE_INEG:
5835       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5836       break;
5837
5838    case TGSI_OPCODE_ISGE:
5839       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5840       break;
5841
5842    case TGSI_OPCODE_ISHR:
5843       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5844       break;
5845
5846    case TGSI_OPCODE_ISLT:
5847       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5848       break;
5849
5850    case TGSI_OPCODE_F2U:
5851       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5852       break;
5853
5854    case TGSI_OPCODE_U2F:
5855       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
5856       break;
5857
5858    case TGSI_OPCODE_UADD:
5859       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5860       break;
5861
5862    case TGSI_OPCODE_UDIV:
5863       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5864       break;
5865
5866    case TGSI_OPCODE_UMAD:
5867       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5868       break;
5869
5870    case TGSI_OPCODE_UMAX:
5871       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5872       break;
5873
5874    case TGSI_OPCODE_UMIN:
5875       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5876       break;
5877
5878    case TGSI_OPCODE_UMOD:
5879       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5880       break;
5881
5882    case TGSI_OPCODE_UMUL:
5883       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5884       break;
5885
5886    case TGSI_OPCODE_IMUL_HI:
5887       exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5888       break;
5889
5890    case TGSI_OPCODE_UMUL_HI:
5891       exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5892       break;
5893
5894    case TGSI_OPCODE_USEQ:
5895       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5896       break;
5897
5898    case TGSI_OPCODE_USGE:
5899       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5900       break;
5901
5902    case TGSI_OPCODE_USHR:
5903       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5904       break;
5905
5906    case TGSI_OPCODE_USLT:
5907       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5908       break;
5909
5910    case TGSI_OPCODE_USNE:
5911       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5912       break;
5913
5914    case TGSI_OPCODE_SWITCH:
5915       exec_switch(mach, inst);
5916       break;
5917
5918    case TGSI_OPCODE_CASE:
5919       exec_case(mach, inst);
5920       break;
5921
5922    case TGSI_OPCODE_DEFAULT:
5923       exec_default(mach);
5924       break;
5925
5926    case TGSI_OPCODE_ENDSWITCH:
5927       exec_endswitch(mach);
5928       break;
5929
5930    case TGSI_OPCODE_SAMPLE_I:
5931       exec_txf(mach, inst);
5932       break;
5933
5934    case TGSI_OPCODE_SAMPLE_I_MS:
5935       exec_txf(mach, inst);
5936       break;
5937
5938    case TGSI_OPCODE_SAMPLE:
5939       exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5940       break;
5941
5942    case TGSI_OPCODE_SAMPLE_B:
5943       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5944       break;
5945
5946    case TGSI_OPCODE_SAMPLE_C:
5947       exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5948       break;
5949
5950    case TGSI_OPCODE_SAMPLE_C_LZ:
5951       exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5952       break;
5953
5954    case TGSI_OPCODE_SAMPLE_D:
5955       exec_sample_d(mach, inst);
5956       break;
5957
5958    case TGSI_OPCODE_SAMPLE_L:
5959       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5960       break;
5961
5962    case TGSI_OPCODE_GATHER4:
5963       exec_sample(mach, inst, TEX_MODIFIER_GATHER, FALSE);
5964       break;
5965
5966    case TGSI_OPCODE_SVIEWINFO:
5967       exec_txq(mach, inst);
5968       break;
5969
5970    case TGSI_OPCODE_SAMPLE_POS:
5971       assert(0);
5972       break;
5973
5974    case TGSI_OPCODE_SAMPLE_INFO:
5975       assert(0);
5976       break;
5977
5978    case TGSI_OPCODE_LOD:
5979       exec_lodq(mach, inst);
5980       break;
5981
5982    case TGSI_OPCODE_UARL:
5983       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5984       break;
5985
5986    case TGSI_OPCODE_UCMP:
5987       exec_ucmp(mach, inst);
5988       break;
5989
5990    case TGSI_OPCODE_IABS:
5991       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5992       break;
5993
5994    case TGSI_OPCODE_ISSG:
5995       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5996       break;
5997
5998    case TGSI_OPCODE_TEX2:
5999       /* simple texture lookup */
6000       /* src[0] = texcoord */
6001       /* src[1] = compare */
6002       /* src[2] = sampler unit */
6003       exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
6004       break;
6005    case TGSI_OPCODE_TXB2:
6006       /* simple texture lookup */
6007       /* src[0] = texcoord */
6008       /* src[1] = bias */
6009       /* src[2] = sampler unit */
6010       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
6011       break;
6012    case TGSI_OPCODE_TXL2:
6013       /* simple texture lookup */
6014       /* src[0] = texcoord */
6015       /* src[1] = lod */
6016       /* src[2] = sampler unit */
6017       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
6018       break;
6019
6020    case TGSI_OPCODE_IBFE:
6021       exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6022       break;
6023    case TGSI_OPCODE_UBFE:
6024       exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6025       break;
6026    case TGSI_OPCODE_BFI:
6027       exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6028       break;
6029    case TGSI_OPCODE_BREV:
6030       exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6031       break;
6032    case TGSI_OPCODE_POPC:
6033       exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6034       break;
6035    case TGSI_OPCODE_LSB:
6036       exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6037       break;
6038    case TGSI_OPCODE_IMSB:
6039       exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6040       break;
6041    case TGSI_OPCODE_UMSB:
6042       exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6043       break;
6044
6045    case TGSI_OPCODE_F2D:
6046       exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
6047       break;
6048
6049    case TGSI_OPCODE_D2F:
6050       exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
6051       break;
6052
6053    case TGSI_OPCODE_DABS:
6054       exec_double_unary(mach, inst, micro_dabs);
6055       break;
6056
6057    case TGSI_OPCODE_DNEG:
6058       exec_double_unary(mach, inst, micro_dneg);
6059       break;
6060
6061    case TGSI_OPCODE_DADD:
6062       exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
6063       break;
6064
6065    case TGSI_OPCODE_DDIV:
6066       exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
6067       break;
6068
6069    case TGSI_OPCODE_DMUL:
6070       exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
6071       break;
6072
6073    case TGSI_OPCODE_DMAX:
6074       exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
6075       break;
6076
6077    case TGSI_OPCODE_DMIN:
6078       exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
6079       break;
6080
6081    case TGSI_OPCODE_DSLT:
6082       exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
6083       break;
6084
6085    case TGSI_OPCODE_DSGE:
6086       exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
6087       break;
6088
6089    case TGSI_OPCODE_DSEQ:
6090       exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
6091       break;
6092
6093    case TGSI_OPCODE_DSNE:
6094       exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
6095       break;
6096
6097    case TGSI_OPCODE_DRCP:
6098       exec_double_unary(mach, inst, micro_drcp);
6099       break;
6100
6101    case TGSI_OPCODE_DSQRT:
6102       exec_double_unary(mach, inst, micro_dsqrt);
6103       break;
6104
6105    case TGSI_OPCODE_DRSQ:
6106       exec_double_unary(mach, inst, micro_drsq);
6107       break;
6108
6109    case TGSI_OPCODE_DMAD:
6110       exec_double_trinary(mach, inst, micro_dmad);
6111       break;
6112
6113    case TGSI_OPCODE_DFRAC:
6114       exec_double_unary(mach, inst, micro_dfrac);
6115       break;
6116
6117    case TGSI_OPCODE_DFLR:
6118       exec_double_unary(mach, inst, micro_dflr);
6119       break;
6120
6121    case TGSI_OPCODE_DLDEXP:
6122       exec_dldexp(mach, inst);
6123       break;
6124
6125    case TGSI_OPCODE_DFRACEXP:
6126       exec_dfracexp(mach, inst);
6127       break;
6128
6129    case TGSI_OPCODE_I2D:
6130       exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
6131       break;
6132
6133    case TGSI_OPCODE_D2I:
6134       exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
6135       break;
6136
6137    case TGSI_OPCODE_U2D:
6138       exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
6139       break;
6140
6141    case TGSI_OPCODE_D2U:
6142       exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
6143       break;
6144
6145    case TGSI_OPCODE_LOAD:
6146       exec_load(mach, inst);
6147       break;
6148
6149    case TGSI_OPCODE_STORE:
6150       exec_store(mach, inst);
6151       break;
6152
6153    case TGSI_OPCODE_ATOMUADD:
6154    case TGSI_OPCODE_ATOMXCHG:
6155    case TGSI_OPCODE_ATOMCAS:
6156    case TGSI_OPCODE_ATOMAND:
6157    case TGSI_OPCODE_ATOMOR:
6158    case TGSI_OPCODE_ATOMXOR:
6159    case TGSI_OPCODE_ATOMUMIN:
6160    case TGSI_OPCODE_ATOMUMAX:
6161    case TGSI_OPCODE_ATOMIMIN:
6162    case TGSI_OPCODE_ATOMIMAX:
6163    case TGSI_OPCODE_ATOMFADD:
6164       exec_atomop(mach, inst);
6165       break;
6166
6167    case TGSI_OPCODE_RESQ:
6168       exec_resq(mach, inst);
6169       break;
6170    case TGSI_OPCODE_BARRIER:
6171    case TGSI_OPCODE_MEMBAR:
6172       return TRUE;
6173       break;
6174
6175    case TGSI_OPCODE_I64ABS:
6176       exec_double_unary(mach, inst, micro_i64abs);
6177       break;
6178
6179    case TGSI_OPCODE_I64SSG:
6180       exec_double_unary(mach, inst, micro_i64sgn);
6181       break;
6182
6183    case TGSI_OPCODE_I64NEG:
6184       exec_double_unary(mach, inst, micro_i64neg);
6185       break;
6186
6187    case TGSI_OPCODE_U64SEQ:
6188       exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
6189       break;
6190
6191    case TGSI_OPCODE_U64SNE:
6192       exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
6193       break;
6194
6195    case TGSI_OPCODE_I64SLT:
6196       exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
6197       break;
6198    case TGSI_OPCODE_U64SLT:
6199       exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
6200       break;
6201
6202    case TGSI_OPCODE_I64SGE:
6203       exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
6204       break;
6205    case TGSI_OPCODE_U64SGE:
6206       exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
6207       break;
6208
6209    case TGSI_OPCODE_I64MIN:
6210       exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
6211       break;
6212    case TGSI_OPCODE_U64MIN:
6213       exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
6214       break;
6215    case TGSI_OPCODE_I64MAX:
6216       exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
6217       break;
6218    case TGSI_OPCODE_U64MAX:
6219       exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
6220       break;
6221    case TGSI_OPCODE_U64ADD:
6222       exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
6223       break;
6224    case TGSI_OPCODE_U64MUL:
6225       exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
6226       break;
6227    case TGSI_OPCODE_U64SHL:
6228       exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
6229       break;
6230    case TGSI_OPCODE_I64SHR:
6231       exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
6232       break;
6233    case TGSI_OPCODE_U64SHR:
6234       exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
6235       break;
6236    case TGSI_OPCODE_U64DIV:
6237       exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
6238       break;
6239    case TGSI_OPCODE_I64DIV:
6240       exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
6241       break;
6242    case TGSI_OPCODE_U64MOD:
6243       exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
6244       break;
6245    case TGSI_OPCODE_I64MOD:
6246       exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
6247       break;
6248
6249    case TGSI_OPCODE_F2U64:
6250       exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
6251       break;
6252
6253    case TGSI_OPCODE_F2I64:
6254       exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
6255       break;
6256
6257    case TGSI_OPCODE_U2I64:
6258       exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
6259       break;
6260    case TGSI_OPCODE_I2I64:
6261       exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
6262       break;
6263
6264    case TGSI_OPCODE_D2U64:
6265       exec_double_unary(mach, inst, micro_d2u64);
6266       break;
6267
6268    case TGSI_OPCODE_D2I64:
6269       exec_double_unary(mach, inst, micro_d2i64);
6270       break;
6271
6272    case TGSI_OPCODE_U642F:
6273       exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
6274       break;
6275    case TGSI_OPCODE_I642F:
6276       exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
6277       break;
6278
6279    case TGSI_OPCODE_U642D:
6280       exec_double_unary(mach, inst, micro_u642d);
6281       break;
6282    case TGSI_OPCODE_I642D:
6283       exec_double_unary(mach, inst, micro_i642d);
6284       break;
6285    case TGSI_OPCODE_INTERP_SAMPLE:
6286       exec_interp_at_sample(mach, inst);
6287       break;
6288    case TGSI_OPCODE_INTERP_OFFSET:
6289       exec_interp_at_offset(mach, inst);
6290       break;
6291    case TGSI_OPCODE_INTERP_CENTROID:
6292       exec_interp_at_centroid(mach, inst);
6293       break;
6294    default:
6295       assert( 0 );
6296    }
6297    return FALSE;
6298 }
6299
6300 static void
6301 tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
6302 {
6303    uint default_mask = 0xf;
6304
6305    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
6306    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
6307
6308    if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
6309       for (unsigned i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) {
6310          mach->Temps[temp_prim_idxs[i].idx].xyzw[temp_prim_idxs[i].chan].u[0] = 0;
6311          mach->Primitives[i][0] = 0;
6312       }
6313       /* GS runs on a single primitive for now */
6314       default_mask = 0x1;
6315    }
6316
6317    if (mach->NonHelperMask == 0)
6318       mach->NonHelperMask = default_mask;
6319    mach->CondMask = default_mask;
6320    mach->LoopMask = default_mask;
6321    mach->ContMask = default_mask;
6322    mach->FuncMask = default_mask;
6323    mach->ExecMask = default_mask;
6324
6325    mach->Switch.mask = default_mask;
6326
6327    assert(mach->CondStackTop == 0);
6328    assert(mach->LoopStackTop == 0);
6329    assert(mach->ContStackTop == 0);
6330    assert(mach->SwitchStackTop == 0);
6331    assert(mach->BreakStackTop == 0);
6332    assert(mach->CallStackTop == 0);
6333 }
6334
6335 /**
6336  * Run TGSI interpreter.
6337  * \return bitmask of "alive" quad components
6338  */
6339 uint
6340 tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
6341 {
6342    uint i;
6343
6344    mach->pc = start_pc;
6345
6346    if (!start_pc) {
6347       tgsi_exec_machine_setup_masks(mach);
6348
6349       /* execute declarations (interpolants) */
6350       for (i = 0; i < mach->NumDeclarations; i++) {
6351          exec_declaration( mach, mach->Declarations+i );
6352       }
6353    }
6354
6355    {
6356 #if DEBUG_EXECUTION
6357       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
6358       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6359       uint inst = 1;
6360
6361       if (!start_pc) {
6362          memset(mach->Temps, 0, sizeof(temps));
6363          if (mach->Outputs)
6364             memset(mach->Outputs, 0, sizeof(outputs));
6365          memset(temps, 0, sizeof(temps));
6366          memset(outputs, 0, sizeof(outputs));
6367       }
6368 #endif
6369
6370       /* execute instructions, until pc is set to -1 */
6371       while (mach->pc != -1) {
6372          boolean barrier_hit;
6373 #if DEBUG_EXECUTION
6374          uint i;
6375
6376          tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6377 #endif
6378
6379          assert(mach->pc < (int) mach->NumInstructions);
6380          barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6381
6382          /* for compute shaders if we hit a barrier return now for later rescheduling */
6383          if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6384             return 0;
6385
6386 #if DEBUG_EXECUTION
6387          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
6388             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6389                uint j;
6390
6391                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6392                debug_printf("TEMP[%2u] = ", i);
6393                for (j = 0; j < 4; j++) {
6394                   if (j > 0) {
6395                      debug_printf("           ");
6396                   }
6397                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6398                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6399                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6400                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6401                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6402                }
6403             }
6404          }
6405          if (mach->Outputs) {
6406             for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6407                if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6408                   uint j;
6409
6410                   memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6411                   debug_printf("OUT[%2u] =  ", i);
6412                   for (j = 0; j < 4; j++) {
6413                      if (j > 0) {
6414                         debug_printf("           ");
6415                      }
6416                      debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6417                                   outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6418                                   outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6419                                   outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6420                                   outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6421                   }
6422                }
6423             }
6424          }
6425 #endif
6426       }
6427    }
6428
6429 #if 0
6430    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6431    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6432       /*
6433        * Scale back depth component.
6434        */
6435       for (i = 0; i < 4; i++)
6436          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6437    }
6438 #endif
6439
6440    /* Strictly speaking, these assertions aren't really needed but they
6441     * can potentially catch some bugs in the control flow code.
6442     */
6443    assert(mach->CondStackTop == 0);
6444    assert(mach->LoopStackTop == 0);
6445    assert(mach->ContStackTop == 0);
6446    assert(mach->SwitchStackTop == 0);
6447    assert(mach->BreakStackTop == 0);
6448    assert(mach->CallStackTop == 0);
6449
6450    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
6451 }