src/mesa/program/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  7.3
   4  *
   5  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "main/glheader.h"
  39 #include "main/colormac.h"
  40 #include "main/macros.h"
  41 #include "prog_execute.h"
  42 #include "prog_instruction.h"
  43 #include "prog_parameter.h"
  44 #include "prog_print.h"
  45 #include "prog_noise.h"
  46
  47
  48 /* debug predicate */
  49 #define DEBUG_PROG 0
  50
  51
  52 /**
  53  * Set x to positive or negative infinity.
  54  */
  55 #if defined(USE_IEEE) || defined(_WIN32)
  56 #define SET_POS_INFINITY(x)                  \
  57    do {                                      \
  58          fi_type fi;                         \
  59          fi.i = 0x7F800000;                  \
  60          x = fi.f;                           \
  61    } while (0)
  62 #define SET_NEG_INFINITY(x)                  \
  63    do {                                      \
  64          fi_type fi;                         \
  65          fi.i = 0xFF800000;                  \
  66          x = fi.f;                           \
  67    } while (0)
  68 #elif defined(VMS)
  69 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
  70 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
  71 #else
  72 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  73 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  74 #endif
  75
  76 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  77
  78
  79 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  80
  81
  82 /**
  83  * Return a pointer to the 4-element float vector specified by the given
  84  * source register.
  85  */
  86 static inline const GLfloat *
  87 get_src_register_pointer(const struct prog_src_register *source,
  88                          const struct gl_program_machine *machine)
  89 {
  90    const struct gl_program *prog = machine->CurProgram;
  91    GLint reg = source->Index;
  92
  93    if (source->RelAddr) {
  94       /* add address register value to src index/offset */
  95       reg += machine->AddressReg[0][0];
  96       if (reg < 0) {
  97          return ZeroVec;
  98       }
  99    }
 100
 101    switch (source->File) {
 102    case PROGRAM_TEMPORARY:
 103       if (reg >= MAX_PROGRAM_TEMPS)
 104          return ZeroVec;
 105       return machine->Temporaries[reg];
 106
 107    case PROGRAM_INPUT:
 108       if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
 109          if (reg >= VERT_ATTRIB_MAX)
 110             return ZeroVec;
 111          return machine->VertAttribs[reg];
 112       }
 113       else {
 114          if (reg >= FRAG_ATTRIB_MAX)
 115             return ZeroVec;
 116          return machine->Attribs[reg][machine->CurElement];
 117       }
 118
 119    case PROGRAM_OUTPUT:
 120       if (reg >= MAX_PROGRAM_OUTPUTS)
 121          return ZeroVec;
 122       return machine->Outputs[reg];
 123
 124    case PROGRAM_LOCAL_PARAM:
 125       if (reg >= MAX_PROGRAM_LOCAL_PARAMS)
 126          return ZeroVec;
 127       return machine->CurProgram->LocalParams[reg];
 128
 129    case PROGRAM_ENV_PARAM:
 130       if (reg >= MAX_PROGRAM_ENV_PARAMS)
 131          return ZeroVec;
 132       return machine->EnvParams[reg];
 133
 134    case PROGRAM_STATE_VAR:
 135       /* Fallthrough */
 136    case PROGRAM_CONSTANT:
 137       /* Fallthrough */
 138    case PROGRAM_UNIFORM:
 139       if (reg >= (GLint) prog->Parameters->NumParameters)
 140          return ZeroVec;
 141       return (GLfloat *) prog->Parameters->ParameterValues[reg];
 142
 143    case PROGRAM_SYSTEM_VALUE:
 144       assert(reg < Elements(machine->SystemValues));
 145       return machine->SystemValues[reg];
 146
 147    default:
 148       _mesa_problem(NULL,
 149          "Invalid src register file %d in get_src_register_pointer()",
 150          source->File);
 151       return NULL;
 152    }
 153 }
 154
 155
 156 /**
 157  * Return a pointer to the 4-element float vector specified by the given
 158  * destination register.
 159  */
 160 static inline GLfloat *
 161 get_dst_register_pointer(const struct prog_dst_register *dest,
 162                          struct gl_program_machine *machine)
 163 {
 164    static GLfloat dummyReg[4];
 165    GLint reg = dest->Index;
 166
 167    if (dest->RelAddr) {
 168       /* add address register value to src index/offset */
 169       reg += machine->AddressReg[0][0];
 170       if (reg < 0) {
 171          return dummyReg;
 172       }
 173    }
 174
 175    switch (dest->File) {
 176    case PROGRAM_TEMPORARY:
 177       if (reg >= MAX_PROGRAM_TEMPS)
 178          return dummyReg;
 179       return machine->Temporaries[reg];
 180
 181    case PROGRAM_OUTPUT:
 182       if (reg >= MAX_PROGRAM_OUTPUTS)
 183          return dummyReg;
 184       return machine->Outputs[reg];
 185
 186    case PROGRAM_WRITE_ONLY:
 187       return dummyReg;
 188
 189    default:
 190       _mesa_problem(NULL,
 191          "Invalid dest register file %d in get_dst_register_pointer()",
 192          dest->File);
 193       return NULL;
 194    }
 195 }
 196
 197
 198
 199 /**
 200  * Fetch a 4-element float vector from the given source register.
 201  * Apply swizzling and negating as needed.
 202  */
 203 static void
 204 fetch_vector4(const struct prog_src_register *source,
 205               const struct gl_program_machine *machine, GLfloat result[4])
 206 {
 207    const GLfloat *src = get_src_register_pointer(source, machine);
 208    ASSERT(src);
 209
 210    if (source->Swizzle == SWIZZLE_NOOP) {
 211       /* no swizzling */
 212       COPY_4V(result, src);
 213    }
 214    else {
 215       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 216       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 217       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 218       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 219       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 220       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 221       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 222       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 223    }
 224
 225    if (source->Abs) {
 226       result[0] = FABSF(result[0]);
 227       result[1] = FABSF(result[1]);
 228       result[2] = FABSF(result[2]);
 229       result[3] = FABSF(result[3]);
 230    }
 231    if (source->Negate) {
 232       ASSERT(source->Negate == NEGATE_XYZW);
 233       result[0] = -result[0];
 234       result[1] = -result[1];
 235       result[2] = -result[2];
 236       result[3] = -result[3];
 237    }
 238
 239 #ifdef NAN_CHECK
 240    assert(!IS_INF_OR_NAN(result[0]));
 241    assert(!IS_INF_OR_NAN(result[0]));
 242    assert(!IS_INF_OR_NAN(result[0]));
 243    assert(!IS_INF_OR_NAN(result[0]));
 244 #endif
 245 }
 246
 247
 248 /**
 249  * Fetch a 4-element uint vector from the given source register.
 250  * Apply swizzling but not negation/abs.
 251  */
 252 static void
 253 fetch_vector4ui(const struct prog_src_register *source,
 254                 const struct gl_program_machine *machine, GLuint result[4])
 255 {
 256    const GLuint *src = (GLuint *) get_src_register_pointer(source, machine);
 257    ASSERT(src);
 258
 259    if (source->Swizzle == SWIZZLE_NOOP) {
 260       /* no swizzling */
 261       COPY_4V(result, src);
 262    }
 263    else {
 264       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 265       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 266       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 267       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 268       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 269       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 270       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 271       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 272    }
 273
 274    /* Note: no Negate or Abs here */
 275 }
 276
 277
 278
 279 /**
 280  * Fetch the derivative with respect to X or Y for the given register.
 281  * XXX this currently only works for fragment program input attribs.
 282  */
 283 static void
 284 fetch_vector4_deriv(struct gl_context * ctx,
 285                     const struct prog_src_register *source,
 286                     const struct gl_program_machine *machine,
 287                     char xOrY, GLfloat result[4])
 288 {
 289    if (source->File == PROGRAM_INPUT &&
 290        source->Index < (GLint) machine->NumDeriv) {
 291       const GLint col = machine->CurElement;
 292       const GLfloat w = machine->Attribs[FRAG_ATTRIB_WPOS][col][3];
 293       const GLfloat invQ = 1.0f / w;
 294       GLfloat deriv[4];
 295
 296       if (xOrY == 'X') {
 297          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 298          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 299          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 300          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 301       }
 302       else {
 303          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 304          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 305          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 306          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 307       }
 308
 309       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 310       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 311       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 312       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 313
 314       if (source->Abs) {
 315          result[0] = FABSF(result[0]);
 316          result[1] = FABSF(result[1]);
 317          result[2] = FABSF(result[2]);
 318          result[3] = FABSF(result[3]);
 319       }
 320       if (source->Negate) {
 321          ASSERT(source->Negate == NEGATE_XYZW);
 322          result[0] = -result[0];
 323          result[1] = -result[1];
 324          result[2] = -result[2];
 325          result[3] = -result[3];
 326       }
 327    }
 328    else {
 329       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 330    }
 331 }
 332
 333
 334 /**
 335  * As above, but only return result[0] element.
 336  */
 337 static void
 338 fetch_vector1(const struct prog_src_register *source,
 339               const struct gl_program_machine *machine, GLfloat result[4])
 340 {
 341    const GLfloat *src = get_src_register_pointer(source, machine);
 342    ASSERT(src);
 343
 344    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 345
 346    if (source->Abs) {
 347       result[0] = FABSF(result[0]);
 348    }
 349    if (source->Negate) {
 350       result[0] = -result[0];
 351    }
 352 }
 353
 354
 355 static GLuint
 356 fetch_vector1ui(const struct prog_src_register *source,
 357                 const struct gl_program_machine *machine)
 358 {
 359    const GLuint *src = (GLuint *) get_src_register_pointer(source, machine);
 360    return src[GET_SWZ(source->Swizzle, 0)];
 361 }
 362
 363
 364 /**
 365  * Fetch texel from texture.  Use partial derivatives when possible.
 366  */
 367 static inline void
 368 fetch_texel(struct gl_context *ctx,
 369             const struct gl_program_machine *machine,
 370             const struct prog_instruction *inst,
 371             const GLfloat texcoord[4], GLfloat lodBias,
 372             GLfloat color[4])
 373 {
 374    const GLuint unit = machine->Samplers[inst->TexSrcUnit];
 375
 376    /* Note: we only have the right derivatives for fragment input attribs.
 377     */
 378    if (machine->NumDeriv > 0 &&
 379        inst->SrcReg[0].File == PROGRAM_INPUT &&
 380        inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit) {
 381       /* simple texture fetch for which we should have derivatives */
 382       GLuint attr = inst->SrcReg[0].Index;
 383       machine->FetchTexelDeriv(ctx, texcoord,
 384                                machine->DerivX[attr],
 385                                machine->DerivY[attr],
 386                                lodBias, unit, color);
 387    }
 388    else {
 389       machine->FetchTexelLod(ctx, texcoord, lodBias, unit, color);
 390    }
 391 }
 392
 393
 394 /**
 395  * Test value against zero and return GT, LT, EQ or UN if NaN.
 396  */
 397 static inline GLuint
 398 generate_cc(float value)
 399 {
 400    if (value != value)
 401       return COND_UN;           /* NaN */
 402    if (value > 0.0F)
 403       return COND_GT;
 404    if (value < 0.0F)
 405       return COND_LT;
 406    return COND_EQ;
 407 }
 408
 409
 410 /**
 411  * Test if the ccMaskRule is satisfied by the given condition code.
 412  * Used to mask destination writes according to the current condition code.
 413  */
 414 static inline GLboolean
 415 test_cc(GLuint condCode, GLuint ccMaskRule)
 416 {
 417    switch (ccMaskRule) {
 418    case COND_EQ: return (condCode == COND_EQ);
 419    case COND_NE: return (condCode != COND_EQ);
 420    case COND_LT: return (condCode == COND_LT);
 421    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 422    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 423    case COND_GT: return (condCode == COND_GT);
 424    case COND_TR: return GL_TRUE;
 425    case COND_FL: return GL_FALSE;
 426    default:      return GL_TRUE;
 427    }
 428 }
 429
 430
 431 /**
 432  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 433  * or GL_FALSE to indicate result.
 434  */
 435 static inline GLboolean
 436 eval_condition(const struct gl_program_machine *machine,
 437                const struct prog_instruction *inst)
 438 {
 439    const GLuint swizzle = inst->DstReg.CondSwizzle;
 440    const GLuint condMask = inst->DstReg.CondMask;
 441    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 442        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 443        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 444        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 445       return GL_TRUE;
 446    }
 447    else {
 448       return GL_FALSE;
 449    }
 450 }
 451
 452
 453
 454 /**
 455  * Store 4 floats into a register.  Observe the instructions saturate and
 456  * set-condition-code flags.
 457  */
 458 static void
 459 store_vector4(const struct prog_instruction *inst,
 460               struct gl_program_machine *machine, const GLfloat value[4])
 461 {
 462    const struct prog_dst_register *dstReg = &(inst->DstReg);
 463    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 464    GLuint writeMask = dstReg->WriteMask;
 465    GLfloat clampedValue[4];
 466    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
 467
 468 #if 0
 469    if (value[0] > 1.0e10 ||
 470        IS_INF_OR_NAN(value[0]) ||
 471        IS_INF_OR_NAN(value[1]) ||
 472        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 473       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 474 #endif
 475
 476    if (clamp) {
 477       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 478       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 479       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 480       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 481       value = clampedValue;
 482    }
 483
 484    if (dstReg->CondMask != COND_TR) {
 485       /* condition codes may turn off some writes */
 486       if (writeMask & WRITEMASK_X) {
 487          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 488                       dstReg->CondMask))
 489             writeMask &= ~WRITEMASK_X;
 490       }
 491       if (writeMask & WRITEMASK_Y) {
 492          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 493                       dstReg->CondMask))
 494             writeMask &= ~WRITEMASK_Y;
 495       }
 496       if (writeMask & WRITEMASK_Z) {
 497          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 498                       dstReg->CondMask))
 499             writeMask &= ~WRITEMASK_Z;
 500       }
 501       if (writeMask & WRITEMASK_W) {
 502          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 503                       dstReg->CondMask))
 504             writeMask &= ~WRITEMASK_W;
 505       }
 506    }
 507
 508 #ifdef NAN_CHECK
 509    assert(!IS_INF_OR_NAN(value[0]));
 510    assert(!IS_INF_OR_NAN(value[0]));
 511    assert(!IS_INF_OR_NAN(value[0]));
 512    assert(!IS_INF_OR_NAN(value[0]));
 513 #endif
 514
 515    if (writeMask & WRITEMASK_X)
 516       dst[0] = value[0];
 517    if (writeMask & WRITEMASK_Y)
 518       dst[1] = value[1];
 519    if (writeMask & WRITEMASK_Z)
 520       dst[2] = value[2];
 521    if (writeMask & WRITEMASK_W)
 522       dst[3] = value[3];
 523
 524    if (inst->CondUpdate) {
 525       if (writeMask & WRITEMASK_X)
 526          machine->CondCodes[0] = generate_cc(value[0]);
 527       if (writeMask & WRITEMASK_Y)
 528          machine->CondCodes[1] = generate_cc(value[1]);
 529       if (writeMask & WRITEMASK_Z)
 530          machine->CondCodes[2] = generate_cc(value[2]);
 531       if (writeMask & WRITEMASK_W)
 532          machine->CondCodes[3] = generate_cc(value[3]);
 533 #if DEBUG_PROG
 534       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 535              _mesa_condcode_string(machine->CondCodes[0]),
 536              _mesa_condcode_string(machine->CondCodes[1]),
 537              _mesa_condcode_string(machine->CondCodes[2]),
 538              _mesa_condcode_string(machine->CondCodes[3]));
 539 #endif
 540    }
 541 }
 542
 543
 544 /**
 545  * Store 4 uints into a register.  Observe the set-condition-code flags.
 546  */
 547 static void
 548 store_vector4ui(const struct prog_instruction *inst,
 549                 struct gl_program_machine *machine, const GLuint value[4])
 550 {
 551    const struct prog_dst_register *dstReg = &(inst->DstReg);
 552    GLuint writeMask = dstReg->WriteMask;
 553    GLuint *dst = (GLuint *) get_dst_register_pointer(dstReg, machine);
 554
 555    if (dstReg->CondMask != COND_TR) {
 556       /* condition codes may turn off some writes */
 557       if (writeMask & WRITEMASK_X) {
 558          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 559                       dstReg->CondMask))
 560             writeMask &= ~WRITEMASK_X;
 561       }
 562       if (writeMask & WRITEMASK_Y) {
 563          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 564                       dstReg->CondMask))
 565             writeMask &= ~WRITEMASK_Y;
 566       }
 567       if (writeMask & WRITEMASK_Z) {
 568          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 569                       dstReg->CondMask))
 570             writeMask &= ~WRITEMASK_Z;
 571       }
 572       if (writeMask & WRITEMASK_W) {
 573          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 574                       dstReg->CondMask))
 575             writeMask &= ~WRITEMASK_W;
 576       }
 577    }
 578
 579    if (writeMask & WRITEMASK_X)
 580       dst[0] = value[0];
 581    if (writeMask & WRITEMASK_Y)
 582       dst[1] = value[1];
 583    if (writeMask & WRITEMASK_Z)
 584       dst[2] = value[2];
 585    if (writeMask & WRITEMASK_W)
 586       dst[3] = value[3];
 587
 588    if (inst->CondUpdate) {
 589       if (writeMask & WRITEMASK_X)
 590          machine->CondCodes[0] = generate_cc((float)value[0]);
 591       if (writeMask & WRITEMASK_Y)
 592          machine->CondCodes[1] = generate_cc((float)value[1]);
 593       if (writeMask & WRITEMASK_Z)
 594          machine->CondCodes[2] = generate_cc((float)value[2]);
 595       if (writeMask & WRITEMASK_W)
 596          machine->CondCodes[3] = generate_cc((float)value[3]);
 597 #if DEBUG_PROG
 598       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 599              _mesa_condcode_string(machine->CondCodes[0]),
 600              _mesa_condcode_string(machine->CondCodes[1]),
 601              _mesa_condcode_string(machine->CondCodes[2]),
 602              _mesa_condcode_string(machine->CondCodes[3]));
 603 #endif
 604    }
 605 }
 606
 607
 608
 609 /**
 610  * Execute the given vertex/fragment program.
 611  *
 612  * \param ctx  rendering context
 613  * \param program  the program to execute
 614  * \param machine  machine state (must be initialized)
 615  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 616  */
 617 GLboolean
 618 _mesa_execute_program(struct gl_context * ctx,
 619                       const struct gl_program *program,
 620                       struct gl_program_machine *machine)
 621 {
 622    const GLuint numInst = program->NumInstructions;
 623    const GLuint maxExec = 65536;
 624    GLuint pc, numExec = 0;
 625
 626    machine->CurProgram = program;
 627
 628    if (DEBUG_PROG) {
 629       printf("execute program %u --------------------\n", program->Id);
 630    }
 631
 632    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 633       machine->EnvParams = ctx->VertexProgram.Parameters;
 634    }
 635    else {
 636       machine->EnvParams = ctx->FragmentProgram.Parameters;
 637    }
 638
 639    for (pc = 0; pc < numInst; pc++) {
 640       const struct prog_instruction *inst = program->Instructions + pc;
 641
 642       if (DEBUG_PROG) {
 643          _mesa_print_instruction(inst);
 644       }
 645
 646       switch (inst->Opcode) {
 647       case OPCODE_ABS:
 648          {
 649             GLfloat a[4], result[4];
 650             fetch_vector4(&inst->SrcReg[0], machine, a);
 651             result[0] = FABSF(a[0]);
 652             result[1] = FABSF(a[1]);
 653             result[2] = FABSF(a[2]);
 654             result[3] = FABSF(a[3]);
 655             store_vector4(inst, machine, result);
 656          }
 657          break;
 658       case OPCODE_ADD:
 659          {
 660             GLfloat a[4], b[4], result[4];
 661             fetch_vector4(&inst->SrcReg[0], machine, a);
 662             fetch_vector4(&inst->SrcReg[1], machine, b);
 663             result[0] = a[0] + b[0];
 664             result[1] = a[1] + b[1];
 665             result[2] = a[2] + b[2];
 666             result[3] = a[3] + b[3];
 667             store_vector4(inst, machine, result);
 668             if (DEBUG_PROG) {
 669                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 670                       result[0], result[1], result[2], result[3],
 671                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 672             }
 673          }
 674          break;
 675       case OPCODE_AND:     /* bitwise AND */
 676          {
 677             GLuint a[4], b[4], result[4];
 678             fetch_vector4ui(&inst->SrcReg[0], machine, a);
 679             fetch_vector4ui(&inst->SrcReg[1], machine, b);
 680             result[0] = a[0] & b[0];
 681             result[1] = a[1] & b[1];
 682             result[2] = a[2] & b[2];
 683             result[3] = a[3] & b[3];
 684             store_vector4ui(inst, machine, result);
 685          }
 686          break;
 687       case OPCODE_ARL:
 688          {
 689             GLfloat t[4];
 690             fetch_vector4(&inst->SrcReg[0], machine, t);
 691             machine->AddressReg[0][0] = IFLOOR(t[0]);
 692             if (DEBUG_PROG) {
 693                printf("ARL %d\n", machine->AddressReg[0][0]);
 694             }
 695          }
 696          break;
 697       case OPCODE_BGNLOOP:
 698          /* no-op */
 699          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 700                 == OPCODE_ENDLOOP);
 701          break;
 702       case OPCODE_ENDLOOP:
 703          /* subtract 1 here since pc is incremented by for(pc) loop */
 704          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 705                 == OPCODE_BGNLOOP);
 706          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 707          break;
 708       case OPCODE_BGNSUB:      /* begin subroutine */
 709          break;
 710       case OPCODE_ENDSUB:      /* end subroutine */
 711          break;
 712       case OPCODE_BRK:         /* break out of loop (conditional) */
 713          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 714                 == OPCODE_ENDLOOP);
 715          if (eval_condition(machine, inst)) {
 716             /* break out of loop */
 717             /* pc++ at end of for-loop will put us after the ENDLOOP inst */
 718             pc = inst->BranchTarget;
 719          }
 720          break;
 721       case OPCODE_CONT:        /* continue loop (conditional) */
 722          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 723                 == OPCODE_ENDLOOP);
 724          if (eval_condition(machine, inst)) {
 725             /* continue at ENDLOOP */
 726             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 727             pc = inst->BranchTarget - 1;
 728          }
 729          break;
 730       case OPCODE_CAL:         /* Call subroutine (conditional) */
 731          if (eval_condition(machine, inst)) {
 732             /* call the subroutine */
 733             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 734                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 735             }
 736             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 737             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 738             pc = inst->BranchTarget - 1;
 739          }
 740          break;
 741       case OPCODE_CMP:
 742          {
 743             GLfloat a[4], b[4], c[4], result[4];
 744             fetch_vector4(&inst->SrcReg[0], machine, a);
 745             fetch_vector4(&inst->SrcReg[1], machine, b);
 746             fetch_vector4(&inst->SrcReg[2], machine, c);
 747             result[0] = a[0] < 0.0F ? b[0] : c[0];
 748             result[1] = a[1] < 0.0F ? b[1] : c[1];
 749             result[2] = a[2] < 0.0F ? b[2] : c[2];
 750             result[3] = a[3] < 0.0F ? b[3] : c[3];
 751             store_vector4(inst, machine, result);
 752             if (DEBUG_PROG) {
 753                printf("CMP (%g %g %g %g) = (%g %g %g %g) < 0 ? (%g %g %g %g) : (%g %g %g %g)\n",
 754                       result[0], result[1], result[2], result[3],
 755                       a[0], a[1], a[2], a[3],
 756                       b[0], b[1], b[2], b[3],
 757                       c[0], c[1], c[2], c[3]);
 758             }
 759          }
 760          break;
 761       case OPCODE_COS:
 762          {
 763             GLfloat a[4], result[4];
 764             fetch_vector1(&inst->SrcReg[0], machine, a);
 765             result[0] = result[1] = result[2] = result[3]
 766                = (GLfloat) cos(a[0]);
 767             store_vector4(inst, machine, result);
 768          }
 769          break;
 770       case OPCODE_DDX:         /* Partial derivative with respect to X */
 771          {
 772             GLfloat result[4];
 773             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 774                                 'X', result);
 775             store_vector4(inst, machine, result);
 776          }
 777          break;
 778       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 779          {
 780             GLfloat result[4];
 781             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 782                                 'Y', result);
 783             store_vector4(inst, machine, result);
 784          }
 785          break;
 786       case OPCODE_DP2:
 787          {
 788             GLfloat a[4], b[4], result[4];
 789             fetch_vector4(&inst->SrcReg[0], machine, a);
 790             fetch_vector4(&inst->SrcReg[1], machine, b);
 791             result[0] = result[1] = result[2] = result[3] = DOT2(a, b);
 792             store_vector4(inst, machine, result);
 793             if (DEBUG_PROG) {
 794                printf("DP2 %g = (%g %g) . (%g %g)\n",
 795                       result[0], a[0], a[1], b[0], b[1]);
 796             }
 797          }
 798          break;
 799       case OPCODE_DP2A:
 800          {
 801             GLfloat a[4], b[4], c, result[4];
 802             fetch_vector4(&inst->SrcReg[0], machine, a);
 803             fetch_vector4(&inst->SrcReg[1], machine, b);
 804             fetch_vector1(&inst->SrcReg[1], machine, &c);
 805             result[0] = result[1] = result[2] = result[3] = DOT2(a, b) + c;
 806             store_vector4(inst, machine, result);
 807             if (DEBUG_PROG) {
 808                printf("DP2A %g = (%g %g) . (%g %g) + %g\n",
 809                       result[0], a[0], a[1], b[0], b[1], c);
 810             }
 811          }
 812          break;
 813       case OPCODE_DP3:
 814          {
 815             GLfloat a[4], b[4], result[4];
 816             fetch_vector4(&inst->SrcReg[0], machine, a);
 817             fetch_vector4(&inst->SrcReg[1], machine, b);
 818             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 819             store_vector4(inst, machine, result);
 820             if (DEBUG_PROG) {
 821                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 822                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 823             }
 824          }
 825          break;
 826       case OPCODE_DP4:
 827          {
 828             GLfloat a[4], b[4], result[4];
 829             fetch_vector4(&inst->SrcReg[0], machine, a);
 830             fetch_vector4(&inst->SrcReg[1], machine, b);
 831             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 832             store_vector4(inst, machine, result);
 833             if (DEBUG_PROG) {
 834                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 835                       result[0], a[0], a[1], a[2], a[3],
 836                       b[0], b[1], b[2], b[3]);
 837             }
 838          }
 839          break;
 840       case OPCODE_DPH:
 841          {
 842             GLfloat a[4], b[4], result[4];
 843             fetch_vector4(&inst->SrcReg[0], machine, a);
 844             fetch_vector4(&inst->SrcReg[1], machine, b);
 845             result[0] = result[1] = result[2] = result[3] = DOT3(a, b) + b[3];
 846             store_vector4(inst, machine, result);
 847          }
 848          break;
 849       case OPCODE_DST:         /* Distance vector */
 850          {
 851             GLfloat a[4], b[4], result[4];
 852             fetch_vector4(&inst->SrcReg[0], machine, a);
 853             fetch_vector4(&inst->SrcReg[1], machine, b);
 854             result[0] = 1.0F;
 855             result[1] = a[1] * b[1];
 856             result[2] = a[2];
 857             result[3] = b[3];
 858             store_vector4(inst, machine, result);
 859          }
 860          break;
 861       case OPCODE_EXP:
 862          {
 863             GLfloat t[4], q[4], floor_t0;
 864             fetch_vector1(&inst->SrcReg[0], machine, t);
 865             floor_t0 = FLOORF(t[0]);
 866             if (floor_t0 > FLT_MAX_EXP) {
 867                SET_POS_INFINITY(q[0]);
 868                SET_POS_INFINITY(q[2]);
 869             }
 870             else if (floor_t0 < FLT_MIN_EXP) {
 871                q[0] = 0.0F;
 872                q[2] = 0.0F;
 873             }
 874             else {
 875                q[0] = LDEXPF(1.0, (int) floor_t0);
 876                /* Note: GL_NV_vertex_program expects
 877                 * result.z = result.x * APPX(result.y)
 878                 * We do what the ARB extension says.
 879                 */
 880                q[2] = (GLfloat) pow(2.0, t[0]);
 881             }
 882             q[1] = t[0] - floor_t0;
 883             q[3] = 1.0F;
 884             store_vector4( inst, machine, q );
 885          }
 886          break;
 887       case OPCODE_EX2:         /* Exponential base 2 */
 888          {
 889             GLfloat a[4], result[4], val;
 890             fetch_vector1(&inst->SrcReg[0], machine, a);
 891             val = (GLfloat) pow(2.0, a[0]);
 892             /*
 893             if (IS_INF_OR_NAN(val))
 894                val = 1.0e10;
 895             */
 896             result[0] = result[1] = result[2] = result[3] = val;
 897             store_vector4(inst, machine, result);
 898          }
 899          break;
 900       case OPCODE_FLR:
 901          {
 902             GLfloat a[4], result[4];
 903             fetch_vector4(&inst->SrcReg[0], machine, a);
 904             result[0] = FLOORF(a[0]);
 905             result[1] = FLOORF(a[1]);
 906             result[2] = FLOORF(a[2]);
 907             result[3] = FLOORF(a[3]);
 908             store_vector4(inst, machine, result);
 909          }
 910          break;
 911       case OPCODE_FRC:
 912          {
 913             GLfloat a[4], result[4];
 914             fetch_vector4(&inst->SrcReg[0], machine, a);
 915             result[0] = a[0] - FLOORF(a[0]);
 916             result[1] = a[1] - FLOORF(a[1]);
 917             result[2] = a[2] - FLOORF(a[2]);
 918             result[3] = a[3] - FLOORF(a[3]);
 919             store_vector4(inst, machine, result);
 920          }
 921          break;
 922       case OPCODE_IF:
 923          {
 924             GLboolean cond;
 925             ASSERT(program->Instructions[inst->BranchTarget].Opcode
 926                    == OPCODE_ELSE ||
 927                    program->Instructions[inst->BranchTarget].Opcode
 928                    == OPCODE_ENDIF);
 929             /* eval condition */
 930             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 931                GLfloat a[4];
 932                fetch_vector1(&inst->SrcReg[0], machine, a);
 933                cond = (a[0] != 0.0);
 934             }
 935             else {
 936                cond = eval_condition(machine, inst);
 937             }
 938             if (DEBUG_PROG) {
 939                printf("IF: %d\n", cond);
 940             }
 941             /* do if/else */
 942             if (cond) {
 943                /* do if-clause (just continue execution) */
 944             }
 945             else {
 946                /* go to the instruction after ELSE or ENDIF */
 947                assert(inst->BranchTarget >= 0);
 948                pc = inst->BranchTarget;
 949             }
 950          }
 951          break;
 952       case OPCODE_ELSE:
 953          /* goto ENDIF */
 954          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 955                 == OPCODE_ENDIF);
 956          assert(inst->BranchTarget >= 0);
 957          pc = inst->BranchTarget;
 958          break;
 959       case OPCODE_ENDIF:
 960          /* nothing */
 961          break;
 962       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 963          if (eval_condition(machine, inst)) {
 964             return GL_FALSE;
 965          }
 966          break;
 967       case OPCODE_KIL:         /* ARB_f_p only */
 968          {
 969             GLfloat a[4];
 970             fetch_vector4(&inst->SrcReg[0], machine, a);
 971             if (DEBUG_PROG) {
 972                printf("KIL if (%g %g %g %g) <= 0.0\n",
 973                       a[0], a[1], a[2], a[3]);
 974             }
 975
 976             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 977                return GL_FALSE;
 978             }
 979          }
 980          break;
 981       case OPCODE_LG2:         /* log base 2 */
 982          {
 983             GLfloat a[4], result[4], val;
 984             fetch_vector1(&inst->SrcReg[0], machine, a);
 985             /* The fast LOG2 macro doesn't meet the precision requirements.
 986              */
 987             if (a[0] == 0.0F) {
 988                val = -FLT_MAX;
 989             }
 990             else {
 991                val = (float)(log(a[0]) * 1.442695F);
 992             }
 993             result[0] = result[1] = result[2] = result[3] = val;
 994             store_vector4(inst, machine, result);
 995          }
 996          break;
 997       case OPCODE_LIT:
 998          {
 999             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
1000             GLfloat a[4], result[4];
1001             fetch_vector4(&inst->SrcReg[0], machine, a);
1002             a[0] = MAX2(a[0], 0.0F);
1003             a[1] = MAX2(a[1], 0.0F);
1004             /* XXX ARB version clamps a[3], NV version doesn't */
1005             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
1006             result[0] = 1.0F;
1007             result[1] = a[0];
1008             /* XXX we could probably just use pow() here */
1009             if (a[0] > 0.0F) {
1010                if (a[1] == 0.0 && a[3] == 0.0)
1011                   result[2] = 1.0F;
1012                else
1013                   result[2] = (GLfloat) pow(a[1], a[3]);
1014             }
1015             else {
1016                result[2] = 0.0F;
1017             }
1018             result[3] = 1.0F;
1019             store_vector4(inst, machine, result);
1020             if (DEBUG_PROG) {
1021                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
1022                       result[0], result[1], result[2], result[3],
1023                       a[0], a[1], a[2], a[3]);
1024             }
1025          }
1026          break;
1027       case OPCODE_LOG:
1028          {
1029             GLfloat t[4], q[4], abs_t0;
1030             fetch_vector1(&inst->SrcReg[0], machine, t);
1031             abs_t0 = FABSF(t[0]);
1032             if (abs_t0 != 0.0F) {
1033                /* Since we really can't handle infinite values on VMS
1034                 * like other OSes we'll use __MAXFLOAT to represent
1035                 * infinity.  This may need some tweaking.
1036                 */
1037 #ifdef VMS
1038                if (abs_t0 == __MAXFLOAT)
1039 #else
1040                if (IS_INF_OR_NAN(abs_t0))
1041 #endif
1042                {
1043                   SET_POS_INFINITY(q[0]);
1044                   q[1] = 1.0F;
1045                   SET_POS_INFINITY(q[2]);
1046                }
1047                else {
1048                   int exponent;
1049                   GLfloat mantissa = FREXPF(t[0], &exponent);
1050                   q[0] = (GLfloat) (exponent - 1);
1051                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
1052
1053                   /* The fast LOG2 macro doesn't meet the precision
1054                    * requirements.
1055                    */
1056                   q[2] = (float)(log(t[0]) * 1.442695F);
1057                }
1058             }
1059             else {
1060                SET_NEG_INFINITY(q[0]);
1061                q[1] = 1.0F;
1062                SET_NEG_INFINITY(q[2]);
1063             }
1064             q[3] = 1.0;
1065             store_vector4(inst, machine, q);
1066          }
1067          break;
1068       case OPCODE_LRP:
1069          {
1070             GLfloat a[4], b[4], c[4], result[4];
1071             fetch_vector4(&inst->SrcReg[0], machine, a);
1072             fetch_vector4(&inst->SrcReg[1], machine, b);
1073             fetch_vector4(&inst->SrcReg[2], machine, c);
1074             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
1075             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
1076             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
1077             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
1078             store_vector4(inst, machine, result);
1079             if (DEBUG_PROG) {
1080                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
1081                       "(%g %g %g %g), (%g %g %g %g)\n",
1082                       result[0], result[1], result[2], result[3],
1083                       a[0], a[1], a[2], a[3],
1084                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1085             }
1086          }
1087          break;
1088       case OPCODE_MAD:
1089          {
1090             GLfloat a[4], b[4], c[4], result[4];
1091             fetch_vector4(&inst->SrcReg[0], machine, a);
1092             fetch_vector4(&inst->SrcReg[1], machine, b);
1093             fetch_vector4(&inst->SrcReg[2], machine, c);
1094             result[0] = a[0] * b[0] + c[0];
1095             result[1] = a[1] * b[1] + c[1];
1096             result[2] = a[2] * b[2] + c[2];
1097             result[3] = a[3] * b[3] + c[3];
1098             store_vector4(inst, machine, result);
1099             if (DEBUG_PROG) {
1100                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
1101                       "(%g %g %g %g) + (%g %g %g %g)\n",
1102                       result[0], result[1], result[2], result[3],
1103                       a[0], a[1], a[2], a[3],
1104                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1105             }
1106          }
1107          break;
1108       case OPCODE_MAX:
1109          {
1110             GLfloat a[4], b[4], result[4];
1111             fetch_vector4(&inst->SrcReg[0], machine, a);
1112             fetch_vector4(&inst->SrcReg[1], machine, b);
1113             result[0] = MAX2(a[0], b[0]);
1114             result[1] = MAX2(a[1], b[1]);
1115             result[2] = MAX2(a[2], b[2]);
1116             result[3] = MAX2(a[3], b[3]);
1117             store_vector4(inst, machine, result);
1118             if (DEBUG_PROG) {
1119                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
1120                       result[0], result[1], result[2], result[3],
1121                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1122             }
1123          }
1124          break;
1125       case OPCODE_MIN:
1126          {
1127             GLfloat a[4], b[4], result[4];
1128             fetch_vector4(&inst->SrcReg[0], machine, a);
1129             fetch_vector4(&inst->SrcReg[1], machine, b);
1130             result[0] = MIN2(a[0], b[0]);
1131             result[1] = MIN2(a[1], b[1]);
1132             result[2] = MIN2(a[2], b[2]);
1133             result[3] = MIN2(a[3], b[3]);
1134             store_vector4(inst, machine, result);
1135          }
1136          break;
1137       case OPCODE_MOV:
1138          {
1139             GLfloat result[4];
1140             fetch_vector4(&inst->SrcReg[0], machine, result);
1141             store_vector4(inst, machine, result);
1142             if (DEBUG_PROG) {
1143                printf("MOV (%g %g %g %g)\n",
1144                       result[0], result[1], result[2], result[3]);
1145             }
1146          }
1147          break;
1148       case OPCODE_MUL:
1149          {
1150             GLfloat a[4], b[4], result[4];
1151             fetch_vector4(&inst->SrcReg[0], machine, a);
1152             fetch_vector4(&inst->SrcReg[1], machine, b);
1153             result[0] = a[0] * b[0];
1154             result[1] = a[1] * b[1];
1155             result[2] = a[2] * b[2];
1156             result[3] = a[3] * b[3];
1157             store_vector4(inst, machine, result);
1158             if (DEBUG_PROG) {
1159                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
1160                       result[0], result[1], result[2], result[3],
1161                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1162             }
1163          }
1164          break;
1165       case OPCODE_NOISE1:
1166          {
1167             GLfloat a[4], result[4];
1168             fetch_vector1(&inst->SrcReg[0], machine, a);
1169             result[0] =
1170                result[1] =
1171                result[2] =
1172                result[3] = _mesa_noise1(a[0]);
1173             store_vector4(inst, machine, result);
1174          }
1175          break;
1176       case OPCODE_NOISE2:
1177          {
1178             GLfloat a[4], result[4];
1179             fetch_vector4(&inst->SrcReg[0], machine, a);
1180             result[0] =
1181                result[1] =
1182                result[2] = result[3] = _mesa_noise2(a[0], a[1]);
1183             store_vector4(inst, machine, result);
1184          }
1185          break;
1186       case OPCODE_NOISE3:
1187          {
1188             GLfloat a[4], result[4];
1189             fetch_vector4(&inst->SrcReg[0], machine, a);
1190             result[0] =
1191                result[1] =
1192                result[2] =
1193                result[3] = _mesa_noise3(a[0], a[1], a[2]);
1194             store_vector4(inst, machine, result);
1195          }
1196          break;
1197       case OPCODE_NOISE4:
1198          {
1199             GLfloat a[4], result[4];
1200             fetch_vector4(&inst->SrcReg[0], machine, a);
1201             result[0] =
1202                result[1] =
1203                result[2] =
1204                result[3] = _mesa_noise4(a[0], a[1], a[2], a[3]);
1205             store_vector4(inst, machine, result);
1206          }
1207          break;
1208       case OPCODE_NOP:
1209          break;
1210       case OPCODE_NOT:         /* bitwise NOT */
1211          {
1212             GLuint a[4], result[4];
1213             fetch_vector4ui(&inst->SrcReg[0], machine, a);
1214             result[0] = ~a[0];
1215             result[1] = ~a[1];
1216             result[2] = ~a[2];
1217             result[3] = ~a[3];
1218             store_vector4ui(inst, machine, result);
1219          }
1220          break;
1221       case OPCODE_NRM3:        /* 3-component normalization */
1222          {
1223             GLfloat a[4], result[4];
1224             GLfloat tmp;
1225             fetch_vector4(&inst->SrcReg[0], machine, a);
1226             tmp = a[0] * a[0] + a[1] * a[1] + a[2] * a[2];
1227             if (tmp != 0.0F)
1228                tmp = INV_SQRTF(tmp);
1229             result[0] = tmp * a[0];
1230             result[1] = tmp * a[1];
1231             result[2] = tmp * a[2];
1232             result[3] = 0.0;  /* undefined, but prevent valgrind warnings */
1233             store_vector4(inst, machine, result);
1234          }
1235          break;
1236       case OPCODE_NRM4:        /* 4-component normalization */
1237          {
1238             GLfloat a[4], result[4];
1239             GLfloat tmp;
1240             fetch_vector4(&inst->SrcReg[0], machine, a);
1241             tmp = a[0] * a[0] + a[1] * a[1] + a[2] * a[2] + a[3] * a[3];
1242             if (tmp != 0.0F)
1243                tmp = INV_SQRTF(tmp);
1244             result[0] = tmp * a[0];
1245             result[1] = tmp * a[1];
1246             result[2] = tmp * a[2];
1247             result[3] = tmp * a[3];
1248             store_vector4(inst, machine, result);
1249          }
1250          break;
1251       case OPCODE_OR:          /* bitwise OR */
1252          {
1253             GLuint a[4], b[4], result[4];
1254             fetch_vector4ui(&inst->SrcReg[0], machine, a);
1255             fetch_vector4ui(&inst->SrcReg[1], machine, b);
1256             result[0] = a[0] | b[0];
1257             result[1] = a[1] | b[1];
1258             result[2] = a[2] | b[2];
1259             result[3] = a[3] | b[3];
1260             store_vector4ui(inst, machine, result);
1261          }
1262          break;
1263       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1264          {
1265             GLfloat a[4];
1266             GLuint result[4];
1267             GLhalfNV hx, hy;
1268             fetch_vector4(&inst->SrcReg[0], machine, a);
1269             hx = _mesa_float_to_half(a[0]);
1270             hy = _mesa_float_to_half(a[1]);
1271             result[0] =
1272             result[1] =
1273             result[2] =
1274             result[3] = hx | (hy << 16);
1275             store_vector4ui(inst, machine, result);
1276          }
1277          break;
1278       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1279          {
1280             GLfloat a[4];
1281             GLuint result[4], usx, usy;
1282             fetch_vector4(&inst->SrcReg[0], machine, a);
1283             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1284             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1285             usx = F_TO_I(a[0] * 65535.0F);
1286             usy = F_TO_I(a[1] * 65535.0F);
1287             result[0] =
1288             result[1] =
1289             result[2] =
1290             result[3] = usx | (usy << 16);
1291             store_vector4ui(inst, machine, result);
1292          }
1293          break;
1294       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1295          {
1296             GLfloat a[4];
1297             GLuint result[4], ubx, uby, ubz, ubw;
1298             fetch_vector4(&inst->SrcReg[0], machine, a);
1299             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1300             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1301             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1302             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1303             ubx = F_TO_I(127.0F * a[0] + 128.0F);
1304             uby = F_TO_I(127.0F * a[1] + 128.0F);
1305             ubz = F_TO_I(127.0F * a[2] + 128.0F);
1306             ubw = F_TO_I(127.0F * a[3] + 128.0F);
1307             result[0] =
1308             result[1] =
1309             result[2] =
1310             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1311             store_vector4ui(inst, machine, result);
1312          }
1313          break;
1314       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1315          {
1316             GLfloat a[4];
1317             GLuint result[4], ubx, uby, ubz, ubw;
1318             fetch_vector4(&inst->SrcReg[0], machine, a);
1319             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1320             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1321             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1322             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1323             ubx = F_TO_I(255.0F * a[0]);
1324             uby = F_TO_I(255.0F * a[1]);
1325             ubz = F_TO_I(255.0F * a[2]);
1326             ubw = F_TO_I(255.0F * a[3]);
1327             result[0] =
1328             result[1] =
1329             result[2] =
1330             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1331             store_vector4ui(inst, machine, result);
1332          }
1333          break;
1334       case OPCODE_POW:
1335          {
1336             GLfloat a[4], b[4], result[4];
1337             fetch_vector1(&inst->SrcReg[0], machine, a);
1338             fetch_vector1(&inst->SrcReg[1], machine, b);
1339             result[0] = result[1] = result[2] = result[3]
1340                = (GLfloat) pow(a[0], b[0]);
1341             store_vector4(inst, machine, result);
1342          }
1343          break;
1344
1345       case OPCODE_RCP:
1346          {
1347             GLfloat a[4], result[4];
1348             fetch_vector1(&inst->SrcReg[0], machine, a);
1349             if (DEBUG_PROG) {
1350                if (a[0] == 0)
1351                   printf("RCP(0)\n");
1352                else if (IS_INF_OR_NAN(a[0]))
1353                   printf("RCP(inf)\n");
1354             }
1355             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1356             store_vector4(inst, machine, result);
1357          }
1358          break;
1359       case OPCODE_RET:         /* return from subroutine (conditional) */
1360          if (eval_condition(machine, inst)) {
1361             if (machine->StackDepth == 0) {
1362                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1363             }
1364             /* subtract one because of pc++ in the for loop */
1365             pc = machine->CallStack[--machine->StackDepth] - 1;
1366          }
1367          break;
1368       case OPCODE_RFL:         /* reflection vector */
1369          {
1370             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1371             fetch_vector4(&inst->SrcReg[0], machine, axis);
1372             fetch_vector4(&inst->SrcReg[1], machine, dir);
1373             tmpW = DOT3(axis, axis);
1374             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1375             result[0] = tmpX * axis[0] - dir[0];
1376             result[1] = tmpX * axis[1] - dir[1];
1377             result[2] = tmpX * axis[2] - dir[2];
1378             /* result[3] is never written! XXX enforce in parser! */
1379             store_vector4(inst, machine, result);
1380          }
1381          break;
1382       case OPCODE_RSQ:         /* 1 / sqrt() */
1383          {
1384             GLfloat a[4], result[4];
1385             fetch_vector1(&inst->SrcReg[0], machine, a);
1386             a[0] = FABSF(a[0]);
1387             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1388             store_vector4(inst, machine, result);
1389             if (DEBUG_PROG) {
1390                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1391             }
1392          }
1393          break;
1394       case OPCODE_SCS:         /* sine and cos */
1395          {
1396             GLfloat a[4], result[4];
1397             fetch_vector1(&inst->SrcReg[0], machine, a);
1398             result[0] = (GLfloat) cos(a[0]);
1399             result[1] = (GLfloat) sin(a[0]);
1400             result[2] = 0.0;    /* undefined! */
1401             result[3] = 0.0;    /* undefined! */
1402             store_vector4(inst, machine, result);
1403          }
1404          break;
1405       case OPCODE_SEQ:         /* set on equal */
1406          {
1407             GLfloat a[4], b[4], result[4];
1408             fetch_vector4(&inst->SrcReg[0], machine, a);
1409             fetch_vector4(&inst->SrcReg[1], machine, b);
1410             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1411             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1412             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1413             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1414             store_vector4(inst, machine, result);
1415             if (DEBUG_PROG) {
1416                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1417                       result[0], result[1], result[2], result[3],
1418                       a[0], a[1], a[2], a[3],
1419                       b[0], b[1], b[2], b[3]);
1420             }
1421          }
1422          break;
1423       case OPCODE_SFL:         /* set false, operands ignored */
1424          {
1425             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1426             store_vector4(inst, machine, result);
1427          }
1428          break;
1429       case OPCODE_SGE:         /* set on greater or equal */
1430          {
1431             GLfloat a[4], b[4], result[4];
1432             fetch_vector4(&inst->SrcReg[0], machine, a);
1433             fetch_vector4(&inst->SrcReg[1], machine, b);
1434             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1435             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1436             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1437             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1438             store_vector4(inst, machine, result);
1439             if (DEBUG_PROG) {
1440                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1441                       result[0], result[1], result[2], result[3],
1442                       a[0], a[1], a[2], a[3],
1443                       b[0], b[1], b[2], b[3]);
1444             }
1445          }
1446          break;
1447       case OPCODE_SGT:         /* set on greater */
1448          {
1449             GLfloat a[4], b[4], result[4];
1450             fetch_vector4(&inst->SrcReg[0], machine, a);
1451             fetch_vector4(&inst->SrcReg[1], machine, b);
1452             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1453             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1454             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1455             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1456             store_vector4(inst, machine, result);
1457             if (DEBUG_PROG) {
1458                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1459                       result[0], result[1], result[2], result[3],
1460                       a[0], a[1], a[2], a[3],
1461                       b[0], b[1], b[2], b[3]);
1462             }
1463          }
1464          break;
1465       case OPCODE_SIN:
1466          {
1467             GLfloat a[4], result[4];
1468             fetch_vector1(&inst->SrcReg[0], machine, a);
1469             result[0] = result[1] = result[2] = result[3]
1470                = (GLfloat) sin(a[0]);
1471             store_vector4(inst, machine, result);
1472          }
1473          break;
1474       case OPCODE_SLE:         /* set on less or equal */
1475          {
1476             GLfloat a[4], b[4], result[4];
1477             fetch_vector4(&inst->SrcReg[0], machine, a);
1478             fetch_vector4(&inst->SrcReg[1], machine, b);
1479             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1480             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1481             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1482             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1483             store_vector4(inst, machine, result);
1484             if (DEBUG_PROG) {
1485                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1486                       result[0], result[1], result[2], result[3],
1487                       a[0], a[1], a[2], a[3],
1488                       b[0], b[1], b[2], b[3]);
1489             }
1490          }
1491          break;
1492       case OPCODE_SLT:         /* set on less */
1493          {
1494             GLfloat a[4], b[4], result[4];
1495             fetch_vector4(&inst->SrcReg[0], machine, a);
1496             fetch_vector4(&inst->SrcReg[1], machine, b);
1497             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1498             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1499             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1500             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1501             store_vector4(inst, machine, result);
1502             if (DEBUG_PROG) {
1503                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1504                       result[0], result[1], result[2], result[3],
1505                       a[0], a[1], a[2], a[3],
1506                       b[0], b[1], b[2], b[3]);
1507             }
1508          }
1509          break;
1510       case OPCODE_SNE:         /* set on not equal */
1511          {
1512             GLfloat a[4], b[4], result[4];
1513             fetch_vector4(&inst->SrcReg[0], machine, a);
1514             fetch_vector4(&inst->SrcReg[1], machine, b);
1515             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1516             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1517             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1518             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1519             store_vector4(inst, machine, result);
1520             if (DEBUG_PROG) {
1521                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1522                       result[0], result[1], result[2], result[3],
1523                       a[0], a[1], a[2], a[3],
1524                       b[0], b[1], b[2], b[3]);
1525             }
1526          }
1527          break;
1528       case OPCODE_SSG:         /* set sign (-1, 0 or +1) */
1529          {
1530             GLfloat a[4], result[4];
1531             fetch_vector4(&inst->SrcReg[0], machine, a);
1532             result[0] = (GLfloat) ((a[0] > 0.0F) - (a[0] < 0.0F));
1533             result[1] = (GLfloat) ((a[1] > 0.0F) - (a[1] < 0.0F));
1534             result[2] = (GLfloat) ((a[2] > 0.0F) - (a[2] < 0.0F));
1535             result[3] = (GLfloat) ((a[3] > 0.0F) - (a[3] < 0.0F));
1536             store_vector4(inst, machine, result);
1537          }
1538          break;
1539       case OPCODE_STR:         /* set true, operands ignored */
1540          {
1541             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1542             store_vector4(inst, machine, result);
1543          }
1544          break;
1545       case OPCODE_SUB:
1546          {
1547             GLfloat a[4], b[4], result[4];
1548             fetch_vector4(&inst->SrcReg[0], machine, a);
1549             fetch_vector4(&inst->SrcReg[1], machine, b);
1550             result[0] = a[0] - b[0];
1551             result[1] = a[1] - b[1];
1552             result[2] = a[2] - b[2];
1553             result[3] = a[3] - b[3];
1554             store_vector4(inst, machine, result);
1555             if (DEBUG_PROG) {
1556                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1557                       result[0], result[1], result[2], result[3],
1558                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1559             }
1560          }
1561          break;
1562       case OPCODE_SWZ:         /* extended swizzle */
1563          {
1564             const struct prog_src_register *source = &inst->SrcReg[0];
1565             const GLfloat *src = get_src_register_pointer(source, machine);
1566             GLfloat result[4];
1567             GLuint i;
1568             for (i = 0; i < 4; i++) {
1569                const GLuint swz = GET_SWZ(source->Swizzle, i);
1570                if (swz == SWIZZLE_ZERO)
1571                   result[i] = 0.0;
1572                else if (swz == SWIZZLE_ONE)
1573                   result[i] = 1.0;
1574                else {
1575                   ASSERT(swz >= 0);
1576                   ASSERT(swz <= 3);
1577                   result[i] = src[swz];
1578                }
1579                if (source->Negate & (1 << i))
1580                   result[i] = -result[i];
1581             }
1582             store_vector4(inst, machine, result);
1583          }
1584          break;
1585       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1586          /* Simple texel lookup */
1587          {
1588             GLfloat texcoord[4], color[4];
1589             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1590
1591             /* For TEX, texcoord.Q should not be used and its value should not
1592              * matter (at most, we pass coord.xyz to texture3D() in GLSL).
1593              * Set Q=1 so that FetchTexelDeriv() doesn't get a garbage value
1594              * which is effectively what happens when the texcoord swizzle
1595              * is .xyzz
1596              */
1597             texcoord[3] = 1.0f;
1598
1599             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1600
1601             if (DEBUG_PROG) {
1602                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1603                       color[0], color[1], color[2], color[3],
1604                       inst->TexSrcUnit,
1605                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1606             }
1607             store_vector4(inst, machine, color);
1608          }
1609          break;
1610       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1611          /* Texel lookup with LOD bias */
1612          {
1613             GLfloat texcoord[4], color[4], lodBias;
1614
1615             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1616
1617             /* texcoord[3] is the bias to add to lambda */
1618             lodBias = texcoord[3];
1619
1620             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1621
1622             if (DEBUG_PROG) {
1623                printf("TXB (%g, %g, %g, %g) = texture[%d][%g %g %g %g]"
1624                       "  bias %g\n",
1625                       color[0], color[1], color[2], color[3],
1626                       inst->TexSrcUnit,
1627                       texcoord[0],
1628                       texcoord[1],
1629                       texcoord[2],
1630                       texcoord[3],
1631                       lodBias);
1632             }
1633
1634             store_vector4(inst, machine, color);
1635          }
1636          break;
1637       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1638          /* Texture lookup w/ partial derivatives for LOD */
1639          {
1640             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1641             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1642             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1643             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1644             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1645                                      0.0, /* lodBias */
1646                                      inst->TexSrcUnit, color);
1647             store_vector4(inst, machine, color);
1648          }
1649          break;
1650       case OPCODE_TXL:
1651          /* Texel lookup with explicit LOD */
1652          {
1653             GLfloat texcoord[4], color[4], lod;
1654
1655             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1656
1657             /* texcoord[3] is the LOD */
1658             lod = texcoord[3];
1659
1660             machine->FetchTexelLod(ctx, texcoord, lod,
1661                                    machine->Samplers[inst->TexSrcUnit], color);
1662
1663             store_vector4(inst, machine, color);
1664          }
1665          break;
1666       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1667          /* Texture lookup w/ projective divide */
1668          {
1669             GLfloat texcoord[4], color[4];
1670
1671             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1672             /* Not so sure about this test - if texcoord[3] is
1673              * zero, we'd probably be fine except for an ASSERT in
1674              * IROUND_POS() which gets triggered by the inf values created.
1675              */
1676             if (texcoord[3] != 0.0) {
1677                texcoord[0] /= texcoord[3];
1678                texcoord[1] /= texcoord[3];
1679                texcoord[2] /= texcoord[3];
1680             }
1681
1682             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1683
1684             store_vector4(inst, machine, color);
1685          }
1686          break;
1687       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1688          /* Texture lookup w/ projective divide, as above, but do not
1689           * do the divide by w if sampling from a cube map.
1690           */
1691          {
1692             GLfloat texcoord[4], color[4];
1693
1694             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1695             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1696                 texcoord[3] != 0.0) {
1697                texcoord[0] /= texcoord[3];
1698                texcoord[1] /= texcoord[3];
1699                texcoord[2] /= texcoord[3];
1700             }
1701
1702             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1703
1704             store_vector4(inst, machine, color);
1705          }
1706          break;
1707       case OPCODE_TRUNC:       /* truncate toward zero */
1708          {
1709             GLfloat a[4], result[4];
1710             fetch_vector4(&inst->SrcReg[0], machine, a);
1711             result[0] = (GLfloat) (GLint) a[0];
1712             result[1] = (GLfloat) (GLint) a[1];
1713             result[2] = (GLfloat) (GLint) a[2];
1714             result[3] = (GLfloat) (GLint) a[3];
1715             store_vector4(inst, machine, result);
1716          }
1717          break;
1718       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1719          {
1720             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1721             GLfloat result[4];
1722             GLushort hx, hy;
1723             hx = raw & 0xffff;
1724             hy = raw >> 16;
1725             result[0] = result[2] = _mesa_half_to_float(hx);
1726             result[1] = result[3] = _mesa_half_to_float(hy);
1727             store_vector4(inst, machine, result);
1728          }
1729          break;
1730       case OPCODE_UP2US:       /* unpack two GLushorts */
1731          {
1732             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1733             GLfloat result[4];
1734             GLushort usx, usy;
1735             usx = raw & 0xffff;
1736             usy = raw >> 16;
1737             result[0] = result[2] = usx * (1.0f / 65535.0f);
1738             result[1] = result[3] = usy * (1.0f / 65535.0f);
1739             store_vector4(inst, machine, result);
1740          }
1741          break;
1742       case OPCODE_UP4B:        /* unpack four GLbytes */
1743          {
1744             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1745             GLfloat result[4];
1746             result[0] = (((raw >> 0) & 0xff) - 128) / 127.0F;
1747             result[1] = (((raw >> 8) & 0xff) - 128) / 127.0F;
1748             result[2] = (((raw >> 16) & 0xff) - 128) / 127.0F;
1749             result[3] = (((raw >> 24) & 0xff) - 128) / 127.0F;
1750             store_vector4(inst, machine, result);
1751          }
1752          break;
1753       case OPCODE_UP4UB:       /* unpack four GLubytes */
1754          {
1755             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1756             GLfloat result[4];
1757             result[0] = ((raw >> 0) & 0xff) / 255.0F;
1758             result[1] = ((raw >> 8) & 0xff) / 255.0F;
1759             result[2] = ((raw >> 16) & 0xff) / 255.0F;
1760             result[3] = ((raw >> 24) & 0xff) / 255.0F;
1761             store_vector4(inst, machine, result);
1762          }
1763          break;
1764       case OPCODE_XOR:         /* bitwise XOR */
1765          {
1766             GLuint a[4], b[4], result[4];
1767             fetch_vector4ui(&inst->SrcReg[0], machine, a);
1768             fetch_vector4ui(&inst->SrcReg[1], machine, b);
1769             result[0] = a[0] ^ b[0];
1770             result[1] = a[1] ^ b[1];
1771             result[2] = a[2] ^ b[2];
1772             result[3] = a[3] ^ b[3];
1773             store_vector4ui(inst, machine, result);
1774          }
1775          break;
1776       case OPCODE_XPD:         /* cross product */
1777          {
1778             GLfloat a[4], b[4], result[4];
1779             fetch_vector4(&inst->SrcReg[0], machine, a);
1780             fetch_vector4(&inst->SrcReg[1], machine, b);
1781             result[0] = a[1] * b[2] - a[2] * b[1];
1782             result[1] = a[2] * b[0] - a[0] * b[2];
1783             result[2] = a[0] * b[1] - a[1] * b[0];
1784             result[3] = 1.0;
1785             store_vector4(inst, machine, result);
1786             if (DEBUG_PROG) {
1787                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1788                       result[0], result[1], result[2], result[3],
1789                       a[0], a[1], a[2], b[0], b[1], b[2]);
1790             }
1791          }
1792          break;
1793       case OPCODE_X2D:         /* 2-D matrix transform */
1794          {
1795             GLfloat a[4], b[4], c[4], result[4];
1796             fetch_vector4(&inst->SrcReg[0], machine, a);
1797             fetch_vector4(&inst->SrcReg[1], machine, b);
1798             fetch_vector4(&inst->SrcReg[2], machine, c);
1799             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1800             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1801             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1802             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1803             store_vector4(inst, machine, result);
1804          }
1805          break;
1806       case OPCODE_PRINT:
1807          {
1808             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
1809                GLfloat a[4];
1810                fetch_vector4(&inst->SrcReg[0], machine, a);
1811                printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1812                             a[0], a[1], a[2], a[3]);
1813             }
1814             else {
1815                printf("%s\n", (const char *) inst->Data);
1816             }
1817          }
1818          break;
1819       case OPCODE_END:
1820          return GL_TRUE;
1821       default:
1822          _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
1823                        inst->Opcode);
1824          return GL_TRUE;        /* return value doesn't matter */
1825       }
1826
1827       numExec++;
1828       if (numExec > maxExec) {
1829          static GLboolean reported = GL_FALSE;
1830          if (!reported) {
1831             _mesa_problem(ctx, "Infinite loop detected in fragment program");
1832             reported = GL_TRUE;
1833          }
1834          return GL_TRUE;
1835       }
1836
1837    } /* for pc */
1838
1839    return GL_TRUE;
1840 }