src/mesa/program/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  7.3
   4  *
   5  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "main/glheader.h"
  39 #include "main/colormac.h"
  40 #include "main/macros.h"
  41 #include "prog_execute.h"
  42 #include "prog_instruction.h"
  43 #include "prog_parameter.h"
  44 #include "prog_print.h"
  45 #include "prog_noise.h"
  46
  47
  48 /* debug predicate */
  49 #define DEBUG_PROG 0
  50
  51
  52 /**
  53  * Set x to positive or negative infinity.
  54  */
  55 #if defined(USE_IEEE) || defined(_WIN32)
  56 #define SET_POS_INFINITY(x)                  \
  57    do {                                      \
  58          fi_type fi;                         \
  59          fi.i = 0x7F800000;                  \
  60          x = fi.f;                           \
  61    } while (0)
  62 #define SET_NEG_INFINITY(x)                  \
  63    do {                                      \
  64          fi_type fi;                         \
  65          fi.i = 0xFF800000;                  \
  66          x = fi.f;                           \
  67    } while (0)
  68 #else
  69 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  70 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  71 #endif
  72
  73 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  74
  75
  76 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  77
  78
  79 /**
  80  * Return a pointer to the 4-element float vector specified by the given
  81  * source register.
  82  */
  83 static inline const GLfloat *
  84 get_src_register_pointer(const struct prog_src_register *source,
  85                          const struct gl_program_machine *machine)
  86 {
  87    const struct gl_program *prog = machine->CurProgram;
  88    GLint reg = source->Index;
  89
  90    if (source->RelAddr) {
  91       /* add address register value to src index/offset */
  92       reg += machine->AddressReg[0][0];
  93       if (reg < 0) {
  94          return ZeroVec;
  95       }
  96    }
  97
  98    switch (source->File) {
  99    case PROGRAM_TEMPORARY:
 100       if (reg >= MAX_PROGRAM_TEMPS)
 101          return ZeroVec;
 102       return machine->Temporaries[reg];
 103
 104    case PROGRAM_INPUT:
 105       if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
 106          if (reg >= VERT_ATTRIB_MAX)
 107             return ZeroVec;
 108          return machine->VertAttribs[reg];
 109       }
 110       else {
 111          if (reg >= VARYING_SLOT_MAX)
 112             return ZeroVec;
 113          return machine->Attribs[reg][machine->CurElement];
 114       }
 115
 116    case PROGRAM_OUTPUT:
 117       if (reg >= MAX_PROGRAM_OUTPUTS)
 118          return ZeroVec;
 119       return machine->Outputs[reg];
 120
 121    case PROGRAM_LOCAL_PARAM:
 122       if (reg >= MAX_PROGRAM_LOCAL_PARAMS)
 123          return ZeroVec;
 124       return machine->CurProgram->LocalParams[reg];
 125
 126    case PROGRAM_ENV_PARAM:
 127       if (reg >= MAX_PROGRAM_ENV_PARAMS)
 128          return ZeroVec;
 129       return machine->EnvParams[reg];
 130
 131    case PROGRAM_STATE_VAR:
 132       /* Fallthrough */
 133    case PROGRAM_CONSTANT:
 134       /* Fallthrough */
 135    case PROGRAM_UNIFORM:
 136       if (reg >= (GLint) prog->Parameters->NumParameters)
 137          return ZeroVec;
 138       return (GLfloat *) prog->Parameters->ParameterValues[reg];
 139
 140    case PROGRAM_SYSTEM_VALUE:
 141       assert(reg < Elements(machine->SystemValues));
 142       return machine->SystemValues[reg];
 143
 144    default:
 145       _mesa_problem(NULL,
 146          "Invalid src register file %d in get_src_register_pointer()",
 147          source->File);
 148       return NULL;
 149    }
 150 }
 151
 152
 153 /**
 154  * Return a pointer to the 4-element float vector specified by the given
 155  * destination register.
 156  */
 157 static inline GLfloat *
 158 get_dst_register_pointer(const struct prog_dst_register *dest,
 159                          struct gl_program_machine *machine)
 160 {
 161    static GLfloat dummyReg[4];
 162    GLint reg = dest->Index;
 163
 164    if (dest->RelAddr) {
 165       /* add address register value to src index/offset */
 166       reg += machine->AddressReg[0][0];
 167       if (reg < 0) {
 168          return dummyReg;
 169       }
 170    }
 171
 172    switch (dest->File) {
 173    case PROGRAM_TEMPORARY:
 174       if (reg >= MAX_PROGRAM_TEMPS)
 175          return dummyReg;
 176       return machine->Temporaries[reg];
 177
 178    case PROGRAM_OUTPUT:
 179       if (reg >= MAX_PROGRAM_OUTPUTS)
 180          return dummyReg;
 181       return machine->Outputs[reg];
 182
 183    default:
 184       _mesa_problem(NULL,
 185          "Invalid dest register file %d in get_dst_register_pointer()",
 186          dest->File);
 187       return NULL;
 188    }
 189 }
 190
 191
 192
 193 /**
 194  * Fetch a 4-element float vector from the given source register.
 195  * Apply swizzling and negating as needed.
 196  */
 197 static void
 198 fetch_vector4(const struct prog_src_register *source,
 199               const struct gl_program_machine *machine, GLfloat result[4])
 200 {
 201    const GLfloat *src = get_src_register_pointer(source, machine);
 202    ASSERT(src);
 203
 204    if (source->Swizzle == SWIZZLE_NOOP) {
 205       /* no swizzling */
 206       COPY_4V(result, src);
 207    }
 208    else {
 209       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 210       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 211       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 212       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 213       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 214       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 215       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 216       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 217    }
 218
 219    if (source->Abs) {
 220       result[0] = FABSF(result[0]);
 221       result[1] = FABSF(result[1]);
 222       result[2] = FABSF(result[2]);
 223       result[3] = FABSF(result[3]);
 224    }
 225    if (source->Negate) {
 226       ASSERT(source->Negate == NEGATE_XYZW);
 227       result[0] = -result[0];
 228       result[1] = -result[1];
 229       result[2] = -result[2];
 230       result[3] = -result[3];
 231    }
 232
 233 #ifdef NAN_CHECK
 234    assert(!IS_INF_OR_NAN(result[0]));
 235    assert(!IS_INF_OR_NAN(result[0]));
 236    assert(!IS_INF_OR_NAN(result[0]));
 237    assert(!IS_INF_OR_NAN(result[0]));
 238 #endif
 239 }
 240
 241
 242 /**
 243  * Fetch a 4-element uint vector from the given source register.
 244  * Apply swizzling but not negation/abs.
 245  */
 246 static void
 247 fetch_vector4ui(const struct prog_src_register *source,
 248                 const struct gl_program_machine *machine, GLuint result[4])
 249 {
 250    const GLuint *src = (GLuint *) get_src_register_pointer(source, machine);
 251    ASSERT(src);
 252
 253    if (source->Swizzle == SWIZZLE_NOOP) {
 254       /* no swizzling */
 255       COPY_4V(result, src);
 256    }
 257    else {
 258       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 259       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 260       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 261       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 262       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 263       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 264       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 265       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 266    }
 267
 268    /* Note: no Negate or Abs here */
 269 }
 270
 271
 272
 273 /**
 274  * Fetch the derivative with respect to X or Y for the given register.
 275  * XXX this currently only works for fragment program input attribs.
 276  */
 277 static void
 278 fetch_vector4_deriv(struct gl_context * ctx,
 279                     const struct prog_src_register *source,
 280                     const struct gl_program_machine *machine,
 281                     char xOrY, GLfloat result[4])
 282 {
 283    if (source->File == PROGRAM_INPUT &&
 284        source->Index < (GLint) machine->NumDeriv) {
 285       const GLint col = machine->CurElement;
 286       const GLfloat w = machine->Attribs[VARYING_SLOT_POS][col][3];
 287       const GLfloat invQ = 1.0f / w;
 288       GLfloat deriv[4];
 289
 290       if (xOrY == 'X') {
 291          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 292          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 293          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 294          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 295       }
 296       else {
 297          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 298          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 299          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 300          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 301       }
 302
 303       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 304       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 305       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 306       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 307
 308       if (source->Abs) {
 309          result[0] = FABSF(result[0]);
 310          result[1] = FABSF(result[1]);
 311          result[2] = FABSF(result[2]);
 312          result[3] = FABSF(result[3]);
 313       }
 314       if (source->Negate) {
 315          ASSERT(source->Negate == NEGATE_XYZW);
 316          result[0] = -result[0];
 317          result[1] = -result[1];
 318          result[2] = -result[2];
 319          result[3] = -result[3];
 320       }
 321    }
 322    else {
 323       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 324    }
 325 }
 326
 327
 328 /**
 329  * As above, but only return result[0] element.
 330  */
 331 static void
 332 fetch_vector1(const struct prog_src_register *source,
 333               const struct gl_program_machine *machine, GLfloat result[4])
 334 {
 335    const GLfloat *src = get_src_register_pointer(source, machine);
 336    ASSERT(src);
 337
 338    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 339
 340    if (source->Abs) {
 341       result[0] = FABSF(result[0]);
 342    }
 343    if (source->Negate) {
 344       result[0] = -result[0];
 345    }
 346 }
 347
 348
 349 static GLuint
 350 fetch_vector1ui(const struct prog_src_register *source,
 351                 const struct gl_program_machine *machine)
 352 {
 353    const GLuint *src = (GLuint *) get_src_register_pointer(source, machine);
 354    return src[GET_SWZ(source->Swizzle, 0)];
 355 }
 356
 357
 358 /**
 359  * Fetch texel from texture.  Use partial derivatives when possible.
 360  */
 361 static inline void
 362 fetch_texel(struct gl_context *ctx,
 363             const struct gl_program_machine *machine,
 364             const struct prog_instruction *inst,
 365             const GLfloat texcoord[4], GLfloat lodBias,
 366             GLfloat color[4])
 367 {
 368    const GLuint unit = machine->Samplers[inst->TexSrcUnit];
 369
 370    /* Note: we only have the right derivatives for fragment input attribs.
 371     */
 372    if (machine->NumDeriv > 0 &&
 373        inst->SrcReg[0].File == PROGRAM_INPUT &&
 374        inst->SrcReg[0].Index == VARYING_SLOT_TEX0 + inst->TexSrcUnit) {
 375       /* simple texture fetch for which we should have derivatives */
 376       GLuint attr = inst->SrcReg[0].Index;
 377       machine->FetchTexelDeriv(ctx, texcoord,
 378                                machine->DerivX[attr],
 379                                machine->DerivY[attr],
 380                                lodBias, unit, color);
 381    }
 382    else {
 383       machine->FetchTexelLod(ctx, texcoord, lodBias, unit, color);
 384    }
 385 }
 386
 387
 388 /**
 389  * Test value against zero and return GT, LT, EQ or UN if NaN.
 390  */
 391 static inline GLuint
 392 generate_cc(float value)
 393 {
 394    if (value != value)
 395       return COND_UN;           /* NaN */
 396    if (value > 0.0F)
 397       return COND_GT;
 398    if (value < 0.0F)
 399       return COND_LT;
 400    return COND_EQ;
 401 }
 402
 403
 404 /**
 405  * Test if the ccMaskRule is satisfied by the given condition code.
 406  * Used to mask destination writes according to the current condition code.
 407  */
 408 static inline GLboolean
 409 test_cc(GLuint condCode, GLuint ccMaskRule)
 410 {
 411    switch (ccMaskRule) {
 412    case COND_EQ: return (condCode == COND_EQ);
 413    case COND_NE: return (condCode != COND_EQ);
 414    case COND_LT: return (condCode == COND_LT);
 415    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 416    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 417    case COND_GT: return (condCode == COND_GT);
 418    case COND_TR: return GL_TRUE;
 419    case COND_FL: return GL_FALSE;
 420    default:      return GL_TRUE;
 421    }
 422 }
 423
 424
 425 /**
 426  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 427  * or GL_FALSE to indicate result.
 428  */
 429 static inline GLboolean
 430 eval_condition(const struct gl_program_machine *machine,
 431                const struct prog_instruction *inst)
 432 {
 433    const GLuint swizzle = inst->DstReg.CondSwizzle;
 434    const GLuint condMask = inst->DstReg.CondMask;
 435    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 436        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 437        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 438        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 439       return GL_TRUE;
 440    }
 441    else {
 442       return GL_FALSE;
 443    }
 444 }
 445
 446
 447
 448 /**
 449  * Store 4 floats into a register.  Observe the instructions saturate and
 450  * set-condition-code flags.
 451  */
 452 static void
 453 store_vector4(const struct prog_instruction *inst,
 454               struct gl_program_machine *machine, const GLfloat value[4])
 455 {
 456    const struct prog_dst_register *dstReg = &(inst->DstReg);
 457    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 458    GLuint writeMask = dstReg->WriteMask;
 459    GLfloat clampedValue[4];
 460    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
 461
 462 #if 0
 463    if (value[0] > 1.0e10 ||
 464        IS_INF_OR_NAN(value[0]) ||
 465        IS_INF_OR_NAN(value[1]) ||
 466        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 467       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 468 #endif
 469
 470    if (clamp) {
 471       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 472       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 473       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 474       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 475       value = clampedValue;
 476    }
 477
 478    if (dstReg->CondMask != COND_TR) {
 479       /* condition codes may turn off some writes */
 480       if (writeMask & WRITEMASK_X) {
 481          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 482                       dstReg->CondMask))
 483             writeMask &= ~WRITEMASK_X;
 484       }
 485       if (writeMask & WRITEMASK_Y) {
 486          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 487                       dstReg->CondMask))
 488             writeMask &= ~WRITEMASK_Y;
 489       }
 490       if (writeMask & WRITEMASK_Z) {
 491          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 492                       dstReg->CondMask))
 493             writeMask &= ~WRITEMASK_Z;
 494       }
 495       if (writeMask & WRITEMASK_W) {
 496          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 497                       dstReg->CondMask))
 498             writeMask &= ~WRITEMASK_W;
 499       }
 500    }
 501
 502 #ifdef NAN_CHECK
 503    assert(!IS_INF_OR_NAN(value[0]));
 504    assert(!IS_INF_OR_NAN(value[0]));
 505    assert(!IS_INF_OR_NAN(value[0]));
 506    assert(!IS_INF_OR_NAN(value[0]));
 507 #endif
 508
 509    if (writeMask & WRITEMASK_X)
 510       dst[0] = value[0];
 511    if (writeMask & WRITEMASK_Y)
 512       dst[1] = value[1];
 513    if (writeMask & WRITEMASK_Z)
 514       dst[2] = value[2];
 515    if (writeMask & WRITEMASK_W)
 516       dst[3] = value[3];
 517
 518    if (inst->CondUpdate) {
 519       if (writeMask & WRITEMASK_X)
 520          machine->CondCodes[0] = generate_cc(value[0]);
 521       if (writeMask & WRITEMASK_Y)
 522          machine->CondCodes[1] = generate_cc(value[1]);
 523       if (writeMask & WRITEMASK_Z)
 524          machine->CondCodes[2] = generate_cc(value[2]);
 525       if (writeMask & WRITEMASK_W)
 526          machine->CondCodes[3] = generate_cc(value[3]);
 527 #if DEBUG_PROG
 528       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 529              _mesa_condcode_string(machine->CondCodes[0]),
 530              _mesa_condcode_string(machine->CondCodes[1]),
 531              _mesa_condcode_string(machine->CondCodes[2]),
 532              _mesa_condcode_string(machine->CondCodes[3]));
 533 #endif
 534    }
 535 }
 536
 537
 538 /**
 539  * Store 4 uints into a register.  Observe the set-condition-code flags.
 540  */
 541 static void
 542 store_vector4ui(const struct prog_instruction *inst,
 543                 struct gl_program_machine *machine, const GLuint value[4])
 544 {
 545    const struct prog_dst_register *dstReg = &(inst->DstReg);
 546    GLuint writeMask = dstReg->WriteMask;
 547    GLuint *dst = (GLuint *) get_dst_register_pointer(dstReg, machine);
 548
 549    if (dstReg->CondMask != COND_TR) {
 550       /* condition codes may turn off some writes */
 551       if (writeMask & WRITEMASK_X) {
 552          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 553                       dstReg->CondMask))
 554             writeMask &= ~WRITEMASK_X;
 555       }
 556       if (writeMask & WRITEMASK_Y) {
 557          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 558                       dstReg->CondMask))
 559             writeMask &= ~WRITEMASK_Y;
 560       }
 561       if (writeMask & WRITEMASK_Z) {
 562          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 563                       dstReg->CondMask))
 564             writeMask &= ~WRITEMASK_Z;
 565       }
 566       if (writeMask & WRITEMASK_W) {
 567          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 568                       dstReg->CondMask))
 569             writeMask &= ~WRITEMASK_W;
 570       }
 571    }
 572
 573    if (writeMask & WRITEMASK_X)
 574       dst[0] = value[0];
 575    if (writeMask & WRITEMASK_Y)
 576       dst[1] = value[1];
 577    if (writeMask & WRITEMASK_Z)
 578       dst[2] = value[2];
 579    if (writeMask & WRITEMASK_W)
 580       dst[3] = value[3];
 581
 582    if (inst->CondUpdate) {
 583       if (writeMask & WRITEMASK_X)
 584          machine->CondCodes[0] = generate_cc((float)value[0]);
 585       if (writeMask & WRITEMASK_Y)
 586          machine->CondCodes[1] = generate_cc((float)value[1]);
 587       if (writeMask & WRITEMASK_Z)
 588          machine->CondCodes[2] = generate_cc((float)value[2]);
 589       if (writeMask & WRITEMASK_W)
 590          machine->CondCodes[3] = generate_cc((float)value[3]);
 591 #if DEBUG_PROG
 592       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 593              _mesa_condcode_string(machine->CondCodes[0]),
 594              _mesa_condcode_string(machine->CondCodes[1]),
 595              _mesa_condcode_string(machine->CondCodes[2]),
 596              _mesa_condcode_string(machine->CondCodes[3]));
 597 #endif
 598    }
 599 }
 600
 601
 602
 603 /**
 604  * Execute the given vertex/fragment program.
 605  *
 606  * \param ctx  rendering context
 607  * \param program  the program to execute
 608  * \param machine  machine state (must be initialized)
 609  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 610  */
 611 GLboolean
 612 _mesa_execute_program(struct gl_context * ctx,
 613                       const struct gl_program *program,
 614                       struct gl_program_machine *machine)
 615 {
 616    const GLuint numInst = program->NumInstructions;
 617    const GLuint maxExec = 65536;
 618    GLuint pc, numExec = 0;
 619
 620    machine->CurProgram = program;
 621
 622    if (DEBUG_PROG) {
 623       printf("execute program %u --------------------\n", program->Id);
 624    }
 625
 626    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 627       machine->EnvParams = ctx->VertexProgram.Parameters;
 628    }
 629    else {
 630       machine->EnvParams = ctx->FragmentProgram.Parameters;
 631    }
 632
 633    for (pc = 0; pc < numInst; pc++) {
 634       const struct prog_instruction *inst = program->Instructions + pc;
 635
 636       if (DEBUG_PROG) {
 637          _mesa_print_instruction(inst);
 638       }
 639
 640       switch (inst->Opcode) {
 641       case OPCODE_ABS:
 642          {
 643             GLfloat a[4], result[4];
 644             fetch_vector4(&inst->SrcReg[0], machine, a);
 645             result[0] = FABSF(a[0]);
 646             result[1] = FABSF(a[1]);
 647             result[2] = FABSF(a[2]);
 648             result[3] = FABSF(a[3]);
 649             store_vector4(inst, machine, result);
 650          }
 651          break;
 652       case OPCODE_ADD:
 653          {
 654             GLfloat a[4], b[4], result[4];
 655             fetch_vector4(&inst->SrcReg[0], machine, a);
 656             fetch_vector4(&inst->SrcReg[1], machine, b);
 657             result[0] = a[0] + b[0];
 658             result[1] = a[1] + b[1];
 659             result[2] = a[2] + b[2];
 660             result[3] = a[3] + b[3];
 661             store_vector4(inst, machine, result);
 662             if (DEBUG_PROG) {
 663                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 664                       result[0], result[1], result[2], result[3],
 665                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 666             }
 667          }
 668          break;
 669       case OPCODE_AND:     /* bitwise AND */
 670          {
 671             GLuint a[4], b[4], result[4];
 672             fetch_vector4ui(&inst->SrcReg[0], machine, a);
 673             fetch_vector4ui(&inst->SrcReg[1], machine, b);
 674             result[0] = a[0] & b[0];
 675             result[1] = a[1] & b[1];
 676             result[2] = a[2] & b[2];
 677             result[3] = a[3] & b[3];
 678             store_vector4ui(inst, machine, result);
 679          }
 680          break;
 681       case OPCODE_ARL:
 682          {
 683             GLfloat t[4];
 684             fetch_vector4(&inst->SrcReg[0], machine, t);
 685             machine->AddressReg[0][0] = IFLOOR(t[0]);
 686             if (DEBUG_PROG) {
 687                printf("ARL %d\n", machine->AddressReg[0][0]);
 688             }
 689          }
 690          break;
 691       case OPCODE_BGNLOOP:
 692          /* no-op */
 693          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 694                 == OPCODE_ENDLOOP);
 695          break;
 696       case OPCODE_ENDLOOP:
 697          /* subtract 1 here since pc is incremented by for(pc) loop */
 698          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 699                 == OPCODE_BGNLOOP);
 700          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 701          break;
 702       case OPCODE_BGNSUB:      /* begin subroutine */
 703          break;
 704       case OPCODE_ENDSUB:      /* end subroutine */
 705          break;
 706       case OPCODE_BRK:         /* break out of loop (conditional) */
 707          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 708                 == OPCODE_ENDLOOP);
 709          if (eval_condition(machine, inst)) {
 710             /* break out of loop */
 711             /* pc++ at end of for-loop will put us after the ENDLOOP inst */
 712             pc = inst->BranchTarget;
 713          }
 714          break;
 715       case OPCODE_CONT:        /* continue loop (conditional) */
 716          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 717                 == OPCODE_ENDLOOP);
 718          if (eval_condition(machine, inst)) {
 719             /* continue at ENDLOOP */
 720             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 721             pc = inst->BranchTarget - 1;
 722          }
 723          break;
 724       case OPCODE_CAL:         /* Call subroutine (conditional) */
 725          if (eval_condition(machine, inst)) {
 726             /* call the subroutine */
 727             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 728                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 729             }
 730             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 731             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 732             pc = inst->BranchTarget - 1;
 733          }
 734          break;
 735       case OPCODE_CMP:
 736          {
 737             GLfloat a[4], b[4], c[4], result[4];
 738             fetch_vector4(&inst->SrcReg[0], machine, a);
 739             fetch_vector4(&inst->SrcReg[1], machine, b);
 740             fetch_vector4(&inst->SrcReg[2], machine, c);
 741             result[0] = a[0] < 0.0F ? b[0] : c[0];
 742             result[1] = a[1] < 0.0F ? b[1] : c[1];
 743             result[2] = a[2] < 0.0F ? b[2] : c[2];
 744             result[3] = a[3] < 0.0F ? b[3] : c[3];
 745             store_vector4(inst, machine, result);
 746             if (DEBUG_PROG) {
 747                printf("CMP (%g %g %g %g) = (%g %g %g %g) < 0 ? (%g %g %g %g) : (%g %g %g %g)\n",
 748                       result[0], result[1], result[2], result[3],
 749                       a[0], a[1], a[2], a[3],
 750                       b[0], b[1], b[2], b[3],
 751                       c[0], c[1], c[2], c[3]);
 752             }
 753          }
 754          break;
 755       case OPCODE_COS:
 756          {
 757             GLfloat a[4], result[4];
 758             fetch_vector1(&inst->SrcReg[0], machine, a);
 759             result[0] = result[1] = result[2] = result[3]
 760                = (GLfloat) cos(a[0]);
 761             store_vector4(inst, machine, result);
 762          }
 763          break;
 764       case OPCODE_DDX:         /* Partial derivative with respect to X */
 765          {
 766             GLfloat result[4];
 767             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 768                                 'X', result);
 769             store_vector4(inst, machine, result);
 770          }
 771          break;
 772       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 773          {
 774             GLfloat result[4];
 775             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 776                                 'Y', result);
 777             store_vector4(inst, machine, result);
 778          }
 779          break;
 780       case OPCODE_DP2:
 781          {
 782             GLfloat a[4], b[4], result[4];
 783             fetch_vector4(&inst->SrcReg[0], machine, a);
 784             fetch_vector4(&inst->SrcReg[1], machine, b);
 785             result[0] = result[1] = result[2] = result[3] = DOT2(a, b);
 786             store_vector4(inst, machine, result);
 787             if (DEBUG_PROG) {
 788                printf("DP2 %g = (%g %g) . (%g %g)\n",
 789                       result[0], a[0], a[1], b[0], b[1]);
 790             }
 791          }
 792          break;
 793       case OPCODE_DP2A:
 794          {
 795             GLfloat a[4], b[4], c, result[4];
 796             fetch_vector4(&inst->SrcReg[0], machine, a);
 797             fetch_vector4(&inst->SrcReg[1], machine, b);
 798             fetch_vector1(&inst->SrcReg[1], machine, &c);
 799             result[0] = result[1] = result[2] = result[3] = DOT2(a, b) + c;
 800             store_vector4(inst, machine, result);
 801             if (DEBUG_PROG) {
 802                printf("DP2A %g = (%g %g) . (%g %g) + %g\n",
 803                       result[0], a[0], a[1], b[0], b[1], c);
 804             }
 805          }
 806          break;
 807       case OPCODE_DP3:
 808          {
 809             GLfloat a[4], b[4], result[4];
 810             fetch_vector4(&inst->SrcReg[0], machine, a);
 811             fetch_vector4(&inst->SrcReg[1], machine, b);
 812             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 813             store_vector4(inst, machine, result);
 814             if (DEBUG_PROG) {
 815                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 816                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 817             }
 818          }
 819          break;
 820       case OPCODE_DP4:
 821          {
 822             GLfloat a[4], b[4], result[4];
 823             fetch_vector4(&inst->SrcReg[0], machine, a);
 824             fetch_vector4(&inst->SrcReg[1], machine, b);
 825             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 826             store_vector4(inst, machine, result);
 827             if (DEBUG_PROG) {
 828                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 829                       result[0], a[0], a[1], a[2], a[3],
 830                       b[0], b[1], b[2], b[3]);
 831             }
 832          }
 833          break;
 834       case OPCODE_DPH:
 835          {
 836             GLfloat a[4], b[4], result[4];
 837             fetch_vector4(&inst->SrcReg[0], machine, a);
 838             fetch_vector4(&inst->SrcReg[1], machine, b);
 839             result[0] = result[1] = result[2] = result[3] = DOT3(a, b) + b[3];
 840             store_vector4(inst, machine, result);
 841          }
 842          break;
 843       case OPCODE_DST:         /* Distance vector */
 844          {
 845             GLfloat a[4], b[4], result[4];
 846             fetch_vector4(&inst->SrcReg[0], machine, a);
 847             fetch_vector4(&inst->SrcReg[1], machine, b);
 848             result[0] = 1.0F;
 849             result[1] = a[1] * b[1];
 850             result[2] = a[2];
 851             result[3] = b[3];
 852             store_vector4(inst, machine, result);
 853          }
 854          break;
 855       case OPCODE_EXP:
 856          {
 857             GLfloat t[4], q[4], floor_t0;
 858             fetch_vector1(&inst->SrcReg[0], machine, t);
 859             floor_t0 = FLOORF(t[0]);
 860             if (floor_t0 > FLT_MAX_EXP) {
 861                SET_POS_INFINITY(q[0]);
 862                SET_POS_INFINITY(q[2]);
 863             }
 864             else if (floor_t0 < FLT_MIN_EXP) {
 865                q[0] = 0.0F;
 866                q[2] = 0.0F;
 867             }
 868             else {
 869                q[0] = LDEXPF(1.0, (int) floor_t0);
 870                /* Note: GL_NV_vertex_program expects
 871                 * result.z = result.x * APPX(result.y)
 872                 * We do what the ARB extension says.
 873                 */
 874                q[2] = (GLfloat) pow(2.0, t[0]);
 875             }
 876             q[1] = t[0] - floor_t0;
 877             q[3] = 1.0F;
 878             store_vector4( inst, machine, q );
 879          }
 880          break;
 881       case OPCODE_EX2:         /* Exponential base 2 */
 882          {
 883             GLfloat a[4], result[4], val;
 884             fetch_vector1(&inst->SrcReg[0], machine, a);
 885             val = (GLfloat) pow(2.0, a[0]);
 886             /*
 887             if (IS_INF_OR_NAN(val))
 888                val = 1.0e10;
 889             */
 890             result[0] = result[1] = result[2] = result[3] = val;
 891             store_vector4(inst, machine, result);
 892          }
 893          break;
 894       case OPCODE_FLR:
 895          {
 896             GLfloat a[4], result[4];
 897             fetch_vector4(&inst->SrcReg[0], machine, a);
 898             result[0] = FLOORF(a[0]);
 899             result[1] = FLOORF(a[1]);
 900             result[2] = FLOORF(a[2]);
 901             result[3] = FLOORF(a[3]);
 902             store_vector4(inst, machine, result);
 903          }
 904          break;
 905       case OPCODE_FRC:
 906          {
 907             GLfloat a[4], result[4];
 908             fetch_vector4(&inst->SrcReg[0], machine, a);
 909             result[0] = a[0] - FLOORF(a[0]);
 910             result[1] = a[1] - FLOORF(a[1]);
 911             result[2] = a[2] - FLOORF(a[2]);
 912             result[3] = a[3] - FLOORF(a[3]);
 913             store_vector4(inst, machine, result);
 914          }
 915          break;
 916       case OPCODE_IF:
 917          {
 918             GLboolean cond;
 919             ASSERT(program->Instructions[inst->BranchTarget].Opcode
 920                    == OPCODE_ELSE ||
 921                    program->Instructions[inst->BranchTarget].Opcode
 922                    == OPCODE_ENDIF);
 923             /* eval condition */
 924             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 925                GLfloat a[4];
 926                fetch_vector1(&inst->SrcReg[0], machine, a);
 927                cond = (a[0] != 0.0);
 928             }
 929             else {
 930                cond = eval_condition(machine, inst);
 931             }
 932             if (DEBUG_PROG) {
 933                printf("IF: %d\n", cond);
 934             }
 935             /* do if/else */
 936             if (cond) {
 937                /* do if-clause (just continue execution) */
 938             }
 939             else {
 940                /* go to the instruction after ELSE or ENDIF */
 941                assert(inst->BranchTarget >= 0);
 942                pc = inst->BranchTarget;
 943             }
 944          }
 945          break;
 946       case OPCODE_ELSE:
 947          /* goto ENDIF */
 948          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 949                 == OPCODE_ENDIF);
 950          assert(inst->BranchTarget >= 0);
 951          pc = inst->BranchTarget;
 952          break;
 953       case OPCODE_ENDIF:
 954          /* nothing */
 955          break;
 956       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 957          if (eval_condition(machine, inst)) {
 958             return GL_FALSE;
 959          }
 960          break;
 961       case OPCODE_KIL:         /* ARB_f_p only */
 962          {
 963             GLfloat a[4];
 964             fetch_vector4(&inst->SrcReg[0], machine, a);
 965             if (DEBUG_PROG) {
 966                printf("KIL if (%g %g %g %g) <= 0.0\n",
 967                       a[0], a[1], a[2], a[3]);
 968             }
 969
 970             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 971                return GL_FALSE;
 972             }
 973          }
 974          break;
 975       case OPCODE_LG2:         /* log base 2 */
 976          {
 977             GLfloat a[4], result[4], val;
 978             fetch_vector1(&inst->SrcReg[0], machine, a);
 979             /* The fast LOG2 macro doesn't meet the precision requirements.
 980              */
 981             if (a[0] == 0.0F) {
 982                val = -FLT_MAX;
 983             }
 984             else {
 985                val = (float)(log(a[0]) * 1.442695F);
 986             }
 987             result[0] = result[1] = result[2] = result[3] = val;
 988             store_vector4(inst, machine, result);
 989          }
 990          break;
 991       case OPCODE_LIT:
 992          {
 993             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
 994             GLfloat a[4], result[4];
 995             fetch_vector4(&inst->SrcReg[0], machine, a);
 996             a[0] = MAX2(a[0], 0.0F);
 997             a[1] = MAX2(a[1], 0.0F);
 998             /* XXX ARB version clamps a[3], NV version doesn't */
 999             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
1000             result[0] = 1.0F;
1001             result[1] = a[0];
1002             /* XXX we could probably just use pow() here */
1003             if (a[0] > 0.0F) {
1004                if (a[1] == 0.0 && a[3] == 0.0)
1005                   result[2] = 1.0F;
1006                else
1007                   result[2] = (GLfloat) pow(a[1], a[3]);
1008             }
1009             else {
1010                result[2] = 0.0F;
1011             }
1012             result[3] = 1.0F;
1013             store_vector4(inst, machine, result);
1014             if (DEBUG_PROG) {
1015                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
1016                       result[0], result[1], result[2], result[3],
1017                       a[0], a[1], a[2], a[3]);
1018             }
1019          }
1020          break;
1021       case OPCODE_LOG:
1022          {
1023             GLfloat t[4], q[4], abs_t0;
1024             fetch_vector1(&inst->SrcReg[0], machine, t);
1025             abs_t0 = FABSF(t[0]);
1026             if (abs_t0 != 0.0F) {
1027                if (IS_INF_OR_NAN(abs_t0))
1028                {
1029                   SET_POS_INFINITY(q[0]);
1030                   q[1] = 1.0F;
1031                   SET_POS_INFINITY(q[2]);
1032                }
1033                else {
1034                   int exponent;
1035                   GLfloat mantissa = FREXPF(t[0], &exponent);
1036                   q[0] = (GLfloat) (exponent - 1);
1037                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
1038
1039                   /* The fast LOG2 macro doesn't meet the precision
1040                    * requirements.
1041                    */
1042                   q[2] = (float)(log(t[0]) * 1.442695F);
1043                }
1044             }
1045             else {
1046                SET_NEG_INFINITY(q[0]);
1047                q[1] = 1.0F;
1048                SET_NEG_INFINITY(q[2]);
1049             }
1050             q[3] = 1.0;
1051             store_vector4(inst, machine, q);
1052          }
1053          break;
1054       case OPCODE_LRP:
1055          {
1056             GLfloat a[4], b[4], c[4], result[4];
1057             fetch_vector4(&inst->SrcReg[0], machine, a);
1058             fetch_vector4(&inst->SrcReg[1], machine, b);
1059             fetch_vector4(&inst->SrcReg[2], machine, c);
1060             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
1061             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
1062             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
1063             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
1064             store_vector4(inst, machine, result);
1065             if (DEBUG_PROG) {
1066                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
1067                       "(%g %g %g %g), (%g %g %g %g)\n",
1068                       result[0], result[1], result[2], result[3],
1069                       a[0], a[1], a[2], a[3],
1070                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1071             }
1072          }
1073          break;
1074       case OPCODE_MAD:
1075          {
1076             GLfloat a[4], b[4], c[4], result[4];
1077             fetch_vector4(&inst->SrcReg[0], machine, a);
1078             fetch_vector4(&inst->SrcReg[1], machine, b);
1079             fetch_vector4(&inst->SrcReg[2], machine, c);
1080             result[0] = a[0] * b[0] + c[0];
1081             result[1] = a[1] * b[1] + c[1];
1082             result[2] = a[2] * b[2] + c[2];
1083             result[3] = a[3] * b[3] + c[3];
1084             store_vector4(inst, machine, result);
1085             if (DEBUG_PROG) {
1086                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
1087                       "(%g %g %g %g) + (%g %g %g %g)\n",
1088                       result[0], result[1], result[2], result[3],
1089                       a[0], a[1], a[2], a[3],
1090                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1091             }
1092          }
1093          break;
1094       case OPCODE_MAX:
1095          {
1096             GLfloat a[4], b[4], result[4];
1097             fetch_vector4(&inst->SrcReg[0], machine, a);
1098             fetch_vector4(&inst->SrcReg[1], machine, b);
1099             result[0] = MAX2(a[0], b[0]);
1100             result[1] = MAX2(a[1], b[1]);
1101             result[2] = MAX2(a[2], b[2]);
1102             result[3] = MAX2(a[3], b[3]);
1103             store_vector4(inst, machine, result);
1104             if (DEBUG_PROG) {
1105                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
1106                       result[0], result[1], result[2], result[3],
1107                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1108             }
1109          }
1110          break;
1111       case OPCODE_MIN:
1112          {
1113             GLfloat a[4], b[4], result[4];
1114             fetch_vector4(&inst->SrcReg[0], machine, a);
1115             fetch_vector4(&inst->SrcReg[1], machine, b);
1116             result[0] = MIN2(a[0], b[0]);
1117             result[1] = MIN2(a[1], b[1]);
1118             result[2] = MIN2(a[2], b[2]);
1119             result[3] = MIN2(a[3], b[3]);
1120             store_vector4(inst, machine, result);
1121          }
1122          break;
1123       case OPCODE_MOV:
1124          {
1125             GLfloat result[4];
1126             fetch_vector4(&inst->SrcReg[0], machine, result);
1127             store_vector4(inst, machine, result);
1128             if (DEBUG_PROG) {
1129                printf("MOV (%g %g %g %g)\n",
1130                       result[0], result[1], result[2], result[3]);
1131             }
1132          }
1133          break;
1134       case OPCODE_MUL:
1135          {
1136             GLfloat a[4], b[4], result[4];
1137             fetch_vector4(&inst->SrcReg[0], machine, a);
1138             fetch_vector4(&inst->SrcReg[1], machine, b);
1139             result[0] = a[0] * b[0];
1140             result[1] = a[1] * b[1];
1141             result[2] = a[2] * b[2];
1142             result[3] = a[3] * b[3];
1143             store_vector4(inst, machine, result);
1144             if (DEBUG_PROG) {
1145                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
1146                       result[0], result[1], result[2], result[3],
1147                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1148             }
1149          }
1150          break;
1151       case OPCODE_NOISE1:
1152          {
1153             GLfloat a[4], result[4];
1154             fetch_vector1(&inst->SrcReg[0], machine, a);
1155             result[0] =
1156                result[1] =
1157                result[2] =
1158                result[3] = _mesa_noise1(a[0]);
1159             store_vector4(inst, machine, result);
1160          }
1161          break;
1162       case OPCODE_NOISE2:
1163          {
1164             GLfloat a[4], result[4];
1165             fetch_vector4(&inst->SrcReg[0], machine, a);
1166             result[0] =
1167                result[1] =
1168                result[2] = result[3] = _mesa_noise2(a[0], a[1]);
1169             store_vector4(inst, machine, result);
1170          }
1171          break;
1172       case OPCODE_NOISE3:
1173          {
1174             GLfloat a[4], result[4];
1175             fetch_vector4(&inst->SrcReg[0], machine, a);
1176             result[0] =
1177                result[1] =
1178                result[2] =
1179                result[3] = _mesa_noise3(a[0], a[1], a[2]);
1180             store_vector4(inst, machine, result);
1181          }
1182          break;
1183       case OPCODE_NOISE4:
1184          {
1185             GLfloat a[4], result[4];
1186             fetch_vector4(&inst->SrcReg[0], machine, a);
1187             result[0] =
1188                result[1] =
1189                result[2] =
1190                result[3] = _mesa_noise4(a[0], a[1], a[2], a[3]);
1191             store_vector4(inst, machine, result);
1192          }
1193          break;
1194       case OPCODE_NOP:
1195          break;
1196       case OPCODE_NOT:         /* bitwise NOT */
1197          {
1198             GLuint a[4], result[4];
1199             fetch_vector4ui(&inst->SrcReg[0], machine, a);
1200             result[0] = ~a[0];
1201             result[1] = ~a[1];
1202             result[2] = ~a[2];
1203             result[3] = ~a[3];
1204             store_vector4ui(inst, machine, result);
1205          }
1206          break;
1207       case OPCODE_NRM3:        /* 3-component normalization */
1208          {
1209             GLfloat a[4], result[4];
1210             GLfloat tmp;
1211             fetch_vector4(&inst->SrcReg[0], machine, a);
1212             tmp = a[0] * a[0] + a[1] * a[1] + a[2] * a[2];
1213             if (tmp != 0.0F)
1214                tmp = INV_SQRTF(tmp);
1215             result[0] = tmp * a[0];
1216             result[1] = tmp * a[1];
1217             result[2] = tmp * a[2];
1218             result[3] = 0.0;  /* undefined, but prevent valgrind warnings */
1219             store_vector4(inst, machine, result);
1220          }
1221          break;
1222       case OPCODE_NRM4:        /* 4-component normalization */
1223          {
1224             GLfloat a[4], result[4];
1225             GLfloat tmp;
1226             fetch_vector4(&inst->SrcReg[0], machine, a);
1227             tmp = a[0] * a[0] + a[1] * a[1] + a[2] * a[2] + a[3] * a[3];
1228             if (tmp != 0.0F)
1229                tmp = INV_SQRTF(tmp);
1230             result[0] = tmp * a[0];
1231             result[1] = tmp * a[1];
1232             result[2] = tmp * a[2];
1233             result[3] = tmp * a[3];
1234             store_vector4(inst, machine, result);
1235          }
1236          break;
1237       case OPCODE_OR:          /* bitwise OR */
1238          {
1239             GLuint a[4], b[4], result[4];
1240             fetch_vector4ui(&inst->SrcReg[0], machine, a);
1241             fetch_vector4ui(&inst->SrcReg[1], machine, b);
1242             result[0] = a[0] | b[0];
1243             result[1] = a[1] | b[1];
1244             result[2] = a[2] | b[2];
1245             result[3] = a[3] | b[3];
1246             store_vector4ui(inst, machine, result);
1247          }
1248          break;
1249       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1250          {
1251             GLfloat a[4];
1252             GLuint result[4];
1253             GLhalfNV hx, hy;
1254             fetch_vector4(&inst->SrcReg[0], machine, a);
1255             hx = _mesa_float_to_half(a[0]);
1256             hy = _mesa_float_to_half(a[1]);
1257             result[0] =
1258             result[1] =
1259             result[2] =
1260             result[3] = hx | (hy << 16);
1261             store_vector4ui(inst, machine, result);
1262          }
1263          break;
1264       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1265          {
1266             GLfloat a[4];
1267             GLuint result[4], usx, usy;
1268             fetch_vector4(&inst->SrcReg[0], machine, a);
1269             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1270             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1271             usx = F_TO_I(a[0] * 65535.0F);
1272             usy = F_TO_I(a[1] * 65535.0F);
1273             result[0] =
1274             result[1] =
1275             result[2] =
1276             result[3] = usx | (usy << 16);
1277             store_vector4ui(inst, machine, result);
1278          }
1279          break;
1280       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1281          {
1282             GLfloat a[4];
1283             GLuint result[4], ubx, uby, ubz, ubw;
1284             fetch_vector4(&inst->SrcReg[0], machine, a);
1285             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1286             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1287             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1288             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1289             ubx = F_TO_I(127.0F * a[0] + 128.0F);
1290             uby = F_TO_I(127.0F * a[1] + 128.0F);
1291             ubz = F_TO_I(127.0F * a[2] + 128.0F);
1292             ubw = F_TO_I(127.0F * a[3] + 128.0F);
1293             result[0] =
1294             result[1] =
1295             result[2] =
1296             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1297             store_vector4ui(inst, machine, result);
1298          }
1299          break;
1300       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1301          {
1302             GLfloat a[4];
1303             GLuint result[4], ubx, uby, ubz, ubw;
1304             fetch_vector4(&inst->SrcReg[0], machine, a);
1305             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1306             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1307             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1308             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1309             ubx = F_TO_I(255.0F * a[0]);
1310             uby = F_TO_I(255.0F * a[1]);
1311             ubz = F_TO_I(255.0F * a[2]);
1312             ubw = F_TO_I(255.0F * a[3]);
1313             result[0] =
1314             result[1] =
1315             result[2] =
1316             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1317             store_vector4ui(inst, machine, result);
1318          }
1319          break;
1320       case OPCODE_POW:
1321          {
1322             GLfloat a[4], b[4], result[4];
1323             fetch_vector1(&inst->SrcReg[0], machine, a);
1324             fetch_vector1(&inst->SrcReg[1], machine, b);
1325             result[0] = result[1] = result[2] = result[3]
1326                = (GLfloat) pow(a[0], b[0]);
1327             store_vector4(inst, machine, result);
1328          }
1329          break;
1330
1331       case OPCODE_RCP:
1332          {
1333             GLfloat a[4], result[4];
1334             fetch_vector1(&inst->SrcReg[0], machine, a);
1335             if (DEBUG_PROG) {
1336                if (a[0] == 0)
1337                   printf("RCP(0)\n");
1338                else if (IS_INF_OR_NAN(a[0]))
1339                   printf("RCP(inf)\n");
1340             }
1341             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1342             store_vector4(inst, machine, result);
1343          }
1344          break;
1345       case OPCODE_RET:         /* return from subroutine (conditional) */
1346          if (eval_condition(machine, inst)) {
1347             if (machine->StackDepth == 0) {
1348                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1349             }
1350             /* subtract one because of pc++ in the for loop */
1351             pc = machine->CallStack[--machine->StackDepth] - 1;
1352          }
1353          break;
1354       case OPCODE_RFL:         /* reflection vector */
1355          {
1356             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1357             fetch_vector4(&inst->SrcReg[0], machine, axis);
1358             fetch_vector4(&inst->SrcReg[1], machine, dir);
1359             tmpW = DOT3(axis, axis);
1360             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1361             result[0] = tmpX * axis[0] - dir[0];
1362             result[1] = tmpX * axis[1] - dir[1];
1363             result[2] = tmpX * axis[2] - dir[2];
1364             /* result[3] is never written! XXX enforce in parser! */
1365             store_vector4(inst, machine, result);
1366          }
1367          break;
1368       case OPCODE_RSQ:         /* 1 / sqrt() */
1369          {
1370             GLfloat a[4], result[4];
1371             fetch_vector1(&inst->SrcReg[0], machine, a);
1372             a[0] = FABSF(a[0]);
1373             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1374             store_vector4(inst, machine, result);
1375             if (DEBUG_PROG) {
1376                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1377             }
1378          }
1379          break;
1380       case OPCODE_SCS:         /* sine and cos */
1381          {
1382             GLfloat a[4], result[4];
1383             fetch_vector1(&inst->SrcReg[0], machine, a);
1384             result[0] = (GLfloat) cos(a[0]);
1385             result[1] = (GLfloat) sin(a[0]);
1386             result[2] = 0.0;    /* undefined! */
1387             result[3] = 0.0;    /* undefined! */
1388             store_vector4(inst, machine, result);
1389          }
1390          break;
1391       case OPCODE_SEQ:         /* set on equal */
1392          {
1393             GLfloat a[4], b[4], result[4];
1394             fetch_vector4(&inst->SrcReg[0], machine, a);
1395             fetch_vector4(&inst->SrcReg[1], machine, b);
1396             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1397             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1398             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1399             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1400             store_vector4(inst, machine, result);
1401             if (DEBUG_PROG) {
1402                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1403                       result[0], result[1], result[2], result[3],
1404                       a[0], a[1], a[2], a[3],
1405                       b[0], b[1], b[2], b[3]);
1406             }
1407          }
1408          break;
1409       case OPCODE_SFL:         /* set false, operands ignored */
1410          {
1411             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1412             store_vector4(inst, machine, result);
1413          }
1414          break;
1415       case OPCODE_SGE:         /* set on greater or equal */
1416          {
1417             GLfloat a[4], b[4], result[4];
1418             fetch_vector4(&inst->SrcReg[0], machine, a);
1419             fetch_vector4(&inst->SrcReg[1], machine, b);
1420             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1421             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1422             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1423             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1424             store_vector4(inst, machine, result);
1425             if (DEBUG_PROG) {
1426                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1427                       result[0], result[1], result[2], result[3],
1428                       a[0], a[1], a[2], a[3],
1429                       b[0], b[1], b[2], b[3]);
1430             }
1431          }
1432          break;
1433       case OPCODE_SGT:         /* set on greater */
1434          {
1435             GLfloat a[4], b[4], result[4];
1436             fetch_vector4(&inst->SrcReg[0], machine, a);
1437             fetch_vector4(&inst->SrcReg[1], machine, b);
1438             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1439             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1440             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1441             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1442             store_vector4(inst, machine, result);
1443             if (DEBUG_PROG) {
1444                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1445                       result[0], result[1], result[2], result[3],
1446                       a[0], a[1], a[2], a[3],
1447                       b[0], b[1], b[2], b[3]);
1448             }
1449          }
1450          break;
1451       case OPCODE_SIN:
1452          {
1453             GLfloat a[4], result[4];
1454             fetch_vector1(&inst->SrcReg[0], machine, a);
1455             result[0] = result[1] = result[2] = result[3]
1456                = (GLfloat) sin(a[0]);
1457             store_vector4(inst, machine, result);
1458          }
1459          break;
1460       case OPCODE_SLE:         /* set on less or equal */
1461          {
1462             GLfloat a[4], b[4], result[4];
1463             fetch_vector4(&inst->SrcReg[0], machine, a);
1464             fetch_vector4(&inst->SrcReg[1], machine, b);
1465             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1466             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1467             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1468             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1469             store_vector4(inst, machine, result);
1470             if (DEBUG_PROG) {
1471                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1472                       result[0], result[1], result[2], result[3],
1473                       a[0], a[1], a[2], a[3],
1474                       b[0], b[1], b[2], b[3]);
1475             }
1476          }
1477          break;
1478       case OPCODE_SLT:         /* set on less */
1479          {
1480             GLfloat a[4], b[4], result[4];
1481             fetch_vector4(&inst->SrcReg[0], machine, a);
1482             fetch_vector4(&inst->SrcReg[1], machine, b);
1483             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1484             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1485             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1486             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1487             store_vector4(inst, machine, result);
1488             if (DEBUG_PROG) {
1489                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1490                       result[0], result[1], result[2], result[3],
1491                       a[0], a[1], a[2], a[3],
1492                       b[0], b[1], b[2], b[3]);
1493             }
1494          }
1495          break;
1496       case OPCODE_SNE:         /* set on not equal */
1497          {
1498             GLfloat a[4], b[4], result[4];
1499             fetch_vector4(&inst->SrcReg[0], machine, a);
1500             fetch_vector4(&inst->SrcReg[1], machine, b);
1501             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1502             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1503             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1504             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1505             store_vector4(inst, machine, result);
1506             if (DEBUG_PROG) {
1507                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1508                       result[0], result[1], result[2], result[3],
1509                       a[0], a[1], a[2], a[3],
1510                       b[0], b[1], b[2], b[3]);
1511             }
1512          }
1513          break;
1514       case OPCODE_SSG:         /* set sign (-1, 0 or +1) */
1515          {
1516             GLfloat a[4], result[4];
1517             fetch_vector4(&inst->SrcReg[0], machine, a);
1518             result[0] = (GLfloat) ((a[0] > 0.0F) - (a[0] < 0.0F));
1519             result[1] = (GLfloat) ((a[1] > 0.0F) - (a[1] < 0.0F));
1520             result[2] = (GLfloat) ((a[2] > 0.0F) - (a[2] < 0.0F));
1521             result[3] = (GLfloat) ((a[3] > 0.0F) - (a[3] < 0.0F));
1522             store_vector4(inst, machine, result);
1523          }
1524          break;
1525       case OPCODE_STR:         /* set true, operands ignored */
1526          {
1527             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1528             store_vector4(inst, machine, result);
1529          }
1530          break;
1531       case OPCODE_SUB:
1532          {
1533             GLfloat a[4], b[4], result[4];
1534             fetch_vector4(&inst->SrcReg[0], machine, a);
1535             fetch_vector4(&inst->SrcReg[1], machine, b);
1536             result[0] = a[0] - b[0];
1537             result[1] = a[1] - b[1];
1538             result[2] = a[2] - b[2];
1539             result[3] = a[3] - b[3];
1540             store_vector4(inst, machine, result);
1541             if (DEBUG_PROG) {
1542                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1543                       result[0], result[1], result[2], result[3],
1544                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1545             }
1546          }
1547          break;
1548       case OPCODE_SWZ:         /* extended swizzle */
1549          {
1550             const struct prog_src_register *source = &inst->SrcReg[0];
1551             const GLfloat *src = get_src_register_pointer(source, machine);
1552             GLfloat result[4];
1553             GLuint i;
1554             for (i = 0; i < 4; i++) {
1555                const GLuint swz = GET_SWZ(source->Swizzle, i);
1556                if (swz == SWIZZLE_ZERO)
1557                   result[i] = 0.0;
1558                else if (swz == SWIZZLE_ONE)
1559                   result[i] = 1.0;
1560                else {
1561                   ASSERT(swz >= 0);
1562                   ASSERT(swz <= 3);
1563                   result[i] = src[swz];
1564                }
1565                if (source->Negate & (1 << i))
1566                   result[i] = -result[i];
1567             }
1568             store_vector4(inst, machine, result);
1569          }
1570          break;
1571       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1572          /* Simple texel lookup */
1573          {
1574             GLfloat texcoord[4], color[4];
1575             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1576
1577             /* For TEX, texcoord.Q should not be used and its value should not
1578              * matter (at most, we pass coord.xyz to texture3D() in GLSL).
1579              * Set Q=1 so that FetchTexelDeriv() doesn't get a garbage value
1580              * which is effectively what happens when the texcoord swizzle
1581              * is .xyzz
1582              */
1583             texcoord[3] = 1.0f;
1584
1585             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1586
1587             if (DEBUG_PROG) {
1588                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1589                       color[0], color[1], color[2], color[3],
1590                       inst->TexSrcUnit,
1591                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1592             }
1593             store_vector4(inst, machine, color);
1594          }
1595          break;
1596       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1597          /* Texel lookup with LOD bias */
1598          {
1599             GLfloat texcoord[4], color[4], lodBias;
1600
1601             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1602
1603             /* texcoord[3] is the bias to add to lambda */
1604             lodBias = texcoord[3];
1605
1606             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1607
1608             if (DEBUG_PROG) {
1609                printf("TXB (%g, %g, %g, %g) = texture[%d][%g %g %g %g]"
1610                       "  bias %g\n",
1611                       color[0], color[1], color[2], color[3],
1612                       inst->TexSrcUnit,
1613                       texcoord[0],
1614                       texcoord[1],
1615                       texcoord[2],
1616                       texcoord[3],
1617                       lodBias);
1618             }
1619
1620             store_vector4(inst, machine, color);
1621          }
1622          break;
1623       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1624          /* Texture lookup w/ partial derivatives for LOD */
1625          {
1626             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1627             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1628             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1629             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1630             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1631                                      0.0, /* lodBias */
1632                                      inst->TexSrcUnit, color);
1633             store_vector4(inst, machine, color);
1634          }
1635          break;
1636       case OPCODE_TXL:
1637          /* Texel lookup with explicit LOD */
1638          {
1639             GLfloat texcoord[4], color[4], lod;
1640
1641             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1642
1643             /* texcoord[3] is the LOD */
1644             lod = texcoord[3];
1645
1646             machine->FetchTexelLod(ctx, texcoord, lod,
1647                                    machine->Samplers[inst->TexSrcUnit], color);
1648
1649             store_vector4(inst, machine, color);
1650          }
1651          break;
1652       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1653          /* Texture lookup w/ projective divide */
1654          {
1655             GLfloat texcoord[4], color[4];
1656
1657             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1658             /* Not so sure about this test - if texcoord[3] is
1659              * zero, we'd probably be fine except for an ASSERT in
1660              * IROUND_POS() which gets triggered by the inf values created.
1661              */
1662             if (texcoord[3] != 0.0) {
1663                texcoord[0] /= texcoord[3];
1664                texcoord[1] /= texcoord[3];
1665                texcoord[2] /= texcoord[3];
1666             }
1667
1668             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1669
1670             store_vector4(inst, machine, color);
1671          }
1672          break;
1673       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1674          /* Texture lookup w/ projective divide, as above, but do not
1675           * do the divide by w if sampling from a cube map.
1676           */
1677          {
1678             GLfloat texcoord[4], color[4];
1679
1680             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1681             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1682                 texcoord[3] != 0.0) {
1683                texcoord[0] /= texcoord[3];
1684                texcoord[1] /= texcoord[3];
1685                texcoord[2] /= texcoord[3];
1686             }
1687
1688             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1689
1690             store_vector4(inst, machine, color);
1691          }
1692          break;
1693       case OPCODE_TRUNC:       /* truncate toward zero */
1694          {
1695             GLfloat a[4], result[4];
1696             fetch_vector4(&inst->SrcReg[0], machine, a);
1697             result[0] = (GLfloat) (GLint) a[0];
1698             result[1] = (GLfloat) (GLint) a[1];
1699             result[2] = (GLfloat) (GLint) a[2];
1700             result[3] = (GLfloat) (GLint) a[3];
1701             store_vector4(inst, machine, result);
1702          }
1703          break;
1704       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1705          {
1706             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1707             GLfloat result[4];
1708             GLushort hx, hy;
1709             hx = raw & 0xffff;
1710             hy = raw >> 16;
1711             result[0] = result[2] = _mesa_half_to_float(hx);
1712             result[1] = result[3] = _mesa_half_to_float(hy);
1713             store_vector4(inst, machine, result);
1714          }
1715          break;
1716       case OPCODE_UP2US:       /* unpack two GLushorts */
1717          {
1718             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1719             GLfloat result[4];
1720             GLushort usx, usy;
1721             usx = raw & 0xffff;
1722             usy = raw >> 16;
1723             result[0] = result[2] = usx * (1.0f / 65535.0f);
1724             result[1] = result[3] = usy * (1.0f / 65535.0f);
1725             store_vector4(inst, machine, result);
1726          }
1727          break;
1728       case OPCODE_UP4B:        /* unpack four GLbytes */
1729          {
1730             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1731             GLfloat result[4];
1732             result[0] = (((raw >> 0) & 0xff) - 128) / 127.0F;
1733             result[1] = (((raw >> 8) & 0xff) - 128) / 127.0F;
1734             result[2] = (((raw >> 16) & 0xff) - 128) / 127.0F;
1735             result[3] = (((raw >> 24) & 0xff) - 128) / 127.0F;
1736             store_vector4(inst, machine, result);
1737          }
1738          break;
1739       case OPCODE_UP4UB:       /* unpack four GLubytes */
1740          {
1741             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1742             GLfloat result[4];
1743             result[0] = ((raw >> 0) & 0xff) / 255.0F;
1744             result[1] = ((raw >> 8) & 0xff) / 255.0F;
1745             result[2] = ((raw >> 16) & 0xff) / 255.0F;
1746             result[3] = ((raw >> 24) & 0xff) / 255.0F;
1747             store_vector4(inst, machine, result);
1748          }
1749          break;
1750       case OPCODE_XOR:         /* bitwise XOR */
1751          {
1752             GLuint a[4], b[4], result[4];
1753             fetch_vector4ui(&inst->SrcReg[0], machine, a);
1754             fetch_vector4ui(&inst->SrcReg[1], machine, b);
1755             result[0] = a[0] ^ b[0];
1756             result[1] = a[1] ^ b[1];
1757             result[2] = a[2] ^ b[2];
1758             result[3] = a[3] ^ b[3];
1759             store_vector4ui(inst, machine, result);
1760          }
1761          break;
1762       case OPCODE_XPD:         /* cross product */
1763          {
1764             GLfloat a[4], b[4], result[4];
1765             fetch_vector4(&inst->SrcReg[0], machine, a);
1766             fetch_vector4(&inst->SrcReg[1], machine, b);
1767             result[0] = a[1] * b[2] - a[2] * b[1];
1768             result[1] = a[2] * b[0] - a[0] * b[2];
1769             result[2] = a[0] * b[1] - a[1] * b[0];
1770             result[3] = 1.0;
1771             store_vector4(inst, machine, result);
1772             if (DEBUG_PROG) {
1773                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1774                       result[0], result[1], result[2], result[3],
1775                       a[0], a[1], a[2], b[0], b[1], b[2]);
1776             }
1777          }
1778          break;
1779       case OPCODE_X2D:         /* 2-D matrix transform */
1780          {
1781             GLfloat a[4], b[4], c[4], result[4];
1782             fetch_vector4(&inst->SrcReg[0], machine, a);
1783             fetch_vector4(&inst->SrcReg[1], machine, b);
1784             fetch_vector4(&inst->SrcReg[2], machine, c);
1785             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1786             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1787             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1788             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1789             store_vector4(inst, machine, result);
1790          }
1791          break;
1792       case OPCODE_PRINT:
1793          {
1794             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
1795                GLfloat a[4];
1796                fetch_vector4(&inst->SrcReg[0], machine, a);
1797                printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1798                             a[0], a[1], a[2], a[3]);
1799             }
1800             else {
1801                printf("%s\n", (const char *) inst->Data);
1802             }
1803          }
1804          break;
1805       case OPCODE_END:
1806          return GL_TRUE;
1807       default:
1808          _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
1809                        inst->Opcode);
1810          return GL_TRUE;        /* return value doesn't matter */
1811       }
1812
1813       numExec++;
1814       if (numExec > maxExec) {
1815          static GLboolean reported = GL_FALSE;
1816          if (!reported) {
1817             _mesa_problem(ctx, "Infinite loop detected in fragment program");
1818             reported = GL_TRUE;
1819          }
1820          return GL_TRUE;
1821       }
1822
1823    } /* for pc */
1824
1825    return GL_TRUE;
1826 }