src/mesa/program/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  *
   4  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included
  14  * in all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "main/glheader.h"
  39 #include "main/colormac.h"
  40 #include "main/macros.h"
  41 #include "prog_execute.h"
  42 #include "prog_instruction.h"
  43 #include "prog_parameter.h"
  44 #include "prog_print.h"
  45 #include "prog_noise.h"
  46
  47
  48 /* debug predicate */
  49 #define DEBUG_PROG 0
  50
  51
  52 /**
  53  * Set x to positive or negative infinity.
  54  */
  55 #if defined(USE_IEEE) || defined(_WIN32)
  56 #define SET_POS_INFINITY(x)                  \
  57    do {                                      \
  58          fi_type fi;                         \
  59          fi.i = 0x7F800000;                  \
  60          x = fi.f;                           \
  61    } while (0)
  62 #define SET_NEG_INFINITY(x)                  \
  63    do {                                      \
  64          fi_type fi;                         \
  65          fi.i = 0xFF800000;                  \
  66          x = fi.f;                           \
  67    } while (0)
  68 #else
  69 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  70 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  71 #endif
  72
  73 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  74
  75
  76 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  77
  78
  79 /**
  80  * Return a pointer to the 4-element float vector specified by the given
  81  * source register.
  82  */
  83 static inline const GLfloat *
  84 get_src_register_pointer(const struct prog_src_register *source,
  85                          const struct gl_program_machine *machine)
  86 {
  87    const struct gl_program *prog = machine->CurProgram;
  88    GLint reg = source->Index;
  89
  90    if (source->RelAddr) {
  91       /* add address register value to src index/offset */
  92       reg += machine->AddressReg[0][0];
  93       if (reg < 0) {
  94          return ZeroVec;
  95       }
  96    }
  97
  98    switch (source->File) {
  99    case PROGRAM_TEMPORARY:
 100       if (reg >= MAX_PROGRAM_TEMPS)
 101          return ZeroVec;
 102       return machine->Temporaries[reg];
 103
 104    case PROGRAM_INPUT:
 105       if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
 106          if (reg >= VERT_ATTRIB_MAX)
 107             return ZeroVec;
 108          return machine->VertAttribs[reg];
 109       }
 110       else {
 111          if (reg >= VARYING_SLOT_MAX)
 112             return ZeroVec;
 113          return machine->Attribs[reg][machine->CurElement];
 114       }
 115
 116    case PROGRAM_OUTPUT:
 117       if (reg >= MAX_PROGRAM_OUTPUTS)
 118          return ZeroVec;
 119       return machine->Outputs[reg];
 120
 121    case PROGRAM_LOCAL_PARAM:
 122       if (reg >= MAX_PROGRAM_LOCAL_PARAMS)
 123          return ZeroVec;
 124       return machine->CurProgram->LocalParams[reg];
 125
 126    case PROGRAM_ENV_PARAM:
 127       if (reg >= MAX_PROGRAM_ENV_PARAMS)
 128          return ZeroVec;
 129       return machine->EnvParams[reg];
 130
 131    case PROGRAM_STATE_VAR:
 132       /* Fallthrough */
 133    case PROGRAM_CONSTANT:
 134       /* Fallthrough */
 135    case PROGRAM_UNIFORM:
 136       if (reg >= (GLint) prog->Parameters->NumParameters)
 137          return ZeroVec;
 138       return (GLfloat *) prog->Parameters->ParameterValues[reg];
 139
 140    case PROGRAM_SYSTEM_VALUE:
 141       assert(reg < Elements(machine->SystemValues));
 142       return machine->SystemValues[reg];
 143
 144    default:
 145       _mesa_problem(NULL,
 146          "Invalid src register file %d in get_src_register_pointer()",
 147          source->File);
 148       return ZeroVec;
 149    }
 150 }
 151
 152
 153 /**
 154  * Return a pointer to the 4-element float vector specified by the given
 155  * destination register.
 156  */
 157 static inline GLfloat *
 158 get_dst_register_pointer(const struct prog_dst_register *dest,
 159                          struct gl_program_machine *machine)
 160 {
 161    static GLfloat dummyReg[4];
 162    GLint reg = dest->Index;
 163
 164    if (dest->RelAddr) {
 165       /* add address register value to src index/offset */
 166       reg += machine->AddressReg[0][0];
 167       if (reg < 0) {
 168          return dummyReg;
 169       }
 170    }
 171
 172    switch (dest->File) {
 173    case PROGRAM_TEMPORARY:
 174       if (reg >= MAX_PROGRAM_TEMPS)
 175          return dummyReg;
 176       return machine->Temporaries[reg];
 177
 178    case PROGRAM_OUTPUT:
 179       if (reg >= MAX_PROGRAM_OUTPUTS)
 180          return dummyReg;
 181       return machine->Outputs[reg];
 182
 183    default:
 184       _mesa_problem(NULL,
 185          "Invalid dest register file %d in get_dst_register_pointer()",
 186          dest->File);
 187       return dummyReg;
 188    }
 189 }
 190
 191
 192
 193 /**
 194  * Fetch a 4-element float vector from the given source register.
 195  * Apply swizzling and negating as needed.
 196  */
 197 static void
 198 fetch_vector4(const struct prog_src_register *source,
 199               const struct gl_program_machine *machine, GLfloat result[4])
 200 {
 201    const GLfloat *src = get_src_register_pointer(source, machine);
 202
 203    if (source->Swizzle == SWIZZLE_NOOP) {
 204       /* no swizzling */
 205       COPY_4V(result, src);
 206    }
 207    else {
 208       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 209       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 210       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 211       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 212       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 213       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 214       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 215       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 216    }
 217
 218    if (source->Abs) {
 219       result[0] = FABSF(result[0]);
 220       result[1] = FABSF(result[1]);
 221       result[2] = FABSF(result[2]);
 222       result[3] = FABSF(result[3]);
 223    }
 224    if (source->Negate) {
 225       ASSERT(source->Negate == NEGATE_XYZW);
 226       result[0] = -result[0];
 227       result[1] = -result[1];
 228       result[2] = -result[2];
 229       result[3] = -result[3];
 230    }
 231
 232 #ifdef NAN_CHECK
 233    assert(!IS_INF_OR_NAN(result[0]));
 234    assert(!IS_INF_OR_NAN(result[0]));
 235    assert(!IS_INF_OR_NAN(result[0]));
 236    assert(!IS_INF_OR_NAN(result[0]));
 237 #endif
 238 }
 239
 240
 241 /**
 242  * Fetch the derivative with respect to X or Y for the given register.
 243  * XXX this currently only works for fragment program input attribs.
 244  */
 245 static void
 246 fetch_vector4_deriv(struct gl_context * ctx,
 247                     const struct prog_src_register *source,
 248                     const struct gl_program_machine *machine,
 249                     char xOrY, GLfloat result[4])
 250 {
 251    if (source->File == PROGRAM_INPUT &&
 252        source->Index < (GLint) machine->NumDeriv) {
 253       const GLint col = machine->CurElement;
 254       const GLfloat w = machine->Attribs[VARYING_SLOT_POS][col][3];
 255       const GLfloat invQ = 1.0f / w;
 256       GLfloat deriv[4];
 257
 258       if (xOrY == 'X') {
 259          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 260          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 261          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 262          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 263       }
 264       else {
 265          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 266          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 267          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 268          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 269       }
 270
 271       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 272       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 273       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 274       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 275
 276       if (source->Abs) {
 277          result[0] = FABSF(result[0]);
 278          result[1] = FABSF(result[1]);
 279          result[2] = FABSF(result[2]);
 280          result[3] = FABSF(result[3]);
 281       }
 282       if (source->Negate) {
 283          ASSERT(source->Negate == NEGATE_XYZW);
 284          result[0] = -result[0];
 285          result[1] = -result[1];
 286          result[2] = -result[2];
 287          result[3] = -result[3];
 288       }
 289    }
 290    else {
 291       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 292    }
 293 }
 294
 295
 296 /**
 297  * As above, but only return result[0] element.
 298  */
 299 static void
 300 fetch_vector1(const struct prog_src_register *source,
 301               const struct gl_program_machine *machine, GLfloat result[4])
 302 {
 303    const GLfloat *src = get_src_register_pointer(source, machine);
 304
 305    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 306
 307    if (source->Abs) {
 308       result[0] = FABSF(result[0]);
 309    }
 310    if (source->Negate) {
 311       result[0] = -result[0];
 312    }
 313 }
 314
 315
 316 static GLuint
 317 fetch_vector1ui(const struct prog_src_register *source,
 318                 const struct gl_program_machine *machine)
 319 {
 320    const GLuint *src = (GLuint *) get_src_register_pointer(source, machine);
 321    return src[GET_SWZ(source->Swizzle, 0)];
 322 }
 323
 324
 325 /**
 326  * Fetch texel from texture.  Use partial derivatives when possible.
 327  */
 328 static inline void
 329 fetch_texel(struct gl_context *ctx,
 330             const struct gl_program_machine *machine,
 331             const struct prog_instruction *inst,
 332             const GLfloat texcoord[4], GLfloat lodBias,
 333             GLfloat color[4])
 334 {
 335    const GLuint unit = machine->Samplers[inst->TexSrcUnit];
 336
 337    /* Note: we only have the right derivatives for fragment input attribs.
 338     */
 339    if (machine->NumDeriv > 0 &&
 340        inst->SrcReg[0].File == PROGRAM_INPUT &&
 341        inst->SrcReg[0].Index == VARYING_SLOT_TEX0 + inst->TexSrcUnit) {
 342       /* simple texture fetch for which we should have derivatives */
 343       GLuint attr = inst->SrcReg[0].Index;
 344       machine->FetchTexelDeriv(ctx, texcoord,
 345                                machine->DerivX[attr],
 346                                machine->DerivY[attr],
 347                                lodBias, unit, color);
 348    }
 349    else {
 350       machine->FetchTexelLod(ctx, texcoord, lodBias, unit, color);
 351    }
 352 }
 353
 354
 355 /**
 356  * Test value against zero and return GT, LT, EQ or UN if NaN.
 357  */
 358 static inline GLuint
 359 generate_cc(float value)
 360 {
 361    if (value != value)
 362       return COND_UN;           /* NaN */
 363    if (value > 0.0F)
 364       return COND_GT;
 365    if (value < 0.0F)
 366       return COND_LT;
 367    return COND_EQ;
 368 }
 369
 370
 371 /**
 372  * Test if the ccMaskRule is satisfied by the given condition code.
 373  * Used to mask destination writes according to the current condition code.
 374  */
 375 static inline GLboolean
 376 test_cc(GLuint condCode, GLuint ccMaskRule)
 377 {
 378    switch (ccMaskRule) {
 379    case COND_EQ: return (condCode == COND_EQ);
 380    case COND_NE: return (condCode != COND_EQ);
 381    case COND_LT: return (condCode == COND_LT);
 382    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 383    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 384    case COND_GT: return (condCode == COND_GT);
 385    case COND_TR: return GL_TRUE;
 386    case COND_FL: return GL_FALSE;
 387    default:      return GL_TRUE;
 388    }
 389 }
 390
 391
 392 /**
 393  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 394  * or GL_FALSE to indicate result.
 395  */
 396 static inline GLboolean
 397 eval_condition(const struct gl_program_machine *machine,
 398                const struct prog_instruction *inst)
 399 {
 400    const GLuint swizzle = inst->DstReg.CondSwizzle;
 401    const GLuint condMask = inst->DstReg.CondMask;
 402    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 403        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 404        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 405        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 406       return GL_TRUE;
 407    }
 408    else {
 409       return GL_FALSE;
 410    }
 411 }
 412
 413
 414
 415 /**
 416  * Store 4 floats into a register.  Observe the instructions saturate and
 417  * set-condition-code flags.
 418  */
 419 static void
 420 store_vector4(const struct prog_instruction *inst,
 421               struct gl_program_machine *machine, const GLfloat value[4])
 422 {
 423    const struct prog_dst_register *dstReg = &(inst->DstReg);
 424    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 425    GLuint writeMask = dstReg->WriteMask;
 426    GLfloat clampedValue[4];
 427    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
 428
 429 #if 0
 430    if (value[0] > 1.0e10 ||
 431        IS_INF_OR_NAN(value[0]) ||
 432        IS_INF_OR_NAN(value[1]) ||
 433        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 434       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 435 #endif
 436
 437    if (clamp) {
 438       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 439       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 440       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 441       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 442       value = clampedValue;
 443    }
 444
 445    if (dstReg->CondMask != COND_TR) {
 446       /* condition codes may turn off some writes */
 447       if (writeMask & WRITEMASK_X) {
 448          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 449                       dstReg->CondMask))
 450             writeMask &= ~WRITEMASK_X;
 451       }
 452       if (writeMask & WRITEMASK_Y) {
 453          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 454                       dstReg->CondMask))
 455             writeMask &= ~WRITEMASK_Y;
 456       }
 457       if (writeMask & WRITEMASK_Z) {
 458          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 459                       dstReg->CondMask))
 460             writeMask &= ~WRITEMASK_Z;
 461       }
 462       if (writeMask & WRITEMASK_W) {
 463          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 464                       dstReg->CondMask))
 465             writeMask &= ~WRITEMASK_W;
 466       }
 467    }
 468
 469 #ifdef NAN_CHECK
 470    assert(!IS_INF_OR_NAN(value[0]));
 471    assert(!IS_INF_OR_NAN(value[0]));
 472    assert(!IS_INF_OR_NAN(value[0]));
 473    assert(!IS_INF_OR_NAN(value[0]));
 474 #endif
 475
 476    if (writeMask & WRITEMASK_X)
 477       dst[0] = value[0];
 478    if (writeMask & WRITEMASK_Y)
 479       dst[1] = value[1];
 480    if (writeMask & WRITEMASK_Z)
 481       dst[2] = value[2];
 482    if (writeMask & WRITEMASK_W)
 483       dst[3] = value[3];
 484
 485    if (inst->CondUpdate) {
 486       if (writeMask & WRITEMASK_X)
 487          machine->CondCodes[0] = generate_cc(value[0]);
 488       if (writeMask & WRITEMASK_Y)
 489          machine->CondCodes[1] = generate_cc(value[1]);
 490       if (writeMask & WRITEMASK_Z)
 491          machine->CondCodes[2] = generate_cc(value[2]);
 492       if (writeMask & WRITEMASK_W)
 493          machine->CondCodes[3] = generate_cc(value[3]);
 494 #if DEBUG_PROG
 495       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 496              _mesa_condcode_string(machine->CondCodes[0]),
 497              _mesa_condcode_string(machine->CondCodes[1]),
 498              _mesa_condcode_string(machine->CondCodes[2]),
 499              _mesa_condcode_string(machine->CondCodes[3]));
 500 #endif
 501    }
 502 }
 503
 504
 505 /**
 506  * Store 4 uints into a register.  Observe the set-condition-code flags.
 507  */
 508 static void
 509 store_vector4ui(const struct prog_instruction *inst,
 510                 struct gl_program_machine *machine, const GLuint value[4])
 511 {
 512    const struct prog_dst_register *dstReg = &(inst->DstReg);
 513    GLuint writeMask = dstReg->WriteMask;
 514    GLuint *dst = (GLuint *) get_dst_register_pointer(dstReg, machine);
 515
 516    if (dstReg->CondMask != COND_TR) {
 517       /* condition codes may turn off some writes */
 518       if (writeMask & WRITEMASK_X) {
 519          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 520                       dstReg->CondMask))
 521             writeMask &= ~WRITEMASK_X;
 522       }
 523       if (writeMask & WRITEMASK_Y) {
 524          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 525                       dstReg->CondMask))
 526             writeMask &= ~WRITEMASK_Y;
 527       }
 528       if (writeMask & WRITEMASK_Z) {
 529          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 530                       dstReg->CondMask))
 531             writeMask &= ~WRITEMASK_Z;
 532       }
 533       if (writeMask & WRITEMASK_W) {
 534          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 535                       dstReg->CondMask))
 536             writeMask &= ~WRITEMASK_W;
 537       }
 538    }
 539
 540    if (writeMask & WRITEMASK_X)
 541       dst[0] = value[0];
 542    if (writeMask & WRITEMASK_Y)
 543       dst[1] = value[1];
 544    if (writeMask & WRITEMASK_Z)
 545       dst[2] = value[2];
 546    if (writeMask & WRITEMASK_W)
 547       dst[3] = value[3];
 548
 549    if (inst->CondUpdate) {
 550       if (writeMask & WRITEMASK_X)
 551          machine->CondCodes[0] = generate_cc((float)value[0]);
 552       if (writeMask & WRITEMASK_Y)
 553          machine->CondCodes[1] = generate_cc((float)value[1]);
 554       if (writeMask & WRITEMASK_Z)
 555          machine->CondCodes[2] = generate_cc((float)value[2]);
 556       if (writeMask & WRITEMASK_W)
 557          machine->CondCodes[3] = generate_cc((float)value[3]);
 558 #if DEBUG_PROG
 559       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 560              _mesa_condcode_string(machine->CondCodes[0]),
 561              _mesa_condcode_string(machine->CondCodes[1]),
 562              _mesa_condcode_string(machine->CondCodes[2]),
 563              _mesa_condcode_string(machine->CondCodes[3]));
 564 #endif
 565    }
 566 }
 567
 568
 569
 570 /**
 571  * Execute the given vertex/fragment program.
 572  *
 573  * \param ctx  rendering context
 574  * \param program  the program to execute
 575  * \param machine  machine state (must be initialized)
 576  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 577  */
 578 GLboolean
 579 _mesa_execute_program(struct gl_context * ctx,
 580                       const struct gl_program *program,
 581                       struct gl_program_machine *machine)
 582 {
 583    const GLuint numInst = program->NumInstructions;
 584    const GLuint maxExec = 65536;
 585    GLuint pc, numExec = 0;
 586
 587    machine->CurProgram = program;
 588
 589    if (DEBUG_PROG) {
 590       printf("execute program %u --------------------\n", program->Id);
 591    }
 592
 593    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 594       machine->EnvParams = ctx->VertexProgram.Parameters;
 595    }
 596    else {
 597       machine->EnvParams = ctx->FragmentProgram.Parameters;
 598    }
 599
 600    for (pc = 0; pc < numInst; pc++) {
 601       const struct prog_instruction *inst = program->Instructions + pc;
 602
 603       if (DEBUG_PROG) {
 604          _mesa_print_instruction(inst);
 605       }
 606
 607       switch (inst->Opcode) {
 608       case OPCODE_ABS:
 609          {
 610             GLfloat a[4], result[4];
 611             fetch_vector4(&inst->SrcReg[0], machine, a);
 612             result[0] = FABSF(a[0]);
 613             result[1] = FABSF(a[1]);
 614             result[2] = FABSF(a[2]);
 615             result[3] = FABSF(a[3]);
 616             store_vector4(inst, machine, result);
 617          }
 618          break;
 619       case OPCODE_ADD:
 620          {
 621             GLfloat a[4], b[4], result[4];
 622             fetch_vector4(&inst->SrcReg[0], machine, a);
 623             fetch_vector4(&inst->SrcReg[1], machine, b);
 624             result[0] = a[0] + b[0];
 625             result[1] = a[1] + b[1];
 626             result[2] = a[2] + b[2];
 627             result[3] = a[3] + b[3];
 628             store_vector4(inst, machine, result);
 629             if (DEBUG_PROG) {
 630                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 631                       result[0], result[1], result[2], result[3],
 632                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 633             }
 634          }
 635          break;
 636       case OPCODE_ARL:
 637          {
 638             GLfloat t[4];
 639             fetch_vector4(&inst->SrcReg[0], machine, t);
 640             machine->AddressReg[0][0] = IFLOOR(t[0]);
 641             if (DEBUG_PROG) {
 642                printf("ARL %d\n", machine->AddressReg[0][0]);
 643             }
 644          }
 645          break;
 646       case OPCODE_BGNLOOP:
 647          /* no-op */
 648          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 649                 == OPCODE_ENDLOOP);
 650          break;
 651       case OPCODE_ENDLOOP:
 652          /* subtract 1 here since pc is incremented by for(pc) loop */
 653          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 654                 == OPCODE_BGNLOOP);
 655          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 656          break;
 657       case OPCODE_BGNSUB:      /* begin subroutine */
 658          break;
 659       case OPCODE_ENDSUB:      /* end subroutine */
 660          break;
 661       case OPCODE_BRK:         /* break out of loop (conditional) */
 662          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 663                 == OPCODE_ENDLOOP);
 664          if (eval_condition(machine, inst)) {
 665             /* break out of loop */
 666             /* pc++ at end of for-loop will put us after the ENDLOOP inst */
 667             pc = inst->BranchTarget;
 668          }
 669          break;
 670       case OPCODE_CONT:        /* continue loop (conditional) */
 671          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 672                 == OPCODE_ENDLOOP);
 673          if (eval_condition(machine, inst)) {
 674             /* continue at ENDLOOP */
 675             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 676             pc = inst->BranchTarget - 1;
 677          }
 678          break;
 679       case OPCODE_CAL:         /* Call subroutine (conditional) */
 680          if (eval_condition(machine, inst)) {
 681             /* call the subroutine */
 682             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 683                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 684             }
 685             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 686             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 687             pc = inst->BranchTarget - 1;
 688          }
 689          break;
 690       case OPCODE_CMP:
 691          {
 692             GLfloat a[4], b[4], c[4], result[4];
 693             fetch_vector4(&inst->SrcReg[0], machine, a);
 694             fetch_vector4(&inst->SrcReg[1], machine, b);
 695             fetch_vector4(&inst->SrcReg[2], machine, c);
 696             result[0] = a[0] < 0.0F ? b[0] : c[0];
 697             result[1] = a[1] < 0.0F ? b[1] : c[1];
 698             result[2] = a[2] < 0.0F ? b[2] : c[2];
 699             result[3] = a[3] < 0.0F ? b[3] : c[3];
 700             store_vector4(inst, machine, result);
 701             if (DEBUG_PROG) {
 702                printf("CMP (%g %g %g %g) = (%g %g %g %g) < 0 ? (%g %g %g %g) : (%g %g %g %g)\n",
 703                       result[0], result[1], result[2], result[3],
 704                       a[0], a[1], a[2], a[3],
 705                       b[0], b[1], b[2], b[3],
 706                       c[0], c[1], c[2], c[3]);
 707             }
 708          }
 709          break;
 710       case OPCODE_COS:
 711          {
 712             GLfloat a[4], result[4];
 713             fetch_vector1(&inst->SrcReg[0], machine, a);
 714             result[0] = result[1] = result[2] = result[3]
 715                = (GLfloat) cos(a[0]);
 716             store_vector4(inst, machine, result);
 717          }
 718          break;
 719       case OPCODE_DDX:         /* Partial derivative with respect to X */
 720          {
 721             GLfloat result[4];
 722             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 723                                 'X', result);
 724             store_vector4(inst, machine, result);
 725          }
 726          break;
 727       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 728          {
 729             GLfloat result[4];
 730             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 731                                 'Y', result);
 732             store_vector4(inst, machine, result);
 733          }
 734          break;
 735       case OPCODE_DP2:
 736          {
 737             GLfloat a[4], b[4], result[4];
 738             fetch_vector4(&inst->SrcReg[0], machine, a);
 739             fetch_vector4(&inst->SrcReg[1], machine, b);
 740             result[0] = result[1] = result[2] = result[3] = DOT2(a, b);
 741             store_vector4(inst, machine, result);
 742             if (DEBUG_PROG) {
 743                printf("DP2 %g = (%g %g) . (%g %g)\n",
 744                       result[0], a[0], a[1], b[0], b[1]);
 745             }
 746          }
 747          break;
 748       case OPCODE_DP3:
 749          {
 750             GLfloat a[4], b[4], result[4];
 751             fetch_vector4(&inst->SrcReg[0], machine, a);
 752             fetch_vector4(&inst->SrcReg[1], machine, b);
 753             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 754             store_vector4(inst, machine, result);
 755             if (DEBUG_PROG) {
 756                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 757                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 758             }
 759          }
 760          break;
 761       case OPCODE_DP4:
 762          {
 763             GLfloat a[4], b[4], result[4];
 764             fetch_vector4(&inst->SrcReg[0], machine, a);
 765             fetch_vector4(&inst->SrcReg[1], machine, b);
 766             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 767             store_vector4(inst, machine, result);
 768             if (DEBUG_PROG) {
 769                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 770                       result[0], a[0], a[1], a[2], a[3],
 771                       b[0], b[1], b[2], b[3]);
 772             }
 773          }
 774          break;
 775       case OPCODE_DPH:
 776          {
 777             GLfloat a[4], b[4], result[4];
 778             fetch_vector4(&inst->SrcReg[0], machine, a);
 779             fetch_vector4(&inst->SrcReg[1], machine, b);
 780             result[0] = result[1] = result[2] = result[3] = DOT3(a, b) + b[3];
 781             store_vector4(inst, machine, result);
 782          }
 783          break;
 784       case OPCODE_DST:         /* Distance vector */
 785          {
 786             GLfloat a[4], b[4], result[4];
 787             fetch_vector4(&inst->SrcReg[0], machine, a);
 788             fetch_vector4(&inst->SrcReg[1], machine, b);
 789             result[0] = 1.0F;
 790             result[1] = a[1] * b[1];
 791             result[2] = a[2];
 792             result[3] = b[3];
 793             store_vector4(inst, machine, result);
 794          }
 795          break;
 796       case OPCODE_EXP:
 797          {
 798             GLfloat t[4], q[4], floor_t0;
 799             fetch_vector1(&inst->SrcReg[0], machine, t);
 800             floor_t0 = FLOORF(t[0]);
 801             if (floor_t0 > FLT_MAX_EXP) {
 802                SET_POS_INFINITY(q[0]);
 803                SET_POS_INFINITY(q[2]);
 804             }
 805             else if (floor_t0 < FLT_MIN_EXP) {
 806                q[0] = 0.0F;
 807                q[2] = 0.0F;
 808             }
 809             else {
 810                q[0] = LDEXPF(1.0, (int) floor_t0);
 811                /* Note: GL_NV_vertex_program expects
 812                 * result.z = result.x * APPX(result.y)
 813                 * We do what the ARB extension says.
 814                 */
 815                q[2] = (GLfloat) pow(2.0, t[0]);
 816             }
 817             q[1] = t[0] - floor_t0;
 818             q[3] = 1.0F;
 819             store_vector4( inst, machine, q );
 820          }
 821          break;
 822       case OPCODE_EX2:         /* Exponential base 2 */
 823          {
 824             GLfloat a[4], result[4], val;
 825             fetch_vector1(&inst->SrcReg[0], machine, a);
 826             val = (GLfloat) pow(2.0, a[0]);
 827             /*
 828             if (IS_INF_OR_NAN(val))
 829                val = 1.0e10;
 830             */
 831             result[0] = result[1] = result[2] = result[3] = val;
 832             store_vector4(inst, machine, result);
 833          }
 834          break;
 835       case OPCODE_FLR:
 836          {
 837             GLfloat a[4], result[4];
 838             fetch_vector4(&inst->SrcReg[0], machine, a);
 839             result[0] = FLOORF(a[0]);
 840             result[1] = FLOORF(a[1]);
 841             result[2] = FLOORF(a[2]);
 842             result[3] = FLOORF(a[3]);
 843             store_vector4(inst, machine, result);
 844          }
 845          break;
 846       case OPCODE_FRC:
 847          {
 848             GLfloat a[4], result[4];
 849             fetch_vector4(&inst->SrcReg[0], machine, a);
 850             result[0] = a[0] - FLOORF(a[0]);
 851             result[1] = a[1] - FLOORF(a[1]);
 852             result[2] = a[2] - FLOORF(a[2]);
 853             result[3] = a[3] - FLOORF(a[3]);
 854             store_vector4(inst, machine, result);
 855          }
 856          break;
 857       case OPCODE_IF:
 858          {
 859             GLboolean cond;
 860             ASSERT(program->Instructions[inst->BranchTarget].Opcode
 861                    == OPCODE_ELSE ||
 862                    program->Instructions[inst->BranchTarget].Opcode
 863                    == OPCODE_ENDIF);
 864             /* eval condition */
 865             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 866                GLfloat a[4];
 867                fetch_vector1(&inst->SrcReg[0], machine, a);
 868                cond = (a[0] != 0.0);
 869             }
 870             else {
 871                cond = eval_condition(machine, inst);
 872             }
 873             if (DEBUG_PROG) {
 874                printf("IF: %d\n", cond);
 875             }
 876             /* do if/else */
 877             if (cond) {
 878                /* do if-clause (just continue execution) */
 879             }
 880             else {
 881                /* go to the instruction after ELSE or ENDIF */
 882                assert(inst->BranchTarget >= 0);
 883                pc = inst->BranchTarget;
 884             }
 885          }
 886          break;
 887       case OPCODE_ELSE:
 888          /* goto ENDIF */
 889          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 890                 == OPCODE_ENDIF);
 891          assert(inst->BranchTarget >= 0);
 892          pc = inst->BranchTarget;
 893          break;
 894       case OPCODE_ENDIF:
 895          /* nothing */
 896          break;
 897       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 898          if (eval_condition(machine, inst)) {
 899             return GL_FALSE;
 900          }
 901          break;
 902       case OPCODE_KIL:         /* ARB_f_p only */
 903          {
 904             GLfloat a[4];
 905             fetch_vector4(&inst->SrcReg[0], machine, a);
 906             if (DEBUG_PROG) {
 907                printf("KIL if (%g %g %g %g) <= 0.0\n",
 908                       a[0], a[1], a[2], a[3]);
 909             }
 910
 911             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 912                return GL_FALSE;
 913             }
 914          }
 915          break;
 916       case OPCODE_LG2:         /* log base 2 */
 917          {
 918             GLfloat a[4], result[4], val;
 919             fetch_vector1(&inst->SrcReg[0], machine, a);
 920             /* The fast LOG2 macro doesn't meet the precision requirements.
 921              */
 922             if (a[0] == 0.0F) {
 923                val = -FLT_MAX;
 924             }
 925             else {
 926                val = (float)(log(a[0]) * 1.442695F);
 927             }
 928             result[0] = result[1] = result[2] = result[3] = val;
 929             store_vector4(inst, machine, result);
 930          }
 931          break;
 932       case OPCODE_LIT:
 933          {
 934             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
 935             GLfloat a[4], result[4];
 936             fetch_vector4(&inst->SrcReg[0], machine, a);
 937             a[0] = MAX2(a[0], 0.0F);
 938             a[1] = MAX2(a[1], 0.0F);
 939             /* XXX ARB version clamps a[3], NV version doesn't */
 940             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 941             result[0] = 1.0F;
 942             result[1] = a[0];
 943             /* XXX we could probably just use pow() here */
 944             if (a[0] > 0.0F) {
 945                if (a[1] == 0.0 && a[3] == 0.0)
 946                   result[2] = 1.0F;
 947                else
 948                   result[2] = (GLfloat) pow(a[1], a[3]);
 949             }
 950             else {
 951                result[2] = 0.0F;
 952             }
 953             result[3] = 1.0F;
 954             store_vector4(inst, machine, result);
 955             if (DEBUG_PROG) {
 956                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
 957                       result[0], result[1], result[2], result[3],
 958                       a[0], a[1], a[2], a[3]);
 959             }
 960          }
 961          break;
 962       case OPCODE_LOG:
 963          {
 964             GLfloat t[4], q[4], abs_t0;
 965             fetch_vector1(&inst->SrcReg[0], machine, t);
 966             abs_t0 = FABSF(t[0]);
 967             if (abs_t0 != 0.0F) {
 968                if (IS_INF_OR_NAN(abs_t0))
 969                {
 970                   SET_POS_INFINITY(q[0]);
 971                   q[1] = 1.0F;
 972                   SET_POS_INFINITY(q[2]);
 973                }
 974                else {
 975                   int exponent;
 976                   GLfloat mantissa = FREXPF(t[0], &exponent);
 977                   q[0] = (GLfloat) (exponent - 1);
 978                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 979
 980                   /* The fast LOG2 macro doesn't meet the precision
 981                    * requirements.
 982                    */
 983                   q[2] = (float)(log(t[0]) * 1.442695F);
 984                }
 985             }
 986             else {
 987                SET_NEG_INFINITY(q[0]);
 988                q[1] = 1.0F;
 989                SET_NEG_INFINITY(q[2]);
 990             }
 991             q[3] = 1.0;
 992             store_vector4(inst, machine, q);
 993          }
 994          break;
 995       case OPCODE_LRP:
 996          {
 997             GLfloat a[4], b[4], c[4], result[4];
 998             fetch_vector4(&inst->SrcReg[0], machine, a);
 999             fetch_vector4(&inst->SrcReg[1], machine, b);
1000             fetch_vector4(&inst->SrcReg[2], machine, c);
1001             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
1002             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
1003             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
1004             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
1005             store_vector4(inst, machine, result);
1006             if (DEBUG_PROG) {
1007                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
1008                       "(%g %g %g %g), (%g %g %g %g)\n",
1009                       result[0], result[1], result[2], result[3],
1010                       a[0], a[1], a[2], a[3],
1011                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1012             }
1013          }
1014          break;
1015       case OPCODE_MAD:
1016          {
1017             GLfloat a[4], b[4], c[4], result[4];
1018             fetch_vector4(&inst->SrcReg[0], machine, a);
1019             fetch_vector4(&inst->SrcReg[1], machine, b);
1020             fetch_vector4(&inst->SrcReg[2], machine, c);
1021             result[0] = a[0] * b[0] + c[0];
1022             result[1] = a[1] * b[1] + c[1];
1023             result[2] = a[2] * b[2] + c[2];
1024             result[3] = a[3] * b[3] + c[3];
1025             store_vector4(inst, machine, result);
1026             if (DEBUG_PROG) {
1027                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
1028                       "(%g %g %g %g) + (%g %g %g %g)\n",
1029                       result[0], result[1], result[2], result[3],
1030                       a[0], a[1], a[2], a[3],
1031                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1032             }
1033          }
1034          break;
1035       case OPCODE_MAX:
1036          {
1037             GLfloat a[4], b[4], result[4];
1038             fetch_vector4(&inst->SrcReg[0], machine, a);
1039             fetch_vector4(&inst->SrcReg[1], machine, b);
1040             result[0] = MAX2(a[0], b[0]);
1041             result[1] = MAX2(a[1], b[1]);
1042             result[2] = MAX2(a[2], b[2]);
1043             result[3] = MAX2(a[3], b[3]);
1044             store_vector4(inst, machine, result);
1045             if (DEBUG_PROG) {
1046                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
1047                       result[0], result[1], result[2], result[3],
1048                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1049             }
1050          }
1051          break;
1052       case OPCODE_MIN:
1053          {
1054             GLfloat a[4], b[4], result[4];
1055             fetch_vector4(&inst->SrcReg[0], machine, a);
1056             fetch_vector4(&inst->SrcReg[1], machine, b);
1057             result[0] = MIN2(a[0], b[0]);
1058             result[1] = MIN2(a[1], b[1]);
1059             result[2] = MIN2(a[2], b[2]);
1060             result[3] = MIN2(a[3], b[3]);
1061             store_vector4(inst, machine, result);
1062          }
1063          break;
1064       case OPCODE_MOV:
1065          {
1066             GLfloat result[4];
1067             fetch_vector4(&inst->SrcReg[0], machine, result);
1068             store_vector4(inst, machine, result);
1069             if (DEBUG_PROG) {
1070                printf("MOV (%g %g %g %g)\n",
1071                       result[0], result[1], result[2], result[3]);
1072             }
1073          }
1074          break;
1075       case OPCODE_MUL:
1076          {
1077             GLfloat a[4], b[4], result[4];
1078             fetch_vector4(&inst->SrcReg[0], machine, a);
1079             fetch_vector4(&inst->SrcReg[1], machine, b);
1080             result[0] = a[0] * b[0];
1081             result[1] = a[1] * b[1];
1082             result[2] = a[2] * b[2];
1083             result[3] = a[3] * b[3];
1084             store_vector4(inst, machine, result);
1085             if (DEBUG_PROG) {
1086                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
1087                       result[0], result[1], result[2], result[3],
1088                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1089             }
1090          }
1091          break;
1092       case OPCODE_NOISE1:
1093          {
1094             GLfloat a[4], result[4];
1095             fetch_vector1(&inst->SrcReg[0], machine, a);
1096             result[0] =
1097                result[1] =
1098                result[2] =
1099                result[3] = _mesa_noise1(a[0]);
1100             store_vector4(inst, machine, result);
1101          }
1102          break;
1103       case OPCODE_NOISE2:
1104          {
1105             GLfloat a[4], result[4];
1106             fetch_vector4(&inst->SrcReg[0], machine, a);
1107             result[0] =
1108                result[1] =
1109                result[2] = result[3] = _mesa_noise2(a[0], a[1]);
1110             store_vector4(inst, machine, result);
1111          }
1112          break;
1113       case OPCODE_NOISE3:
1114          {
1115             GLfloat a[4], result[4];
1116             fetch_vector4(&inst->SrcReg[0], machine, a);
1117             result[0] =
1118                result[1] =
1119                result[2] =
1120                result[3] = _mesa_noise3(a[0], a[1], a[2]);
1121             store_vector4(inst, machine, result);
1122          }
1123          break;
1124       case OPCODE_NOISE4:
1125          {
1126             GLfloat a[4], result[4];
1127             fetch_vector4(&inst->SrcReg[0], machine, a);
1128             result[0] =
1129                result[1] =
1130                result[2] =
1131                result[3] = _mesa_noise4(a[0], a[1], a[2], a[3]);
1132             store_vector4(inst, machine, result);
1133          }
1134          break;
1135       case OPCODE_NOP:
1136          break;
1137       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1138          {
1139             GLfloat a[4];
1140             GLuint result[4];
1141             GLhalfNV hx, hy;
1142             fetch_vector4(&inst->SrcReg[0], machine, a);
1143             hx = _mesa_float_to_half(a[0]);
1144             hy = _mesa_float_to_half(a[1]);
1145             result[0] =
1146             result[1] =
1147             result[2] =
1148             result[3] = hx | (hy << 16);
1149             store_vector4ui(inst, machine, result);
1150          }
1151          break;
1152       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1153          {
1154             GLfloat a[4];
1155             GLuint result[4], usx, usy;
1156             fetch_vector4(&inst->SrcReg[0], machine, a);
1157             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1158             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1159             usx = F_TO_I(a[0] * 65535.0F);
1160             usy = F_TO_I(a[1] * 65535.0F);
1161             result[0] =
1162             result[1] =
1163             result[2] =
1164             result[3] = usx | (usy << 16);
1165             store_vector4ui(inst, machine, result);
1166          }
1167          break;
1168       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1169          {
1170             GLfloat a[4];
1171             GLuint result[4], ubx, uby, ubz, ubw;
1172             fetch_vector4(&inst->SrcReg[0], machine, a);
1173             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1174             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1175             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1176             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1177             ubx = F_TO_I(127.0F * a[0] + 128.0F);
1178             uby = F_TO_I(127.0F * a[1] + 128.0F);
1179             ubz = F_TO_I(127.0F * a[2] + 128.0F);
1180             ubw = F_TO_I(127.0F * a[3] + 128.0F);
1181             result[0] =
1182             result[1] =
1183             result[2] =
1184             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1185             store_vector4ui(inst, machine, result);
1186          }
1187          break;
1188       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1189          {
1190             GLfloat a[4];
1191             GLuint result[4], ubx, uby, ubz, ubw;
1192             fetch_vector4(&inst->SrcReg[0], machine, a);
1193             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1194             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1195             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1196             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1197             ubx = F_TO_I(255.0F * a[0]);
1198             uby = F_TO_I(255.0F * a[1]);
1199             ubz = F_TO_I(255.0F * a[2]);
1200             ubw = F_TO_I(255.0F * a[3]);
1201             result[0] =
1202             result[1] =
1203             result[2] =
1204             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1205             store_vector4ui(inst, machine, result);
1206          }
1207          break;
1208       case OPCODE_POW:
1209          {
1210             GLfloat a[4], b[4], result[4];
1211             fetch_vector1(&inst->SrcReg[0], machine, a);
1212             fetch_vector1(&inst->SrcReg[1], machine, b);
1213             result[0] = result[1] = result[2] = result[3]
1214                = (GLfloat) pow(a[0], b[0]);
1215             store_vector4(inst, machine, result);
1216          }
1217          break;
1218
1219       case OPCODE_RCP:
1220          {
1221             GLfloat a[4], result[4];
1222             fetch_vector1(&inst->SrcReg[0], machine, a);
1223             if (DEBUG_PROG) {
1224                if (a[0] == 0)
1225                   printf("RCP(0)\n");
1226                else if (IS_INF_OR_NAN(a[0]))
1227                   printf("RCP(inf)\n");
1228             }
1229             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1230             store_vector4(inst, machine, result);
1231          }
1232          break;
1233       case OPCODE_RET:         /* return from subroutine (conditional) */
1234          if (eval_condition(machine, inst)) {
1235             if (machine->StackDepth == 0) {
1236                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1237             }
1238             /* subtract one because of pc++ in the for loop */
1239             pc = machine->CallStack[--machine->StackDepth] - 1;
1240          }
1241          break;
1242       case OPCODE_RFL:         /* reflection vector */
1243          {
1244             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1245             fetch_vector4(&inst->SrcReg[0], machine, axis);
1246             fetch_vector4(&inst->SrcReg[1], machine, dir);
1247             tmpW = DOT3(axis, axis);
1248             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1249             result[0] = tmpX * axis[0] - dir[0];
1250             result[1] = tmpX * axis[1] - dir[1];
1251             result[2] = tmpX * axis[2] - dir[2];
1252             /* result[3] is never written! XXX enforce in parser! */
1253             store_vector4(inst, machine, result);
1254          }
1255          break;
1256       case OPCODE_RSQ:         /* 1 / sqrt() */
1257          {
1258             GLfloat a[4], result[4];
1259             fetch_vector1(&inst->SrcReg[0], machine, a);
1260             a[0] = FABSF(a[0]);
1261             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1262             store_vector4(inst, machine, result);
1263             if (DEBUG_PROG) {
1264                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1265             }
1266          }
1267          break;
1268       case OPCODE_SCS:         /* sine and cos */
1269          {
1270             GLfloat a[4], result[4];
1271             fetch_vector1(&inst->SrcReg[0], machine, a);
1272             result[0] = (GLfloat) cos(a[0]);
1273             result[1] = (GLfloat) sin(a[0]);
1274             result[2] = 0.0;    /* undefined! */
1275             result[3] = 0.0;    /* undefined! */
1276             store_vector4(inst, machine, result);
1277          }
1278          break;
1279       case OPCODE_SEQ:         /* set on equal */
1280          {
1281             GLfloat a[4], b[4], result[4];
1282             fetch_vector4(&inst->SrcReg[0], machine, a);
1283             fetch_vector4(&inst->SrcReg[1], machine, b);
1284             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1285             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1286             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1287             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1288             store_vector4(inst, machine, result);
1289             if (DEBUG_PROG) {
1290                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1291                       result[0], result[1], result[2], result[3],
1292                       a[0], a[1], a[2], a[3],
1293                       b[0], b[1], b[2], b[3]);
1294             }
1295          }
1296          break;
1297       case OPCODE_SFL:         /* set false, operands ignored */
1298          {
1299             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1300             store_vector4(inst, machine, result);
1301          }
1302          break;
1303       case OPCODE_SGE:         /* set on greater or equal */
1304          {
1305             GLfloat a[4], b[4], result[4];
1306             fetch_vector4(&inst->SrcReg[0], machine, a);
1307             fetch_vector4(&inst->SrcReg[1], machine, b);
1308             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1309             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1310             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1311             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1312             store_vector4(inst, machine, result);
1313             if (DEBUG_PROG) {
1314                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1315                       result[0], result[1], result[2], result[3],
1316                       a[0], a[1], a[2], a[3],
1317                       b[0], b[1], b[2], b[3]);
1318             }
1319          }
1320          break;
1321       case OPCODE_SGT:         /* set on greater */
1322          {
1323             GLfloat a[4], b[4], result[4];
1324             fetch_vector4(&inst->SrcReg[0], machine, a);
1325             fetch_vector4(&inst->SrcReg[1], machine, b);
1326             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1327             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1328             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1329             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1330             store_vector4(inst, machine, result);
1331             if (DEBUG_PROG) {
1332                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1333                       result[0], result[1], result[2], result[3],
1334                       a[0], a[1], a[2], a[3],
1335                       b[0], b[1], b[2], b[3]);
1336             }
1337          }
1338          break;
1339       case OPCODE_SIN:
1340          {
1341             GLfloat a[4], result[4];
1342             fetch_vector1(&inst->SrcReg[0], machine, a);
1343             result[0] = result[1] = result[2] = result[3]
1344                = (GLfloat) sin(a[0]);
1345             store_vector4(inst, machine, result);
1346          }
1347          break;
1348       case OPCODE_SLE:         /* set on less or equal */
1349          {
1350             GLfloat a[4], b[4], result[4];
1351             fetch_vector4(&inst->SrcReg[0], machine, a);
1352             fetch_vector4(&inst->SrcReg[1], machine, b);
1353             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1354             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1355             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1356             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1357             store_vector4(inst, machine, result);
1358             if (DEBUG_PROG) {
1359                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1360                       result[0], result[1], result[2], result[3],
1361                       a[0], a[1], a[2], a[3],
1362                       b[0], b[1], b[2], b[3]);
1363             }
1364          }
1365          break;
1366       case OPCODE_SLT:         /* set on less */
1367          {
1368             GLfloat a[4], b[4], result[4];
1369             fetch_vector4(&inst->SrcReg[0], machine, a);
1370             fetch_vector4(&inst->SrcReg[1], machine, b);
1371             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1372             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1373             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1374             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1375             store_vector4(inst, machine, result);
1376             if (DEBUG_PROG) {
1377                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1378                       result[0], result[1], result[2], result[3],
1379                       a[0], a[1], a[2], a[3],
1380                       b[0], b[1], b[2], b[3]);
1381             }
1382          }
1383          break;
1384       case OPCODE_SNE:         /* set on not equal */
1385          {
1386             GLfloat a[4], b[4], result[4];
1387             fetch_vector4(&inst->SrcReg[0], machine, a);
1388             fetch_vector4(&inst->SrcReg[1], machine, b);
1389             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1390             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1391             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1392             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1393             store_vector4(inst, machine, result);
1394             if (DEBUG_PROG) {
1395                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1396                       result[0], result[1], result[2], result[3],
1397                       a[0], a[1], a[2], a[3],
1398                       b[0], b[1], b[2], b[3]);
1399             }
1400          }
1401          break;
1402       case OPCODE_SSG:         /* set sign (-1, 0 or +1) */
1403          {
1404             GLfloat a[4], result[4];
1405             fetch_vector4(&inst->SrcReg[0], machine, a);
1406             result[0] = (GLfloat) ((a[0] > 0.0F) - (a[0] < 0.0F));
1407             result[1] = (GLfloat) ((a[1] > 0.0F) - (a[1] < 0.0F));
1408             result[2] = (GLfloat) ((a[2] > 0.0F) - (a[2] < 0.0F));
1409             result[3] = (GLfloat) ((a[3] > 0.0F) - (a[3] < 0.0F));
1410             store_vector4(inst, machine, result);
1411          }
1412          break;
1413       case OPCODE_STR:         /* set true, operands ignored */
1414          {
1415             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1416             store_vector4(inst, machine, result);
1417          }
1418          break;
1419       case OPCODE_SUB:
1420          {
1421             GLfloat a[4], b[4], result[4];
1422             fetch_vector4(&inst->SrcReg[0], machine, a);
1423             fetch_vector4(&inst->SrcReg[1], machine, b);
1424             result[0] = a[0] - b[0];
1425             result[1] = a[1] - b[1];
1426             result[2] = a[2] - b[2];
1427             result[3] = a[3] - b[3];
1428             store_vector4(inst, machine, result);
1429             if (DEBUG_PROG) {
1430                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1431                       result[0], result[1], result[2], result[3],
1432                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1433             }
1434          }
1435          break;
1436       case OPCODE_SWZ:         /* extended swizzle */
1437          {
1438             const struct prog_src_register *source = &inst->SrcReg[0];
1439             const GLfloat *src = get_src_register_pointer(source, machine);
1440             GLfloat result[4];
1441             GLuint i;
1442             for (i = 0; i < 4; i++) {
1443                const GLuint swz = GET_SWZ(source->Swizzle, i);
1444                if (swz == SWIZZLE_ZERO)
1445                   result[i] = 0.0;
1446                else if (swz == SWIZZLE_ONE)
1447                   result[i] = 1.0;
1448                else {
1449                   ASSERT(swz >= 0);
1450                   ASSERT(swz <= 3);
1451                   result[i] = src[swz];
1452                }
1453                if (source->Negate & (1 << i))
1454                   result[i] = -result[i];
1455             }
1456             store_vector4(inst, machine, result);
1457          }
1458          break;
1459       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1460          /* Simple texel lookup */
1461          {
1462             GLfloat texcoord[4], color[4];
1463             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1464
1465             /* For TEX, texcoord.Q should not be used and its value should not
1466              * matter (at most, we pass coord.xyz to texture3D() in GLSL).
1467              * Set Q=1 so that FetchTexelDeriv() doesn't get a garbage value
1468              * which is effectively what happens when the texcoord swizzle
1469              * is .xyzz
1470              */
1471             texcoord[3] = 1.0f;
1472
1473             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1474
1475             if (DEBUG_PROG) {
1476                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1477                       color[0], color[1], color[2], color[3],
1478                       inst->TexSrcUnit,
1479                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1480             }
1481             store_vector4(inst, machine, color);
1482          }
1483          break;
1484       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1485          /* Texel lookup with LOD bias */
1486          {
1487             GLfloat texcoord[4], color[4], lodBias;
1488
1489             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1490
1491             /* texcoord[3] is the bias to add to lambda */
1492             lodBias = texcoord[3];
1493
1494             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1495
1496             if (DEBUG_PROG) {
1497                printf("TXB (%g, %g, %g, %g) = texture[%d][%g %g %g %g]"
1498                       "  bias %g\n",
1499                       color[0], color[1], color[2], color[3],
1500                       inst->TexSrcUnit,
1501                       texcoord[0],
1502                       texcoord[1],
1503                       texcoord[2],
1504                       texcoord[3],
1505                       lodBias);
1506             }
1507
1508             store_vector4(inst, machine, color);
1509          }
1510          break;
1511       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1512          /* Texture lookup w/ partial derivatives for LOD */
1513          {
1514             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1515             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1516             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1517             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1518             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1519                                      0.0, /* lodBias */
1520                                      inst->TexSrcUnit, color);
1521             store_vector4(inst, machine, color);
1522          }
1523          break;
1524       case OPCODE_TXL:
1525          /* Texel lookup with explicit LOD */
1526          {
1527             GLfloat texcoord[4], color[4], lod;
1528
1529             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1530
1531             /* texcoord[3] is the LOD */
1532             lod = texcoord[3];
1533
1534             machine->FetchTexelLod(ctx, texcoord, lod,
1535                                    machine->Samplers[inst->TexSrcUnit], color);
1536
1537             store_vector4(inst, machine, color);
1538          }
1539          break;
1540       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1541          /* Texture lookup w/ projective divide */
1542          {
1543             GLfloat texcoord[4], color[4];
1544
1545             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1546             /* Not so sure about this test - if texcoord[3] is
1547              * zero, we'd probably be fine except for an ASSERT in
1548              * IROUND_POS() which gets triggered by the inf values created.
1549              */
1550             if (texcoord[3] != 0.0) {
1551                texcoord[0] /= texcoord[3];
1552                texcoord[1] /= texcoord[3];
1553                texcoord[2] /= texcoord[3];
1554             }
1555
1556             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1557
1558             store_vector4(inst, machine, color);
1559          }
1560          break;
1561       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1562          /* Texture lookup w/ projective divide, as above, but do not
1563           * do the divide by w if sampling from a cube map.
1564           */
1565          {
1566             GLfloat texcoord[4], color[4];
1567
1568             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1569             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1570                 texcoord[3] != 0.0) {
1571                texcoord[0] /= texcoord[3];
1572                texcoord[1] /= texcoord[3];
1573                texcoord[2] /= texcoord[3];
1574             }
1575
1576             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1577
1578             store_vector4(inst, machine, color);
1579          }
1580          break;
1581       case OPCODE_TRUNC:       /* truncate toward zero */
1582          {
1583             GLfloat a[4], result[4];
1584             fetch_vector4(&inst->SrcReg[0], machine, a);
1585             result[0] = (GLfloat) (GLint) a[0];
1586             result[1] = (GLfloat) (GLint) a[1];
1587             result[2] = (GLfloat) (GLint) a[2];
1588             result[3] = (GLfloat) (GLint) a[3];
1589             store_vector4(inst, machine, result);
1590          }
1591          break;
1592       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1593          {
1594             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1595             GLfloat result[4];
1596             GLushort hx, hy;
1597             hx = raw & 0xffff;
1598             hy = raw >> 16;
1599             result[0] = result[2] = _mesa_half_to_float(hx);
1600             result[1] = result[3] = _mesa_half_to_float(hy);
1601             store_vector4(inst, machine, result);
1602          }
1603          break;
1604       case OPCODE_UP2US:       /* unpack two GLushorts */
1605          {
1606             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1607             GLfloat result[4];
1608             GLushort usx, usy;
1609             usx = raw & 0xffff;
1610             usy = raw >> 16;
1611             result[0] = result[2] = usx * (1.0f / 65535.0f);
1612             result[1] = result[3] = usy * (1.0f / 65535.0f);
1613             store_vector4(inst, machine, result);
1614          }
1615          break;
1616       case OPCODE_UP4B:        /* unpack four GLbytes */
1617          {
1618             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1619             GLfloat result[4];
1620             result[0] = (((raw >> 0) & 0xff) - 128) / 127.0F;
1621             result[1] = (((raw >> 8) & 0xff) - 128) / 127.0F;
1622             result[2] = (((raw >> 16) & 0xff) - 128) / 127.0F;
1623             result[3] = (((raw >> 24) & 0xff) - 128) / 127.0F;
1624             store_vector4(inst, machine, result);
1625          }
1626          break;
1627       case OPCODE_UP4UB:       /* unpack four GLubytes */
1628          {
1629             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1630             GLfloat result[4];
1631             result[0] = ((raw >> 0) & 0xff) / 255.0F;
1632             result[1] = ((raw >> 8) & 0xff) / 255.0F;
1633             result[2] = ((raw >> 16) & 0xff) / 255.0F;
1634             result[3] = ((raw >> 24) & 0xff) / 255.0F;
1635             store_vector4(inst, machine, result);
1636          }
1637          break;
1638       case OPCODE_XPD:         /* cross product */
1639          {
1640             GLfloat a[4], b[4], result[4];
1641             fetch_vector4(&inst->SrcReg[0], machine, a);
1642             fetch_vector4(&inst->SrcReg[1], machine, b);
1643             result[0] = a[1] * b[2] - a[2] * b[1];
1644             result[1] = a[2] * b[0] - a[0] * b[2];
1645             result[2] = a[0] * b[1] - a[1] * b[0];
1646             result[3] = 1.0;
1647             store_vector4(inst, machine, result);
1648             if (DEBUG_PROG) {
1649                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1650                       result[0], result[1], result[2], result[3],
1651                       a[0], a[1], a[2], b[0], b[1], b[2]);
1652             }
1653          }
1654          break;
1655       case OPCODE_X2D:         /* 2-D matrix transform */
1656          {
1657             GLfloat a[4], b[4], c[4], result[4];
1658             fetch_vector4(&inst->SrcReg[0], machine, a);
1659             fetch_vector4(&inst->SrcReg[1], machine, b);
1660             fetch_vector4(&inst->SrcReg[2], machine, c);
1661             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1662             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1663             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1664             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1665             store_vector4(inst, machine, result);
1666          }
1667          break;
1668       case OPCODE_END:
1669          return GL_TRUE;
1670       default:
1671          _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
1672                        inst->Opcode);
1673          return GL_TRUE;        /* return value doesn't matter */
1674       }
1675
1676       numExec++;
1677       if (numExec > maxExec) {
1678          static GLboolean reported = GL_FALSE;
1679          if (!reported) {
1680             _mesa_problem(ctx, "Infinite loop detected in fragment program");
1681             reported = GL_TRUE;
1682          }
1683          return GL_TRUE;
1684       }
1685
1686    } /* for pc */
1687
1688    return GL_TRUE;
1689 }