src/mesa/program/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  7.3
   4  *
   5  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23  * OTHER DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 /**
  27  * \file prog_execute.c
  28  * Software interpreter for vertex/fragment programs.
  29  * \author Brian Paul
  30  */
  31
  32 /*
  33  * NOTE: we do everything in single-precision floating point; we don't
  34  * currently observe the single/half/fixed-precision qualifiers.
  35  *
  36  */
  37
  38
  39 #include "main/glheader.h"
  40 #include "main/colormac.h"
  41 #include "main/macros.h"
  42 #include "prog_execute.h"
  43 #include "prog_instruction.h"
  44 #include "prog_parameter.h"
  45 #include "prog_print.h"
  46 #include "prog_noise.h"
  47
  48
  49 /* debug predicate */
  50 #define DEBUG_PROG 0
  51
  52
  53 /**
  54  * Set x to positive or negative infinity.
  55  */
  56 #if defined(USE_IEEE) || defined(_WIN32)
  57 #define SET_POS_INFINITY(x)                  \
  58    do {                                      \
  59          fi_type fi;                         \
  60          fi.i = 0x7F800000;                  \
  61          x = fi.f;                           \
  62    } while (0)
  63 #define SET_NEG_INFINITY(x)                  \
  64    do {                                      \
  65          fi_type fi;                         \
  66          fi.i = 0xFF800000;                  \
  67          x = fi.f;                           \
  68    } while (0)
  69 #else
  70 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  71 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  72 #endif
  73
  74 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  75
  76
  77 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  78
  79
  80 /**
  81  * Return a pointer to the 4-element float vector specified by the given
  82  * source register.
  83  */
  84 static inline const GLfloat *
  85 get_src_register_pointer(const struct prog_src_register *source,
  86                          const struct gl_program_machine *machine)
  87 {
  88    const struct gl_program *prog = machine->CurProgram;
  89    GLint reg = source->Index;
  90
  91    if (source->RelAddr) {
  92       /* add address register value to src index/offset */
  93       reg += machine->AddressReg[0][0];
  94       if (reg < 0) {
  95          return ZeroVec;
  96       }
  97    }
  98
  99    switch (source->File) {
 100    case PROGRAM_TEMPORARY:
 101       if (reg >= MAX_PROGRAM_TEMPS)
 102          return ZeroVec;
 103       return machine->Temporaries[reg];
 104
 105    case PROGRAM_INPUT:
 106       if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
 107          if (reg >= VERT_ATTRIB_MAX)
 108             return ZeroVec;
 109          return machine->VertAttribs[reg];
 110       }
 111       else {
 112          if (reg >= VARYING_SLOT_MAX)
 113             return ZeroVec;
 114          return machine->Attribs[reg][machine->CurElement];
 115       }
 116
 117    case PROGRAM_OUTPUT:
 118       if (reg >= MAX_PROGRAM_OUTPUTS)
 119          return ZeroVec;
 120       return machine->Outputs[reg];
 121
 122    case PROGRAM_LOCAL_PARAM:
 123       if (reg >= MAX_PROGRAM_LOCAL_PARAMS)
 124          return ZeroVec;
 125       return machine->CurProgram->LocalParams[reg];
 126
 127    case PROGRAM_ENV_PARAM:
 128       if (reg >= MAX_PROGRAM_ENV_PARAMS)
 129          return ZeroVec;
 130       return machine->EnvParams[reg];
 131
 132    case PROGRAM_STATE_VAR:
 133       /* Fallthrough */
 134    case PROGRAM_CONSTANT:
 135       /* Fallthrough */
 136    case PROGRAM_UNIFORM:
 137       if (reg >= (GLint) prog->Parameters->NumParameters)
 138          return ZeroVec;
 139       return (GLfloat *) prog->Parameters->ParameterValues[reg];
 140
 141    case PROGRAM_SYSTEM_VALUE:
 142       assert(reg < Elements(machine->SystemValues));
 143       return machine->SystemValues[reg];
 144
 145    default:
 146       _mesa_problem(NULL,
 147          "Invalid src register file %d in get_src_register_pointer()",
 148          source->File);
 149       return NULL;
 150    }
 151 }
 152
 153
 154 /**
 155  * Return a pointer to the 4-element float vector specified by the given
 156  * destination register.
 157  */
 158 static inline GLfloat *
 159 get_dst_register_pointer(const struct prog_dst_register *dest,
 160                          struct gl_program_machine *machine)
 161 {
 162    static GLfloat dummyReg[4];
 163    GLint reg = dest->Index;
 164
 165    if (dest->RelAddr) {
 166       /* add address register value to src index/offset */
 167       reg += machine->AddressReg[0][0];
 168       if (reg < 0) {
 169          return dummyReg;
 170       }
 171    }
 172
 173    switch (dest->File) {
 174    case PROGRAM_TEMPORARY:
 175       if (reg >= MAX_PROGRAM_TEMPS)
 176          return dummyReg;
 177       return machine->Temporaries[reg];
 178
 179    case PROGRAM_OUTPUT:
 180       if (reg >= MAX_PROGRAM_OUTPUTS)
 181          return dummyReg;
 182       return machine->Outputs[reg];
 183
 184    default:
 185       _mesa_problem(NULL,
 186          "Invalid dest register file %d in get_dst_register_pointer()",
 187          dest->File);
 188       return NULL;
 189    }
 190 }
 191
 192
 193
 194 /**
 195  * Fetch a 4-element float vector from the given source register.
 196  * Apply swizzling and negating as needed.
 197  */
 198 static void
 199 fetch_vector4(const struct prog_src_register *source,
 200               const struct gl_program_machine *machine, GLfloat result[4])
 201 {
 202    const GLfloat *src = get_src_register_pointer(source, machine);
 203    ASSERT(src);
 204
 205    if (source->Swizzle == SWIZZLE_NOOP) {
 206       /* no swizzling */
 207       COPY_4V(result, src);
 208    }
 209    else {
 210       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 211       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 212       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 213       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 214       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 215       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 216       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 217       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 218    }
 219
 220    if (source->Abs) {
 221       result[0] = FABSF(result[0]);
 222       result[1] = FABSF(result[1]);
 223       result[2] = FABSF(result[2]);
 224       result[3] = FABSF(result[3]);
 225    }
 226    if (source->Negate) {
 227       ASSERT(source->Negate == NEGATE_XYZW);
 228       result[0] = -result[0];
 229       result[1] = -result[1];
 230       result[2] = -result[2];
 231       result[3] = -result[3];
 232    }
 233
 234 #ifdef NAN_CHECK
 235    assert(!IS_INF_OR_NAN(result[0]));
 236    assert(!IS_INF_OR_NAN(result[0]));
 237    assert(!IS_INF_OR_NAN(result[0]));
 238    assert(!IS_INF_OR_NAN(result[0]));
 239 #endif
 240 }
 241
 242
 243 /**
 244  * Fetch the derivative with respect to X or Y for the given register.
 245  * XXX this currently only works for fragment program input attribs.
 246  */
 247 static void
 248 fetch_vector4_deriv(struct gl_context * ctx,
 249                     const struct prog_src_register *source,
 250                     const struct gl_program_machine *machine,
 251                     char xOrY, GLfloat result[4])
 252 {
 253    if (source->File == PROGRAM_INPUT &&
 254        source->Index < (GLint) machine->NumDeriv) {
 255       const GLint col = machine->CurElement;
 256       const GLfloat w = machine->Attribs[VARYING_SLOT_POS][col][3];
 257       const GLfloat invQ = 1.0f / w;
 258       GLfloat deriv[4];
 259
 260       if (xOrY == 'X') {
 261          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 262          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 263          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 264          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 265       }
 266       else {
 267          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 268          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 269          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 270          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 271       }
 272
 273       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 274       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 275       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 276       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 277
 278       if (source->Abs) {
 279          result[0] = FABSF(result[0]);
 280          result[1] = FABSF(result[1]);
 281          result[2] = FABSF(result[2]);
 282          result[3] = FABSF(result[3]);
 283       }
 284       if (source->Negate) {
 285          ASSERT(source->Negate == NEGATE_XYZW);
 286          result[0] = -result[0];
 287          result[1] = -result[1];
 288          result[2] = -result[2];
 289          result[3] = -result[3];
 290       }
 291    }
 292    else {
 293       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 294    }
 295 }
 296
 297
 298 /**
 299  * As above, but only return result[0] element.
 300  */
 301 static void
 302 fetch_vector1(const struct prog_src_register *source,
 303               const struct gl_program_machine *machine, GLfloat result[4])
 304 {
 305    const GLfloat *src = get_src_register_pointer(source, machine);
 306    ASSERT(src);
 307
 308    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 309
 310    if (source->Abs) {
 311       result[0] = FABSF(result[0]);
 312    }
 313    if (source->Negate) {
 314       result[0] = -result[0];
 315    }
 316 }
 317
 318
 319 static GLuint
 320 fetch_vector1ui(const struct prog_src_register *source,
 321                 const struct gl_program_machine *machine)
 322 {
 323    const GLuint *src = (GLuint *) get_src_register_pointer(source, machine);
 324    return src[GET_SWZ(source->Swizzle, 0)];
 325 }
 326
 327
 328 /**
 329  * Fetch texel from texture.  Use partial derivatives when possible.
 330  */
 331 static inline void
 332 fetch_texel(struct gl_context *ctx,
 333             const struct gl_program_machine *machine,
 334             const struct prog_instruction *inst,
 335             const GLfloat texcoord[4], GLfloat lodBias,
 336             GLfloat color[4])
 337 {
 338    const GLuint unit = machine->Samplers[inst->TexSrcUnit];
 339
 340    /* Note: we only have the right derivatives for fragment input attribs.
 341     */
 342    if (machine->NumDeriv > 0 &&
 343        inst->SrcReg[0].File == PROGRAM_INPUT &&
 344        inst->SrcReg[0].Index == VARYING_SLOT_TEX0 + inst->TexSrcUnit) {
 345       /* simple texture fetch for which we should have derivatives */
 346       GLuint attr = inst->SrcReg[0].Index;
 347       machine->FetchTexelDeriv(ctx, texcoord,
 348                                machine->DerivX[attr],
 349                                machine->DerivY[attr],
 350                                lodBias, unit, color);
 351    }
 352    else {
 353       machine->FetchTexelLod(ctx, texcoord, lodBias, unit, color);
 354    }
 355 }
 356
 357
 358 /**
 359  * Test value against zero and return GT, LT, EQ or UN if NaN.
 360  */
 361 static inline GLuint
 362 generate_cc(float value)
 363 {
 364    if (value != value)
 365       return COND_UN;           /* NaN */
 366    if (value > 0.0F)
 367       return COND_GT;
 368    if (value < 0.0F)
 369       return COND_LT;
 370    return COND_EQ;
 371 }
 372
 373
 374 /**
 375  * Test if the ccMaskRule is satisfied by the given condition code.
 376  * Used to mask destination writes according to the current condition code.
 377  */
 378 static inline GLboolean
 379 test_cc(GLuint condCode, GLuint ccMaskRule)
 380 {
 381    switch (ccMaskRule) {
 382    case COND_EQ: return (condCode == COND_EQ);
 383    case COND_NE: return (condCode != COND_EQ);
 384    case COND_LT: return (condCode == COND_LT);
 385    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 386    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 387    case COND_GT: return (condCode == COND_GT);
 388    case COND_TR: return GL_TRUE;
 389    case COND_FL: return GL_FALSE;
 390    default:      return GL_TRUE;
 391    }
 392 }
 393
 394
 395 /**
 396  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 397  * or GL_FALSE to indicate result.
 398  */
 399 static inline GLboolean
 400 eval_condition(const struct gl_program_machine *machine,
 401                const struct prog_instruction *inst)
 402 {
 403    const GLuint swizzle = inst->DstReg.CondSwizzle;
 404    const GLuint condMask = inst->DstReg.CondMask;
 405    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 406        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 407        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 408        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 409       return GL_TRUE;
 410    }
 411    else {
 412       return GL_FALSE;
 413    }
 414 }
 415
 416
 417
 418 /**
 419  * Store 4 floats into a register.  Observe the instructions saturate and
 420  * set-condition-code flags.
 421  */
 422 static void
 423 store_vector4(const struct prog_instruction *inst,
 424               struct gl_program_machine *machine, const GLfloat value[4])
 425 {
 426    const struct prog_dst_register *dstReg = &(inst->DstReg);
 427    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 428    GLuint writeMask = dstReg->WriteMask;
 429    GLfloat clampedValue[4];
 430    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
 431
 432 #if 0
 433    if (value[0] > 1.0e10 ||
 434        IS_INF_OR_NAN(value[0]) ||
 435        IS_INF_OR_NAN(value[1]) ||
 436        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 437       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 438 #endif
 439
 440    if (clamp) {
 441       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 442       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 443       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 444       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 445       value = clampedValue;
 446    }
 447
 448    if (dstReg->CondMask != COND_TR) {
 449       /* condition codes may turn off some writes */
 450       if (writeMask & WRITEMASK_X) {
 451          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 452                       dstReg->CondMask))
 453             writeMask &= ~WRITEMASK_X;
 454       }
 455       if (writeMask & WRITEMASK_Y) {
 456          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 457                       dstReg->CondMask))
 458             writeMask &= ~WRITEMASK_Y;
 459       }
 460       if (writeMask & WRITEMASK_Z) {
 461          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 462                       dstReg->CondMask))
 463             writeMask &= ~WRITEMASK_Z;
 464       }
 465       if (writeMask & WRITEMASK_W) {
 466          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 467                       dstReg->CondMask))
 468             writeMask &= ~WRITEMASK_W;
 469       }
 470    }
 471
 472 #ifdef NAN_CHECK
 473    assert(!IS_INF_OR_NAN(value[0]));
 474    assert(!IS_INF_OR_NAN(value[0]));
 475    assert(!IS_INF_OR_NAN(value[0]));
 476    assert(!IS_INF_OR_NAN(value[0]));
 477 #endif
 478
 479    if (writeMask & WRITEMASK_X)
 480       dst[0] = value[0];
 481    if (writeMask & WRITEMASK_Y)
 482       dst[1] = value[1];
 483    if (writeMask & WRITEMASK_Z)
 484       dst[2] = value[2];
 485    if (writeMask & WRITEMASK_W)
 486       dst[3] = value[3];
 487
 488    if (inst->CondUpdate) {
 489       if (writeMask & WRITEMASK_X)
 490          machine->CondCodes[0] = generate_cc(value[0]);
 491       if (writeMask & WRITEMASK_Y)
 492          machine->CondCodes[1] = generate_cc(value[1]);
 493       if (writeMask & WRITEMASK_Z)
 494          machine->CondCodes[2] = generate_cc(value[2]);
 495       if (writeMask & WRITEMASK_W)
 496          machine->CondCodes[3] = generate_cc(value[3]);
 497 #if DEBUG_PROG
 498       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 499              _mesa_condcode_string(machine->CondCodes[0]),
 500              _mesa_condcode_string(machine->CondCodes[1]),
 501              _mesa_condcode_string(machine->CondCodes[2]),
 502              _mesa_condcode_string(machine->CondCodes[3]));
 503 #endif
 504    }
 505 }
 506
 507
 508 /**
 509  * Store 4 uints into a register.  Observe the set-condition-code flags.
 510  */
 511 static void
 512 store_vector4ui(const struct prog_instruction *inst,
 513                 struct gl_program_machine *machine, const GLuint value[4])
 514 {
 515    const struct prog_dst_register *dstReg = &(inst->DstReg);
 516    GLuint writeMask = dstReg->WriteMask;
 517    GLuint *dst = (GLuint *) get_dst_register_pointer(dstReg, machine);
 518
 519    if (dstReg->CondMask != COND_TR) {
 520       /* condition codes may turn off some writes */
 521       if (writeMask & WRITEMASK_X) {
 522          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 523                       dstReg->CondMask))
 524             writeMask &= ~WRITEMASK_X;
 525       }
 526       if (writeMask & WRITEMASK_Y) {
 527          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 528                       dstReg->CondMask))
 529             writeMask &= ~WRITEMASK_Y;
 530       }
 531       if (writeMask & WRITEMASK_Z) {
 532          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 533                       dstReg->CondMask))
 534             writeMask &= ~WRITEMASK_Z;
 535       }
 536       if (writeMask & WRITEMASK_W) {
 537          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 538                       dstReg->CondMask))
 539             writeMask &= ~WRITEMASK_W;
 540       }
 541    }
 542
 543    if (writeMask & WRITEMASK_X)
 544       dst[0] = value[0];
 545    if (writeMask & WRITEMASK_Y)
 546       dst[1] = value[1];
 547    if (writeMask & WRITEMASK_Z)
 548       dst[2] = value[2];
 549    if (writeMask & WRITEMASK_W)
 550       dst[3] = value[3];
 551
 552    if (inst->CondUpdate) {
 553       if (writeMask & WRITEMASK_X)
 554          machine->CondCodes[0] = generate_cc((float)value[0]);
 555       if (writeMask & WRITEMASK_Y)
 556          machine->CondCodes[1] = generate_cc((float)value[1]);
 557       if (writeMask & WRITEMASK_Z)
 558          machine->CondCodes[2] = generate_cc((float)value[2]);
 559       if (writeMask & WRITEMASK_W)
 560          machine->CondCodes[3] = generate_cc((float)value[3]);
 561 #if DEBUG_PROG
 562       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 563              _mesa_condcode_string(machine->CondCodes[0]),
 564              _mesa_condcode_string(machine->CondCodes[1]),
 565              _mesa_condcode_string(machine->CondCodes[2]),
 566              _mesa_condcode_string(machine->CondCodes[3]));
 567 #endif
 568    }
 569 }
 570
 571
 572
 573 /**
 574  * Execute the given vertex/fragment program.
 575  *
 576  * \param ctx  rendering context
 577  * \param program  the program to execute
 578  * \param machine  machine state (must be initialized)
 579  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 580  */
 581 GLboolean
 582 _mesa_execute_program(struct gl_context * ctx,
 583                       const struct gl_program *program,
 584                       struct gl_program_machine *machine)
 585 {
 586    const GLuint numInst = program->NumInstructions;
 587    const GLuint maxExec = 65536;
 588    GLuint pc, numExec = 0;
 589
 590    machine->CurProgram = program;
 591
 592    if (DEBUG_PROG) {
 593       printf("execute program %u --------------------\n", program->Id);
 594    }
 595
 596    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 597       machine->EnvParams = ctx->VertexProgram.Parameters;
 598    }
 599    else {
 600       machine->EnvParams = ctx->FragmentProgram.Parameters;
 601    }
 602
 603    for (pc = 0; pc < numInst; pc++) {
 604       const struct prog_instruction *inst = program->Instructions + pc;
 605
 606       if (DEBUG_PROG) {
 607          _mesa_print_instruction(inst);
 608       }
 609
 610       switch (inst->Opcode) {
 611       case OPCODE_ABS:
 612          {
 613             GLfloat a[4], result[4];
 614             fetch_vector4(&inst->SrcReg[0], machine, a);
 615             result[0] = FABSF(a[0]);
 616             result[1] = FABSF(a[1]);
 617             result[2] = FABSF(a[2]);
 618             result[3] = FABSF(a[3]);
 619             store_vector4(inst, machine, result);
 620          }
 621          break;
 622       case OPCODE_ADD:
 623          {
 624             GLfloat a[4], b[4], result[4];
 625             fetch_vector4(&inst->SrcReg[0], machine, a);
 626             fetch_vector4(&inst->SrcReg[1], machine, b);
 627             result[0] = a[0] + b[0];
 628             result[1] = a[1] + b[1];
 629             result[2] = a[2] + b[2];
 630             result[3] = a[3] + b[3];
 631             store_vector4(inst, machine, result);
 632             if (DEBUG_PROG) {
 633                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 634                       result[0], result[1], result[2], result[3],
 635                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 636             }
 637          }
 638          break;
 639       case OPCODE_ARL:
 640          {
 641             GLfloat t[4];
 642             fetch_vector4(&inst->SrcReg[0], machine, t);
 643             machine->AddressReg[0][0] = IFLOOR(t[0]);
 644             if (DEBUG_PROG) {
 645                printf("ARL %d\n", machine->AddressReg[0][0]);
 646             }
 647          }
 648          break;
 649       case OPCODE_BGNLOOP:
 650          /* no-op */
 651          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 652                 == OPCODE_ENDLOOP);
 653          break;
 654       case OPCODE_ENDLOOP:
 655          /* subtract 1 here since pc is incremented by for(pc) loop */
 656          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 657                 == OPCODE_BGNLOOP);
 658          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 659          break;
 660       case OPCODE_BGNSUB:      /* begin subroutine */
 661          break;
 662       case OPCODE_ENDSUB:      /* end subroutine */
 663          break;
 664       case OPCODE_BRK:         /* break out of loop (conditional) */
 665          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 666                 == OPCODE_ENDLOOP);
 667          if (eval_condition(machine, inst)) {
 668             /* break out of loop */
 669             /* pc++ at end of for-loop will put us after the ENDLOOP inst */
 670             pc = inst->BranchTarget;
 671          }
 672          break;
 673       case OPCODE_CONT:        /* continue loop (conditional) */
 674          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 675                 == OPCODE_ENDLOOP);
 676          if (eval_condition(machine, inst)) {
 677             /* continue at ENDLOOP */
 678             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 679             pc = inst->BranchTarget - 1;
 680          }
 681          break;
 682       case OPCODE_CAL:         /* Call subroutine (conditional) */
 683          if (eval_condition(machine, inst)) {
 684             /* call the subroutine */
 685             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 686                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 687             }
 688             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 689             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 690             pc = inst->BranchTarget - 1;
 691          }
 692          break;
 693       case OPCODE_CMP:
 694          {
 695             GLfloat a[4], b[4], c[4], result[4];
 696             fetch_vector4(&inst->SrcReg[0], machine, a);
 697             fetch_vector4(&inst->SrcReg[1], machine, b);
 698             fetch_vector4(&inst->SrcReg[2], machine, c);
 699             result[0] = a[0] < 0.0F ? b[0] : c[0];
 700             result[1] = a[1] < 0.0F ? b[1] : c[1];
 701             result[2] = a[2] < 0.0F ? b[2] : c[2];
 702             result[3] = a[3] < 0.0F ? b[3] : c[3];
 703             store_vector4(inst, machine, result);
 704             if (DEBUG_PROG) {
 705                printf("CMP (%g %g %g %g) = (%g %g %g %g) < 0 ? (%g %g %g %g) : (%g %g %g %g)\n",
 706                       result[0], result[1], result[2], result[3],
 707                       a[0], a[1], a[2], a[3],
 708                       b[0], b[1], b[2], b[3],
 709                       c[0], c[1], c[2], c[3]);
 710             }
 711          }
 712          break;
 713       case OPCODE_COS:
 714          {
 715             GLfloat a[4], result[4];
 716             fetch_vector1(&inst->SrcReg[0], machine, a);
 717             result[0] = result[1] = result[2] = result[3]
 718                = (GLfloat) cos(a[0]);
 719             store_vector4(inst, machine, result);
 720          }
 721          break;
 722       case OPCODE_DDX:         /* Partial derivative with respect to X */
 723          {
 724             GLfloat result[4];
 725             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 726                                 'X', result);
 727             store_vector4(inst, machine, result);
 728          }
 729          break;
 730       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 731          {
 732             GLfloat result[4];
 733             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 734                                 'Y', result);
 735             store_vector4(inst, machine, result);
 736          }
 737          break;
 738       case OPCODE_DP2:
 739          {
 740             GLfloat a[4], b[4], result[4];
 741             fetch_vector4(&inst->SrcReg[0], machine, a);
 742             fetch_vector4(&inst->SrcReg[1], machine, b);
 743             result[0] = result[1] = result[2] = result[3] = DOT2(a, b);
 744             store_vector4(inst, machine, result);
 745             if (DEBUG_PROG) {
 746                printf("DP2 %g = (%g %g) . (%g %g)\n",
 747                       result[0], a[0], a[1], b[0], b[1]);
 748             }
 749          }
 750          break;
 751       case OPCODE_DP3:
 752          {
 753             GLfloat a[4], b[4], result[4];
 754             fetch_vector4(&inst->SrcReg[0], machine, a);
 755             fetch_vector4(&inst->SrcReg[1], machine, b);
 756             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 757             store_vector4(inst, machine, result);
 758             if (DEBUG_PROG) {
 759                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 760                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 761             }
 762          }
 763          break;
 764       case OPCODE_DP4:
 765          {
 766             GLfloat a[4], b[4], result[4];
 767             fetch_vector4(&inst->SrcReg[0], machine, a);
 768             fetch_vector4(&inst->SrcReg[1], machine, b);
 769             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 770             store_vector4(inst, machine, result);
 771             if (DEBUG_PROG) {
 772                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 773                       result[0], a[0], a[1], a[2], a[3],
 774                       b[0], b[1], b[2], b[3]);
 775             }
 776          }
 777          break;
 778       case OPCODE_DPH:
 779          {
 780             GLfloat a[4], b[4], result[4];
 781             fetch_vector4(&inst->SrcReg[0], machine, a);
 782             fetch_vector4(&inst->SrcReg[1], machine, b);
 783             result[0] = result[1] = result[2] = result[3] = DOT3(a, b) + b[3];
 784             store_vector4(inst, machine, result);
 785          }
 786          break;
 787       case OPCODE_DST:         /* Distance vector */
 788          {
 789             GLfloat a[4], b[4], result[4];
 790             fetch_vector4(&inst->SrcReg[0], machine, a);
 791             fetch_vector4(&inst->SrcReg[1], machine, b);
 792             result[0] = 1.0F;
 793             result[1] = a[1] * b[1];
 794             result[2] = a[2];
 795             result[3] = b[3];
 796             store_vector4(inst, machine, result);
 797          }
 798          break;
 799       case OPCODE_EXP:
 800          {
 801             GLfloat t[4], q[4], floor_t0;
 802             fetch_vector1(&inst->SrcReg[0], machine, t);
 803             floor_t0 = FLOORF(t[0]);
 804             if (floor_t0 > FLT_MAX_EXP) {
 805                SET_POS_INFINITY(q[0]);
 806                SET_POS_INFINITY(q[2]);
 807             }
 808             else if (floor_t0 < FLT_MIN_EXP) {
 809                q[0] = 0.0F;
 810                q[2] = 0.0F;
 811             }
 812             else {
 813                q[0] = LDEXPF(1.0, (int) floor_t0);
 814                /* Note: GL_NV_vertex_program expects
 815                 * result.z = result.x * APPX(result.y)
 816                 * We do what the ARB extension says.
 817                 */
 818                q[2] = (GLfloat) pow(2.0, t[0]);
 819             }
 820             q[1] = t[0] - floor_t0;
 821             q[3] = 1.0F;
 822             store_vector4( inst, machine, q );
 823          }
 824          break;
 825       case OPCODE_EX2:         /* Exponential base 2 */
 826          {
 827             GLfloat a[4], result[4], val;
 828             fetch_vector1(&inst->SrcReg[0], machine, a);
 829             val = (GLfloat) pow(2.0, a[0]);
 830             /*
 831             if (IS_INF_OR_NAN(val))
 832                val = 1.0e10;
 833             */
 834             result[0] = result[1] = result[2] = result[3] = val;
 835             store_vector4(inst, machine, result);
 836          }
 837          break;
 838       case OPCODE_FLR:
 839          {
 840             GLfloat a[4], result[4];
 841             fetch_vector4(&inst->SrcReg[0], machine, a);
 842             result[0] = FLOORF(a[0]);
 843             result[1] = FLOORF(a[1]);
 844             result[2] = FLOORF(a[2]);
 845             result[3] = FLOORF(a[3]);
 846             store_vector4(inst, machine, result);
 847          }
 848          break;
 849       case OPCODE_FRC:
 850          {
 851             GLfloat a[4], result[4];
 852             fetch_vector4(&inst->SrcReg[0], machine, a);
 853             result[0] = a[0] - FLOORF(a[0]);
 854             result[1] = a[1] - FLOORF(a[1]);
 855             result[2] = a[2] - FLOORF(a[2]);
 856             result[3] = a[3] - FLOORF(a[3]);
 857             store_vector4(inst, machine, result);
 858          }
 859          break;
 860       case OPCODE_IF:
 861          {
 862             GLboolean cond;
 863             ASSERT(program->Instructions[inst->BranchTarget].Opcode
 864                    == OPCODE_ELSE ||
 865                    program->Instructions[inst->BranchTarget].Opcode
 866                    == OPCODE_ENDIF);
 867             /* eval condition */
 868             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 869                GLfloat a[4];
 870                fetch_vector1(&inst->SrcReg[0], machine, a);
 871                cond = (a[0] != 0.0);
 872             }
 873             else {
 874                cond = eval_condition(machine, inst);
 875             }
 876             if (DEBUG_PROG) {
 877                printf("IF: %d\n", cond);
 878             }
 879             /* do if/else */
 880             if (cond) {
 881                /* do if-clause (just continue execution) */
 882             }
 883             else {
 884                /* go to the instruction after ELSE or ENDIF */
 885                assert(inst->BranchTarget >= 0);
 886                pc = inst->BranchTarget;
 887             }
 888          }
 889          break;
 890       case OPCODE_ELSE:
 891          /* goto ENDIF */
 892          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 893                 == OPCODE_ENDIF);
 894          assert(inst->BranchTarget >= 0);
 895          pc = inst->BranchTarget;
 896          break;
 897       case OPCODE_ENDIF:
 898          /* nothing */
 899          break;
 900       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 901          if (eval_condition(machine, inst)) {
 902             return GL_FALSE;
 903          }
 904          break;
 905       case OPCODE_KIL:         /* ARB_f_p only */
 906          {
 907             GLfloat a[4];
 908             fetch_vector4(&inst->SrcReg[0], machine, a);
 909             if (DEBUG_PROG) {
 910                printf("KIL if (%g %g %g %g) <= 0.0\n",
 911                       a[0], a[1], a[2], a[3]);
 912             }
 913
 914             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 915                return GL_FALSE;
 916             }
 917          }
 918          break;
 919       case OPCODE_LG2:         /* log base 2 */
 920          {
 921             GLfloat a[4], result[4], val;
 922             fetch_vector1(&inst->SrcReg[0], machine, a);
 923             /* The fast LOG2 macro doesn't meet the precision requirements.
 924              */
 925             if (a[0] == 0.0F) {
 926                val = -FLT_MAX;
 927             }
 928             else {
 929                val = (float)(log(a[0]) * 1.442695F);
 930             }
 931             result[0] = result[1] = result[2] = result[3] = val;
 932             store_vector4(inst, machine, result);
 933          }
 934          break;
 935       case OPCODE_LIT:
 936          {
 937             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
 938             GLfloat a[4], result[4];
 939             fetch_vector4(&inst->SrcReg[0], machine, a);
 940             a[0] = MAX2(a[0], 0.0F);
 941             a[1] = MAX2(a[1], 0.0F);
 942             /* XXX ARB version clamps a[3], NV version doesn't */
 943             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 944             result[0] = 1.0F;
 945             result[1] = a[0];
 946             /* XXX we could probably just use pow() here */
 947             if (a[0] > 0.0F) {
 948                if (a[1] == 0.0 && a[3] == 0.0)
 949                   result[2] = 1.0F;
 950                else
 951                   result[2] = (GLfloat) pow(a[1], a[3]);
 952             }
 953             else {
 954                result[2] = 0.0F;
 955             }
 956             result[3] = 1.0F;
 957             store_vector4(inst, machine, result);
 958             if (DEBUG_PROG) {
 959                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
 960                       result[0], result[1], result[2], result[3],
 961                       a[0], a[1], a[2], a[3]);
 962             }
 963          }
 964          break;
 965       case OPCODE_LOG:
 966          {
 967             GLfloat t[4], q[4], abs_t0;
 968             fetch_vector1(&inst->SrcReg[0], machine, t);
 969             abs_t0 = FABSF(t[0]);
 970             if (abs_t0 != 0.0F) {
 971                if (IS_INF_OR_NAN(abs_t0))
 972                {
 973                   SET_POS_INFINITY(q[0]);
 974                   q[1] = 1.0F;
 975                   SET_POS_INFINITY(q[2]);
 976                }
 977                else {
 978                   int exponent;
 979                   GLfloat mantissa = FREXPF(t[0], &exponent);
 980                   q[0] = (GLfloat) (exponent - 1);
 981                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 982
 983                   /* The fast LOG2 macro doesn't meet the precision
 984                    * requirements.
 985                    */
 986                   q[2] = (float)(log(t[0]) * 1.442695F);
 987                }
 988             }
 989             else {
 990                SET_NEG_INFINITY(q[0]);
 991                q[1] = 1.0F;
 992                SET_NEG_INFINITY(q[2]);
 993             }
 994             q[3] = 1.0;
 995             store_vector4(inst, machine, q);
 996          }
 997          break;
 998       case OPCODE_LRP:
 999          {
1000             GLfloat a[4], b[4], c[4], result[4];
1001             fetch_vector4(&inst->SrcReg[0], machine, a);
1002             fetch_vector4(&inst->SrcReg[1], machine, b);
1003             fetch_vector4(&inst->SrcReg[2], machine, c);
1004             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
1005             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
1006             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
1007             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
1008             store_vector4(inst, machine, result);
1009             if (DEBUG_PROG) {
1010                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
1011                       "(%g %g %g %g), (%g %g %g %g)\n",
1012                       result[0], result[1], result[2], result[3],
1013                       a[0], a[1], a[2], a[3],
1014                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1015             }
1016          }
1017          break;
1018       case OPCODE_MAD:
1019          {
1020             GLfloat a[4], b[4], c[4], result[4];
1021             fetch_vector4(&inst->SrcReg[0], machine, a);
1022             fetch_vector4(&inst->SrcReg[1], machine, b);
1023             fetch_vector4(&inst->SrcReg[2], machine, c);
1024             result[0] = a[0] * b[0] + c[0];
1025             result[1] = a[1] * b[1] + c[1];
1026             result[2] = a[2] * b[2] + c[2];
1027             result[3] = a[3] * b[3] + c[3];
1028             store_vector4(inst, machine, result);
1029             if (DEBUG_PROG) {
1030                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
1031                       "(%g %g %g %g) + (%g %g %g %g)\n",
1032                       result[0], result[1], result[2], result[3],
1033                       a[0], a[1], a[2], a[3],
1034                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1035             }
1036          }
1037          break;
1038       case OPCODE_MAX:
1039          {
1040             GLfloat a[4], b[4], result[4];
1041             fetch_vector4(&inst->SrcReg[0], machine, a);
1042             fetch_vector4(&inst->SrcReg[1], machine, b);
1043             result[0] = MAX2(a[0], b[0]);
1044             result[1] = MAX2(a[1], b[1]);
1045             result[2] = MAX2(a[2], b[2]);
1046             result[3] = MAX2(a[3], b[3]);
1047             store_vector4(inst, machine, result);
1048             if (DEBUG_PROG) {
1049                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
1050                       result[0], result[1], result[2], result[3],
1051                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1052             }
1053          }
1054          break;
1055       case OPCODE_MIN:
1056          {
1057             GLfloat a[4], b[4], result[4];
1058             fetch_vector4(&inst->SrcReg[0], machine, a);
1059             fetch_vector4(&inst->SrcReg[1], machine, b);
1060             result[0] = MIN2(a[0], b[0]);
1061             result[1] = MIN2(a[1], b[1]);
1062             result[2] = MIN2(a[2], b[2]);
1063             result[3] = MIN2(a[3], b[3]);
1064             store_vector4(inst, machine, result);
1065          }
1066          break;
1067       case OPCODE_MOV:
1068          {
1069             GLfloat result[4];
1070             fetch_vector4(&inst->SrcReg[0], machine, result);
1071             store_vector4(inst, machine, result);
1072             if (DEBUG_PROG) {
1073                printf("MOV (%g %g %g %g)\n",
1074                       result[0], result[1], result[2], result[3]);
1075             }
1076          }
1077          break;
1078       case OPCODE_MUL:
1079          {
1080             GLfloat a[4], b[4], result[4];
1081             fetch_vector4(&inst->SrcReg[0], machine, a);
1082             fetch_vector4(&inst->SrcReg[1], machine, b);
1083             result[0] = a[0] * b[0];
1084             result[1] = a[1] * b[1];
1085             result[2] = a[2] * b[2];
1086             result[3] = a[3] * b[3];
1087             store_vector4(inst, machine, result);
1088             if (DEBUG_PROG) {
1089                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
1090                       result[0], result[1], result[2], result[3],
1091                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1092             }
1093          }
1094          break;
1095       case OPCODE_NOISE1:
1096          {
1097             GLfloat a[4], result[4];
1098             fetch_vector1(&inst->SrcReg[0], machine, a);
1099             result[0] =
1100                result[1] =
1101                result[2] =
1102                result[3] = _mesa_noise1(a[0]);
1103             store_vector4(inst, machine, result);
1104          }
1105          break;
1106       case OPCODE_NOISE2:
1107          {
1108             GLfloat a[4], result[4];
1109             fetch_vector4(&inst->SrcReg[0], machine, a);
1110             result[0] =
1111                result[1] =
1112                result[2] = result[3] = _mesa_noise2(a[0], a[1]);
1113             store_vector4(inst, machine, result);
1114          }
1115          break;
1116       case OPCODE_NOISE3:
1117          {
1118             GLfloat a[4], result[4];
1119             fetch_vector4(&inst->SrcReg[0], machine, a);
1120             result[0] =
1121                result[1] =
1122                result[2] =
1123                result[3] = _mesa_noise3(a[0], a[1], a[2]);
1124             store_vector4(inst, machine, result);
1125          }
1126          break;
1127       case OPCODE_NOISE4:
1128          {
1129             GLfloat a[4], result[4];
1130             fetch_vector4(&inst->SrcReg[0], machine, a);
1131             result[0] =
1132                result[1] =
1133                result[2] =
1134                result[3] = _mesa_noise4(a[0], a[1], a[2], a[3]);
1135             store_vector4(inst, machine, result);
1136          }
1137          break;
1138       case OPCODE_NOP:
1139          break;
1140       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1141          {
1142             GLfloat a[4];
1143             GLuint result[4];
1144             GLhalfNV hx, hy;
1145             fetch_vector4(&inst->SrcReg[0], machine, a);
1146             hx = _mesa_float_to_half(a[0]);
1147             hy = _mesa_float_to_half(a[1]);
1148             result[0] =
1149             result[1] =
1150             result[2] =
1151             result[3] = hx | (hy << 16);
1152             store_vector4ui(inst, machine, result);
1153          }
1154          break;
1155       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1156          {
1157             GLfloat a[4];
1158             GLuint result[4], usx, usy;
1159             fetch_vector4(&inst->SrcReg[0], machine, a);
1160             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1161             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1162             usx = F_TO_I(a[0] * 65535.0F);
1163             usy = F_TO_I(a[1] * 65535.0F);
1164             result[0] =
1165             result[1] =
1166             result[2] =
1167             result[3] = usx | (usy << 16);
1168             store_vector4ui(inst, machine, result);
1169          }
1170          break;
1171       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1172          {
1173             GLfloat a[4];
1174             GLuint result[4], ubx, uby, ubz, ubw;
1175             fetch_vector4(&inst->SrcReg[0], machine, a);
1176             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1177             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1178             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1179             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1180             ubx = F_TO_I(127.0F * a[0] + 128.0F);
1181             uby = F_TO_I(127.0F * a[1] + 128.0F);
1182             ubz = F_TO_I(127.0F * a[2] + 128.0F);
1183             ubw = F_TO_I(127.0F * a[3] + 128.0F);
1184             result[0] =
1185             result[1] =
1186             result[2] =
1187             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1188             store_vector4ui(inst, machine, result);
1189          }
1190          break;
1191       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1192          {
1193             GLfloat a[4];
1194             GLuint result[4], ubx, uby, ubz, ubw;
1195             fetch_vector4(&inst->SrcReg[0], machine, a);
1196             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1197             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1198             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1199             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1200             ubx = F_TO_I(255.0F * a[0]);
1201             uby = F_TO_I(255.0F * a[1]);
1202             ubz = F_TO_I(255.0F * a[2]);
1203             ubw = F_TO_I(255.0F * a[3]);
1204             result[0] =
1205             result[1] =
1206             result[2] =
1207             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1208             store_vector4ui(inst, machine, result);
1209          }
1210          break;
1211       case OPCODE_POW:
1212          {
1213             GLfloat a[4], b[4], result[4];
1214             fetch_vector1(&inst->SrcReg[0], machine, a);
1215             fetch_vector1(&inst->SrcReg[1], machine, b);
1216             result[0] = result[1] = result[2] = result[3]
1217                = (GLfloat) pow(a[0], b[0]);
1218             store_vector4(inst, machine, result);
1219          }
1220          break;
1221
1222       case OPCODE_RCP:
1223          {
1224             GLfloat a[4], result[4];
1225             fetch_vector1(&inst->SrcReg[0], machine, a);
1226             if (DEBUG_PROG) {
1227                if (a[0] == 0)
1228                   printf("RCP(0)\n");
1229                else if (IS_INF_OR_NAN(a[0]))
1230                   printf("RCP(inf)\n");
1231             }
1232             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1233             store_vector4(inst, machine, result);
1234          }
1235          break;
1236       case OPCODE_RET:         /* return from subroutine (conditional) */
1237          if (eval_condition(machine, inst)) {
1238             if (machine->StackDepth == 0) {
1239                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1240             }
1241             /* subtract one because of pc++ in the for loop */
1242             pc = machine->CallStack[--machine->StackDepth] - 1;
1243          }
1244          break;
1245       case OPCODE_RFL:         /* reflection vector */
1246          {
1247             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1248             fetch_vector4(&inst->SrcReg[0], machine, axis);
1249             fetch_vector4(&inst->SrcReg[1], machine, dir);
1250             tmpW = DOT3(axis, axis);
1251             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1252             result[0] = tmpX * axis[0] - dir[0];
1253             result[1] = tmpX * axis[1] - dir[1];
1254             result[2] = tmpX * axis[2] - dir[2];
1255             /* result[3] is never written! XXX enforce in parser! */
1256             store_vector4(inst, machine, result);
1257          }
1258          break;
1259       case OPCODE_RSQ:         /* 1 / sqrt() */
1260          {
1261             GLfloat a[4], result[4];
1262             fetch_vector1(&inst->SrcReg[0], machine, a);
1263             a[0] = FABSF(a[0]);
1264             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1265             store_vector4(inst, machine, result);
1266             if (DEBUG_PROG) {
1267                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1268             }
1269          }
1270          break;
1271       case OPCODE_SCS:         /* sine and cos */
1272          {
1273             GLfloat a[4], result[4];
1274             fetch_vector1(&inst->SrcReg[0], machine, a);
1275             result[0] = (GLfloat) cos(a[0]);
1276             result[1] = (GLfloat) sin(a[0]);
1277             result[2] = 0.0;    /* undefined! */
1278             result[3] = 0.0;    /* undefined! */
1279             store_vector4(inst, machine, result);
1280          }
1281          break;
1282       case OPCODE_SEQ:         /* set on equal */
1283          {
1284             GLfloat a[4], b[4], result[4];
1285             fetch_vector4(&inst->SrcReg[0], machine, a);
1286             fetch_vector4(&inst->SrcReg[1], machine, b);
1287             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1288             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1289             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1290             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1291             store_vector4(inst, machine, result);
1292             if (DEBUG_PROG) {
1293                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1294                       result[0], result[1], result[2], result[3],
1295                       a[0], a[1], a[2], a[3],
1296                       b[0], b[1], b[2], b[3]);
1297             }
1298          }
1299          break;
1300       case OPCODE_SFL:         /* set false, operands ignored */
1301          {
1302             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1303             store_vector4(inst, machine, result);
1304          }
1305          break;
1306       case OPCODE_SGE:         /* set on greater or equal */
1307          {
1308             GLfloat a[4], b[4], result[4];
1309             fetch_vector4(&inst->SrcReg[0], machine, a);
1310             fetch_vector4(&inst->SrcReg[1], machine, b);
1311             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1312             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1313             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1314             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1315             store_vector4(inst, machine, result);
1316             if (DEBUG_PROG) {
1317                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1318                       result[0], result[1], result[2], result[3],
1319                       a[0], a[1], a[2], a[3],
1320                       b[0], b[1], b[2], b[3]);
1321             }
1322          }
1323          break;
1324       case OPCODE_SGT:         /* set on greater */
1325          {
1326             GLfloat a[4], b[4], result[4];
1327             fetch_vector4(&inst->SrcReg[0], machine, a);
1328             fetch_vector4(&inst->SrcReg[1], machine, b);
1329             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1330             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1331             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1332             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1333             store_vector4(inst, machine, result);
1334             if (DEBUG_PROG) {
1335                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1336                       result[0], result[1], result[2], result[3],
1337                       a[0], a[1], a[2], a[3],
1338                       b[0], b[1], b[2], b[3]);
1339             }
1340          }
1341          break;
1342       case OPCODE_SIN:
1343          {
1344             GLfloat a[4], result[4];
1345             fetch_vector1(&inst->SrcReg[0], machine, a);
1346             result[0] = result[1] = result[2] = result[3]
1347                = (GLfloat) sin(a[0]);
1348             store_vector4(inst, machine, result);
1349          }
1350          break;
1351       case OPCODE_SLE:         /* set on less or equal */
1352          {
1353             GLfloat a[4], b[4], result[4];
1354             fetch_vector4(&inst->SrcReg[0], machine, a);
1355             fetch_vector4(&inst->SrcReg[1], machine, b);
1356             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1357             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1358             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1359             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1360             store_vector4(inst, machine, result);
1361             if (DEBUG_PROG) {
1362                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1363                       result[0], result[1], result[2], result[3],
1364                       a[0], a[1], a[2], a[3],
1365                       b[0], b[1], b[2], b[3]);
1366             }
1367          }
1368          break;
1369       case OPCODE_SLT:         /* set on less */
1370          {
1371             GLfloat a[4], b[4], result[4];
1372             fetch_vector4(&inst->SrcReg[0], machine, a);
1373             fetch_vector4(&inst->SrcReg[1], machine, b);
1374             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1375             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1376             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1377             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1378             store_vector4(inst, machine, result);
1379             if (DEBUG_PROG) {
1380                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1381                       result[0], result[1], result[2], result[3],
1382                       a[0], a[1], a[2], a[3],
1383                       b[0], b[1], b[2], b[3]);
1384             }
1385          }
1386          break;
1387       case OPCODE_SNE:         /* set on not equal */
1388          {
1389             GLfloat a[4], b[4], result[4];
1390             fetch_vector4(&inst->SrcReg[0], machine, a);
1391             fetch_vector4(&inst->SrcReg[1], machine, b);
1392             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1393             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1394             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1395             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1396             store_vector4(inst, machine, result);
1397             if (DEBUG_PROG) {
1398                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1399                       result[0], result[1], result[2], result[3],
1400                       a[0], a[1], a[2], a[3],
1401                       b[0], b[1], b[2], b[3]);
1402             }
1403          }
1404          break;
1405       case OPCODE_SSG:         /* set sign (-1, 0 or +1) */
1406          {
1407             GLfloat a[4], result[4];
1408             fetch_vector4(&inst->SrcReg[0], machine, a);
1409             result[0] = (GLfloat) ((a[0] > 0.0F) - (a[0] < 0.0F));
1410             result[1] = (GLfloat) ((a[1] > 0.0F) - (a[1] < 0.0F));
1411             result[2] = (GLfloat) ((a[2] > 0.0F) - (a[2] < 0.0F));
1412             result[3] = (GLfloat) ((a[3] > 0.0F) - (a[3] < 0.0F));
1413             store_vector4(inst, machine, result);
1414          }
1415          break;
1416       case OPCODE_STR:         /* set true, operands ignored */
1417          {
1418             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1419             store_vector4(inst, machine, result);
1420          }
1421          break;
1422       case OPCODE_SUB:
1423          {
1424             GLfloat a[4], b[4], result[4];
1425             fetch_vector4(&inst->SrcReg[0], machine, a);
1426             fetch_vector4(&inst->SrcReg[1], machine, b);
1427             result[0] = a[0] - b[0];
1428             result[1] = a[1] - b[1];
1429             result[2] = a[2] - b[2];
1430             result[3] = a[3] - b[3];
1431             store_vector4(inst, machine, result);
1432             if (DEBUG_PROG) {
1433                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1434                       result[0], result[1], result[2], result[3],
1435                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1436             }
1437          }
1438          break;
1439       case OPCODE_SWZ:         /* extended swizzle */
1440          {
1441             const struct prog_src_register *source = &inst->SrcReg[0];
1442             const GLfloat *src = get_src_register_pointer(source, machine);
1443             GLfloat result[4];
1444             GLuint i;
1445             for (i = 0; i < 4; i++) {
1446                const GLuint swz = GET_SWZ(source->Swizzle, i);
1447                if (swz == SWIZZLE_ZERO)
1448                   result[i] = 0.0;
1449                else if (swz == SWIZZLE_ONE)
1450                   result[i] = 1.0;
1451                else {
1452                   ASSERT(swz >= 0);
1453                   ASSERT(swz <= 3);
1454                   result[i] = src[swz];
1455                }
1456                if (source->Negate & (1 << i))
1457                   result[i] = -result[i];
1458             }
1459             store_vector4(inst, machine, result);
1460          }
1461          break;
1462       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1463          /* Simple texel lookup */
1464          {
1465             GLfloat texcoord[4], color[4];
1466             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1467
1468             /* For TEX, texcoord.Q should not be used and its value should not
1469              * matter (at most, we pass coord.xyz to texture3D() in GLSL).
1470              * Set Q=1 so that FetchTexelDeriv() doesn't get a garbage value
1471              * which is effectively what happens when the texcoord swizzle
1472              * is .xyzz
1473              */
1474             texcoord[3] = 1.0f;
1475
1476             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1477
1478             if (DEBUG_PROG) {
1479                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1480                       color[0], color[1], color[2], color[3],
1481                       inst->TexSrcUnit,
1482                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1483             }
1484             store_vector4(inst, machine, color);
1485          }
1486          break;
1487       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1488          /* Texel lookup with LOD bias */
1489          {
1490             GLfloat texcoord[4], color[4], lodBias;
1491
1492             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1493
1494             /* texcoord[3] is the bias to add to lambda */
1495             lodBias = texcoord[3];
1496
1497             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1498
1499             if (DEBUG_PROG) {
1500                printf("TXB (%g, %g, %g, %g) = texture[%d][%g %g %g %g]"
1501                       "  bias %g\n",
1502                       color[0], color[1], color[2], color[3],
1503                       inst->TexSrcUnit,
1504                       texcoord[0],
1505                       texcoord[1],
1506                       texcoord[2],
1507                       texcoord[3],
1508                       lodBias);
1509             }
1510
1511             store_vector4(inst, machine, color);
1512          }
1513          break;
1514       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1515          /* Texture lookup w/ partial derivatives for LOD */
1516          {
1517             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1518             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1519             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1520             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1521             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1522                                      0.0, /* lodBias */
1523                                      inst->TexSrcUnit, color);
1524             store_vector4(inst, machine, color);
1525          }
1526          break;
1527       case OPCODE_TXL:
1528          /* Texel lookup with explicit LOD */
1529          {
1530             GLfloat texcoord[4], color[4], lod;
1531
1532             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1533
1534             /* texcoord[3] is the LOD */
1535             lod = texcoord[3];
1536
1537             machine->FetchTexelLod(ctx, texcoord, lod,
1538                                    machine->Samplers[inst->TexSrcUnit], color);
1539
1540             store_vector4(inst, machine, color);
1541          }
1542          break;
1543       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1544          /* Texture lookup w/ projective divide */
1545          {
1546             GLfloat texcoord[4], color[4];
1547
1548             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1549             /* Not so sure about this test - if texcoord[3] is
1550              * zero, we'd probably be fine except for an ASSERT in
1551              * IROUND_POS() which gets triggered by the inf values created.
1552              */
1553             if (texcoord[3] != 0.0) {
1554                texcoord[0] /= texcoord[3];
1555                texcoord[1] /= texcoord[3];
1556                texcoord[2] /= texcoord[3];
1557             }
1558
1559             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1560
1561             store_vector4(inst, machine, color);
1562          }
1563          break;
1564       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1565          /* Texture lookup w/ projective divide, as above, but do not
1566           * do the divide by w if sampling from a cube map.
1567           */
1568          {
1569             GLfloat texcoord[4], color[4];
1570
1571             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1572             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1573                 texcoord[3] != 0.0) {
1574                texcoord[0] /= texcoord[3];
1575                texcoord[1] /= texcoord[3];
1576                texcoord[2] /= texcoord[3];
1577             }
1578
1579             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1580
1581             store_vector4(inst, machine, color);
1582          }
1583          break;
1584       case OPCODE_TRUNC:       /* truncate toward zero */
1585          {
1586             GLfloat a[4], result[4];
1587             fetch_vector4(&inst->SrcReg[0], machine, a);
1588             result[0] = (GLfloat) (GLint) a[0];
1589             result[1] = (GLfloat) (GLint) a[1];
1590             result[2] = (GLfloat) (GLint) a[2];
1591             result[3] = (GLfloat) (GLint) a[3];
1592             store_vector4(inst, machine, result);
1593          }
1594          break;
1595       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1596          {
1597             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1598             GLfloat result[4];
1599             GLushort hx, hy;
1600             hx = raw & 0xffff;
1601             hy = raw >> 16;
1602             result[0] = result[2] = _mesa_half_to_float(hx);
1603             result[1] = result[3] = _mesa_half_to_float(hy);
1604             store_vector4(inst, machine, result);
1605          }
1606          break;
1607       case OPCODE_UP2US:       /* unpack two GLushorts */
1608          {
1609             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1610             GLfloat result[4];
1611             GLushort usx, usy;
1612             usx = raw & 0xffff;
1613             usy = raw >> 16;
1614             result[0] = result[2] = usx * (1.0f / 65535.0f);
1615             result[1] = result[3] = usy * (1.0f / 65535.0f);
1616             store_vector4(inst, machine, result);
1617          }
1618          break;
1619       case OPCODE_UP4B:        /* unpack four GLbytes */
1620          {
1621             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1622             GLfloat result[4];
1623             result[0] = (((raw >> 0) & 0xff) - 128) / 127.0F;
1624             result[1] = (((raw >> 8) & 0xff) - 128) / 127.0F;
1625             result[2] = (((raw >> 16) & 0xff) - 128) / 127.0F;
1626             result[3] = (((raw >> 24) & 0xff) - 128) / 127.0F;
1627             store_vector4(inst, machine, result);
1628          }
1629          break;
1630       case OPCODE_UP4UB:       /* unpack four GLubytes */
1631          {
1632             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1633             GLfloat result[4];
1634             result[0] = ((raw >> 0) & 0xff) / 255.0F;
1635             result[1] = ((raw >> 8) & 0xff) / 255.0F;
1636             result[2] = ((raw >> 16) & 0xff) / 255.0F;
1637             result[3] = ((raw >> 24) & 0xff) / 255.0F;
1638             store_vector4(inst, machine, result);
1639          }
1640          break;
1641       case OPCODE_XPD:         /* cross product */
1642          {
1643             GLfloat a[4], b[4], result[4];
1644             fetch_vector4(&inst->SrcReg[0], machine, a);
1645             fetch_vector4(&inst->SrcReg[1], machine, b);
1646             result[0] = a[1] * b[2] - a[2] * b[1];
1647             result[1] = a[2] * b[0] - a[0] * b[2];
1648             result[2] = a[0] * b[1] - a[1] * b[0];
1649             result[3] = 1.0;
1650             store_vector4(inst, machine, result);
1651             if (DEBUG_PROG) {
1652                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1653                       result[0], result[1], result[2], result[3],
1654                       a[0], a[1], a[2], b[0], b[1], b[2]);
1655             }
1656          }
1657          break;
1658       case OPCODE_X2D:         /* 2-D matrix transform */
1659          {
1660             GLfloat a[4], b[4], c[4], result[4];
1661             fetch_vector4(&inst->SrcReg[0], machine, a);
1662             fetch_vector4(&inst->SrcReg[1], machine, b);
1663             fetch_vector4(&inst->SrcReg[2], machine, c);
1664             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1665             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1666             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1667             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1668             store_vector4(inst, machine, result);
1669          }
1670          break;
1671       case OPCODE_END:
1672          return GL_TRUE;
1673       default:
1674          _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
1675                        inst->Opcode);
1676          return GL_TRUE;        /* return value doesn't matter */
1677       }
1678
1679       numExec++;
1680       if (numExec > maxExec) {
1681          static GLboolean reported = GL_FALSE;
1682          if (!reported) {
1683             _mesa_problem(ctx, "Infinite loop detected in fragment program");
1684             reported = GL_TRUE;
1685          }
1686          return GL_TRUE;
1687       }
1688
1689    } /* for pc */
1690
1691    return GL_TRUE;
1692 }