src/mesa/shader/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  7.0.3
   4  *
   5  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "glheader.h"
  39 #include "colormac.h"
  40 #include "context.h"
  41 #include "program.h"
  42 #include "prog_execute.h"
  43 #include "prog_instruction.h"
  44 #include "prog_parameter.h"
  45 #include "prog_print.h"
  46 #include "shader/slang/slang_library_noise.h"
  47
  48
  49 /* debug predicate */
  50 #define DEBUG_PROG 0
  51
  52
  53 /**
  54  * Set x to positive or negative infinity.
  55  */
  56 #if defined(USE_IEEE) || defined(_WIN32)
  57 #define SET_POS_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0x7F800000 )
  58 #define SET_NEG_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0xFF800000 )
  59 #elif defined(VMS)
  60 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
  61 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
  62 #else
  63 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  64 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  65 #endif
  66
  67 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  68
  69
  70 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  71
  72
  73
  74 /**
  75  * Return a pointer to the 4-element float vector specified by the given
  76  * source register.
  77  */
  78 static INLINE const GLfloat *
  79 get_register_pointer(const struct prog_src_register *source,
  80                      const struct gl_program_machine *machine)
  81 {
  82    if (source->RelAddr) {
  83       const GLint reg = source->Index + machine->AddressReg[0][0];
  84       if (source->File == PROGRAM_ENV_PARAM)
  85          if (reg < 0 || reg >= MAX_PROGRAM_ENV_PARAMS)
  86             return ZeroVec;
  87          else
  88             return machine->EnvParams[reg];
  89       else {
  90          const struct gl_program_parameter_list *params;
  91          ASSERT(source->File == PROGRAM_LOCAL_PARAM ||
  92                 source->File == PROGRAM_CONSTANT ||
  93                 source->File == PROGRAM_STATE_VAR);
  94          params = machine->CurProgram->Parameters;
  95          if (reg < 0 || reg >= params->NumParameters)
  96             return ZeroVec;
  97          else
  98             return params->ParameterValues[reg];
  99       }
 100    }
 101
 102    switch (source->File) {
 103    case PROGRAM_TEMPORARY:
 104       ASSERT(source->Index < MAX_PROGRAM_TEMPS);
 105       return machine->Temporaries[source->Index];
 106
 107    case PROGRAM_INPUT:
 108       if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB) {
 109          ASSERT(source->Index < VERT_ATTRIB_MAX);
 110          return machine->VertAttribs[source->Index];
 111       }
 112       else {
 113          ASSERT(source->Index < FRAG_ATTRIB_MAX);
 114          return machine->Attribs[source->Index][machine->CurElement];
 115       }
 116
 117    case PROGRAM_OUTPUT:
 118       ASSERT(source->Index < MAX_PROGRAM_OUTPUTS);
 119       return machine->Outputs[source->Index];
 120
 121    case PROGRAM_LOCAL_PARAM:
 122       ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 123       return machine->CurProgram->LocalParams[source->Index];
 124
 125    case PROGRAM_ENV_PARAM:
 126       ASSERT(source->Index < MAX_PROGRAM_ENV_PARAMS);
 127       return machine->EnvParams[source->Index];
 128
 129    case PROGRAM_STATE_VAR:
 130       /* Fallthrough */
 131    case PROGRAM_CONSTANT:
 132       /* Fallthrough */
 133    case PROGRAM_UNIFORM:
 134       /* Fallthrough */
 135    case PROGRAM_NAMED_PARAM:
 136       ASSERT(source->Index <
 137              (GLint) machine->CurProgram->Parameters->NumParameters);
 138       return machine->CurProgram->Parameters->ParameterValues[source->Index];
 139
 140    default:
 141       _mesa_problem(NULL,
 142                     "Invalid input register file %d in get_register_pointer()",
 143                     source->File);
 144       return NULL;
 145    }
 146 }
 147
 148
 149 #if FEATURE_MESA_program_debug
 150 static struct gl_program_machine *CurrentMachine = NULL;
 151
 152 /**
 153  * For GL_MESA_program_debug.
 154  * Return current value (4*GLfloat) of a program register.
 155  * Called via ctx->Driver.GetProgramRegister().
 156  */
 157 void
 158 _mesa_get_program_register(GLcontext *ctx, enum register_file file,
 159                            GLuint index, GLfloat val[4])
 160 {
 161    if (CurrentMachine) {
 162       struct prog_src_register src;
 163       const GLfloat *reg;
 164       src.File = file;
 165       src.Index = index;
 166       reg = get_register_pointer(&src, CurrentMachine);
 167       COPY_4V(val, reg);
 168    }
 169 }
 170 #endif /* FEATURE_MESA_program_debug */
 171
 172
 173 /**
 174  * Fetch a 4-element float vector from the given source register.
 175  * Apply swizzling and negating as needed.
 176  */
 177 static void
 178 fetch_vector4(const struct prog_src_register *source,
 179               const struct gl_program_machine *machine, GLfloat result[4])
 180 {
 181    const GLfloat *src = get_register_pointer(source, machine);
 182    ASSERT(src);
 183
 184    if (source->Swizzle == SWIZZLE_NOOP) {
 185       /* no swizzling */
 186       COPY_4V(result, src);
 187    }
 188    else {
 189       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 190       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 191       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 192       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 193       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 194       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 195       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 196       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 197    }
 198
 199    if (source->NegateBase) {
 200       result[0] = -result[0];
 201       result[1] = -result[1];
 202       result[2] = -result[2];
 203       result[3] = -result[3];
 204    }
 205    if (source->Abs) {
 206       result[0] = FABSF(result[0]);
 207       result[1] = FABSF(result[1]);
 208       result[2] = FABSF(result[2]);
 209       result[3] = FABSF(result[3]);
 210    }
 211    if (source->NegateAbs) {
 212       result[0] = -result[0];
 213       result[1] = -result[1];
 214       result[2] = -result[2];
 215       result[3] = -result[3];
 216    }
 217 }
 218
 219
 220 /**
 221  * Fetch the derivative with respect to X or Y for the given register.
 222  * XXX this currently only works for fragment program input attribs.
 223  */
 224 static void
 225 fetch_vector4_deriv(GLcontext * ctx,
 226                     const struct prog_src_register *source,
 227                     const struct gl_program_machine *machine,
 228                     char xOrY, GLfloat result[4])
 229 {
 230    if (source->File == PROGRAM_INPUT && source->Index < machine->NumDeriv) {
 231       const GLint col = machine->CurElement;
 232       const GLfloat w = machine->Attribs[FRAG_ATTRIB_WPOS][col][3];
 233       const GLfloat invQ = 1.0f / w;
 234       GLfloat deriv[4];
 235
 236       if (xOrY == 'X') {
 237          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 238          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 239          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 240          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 241       }
 242       else {
 243          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 244          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 245          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 246          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 247       }
 248
 249       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 250       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 251       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 252       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 253
 254       if (source->NegateBase) {
 255          result[0] = -result[0];
 256          result[1] = -result[1];
 257          result[2] = -result[2];
 258          result[3] = -result[3];
 259       }
 260       if (source->Abs) {
 261          result[0] = FABSF(result[0]);
 262          result[1] = FABSF(result[1]);
 263          result[2] = FABSF(result[2]);
 264          result[3] = FABSF(result[3]);
 265       }
 266       if (source->NegateAbs) {
 267          result[0] = -result[0];
 268          result[1] = -result[1];
 269          result[2] = -result[2];
 270          result[3] = -result[3];
 271       }
 272    }
 273    else {
 274       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 275    }
 276 }
 277
 278
 279 /**
 280  * As above, but only return result[0] element.
 281  */
 282 static void
 283 fetch_vector1(const struct prog_src_register *source,
 284               const struct gl_program_machine *machine, GLfloat result[4])
 285 {
 286    const GLfloat *src = get_register_pointer(source, machine);
 287    ASSERT(src);
 288
 289    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 290
 291    if (source->NegateBase) {
 292       result[0] = -result[0];
 293    }
 294    if (source->Abs) {
 295       result[0] = FABSF(result[0]);
 296    }
 297    if (source->NegateAbs) {
 298       result[0] = -result[0];
 299    }
 300 }
 301
 302
 303 /**
 304  * Fetch texel from texture.  Use partial derivatives when possible.
 305  */
 306 static INLINE void
 307 fetch_texel(GLcontext *ctx,
 308             const struct gl_program_machine *machine,
 309             const struct prog_instruction *inst,
 310             const GLfloat texcoord[4], GLfloat lodBias,
 311             GLfloat color[4])
 312 {
 313    /* Note: we only have the right derivatives for fragment input attribs.
 314     */
 315    if (machine->NumDeriv > 0 &&
 316        inst->SrcReg[0].File == PROGRAM_INPUT &&
 317        inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit) {
 318       /* simple texture fetch for which we should have derivatives */
 319       GLuint attr = inst->SrcReg[0].Index;
 320       machine->FetchTexelDeriv(ctx, texcoord,
 321                                machine->DerivX[attr],
 322                                machine->DerivY[attr],
 323                                lodBias,
 324                                inst->TexSrcUnit, color);
 325    }
 326    else {
 327       machine->FetchTexelLod(ctx, texcoord, lodBias,
 328                              inst->TexSrcUnit, color);
 329    }
 330 }
 331
 332
 333 /**
 334  * Test value against zero and return GT, LT, EQ or UN if NaN.
 335  */
 336 static INLINE GLuint
 337 generate_cc(float value)
 338 {
 339    if (value != value)
 340       return COND_UN;           /* NaN */
 341    if (value > 0.0F)
 342       return COND_GT;
 343    if (value < 0.0F)
 344       return COND_LT;
 345    return COND_EQ;
 346 }
 347
 348
 349 /**
 350  * Test if the ccMaskRule is satisfied by the given condition code.
 351  * Used to mask destination writes according to the current condition code.
 352  */
 353 static INLINE GLboolean
 354 test_cc(GLuint condCode, GLuint ccMaskRule)
 355 {
 356    switch (ccMaskRule) {
 357    case COND_EQ: return (condCode == COND_EQ);
 358    case COND_NE: return (condCode != COND_EQ);
 359    case COND_LT: return (condCode == COND_LT);
 360    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 361    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 362    case COND_GT: return (condCode == COND_GT);
 363    case COND_TR: return GL_TRUE;
 364    case COND_FL: return GL_FALSE;
 365    default:      return GL_TRUE;
 366    }
 367 }
 368
 369
 370 /**
 371  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 372  * or GL_FALSE to indicate result.
 373  */
 374 static INLINE GLboolean
 375 eval_condition(const struct gl_program_machine *machine,
 376                const struct prog_instruction *inst)
 377 {
 378    const GLuint swizzle = inst->DstReg.CondSwizzle;
 379    const GLuint condMask = inst->DstReg.CondMask;
 380    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 381        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 382        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 383        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 384       return GL_TRUE;
 385    }
 386    else {
 387       return GL_FALSE;
 388    }
 389 }
 390
 391
 392
 393 /**
 394  * Store 4 floats into a register.  Observe the instructions saturate and
 395  * set-condition-code flags.
 396  */
 397 static void
 398 store_vector4(const struct prog_instruction *inst,
 399               struct gl_program_machine *machine, const GLfloat value[4])
 400 {
 401    const struct prog_dst_register *dest = &(inst->DstReg);
 402    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 403    GLfloat *dstReg;
 404    GLfloat dummyReg[4];
 405    GLfloat clampedValue[4];
 406    GLuint writeMask = dest->WriteMask;
 407
 408    switch (dest->File) {
 409    case PROGRAM_OUTPUT:
 410       ASSERT(dest->Index < MAX_PROGRAM_OUTPUTS);
 411       dstReg = machine->Outputs[dest->Index];
 412       break;
 413    case PROGRAM_TEMPORARY:
 414       ASSERT(dest->Index < MAX_PROGRAM_TEMPS);
 415       dstReg = machine->Temporaries[dest->Index];
 416       break;
 417    case PROGRAM_WRITE_ONLY:
 418       dstReg = dummyReg;
 419       return;
 420    default:
 421       _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 422       return;
 423    }
 424
 425 #if 0
 426    if (value[0] > 1.0e10 ||
 427        IS_INF_OR_NAN(value[0]) ||
 428        IS_INF_OR_NAN(value[1]) ||
 429        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 430       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 431 #endif
 432
 433    if (clamp) {
 434       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 435       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 436       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 437       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 438       value = clampedValue;
 439    }
 440
 441    if (dest->CondMask != COND_TR) {
 442       /* condition codes may turn off some writes */
 443       if (writeMask & WRITEMASK_X) {
 444          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)],
 445                       dest->CondMask))
 446             writeMask &= ~WRITEMASK_X;
 447       }
 448       if (writeMask & WRITEMASK_Y) {
 449          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)],
 450                       dest->CondMask))
 451             writeMask &= ~WRITEMASK_Y;
 452       }
 453       if (writeMask & WRITEMASK_Z) {
 454          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)],
 455                       dest->CondMask))
 456             writeMask &= ~WRITEMASK_Z;
 457       }
 458       if (writeMask & WRITEMASK_W) {
 459          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)],
 460                       dest->CondMask))
 461             writeMask &= ~WRITEMASK_W;
 462       }
 463    }
 464
 465    if (writeMask & WRITEMASK_X)
 466       dstReg[0] = value[0];
 467    if (writeMask & WRITEMASK_Y)
 468       dstReg[1] = value[1];
 469    if (writeMask & WRITEMASK_Z)
 470       dstReg[2] = value[2];
 471    if (writeMask & WRITEMASK_W)
 472       dstReg[3] = value[3];
 473
 474    if (inst->CondUpdate) {
 475       if (writeMask & WRITEMASK_X)
 476          machine->CondCodes[0] = generate_cc(value[0]);
 477       if (writeMask & WRITEMASK_Y)
 478          machine->CondCodes[1] = generate_cc(value[1]);
 479       if (writeMask & WRITEMASK_Z)
 480          machine->CondCodes[2] = generate_cc(value[2]);
 481       if (writeMask & WRITEMASK_W)
 482          machine->CondCodes[3] = generate_cc(value[3]);
 483 #if DEBUG_PROG
 484       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 485              _mesa_condcode_string(machine->CondCodes[0]),
 486              _mesa_condcode_string(machine->CondCodes[1]),
 487              _mesa_condcode_string(machine->CondCodes[2]),
 488              _mesa_condcode_string(machine->CondCodes[3]));
 489 #endif
 490    }
 491 }
 492
 493
 494 /**
 495  * Execute the given vertex/fragment program.
 496  *
 497  * \param ctx  rendering context
 498  * \param program  the program to execute
 499  * \param machine  machine state (must be initialized)
 500  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 501  */
 502 GLboolean
 503 _mesa_execute_program(GLcontext * ctx,
 504                       const struct gl_program *program,
 505                       struct gl_program_machine *machine)
 506 {
 507    const GLuint numInst = program->NumInstructions;
 508    const GLuint maxExec = 10000;
 509    GLint pc, numExec = 0;
 510
 511    machine->CurProgram = program;
 512
 513    if (DEBUG_PROG) {
 514       printf("execute program %u --------------------\n", program->Id);
 515    }
 516
 517 #if FEATURE_MESA_program_debug
 518    CurrentMachine = machine;
 519 #endif
 520
 521    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 522       machine->EnvParams = ctx->VertexProgram.Parameters;
 523    }
 524    else {
 525       machine->EnvParams = ctx->FragmentProgram.Parameters;
 526    }
 527
 528    for (pc = 0; pc < numInst; pc++) {
 529       const struct prog_instruction *inst = program->Instructions + pc;
 530
 531 #if FEATURE_MESA_program_debug
 532       if (ctx->FragmentProgram.CallbackEnabled &&
 533           ctx->FragmentProgram.Callback) {
 534          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 535          ctx->FragmentProgram.Callback(program->Target,
 536                                        ctx->FragmentProgram.CallbackData);
 537       }
 538 #endif
 539
 540       if (DEBUG_PROG) {
 541          _mesa_print_instruction(inst);
 542       }
 543
 544       switch (inst->Opcode) {
 545       case OPCODE_ABS:
 546          {
 547             GLfloat a[4], result[4];
 548             fetch_vector4(&inst->SrcReg[0], machine, a);
 549             result[0] = FABSF(a[0]);
 550             result[1] = FABSF(a[1]);
 551             result[2] = FABSF(a[2]);
 552             result[3] = FABSF(a[3]);
 553             store_vector4(inst, machine, result);
 554          }
 555          break;
 556       case OPCODE_ADD:
 557          {
 558             GLfloat a[4], b[4], result[4];
 559             fetch_vector4(&inst->SrcReg[0], machine, a);
 560             fetch_vector4(&inst->SrcReg[1], machine, b);
 561             result[0] = a[0] + b[0];
 562             result[1] = a[1] + b[1];
 563             result[2] = a[2] + b[2];
 564             result[3] = a[3] + b[3];
 565             store_vector4(inst, machine, result);
 566             if (DEBUG_PROG) {
 567                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 568                       result[0], result[1], result[2], result[3],
 569                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 570             }
 571          }
 572          break;
 573       case OPCODE_ARL:
 574          {
 575             GLfloat t[4];
 576             fetch_vector4(&inst->SrcReg[0], machine, t);
 577             machine->AddressReg[0][0] = (GLint) FLOORF(t[0]);
 578          }
 579          break;
 580       case OPCODE_BGNLOOP:
 581          /* no-op */
 582          break;
 583       case OPCODE_ENDLOOP:
 584          /* subtract 1 here since pc is incremented by for(pc) loop */
 585          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 586          break;
 587       case OPCODE_BGNSUB:      /* begin subroutine */
 588          break;
 589       case OPCODE_ENDSUB:      /* end subroutine */
 590          break;
 591       case OPCODE_BRA:         /* branch (conditional) */
 592          /* fall-through */
 593       case OPCODE_BRK:         /* break out of loop (conditional) */
 594          /* fall-through */
 595       case OPCODE_CONT:        /* continue loop (conditional) */
 596          if (eval_condition(machine, inst)) {
 597             /* take branch */
 598             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 599             pc = inst->BranchTarget - 1;
 600          }
 601          break;
 602       case OPCODE_CAL:         /* Call subroutine (conditional) */
 603          if (eval_condition(machine, inst)) {
 604             /* call the subroutine */
 605             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 606                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 607             }
 608             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 609             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 610             pc = inst->BranchTarget - 1;
 611          }
 612          break;
 613       case OPCODE_CMP:
 614          {
 615             GLfloat a[4], b[4], c[4], result[4];
 616             fetch_vector4(&inst->SrcReg[0], machine, a);
 617             fetch_vector4(&inst->SrcReg[1], machine, b);
 618             fetch_vector4(&inst->SrcReg[2], machine, c);
 619             result[0] = a[0] < 0.0F ? b[0] : c[0];
 620             result[1] = a[1] < 0.0F ? b[1] : c[1];
 621             result[2] = a[2] < 0.0F ? b[2] : c[2];
 622             result[3] = a[3] < 0.0F ? b[3] : c[3];
 623             store_vector4(inst, machine, result);
 624          }
 625          break;
 626       case OPCODE_COS:
 627          {
 628             GLfloat a[4], result[4];
 629             fetch_vector1(&inst->SrcReg[0], machine, a);
 630             result[0] = result[1] = result[2] = result[3]
 631                = (GLfloat) _mesa_cos(a[0]);
 632             store_vector4(inst, machine, result);
 633          }
 634          break;
 635       case OPCODE_DDX:         /* Partial derivative with respect to X */
 636          {
 637             GLfloat result[4];
 638             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 639                                 'X', result);
 640             store_vector4(inst, machine, result);
 641          }
 642          break;
 643       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 644          {
 645             GLfloat result[4];
 646             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 647                                 'Y', result);
 648             store_vector4(inst, machine, result);
 649          }
 650          break;
 651       case OPCODE_DP3:
 652          {
 653             GLfloat a[4], b[4], result[4];
 654             fetch_vector4(&inst->SrcReg[0], machine, a);
 655             fetch_vector4(&inst->SrcReg[1], machine, b);
 656             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 657             store_vector4(inst, machine, result);
 658             if (DEBUG_PROG) {
 659                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 660                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 661             }
 662          }
 663          break;
 664       case OPCODE_DP4:
 665          {
 666             GLfloat a[4], b[4], result[4];
 667             fetch_vector4(&inst->SrcReg[0], machine, a);
 668             fetch_vector4(&inst->SrcReg[1], machine, b);
 669             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 670             store_vector4(inst, machine, result);
 671             if (DEBUG_PROG) {
 672                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 673                       result[0], a[0], a[1], a[2], a[3],
 674                       b[0], b[1], b[2], b[3]);
 675             }
 676          }
 677          break;
 678       case OPCODE_DPH:
 679          {
 680             GLfloat a[4], b[4], result[4];
 681             fetch_vector4(&inst->SrcReg[0], machine, a);
 682             fetch_vector4(&inst->SrcReg[1], machine, b);
 683             result[0] = result[1] = result[2] = result[3] =
 684                a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 685             store_vector4(inst, machine, result);
 686          }
 687          break;
 688       case OPCODE_DST:         /* Distance vector */
 689          {
 690             GLfloat a[4], b[4], result[4];
 691             fetch_vector4(&inst->SrcReg[0], machine, a);
 692             fetch_vector4(&inst->SrcReg[1], machine, b);
 693             result[0] = 1.0F;
 694             result[1] = a[1] * b[1];
 695             result[2] = a[2];
 696             result[3] = b[3];
 697             store_vector4(inst, machine, result);
 698          }
 699          break;
 700       case OPCODE_EXP:
 701          {
 702             GLfloat t[4], q[4], floor_t0;
 703             fetch_vector1(&inst->SrcReg[0], machine, t);
 704             floor_t0 = FLOORF(t[0]);
 705             if (floor_t0 > FLT_MAX_EXP) {
 706                SET_POS_INFINITY(q[0]);
 707                SET_POS_INFINITY(q[2]);
 708             }
 709             else if (floor_t0 < FLT_MIN_EXP) {
 710                q[0] = 0.0F;
 711                q[2] = 0.0F;
 712             }
 713             else {
 714                q[0] = LDEXPF(1.0, (int) floor_t0);
 715                /* Note: GL_NV_vertex_program expects
 716                 * result.z = result.x * APPX(result.y)
 717                 * We do what the ARB extension says.
 718                 */
 719                q[2] = pow(2.0, t[0]);
 720             }
 721             q[1] = t[0] - floor_t0;
 722             q[3] = 1.0F;
 723             store_vector4( inst, machine, q );
 724          }
 725          break;
 726       case OPCODE_EX2:         /* Exponential base 2 */
 727          {
 728             GLfloat a[4], result[4];
 729             fetch_vector1(&inst->SrcReg[0], machine, a);
 730             result[0] = result[1] = result[2] = result[3] =
 731                (GLfloat) _mesa_pow(2.0, a[0]);
 732             store_vector4(inst, machine, result);
 733          }
 734          break;
 735       case OPCODE_FLR:
 736          {
 737             GLfloat a[4], result[4];
 738             fetch_vector4(&inst->SrcReg[0], machine, a);
 739             result[0] = FLOORF(a[0]);
 740             result[1] = FLOORF(a[1]);
 741             result[2] = FLOORF(a[2]);
 742             result[3] = FLOORF(a[3]);
 743             store_vector4(inst, machine, result);
 744          }
 745          break;
 746       case OPCODE_FRC:
 747          {
 748             GLfloat a[4], result[4];
 749             fetch_vector4(&inst->SrcReg[0], machine, a);
 750             result[0] = a[0] - FLOORF(a[0]);
 751             result[1] = a[1] - FLOORF(a[1]);
 752             result[2] = a[2] - FLOORF(a[2]);
 753             result[3] = a[3] - FLOORF(a[3]);
 754             store_vector4(inst, machine, result);
 755          }
 756          break;
 757       case OPCODE_IF:
 758          {
 759             GLboolean cond;
 760             /* eval condition */
 761             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 762                GLfloat a[4];
 763                fetch_vector1(&inst->SrcReg[0], machine, a);
 764                cond = (a[0] != 0.0);
 765             }
 766             else {
 767                cond = eval_condition(machine, inst);
 768             }
 769             if (DEBUG_PROG) {
 770                printf("IF: %d\n", cond);
 771             }
 772             /* do if/else */
 773             if (cond) {
 774                /* do if-clause (just continue execution) */
 775             }
 776             else {
 777                /* go to the instruction after ELSE or ENDIF */
 778                assert(inst->BranchTarget >= 0);
 779                pc = inst->BranchTarget - 1;
 780             }
 781          }
 782          break;
 783       case OPCODE_ELSE:
 784          /* goto ENDIF */
 785          assert(inst->BranchTarget >= 0);
 786          pc = inst->BranchTarget - 1;
 787          break;
 788       case OPCODE_ENDIF:
 789          /* nothing */
 790          break;
 791       case OPCODE_INT:         /* float to int */
 792          {
 793             GLfloat a[4], result[4];
 794             fetch_vector4(&inst->SrcReg[0], machine, a);
 795             result[0] = (GLfloat) (GLint) a[0];
 796             result[1] = (GLfloat) (GLint) a[1];
 797             result[2] = (GLfloat) (GLint) a[2];
 798             result[3] = (GLfloat) (GLint) a[3];
 799             store_vector4(inst, machine, result);
 800          }
 801          break;
 802       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 803          if (eval_condition(machine, inst)) {
 804             return GL_FALSE;
 805          }
 806          break;
 807       case OPCODE_KIL:         /* ARB_f_p only */
 808          {
 809             GLfloat a[4];
 810             fetch_vector4(&inst->SrcReg[0], machine, a);
 811             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 812                return GL_FALSE;
 813             }
 814          }
 815          break;
 816       case OPCODE_LG2:         /* log base 2 */
 817          {
 818             GLfloat a[4], result[4];
 819             fetch_vector1(&inst->SrcReg[0], machine, a);
 820             result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
 821             store_vector4(inst, machine, result);
 822          }
 823          break;
 824       case OPCODE_LIT:
 825          {
 826             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
 827             GLfloat a[4], result[4];
 828             fetch_vector4(&inst->SrcReg[0], machine, a);
 829             a[0] = MAX2(a[0], 0.0F);
 830             a[1] = MAX2(a[1], 0.0F);
 831             /* XXX ARB version clamps a[3], NV version doesn't */
 832             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 833             result[0] = 1.0F;
 834             result[1] = a[0];
 835             /* XXX we could probably just use pow() here */
 836             if (a[0] > 0.0F) {
 837                if (a[1] == 0.0 && a[3] == 0.0)
 838                   result[2] = 1.0;
 839                else
 840                   result[2] = EXPF(a[3] * LOGF(a[1]));
 841             }
 842             else {
 843                result[2] = 0.0;
 844             }
 845             result[3] = 1.0F;
 846             store_vector4(inst, machine, result);
 847             if (DEBUG_PROG) {
 848                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
 849                       result[0], result[1], result[2], result[3],
 850                       a[0], a[1], a[2], a[3]);
 851             }
 852          }
 853          break;
 854       case OPCODE_LOG:
 855          {
 856             GLfloat t[4], q[4], abs_t0;
 857             fetch_vector1(&inst->SrcReg[0], machine, t);
 858             abs_t0 = FABSF(t[0]);
 859             if (abs_t0 != 0.0F) {
 860                /* Since we really can't handle infinite values on VMS
 861                 * like other OSes we'll use __MAXFLOAT to represent
 862                 * infinity.  This may need some tweaking.
 863                 */
 864 #ifdef VMS
 865                if (abs_t0 == __MAXFLOAT)
 866 #else
 867                if (IS_INF_OR_NAN(abs_t0))
 868 #endif
 869                {
 870                   SET_POS_INFINITY(q[0]);
 871                   q[1] = 1.0F;
 872                   SET_POS_INFINITY(q[2]);
 873                }
 874                else {
 875                   int exponent;
 876                   GLfloat mantissa = FREXPF(t[0], &exponent);
 877                   q[0] = (GLfloat) (exponent - 1);
 878                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 879                   q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 880                }
 881             }
 882             else {
 883                SET_NEG_INFINITY(q[0]);
 884                q[1] = 1.0F;
 885                SET_NEG_INFINITY(q[2]);
 886             }
 887             q[3] = 1.0;
 888             store_vector4(inst, machine, q);
 889          }
 890          break;
 891       case OPCODE_LRP:
 892          {
 893             GLfloat a[4], b[4], c[4], result[4];
 894             fetch_vector4(&inst->SrcReg[0], machine, a);
 895             fetch_vector4(&inst->SrcReg[1], machine, b);
 896             fetch_vector4(&inst->SrcReg[2], machine, c);
 897             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 898             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 899             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 900             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 901             store_vector4(inst, machine, result);
 902             if (DEBUG_PROG) {
 903                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
 904                       "(%g %g %g %g), (%g %g %g %g)\n",
 905                       result[0], result[1], result[2], result[3],
 906                       a[0], a[1], a[2], a[3],
 907                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
 908             }
 909          }
 910          break;
 911       case OPCODE_MAD:
 912          {
 913             GLfloat a[4], b[4], c[4], result[4];
 914             fetch_vector4(&inst->SrcReg[0], machine, a);
 915             fetch_vector4(&inst->SrcReg[1], machine, b);
 916             fetch_vector4(&inst->SrcReg[2], machine, c);
 917             result[0] = a[0] * b[0] + c[0];
 918             result[1] = a[1] * b[1] + c[1];
 919             result[2] = a[2] * b[2] + c[2];
 920             result[3] = a[3] * b[3] + c[3];
 921             store_vector4(inst, machine, result);
 922             if (DEBUG_PROG) {
 923                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
 924                       "(%g %g %g %g) + (%g %g %g %g)\n",
 925                       result[0], result[1], result[2], result[3],
 926                       a[0], a[1], a[2], a[3],
 927                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
 928             }
 929          }
 930          break;
 931       case OPCODE_MAX:
 932          {
 933             GLfloat a[4], b[4], result[4];
 934             fetch_vector4(&inst->SrcReg[0], machine, a);
 935             fetch_vector4(&inst->SrcReg[1], machine, b);
 936             result[0] = MAX2(a[0], b[0]);
 937             result[1] = MAX2(a[1], b[1]);
 938             result[2] = MAX2(a[2], b[2]);
 939             result[3] = MAX2(a[3], b[3]);
 940             store_vector4(inst, machine, result);
 941             if (DEBUG_PROG) {
 942                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 943                       result[0], result[1], result[2], result[3],
 944                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 945             }
 946          }
 947          break;
 948       case OPCODE_MIN:
 949          {
 950             GLfloat a[4], b[4], result[4];
 951             fetch_vector4(&inst->SrcReg[0], machine, a);
 952             fetch_vector4(&inst->SrcReg[1], machine, b);
 953             result[0] = MIN2(a[0], b[0]);
 954             result[1] = MIN2(a[1], b[1]);
 955             result[2] = MIN2(a[2], b[2]);
 956             result[3] = MIN2(a[3], b[3]);
 957             store_vector4(inst, machine, result);
 958          }
 959          break;
 960       case OPCODE_MOV:
 961          {
 962             GLfloat result[4];
 963             fetch_vector4(&inst->SrcReg[0], machine, result);
 964             store_vector4(inst, machine, result);
 965             if (DEBUG_PROG) {
 966                printf("MOV (%g %g %g %g)\n",
 967                       result[0], result[1], result[2], result[3]);
 968             }
 969          }
 970          break;
 971       case OPCODE_MUL:
 972          {
 973             GLfloat a[4], b[4], result[4];
 974             fetch_vector4(&inst->SrcReg[0], machine, a);
 975             fetch_vector4(&inst->SrcReg[1], machine, b);
 976             result[0] = a[0] * b[0];
 977             result[1] = a[1] * b[1];
 978             result[2] = a[2] * b[2];
 979             result[3] = a[3] * b[3];
 980             store_vector4(inst, machine, result);
 981             if (DEBUG_PROG) {
 982                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 983                       result[0], result[1], result[2], result[3],
 984                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 985             }
 986          }
 987          break;
 988       case OPCODE_NOISE1:
 989          {
 990             GLfloat a[4], result[4];
 991             fetch_vector1(&inst->SrcReg[0], machine, a);
 992             result[0] =
 993                result[1] =
 994                result[2] = result[3] = _slang_library_noise1(a[0]);
 995             store_vector4(inst, machine, result);
 996          }
 997          break;
 998       case OPCODE_NOISE2:
 999          {
1000             GLfloat a[4], result[4];
1001             fetch_vector4(&inst->SrcReg[0], machine, a);
1002             result[0] =
1003                result[1] =
1004                result[2] = result[3] = _slang_library_noise2(a[0], a[1]);
1005             store_vector4(inst, machine, result);
1006          }
1007          break;
1008       case OPCODE_NOISE3:
1009          {
1010             GLfloat a[4], result[4];
1011             fetch_vector4(&inst->SrcReg[0], machine, a);
1012             result[0] =
1013                result[1] =
1014                result[2] =
1015                result[3] = _slang_library_noise3(a[0], a[1], a[2]);
1016             store_vector4(inst, machine, result);
1017          }
1018          break;
1019       case OPCODE_NOISE4:
1020          {
1021             GLfloat a[4], result[4];
1022             fetch_vector4(&inst->SrcReg[0], machine, a);
1023             result[0] =
1024                result[1] =
1025                result[2] =
1026                result[3] = _slang_library_noise4(a[0], a[1], a[2], a[3]);
1027             store_vector4(inst, machine, result);
1028          }
1029          break;
1030       case OPCODE_NOP:
1031          break;
1032       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1033          {
1034             GLfloat a[4], result[4];
1035             GLhalfNV hx, hy;
1036             GLuint *rawResult = (GLuint *) result;
1037             GLuint twoHalves;
1038             fetch_vector4(&inst->SrcReg[0], machine, a);
1039             hx = _mesa_float_to_half(a[0]);
1040             hy = _mesa_float_to_half(a[1]);
1041             twoHalves = hx | (hy << 16);
1042             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1043                = twoHalves;
1044             store_vector4(inst, machine, result);
1045          }
1046          break;
1047       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1048          {
1049             GLfloat a[4], result[4];
1050             GLuint usx, usy, *rawResult = (GLuint *) result;
1051             fetch_vector4(&inst->SrcReg[0], machine, a);
1052             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1053             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1054             usx = IROUND(a[0] * 65535.0F);
1055             usy = IROUND(a[1] * 65535.0F);
1056             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1057                = usx | (usy << 16);
1058             store_vector4(inst, machine, result);
1059          }
1060          break;
1061       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1062          {
1063             GLfloat a[4], result[4];
1064             GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1065             fetch_vector4(&inst->SrcReg[0], machine, a);
1066             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1067             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1068             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1069             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1070             ubx = IROUND(127.0F * a[0] + 128.0F);
1071             uby = IROUND(127.0F * a[1] + 128.0F);
1072             ubz = IROUND(127.0F * a[2] + 128.0F);
1073             ubw = IROUND(127.0F * a[3] + 128.0F);
1074             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1075                = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1076             store_vector4(inst, machine, result);
1077          }
1078          break;
1079       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1080          {
1081             GLfloat a[4], result[4];
1082             GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1083             fetch_vector4(&inst->SrcReg[0], machine, a);
1084             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1085             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1086             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1087             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1088             ubx = IROUND(255.0F * a[0]);
1089             uby = IROUND(255.0F * a[1]);
1090             ubz = IROUND(255.0F * a[2]);
1091             ubw = IROUND(255.0F * a[3]);
1092             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1093                = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1094             store_vector4(inst, machine, result);
1095          }
1096          break;
1097       case OPCODE_POW:
1098          {
1099             GLfloat a[4], b[4], result[4];
1100             fetch_vector1(&inst->SrcReg[0], machine, a);
1101             fetch_vector1(&inst->SrcReg[1], machine, b);
1102             result[0] = result[1] = result[2] = result[3]
1103                = (GLfloat) _mesa_pow(a[0], b[0]);
1104             store_vector4(inst, machine, result);
1105          }
1106          break;
1107       case OPCODE_RCP:
1108          {
1109             GLfloat a[4], result[4];
1110             fetch_vector1(&inst->SrcReg[0], machine, a);
1111             if (DEBUG_PROG) {
1112                if (a[0] == 0)
1113                   printf("RCP(0)\n");
1114                else if (IS_INF_OR_NAN(a[0]))
1115                   printf("RCP(inf)\n");
1116             }
1117             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1118             store_vector4(inst, machine, result);
1119          }
1120          break;
1121       case OPCODE_RET:         /* return from subroutine (conditional) */
1122          if (eval_condition(machine, inst)) {
1123             if (machine->StackDepth == 0) {
1124                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1125             }
1126             /* subtract one because of pc++ in the for loop */
1127             pc = machine->CallStack[--machine->StackDepth] - 1;
1128          }
1129          break;
1130       case OPCODE_RFL:         /* reflection vector */
1131          {
1132             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1133             fetch_vector4(&inst->SrcReg[0], machine, axis);
1134             fetch_vector4(&inst->SrcReg[1], machine, dir);
1135             tmpW = DOT3(axis, axis);
1136             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1137             result[0] = tmpX * axis[0] - dir[0];
1138             result[1] = tmpX * axis[1] - dir[1];
1139             result[2] = tmpX * axis[2] - dir[2];
1140             /* result[3] is never written! XXX enforce in parser! */
1141             store_vector4(inst, machine, result);
1142          }
1143          break;
1144       case OPCODE_RSQ:         /* 1 / sqrt() */
1145          {
1146             GLfloat a[4], result[4];
1147             fetch_vector1(&inst->SrcReg[0], machine, a);
1148             a[0] = FABSF(a[0]);
1149             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1150             store_vector4(inst, machine, result);
1151             if (DEBUG_PROG) {
1152                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1153             }
1154          }
1155          break;
1156       case OPCODE_SCS:         /* sine and cos */
1157          {
1158             GLfloat a[4], result[4];
1159             fetch_vector1(&inst->SrcReg[0], machine, a);
1160             result[0] = (GLfloat) _mesa_cos(a[0]);
1161             result[1] = (GLfloat) _mesa_sin(a[0]);
1162             result[2] = 0.0;    /* undefined! */
1163             result[3] = 0.0;    /* undefined! */
1164             store_vector4(inst, machine, result);
1165          }
1166          break;
1167       case OPCODE_SEQ:         /* set on equal */
1168          {
1169             GLfloat a[4], b[4], result[4];
1170             fetch_vector4(&inst->SrcReg[0], machine, a);
1171             fetch_vector4(&inst->SrcReg[1], machine, b);
1172             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1173             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1174             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1175             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1176             store_vector4(inst, machine, result);
1177             if (DEBUG_PROG) {
1178                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1179                       result[0], result[1], result[2], result[3],
1180                       a[0], a[1], a[2], a[3],
1181                       b[0], b[1], b[2], b[3]);
1182             }
1183          }
1184          break;
1185       case OPCODE_SFL:         /* set false, operands ignored */
1186          {
1187             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1188             store_vector4(inst, machine, result);
1189          }
1190          break;
1191       case OPCODE_SGE:         /* set on greater or equal */
1192          {
1193             GLfloat a[4], b[4], result[4];
1194             fetch_vector4(&inst->SrcReg[0], machine, a);
1195             fetch_vector4(&inst->SrcReg[1], machine, b);
1196             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1197             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1198             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1199             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1200             store_vector4(inst, machine, result);
1201             if (DEBUG_PROG) {
1202                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1203                       result[0], result[1], result[2], result[3],
1204                       a[0], a[1], a[2], a[3],
1205                       b[0], b[1], b[2], b[3]);
1206             }
1207          }
1208          break;
1209       case OPCODE_SGT:         /* set on greater */
1210          {
1211             GLfloat a[4], b[4], result[4];
1212             fetch_vector4(&inst->SrcReg[0], machine, a);
1213             fetch_vector4(&inst->SrcReg[1], machine, b);
1214             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1215             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1216             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1217             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1218             store_vector4(inst, machine, result);
1219             if (DEBUG_PROG) {
1220                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1221                       result[0], result[1], result[2], result[3],
1222                       a[0], a[1], a[2], a[3],
1223                       b[0], b[1], b[2], b[3]);
1224             }
1225          }
1226          break;
1227       case OPCODE_SIN:
1228          {
1229             GLfloat a[4], result[4];
1230             fetch_vector1(&inst->SrcReg[0], machine, a);
1231             result[0] = result[1] = result[2] = result[3]
1232                = (GLfloat) _mesa_sin(a[0]);
1233             store_vector4(inst, machine, result);
1234          }
1235          break;
1236       case OPCODE_SLE:         /* set on less or equal */
1237          {
1238             GLfloat a[4], b[4], result[4];
1239             fetch_vector4(&inst->SrcReg[0], machine, a);
1240             fetch_vector4(&inst->SrcReg[1], machine, b);
1241             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1242             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1243             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1244             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1245             store_vector4(inst, machine, result);
1246             if (DEBUG_PROG) {
1247                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1248                       result[0], result[1], result[2], result[3],
1249                       a[0], a[1], a[2], a[3],
1250                       b[0], b[1], b[2], b[3]);
1251             }
1252          }
1253          break;
1254       case OPCODE_SLT:         /* set on less */
1255          {
1256             GLfloat a[4], b[4], result[4];
1257             fetch_vector4(&inst->SrcReg[0], machine, a);
1258             fetch_vector4(&inst->SrcReg[1], machine, b);
1259             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1260             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1261             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1262             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1263             store_vector4(inst, machine, result);
1264             if (DEBUG_PROG) {
1265                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1266                       result[0], result[1], result[2], result[3],
1267                       a[0], a[1], a[2], a[3],
1268                       b[0], b[1], b[2], b[3]);
1269             }
1270          }
1271          break;
1272       case OPCODE_SNE:         /* set on not equal */
1273          {
1274             GLfloat a[4], b[4], result[4];
1275             fetch_vector4(&inst->SrcReg[0], machine, a);
1276             fetch_vector4(&inst->SrcReg[1], machine, b);
1277             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1278             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1279             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1280             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1281             store_vector4(inst, machine, result);
1282             if (DEBUG_PROG) {
1283                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1284                       result[0], result[1], result[2], result[3],
1285                       a[0], a[1], a[2], a[3],
1286                       b[0], b[1], b[2], b[3]);
1287             }
1288          }
1289          break;
1290       case OPCODE_STR:         /* set true, operands ignored */
1291          {
1292             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1293             store_vector4(inst, machine, result);
1294          }
1295          break;
1296       case OPCODE_SUB:
1297          {
1298             GLfloat a[4], b[4], result[4];
1299             fetch_vector4(&inst->SrcReg[0], machine, a);
1300             fetch_vector4(&inst->SrcReg[1], machine, b);
1301             result[0] = a[0] - b[0];
1302             result[1] = a[1] - b[1];
1303             result[2] = a[2] - b[2];
1304             result[3] = a[3] - b[3];
1305             store_vector4(inst, machine, result);
1306             if (DEBUG_PROG) {
1307                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1308                       result[0], result[1], result[2], result[3],
1309                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1310             }
1311          }
1312          break;
1313       case OPCODE_SWZ:         /* extended swizzle */
1314          {
1315             const struct prog_src_register *source = &inst->SrcReg[0];
1316             const GLfloat *src = get_register_pointer(source, machine);
1317             GLfloat result[4];
1318             GLuint i;
1319             for (i = 0; i < 4; i++) {
1320                const GLuint swz = GET_SWZ(source->Swizzle, i);
1321                if (swz == SWIZZLE_ZERO)
1322                   result[i] = 0.0;
1323                else if (swz == SWIZZLE_ONE)
1324                   result[i] = 1.0;
1325                else {
1326                   ASSERT(swz >= 0);
1327                   ASSERT(swz <= 3);
1328                   result[i] = src[swz];
1329                }
1330                if (source->NegateBase & (1 << i))
1331                   result[i] = -result[i];
1332             }
1333             store_vector4(inst, machine, result);
1334          }
1335          break;
1336       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1337          /* Simple texel lookup */
1338          {
1339             GLfloat texcoord[4], color[4];
1340             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1341
1342             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1343
1344             if (DEBUG_PROG) {
1345                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1346                       color[0], color[1], color[2], color[3],
1347                       inst->TexSrcUnit,
1348                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1349             }
1350             store_vector4(inst, machine, color);
1351          }
1352          break;
1353       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1354          /* Texel lookup with LOD bias */
1355          {
1356             const struct gl_texture_unit *texUnit
1357                = &ctx->Texture.Unit[inst->TexSrcUnit];
1358             GLfloat texcoord[4], color[4], lodBias;
1359
1360             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1361
1362             /* texcoord[3] is the bias to add to lambda */
1363             lodBias = texUnit->LodBias + texcoord[3];
1364             if (texUnit->_Current) {
1365                lodBias += texUnit->_Current->LodBias;
1366             }
1367
1368             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1369
1370             store_vector4(inst, machine, color);
1371          }
1372          break;
1373       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1374          /* Texture lookup w/ partial derivatives for LOD */
1375          {
1376             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1377             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1378             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1379             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1380             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1381                                      0.0, /* lodBias */
1382                                      inst->TexSrcUnit, color);
1383             store_vector4(inst, machine, color);
1384          }
1385          break;
1386       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1387          /* Texture lookup w/ projective divide */
1388          {
1389             GLfloat texcoord[4], color[4];
1390
1391             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1392             /* Not so sure about this test - if texcoord[3] is
1393              * zero, we'd probably be fine except for an ASSERT in
1394              * IROUND_POS() which gets triggered by the inf values created.
1395              */
1396             if (texcoord[3] != 0.0) {
1397                texcoord[0] /= texcoord[3];
1398                texcoord[1] /= texcoord[3];
1399                texcoord[2] /= texcoord[3];
1400             }
1401
1402             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1403
1404             store_vector4(inst, machine, color);
1405          }
1406          break;
1407       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1408          /* Texture lookup w/ projective divide, as above, but do not
1409           * do the divide by w if sampling from a cube map.
1410           */
1411          {
1412             GLfloat texcoord[4], color[4];
1413
1414             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1415             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1416                 texcoord[3] != 0.0) {
1417                texcoord[0] /= texcoord[3];
1418                texcoord[1] /= texcoord[3];
1419                texcoord[2] /= texcoord[3];
1420             }
1421
1422             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1423
1424             store_vector4(inst, machine, color);
1425          }
1426          break;
1427       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1428          {
1429             GLfloat a[4], result[4];
1430             const GLuint *rawBits = (const GLuint *) a;
1431             GLhalfNV hx, hy;
1432             fetch_vector1(&inst->SrcReg[0], machine, a);
1433             hx = rawBits[0] & 0xffff;
1434             hy = rawBits[0] >> 16;
1435             result[0] = result[2] = _mesa_half_to_float(hx);
1436             result[1] = result[3] = _mesa_half_to_float(hy);
1437             store_vector4(inst, machine, result);
1438          }
1439          break;
1440       case OPCODE_UP2US:       /* unpack two GLushorts */
1441          {
1442             GLfloat a[4], result[4];
1443             const GLuint *rawBits = (const GLuint *) a;
1444             GLushort usx, usy;
1445             fetch_vector1(&inst->SrcReg[0], machine, a);
1446             usx = rawBits[0] & 0xffff;
1447             usy = rawBits[0] >> 16;
1448             result[0] = result[2] = usx * (1.0f / 65535.0f);
1449             result[1] = result[3] = usy * (1.0f / 65535.0f);
1450             store_vector4(inst, machine, result);
1451          }
1452          break;
1453       case OPCODE_UP4B:        /* unpack four GLbytes */
1454          {
1455             GLfloat a[4], result[4];
1456             const GLuint *rawBits = (const GLuint *) a;
1457             fetch_vector1(&inst->SrcReg[0], machine, a);
1458             result[0] = (((rawBits[0] >> 0) & 0xff) - 128) / 127.0F;
1459             result[1] = (((rawBits[0] >> 8) & 0xff) - 128) / 127.0F;
1460             result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1461             result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1462             store_vector4(inst, machine, result);
1463          }
1464          break;
1465       case OPCODE_UP4UB:       /* unpack four GLubytes */
1466          {
1467             GLfloat a[4], result[4];
1468             const GLuint *rawBits = (const GLuint *) a;
1469             fetch_vector1(&inst->SrcReg[0], machine, a);
1470             result[0] = ((rawBits[0] >> 0) & 0xff) / 255.0F;
1471             result[1] = ((rawBits[0] >> 8) & 0xff) / 255.0F;
1472             result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1473             result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1474             store_vector4(inst, machine, result);
1475          }
1476          break;
1477       case OPCODE_XPD:         /* cross product */
1478          {
1479             GLfloat a[4], b[4], result[4];
1480             fetch_vector4(&inst->SrcReg[0], machine, a);
1481             fetch_vector4(&inst->SrcReg[1], machine, b);
1482             result[0] = a[1] * b[2] - a[2] * b[1];
1483             result[1] = a[2] * b[0] - a[0] * b[2];
1484             result[2] = a[0] * b[1] - a[1] * b[0];
1485             result[3] = 1.0;
1486             store_vector4(inst, machine, result);
1487             if (DEBUG_PROG) {
1488                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1489                       result[0], result[1], result[2], result[3],
1490                       a[0], a[1], a[2], b[0], b[1], b[2]);
1491             }
1492          }
1493          break;
1494       case OPCODE_X2D:         /* 2-D matrix transform */
1495          {
1496             GLfloat a[4], b[4], c[4], result[4];
1497             fetch_vector4(&inst->SrcReg[0], machine, a);
1498             fetch_vector4(&inst->SrcReg[1], machine, b);
1499             fetch_vector4(&inst->SrcReg[2], machine, c);
1500             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1501             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1502             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1503             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1504             store_vector4(inst, machine, result);
1505          }
1506          break;
1507       case OPCODE_PRINT:
1508          {
1509             if (inst->SrcReg[0].File != -1) {
1510                GLfloat a[4];
1511                fetch_vector4(&inst->SrcReg[0], machine, a);
1512                _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1513                             a[0], a[1], a[2], a[3]);
1514             }
1515             else {
1516                _mesa_printf("%s\n", (const char *) inst->Data);
1517             }
1518          }
1519          break;
1520       case OPCODE_END:
1521          return GL_TRUE;
1522       default:
1523          _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
1524                        inst->Opcode);
1525                        assert(0);
1526          return GL_TRUE;        /* return value doesn't matter */
1527
1528       }
1529
1530       numExec++;
1531       if (numExec > maxExec) {
1532          _mesa_problem(ctx, "Infinite loop detected in fragment program");
1533          return GL_TRUE;
1534       }
1535
1536    } /* for pc */
1537
1538 #if FEATURE_MESA_program_debug
1539    CurrentMachine = NULL;
1540 #endif
1541
1542    return GL_TRUE;
1543 }