src/mesa/shader/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  7.0.3
   4  *
   5  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "glheader.h"
  39 #include "colormac.h"
  40 #include "context.h"
  41 #include "program.h"
  42 #include "prog_execute.h"
  43 #include "prog_instruction.h"
  44 #include "prog_parameter.h"
  45 #include "prog_print.h"
  46 #include "shader/slang/slang_library_noise.h"
  47
  48
  49 /* debug predicate */
  50 #define DEBUG_PROG 0
  51
  52
  53 /**
  54  * Set x to positive or negative infinity.
  55  */
  56 #if defined(USE_IEEE) || defined(_WIN32)
  57 #define SET_POS_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0x7F800000 )
  58 #define SET_NEG_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0xFF800000 )
  59 #elif defined(VMS)
  60 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
  61 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
  62 #else
  63 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  64 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  65 #endif
  66
  67 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  68
  69
  70 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  71
  72
  73
  74 /**
  75  * Return a pointer to the 4-element float vector specified by the given
  76  * source register.
  77  */
  78 static INLINE const GLfloat *
  79 get_register_pointer(const struct prog_src_register *source,
  80                      const struct gl_program_machine *machine)
  81 {
  82    if (source->RelAddr) {
  83       const GLint reg = source->Index + machine->AddressReg[0][0];
  84       if (source->File == PROGRAM_ENV_PARAM)
  85          if (reg < 0 || reg >= MAX_PROGRAM_ENV_PARAMS)
  86             return ZeroVec;
  87          else
  88             return machine->EnvParams[reg];
  89       else {
  90          const struct gl_program_parameter_list *params;
  91          ASSERT(source->File == PROGRAM_LOCAL_PARAM ||
  92                 source->File == PROGRAM_STATE_VAR);
  93          params = machine->CurProgram->Parameters;
  94          if (reg < 0 || reg >= params->NumParameters)
  95             return ZeroVec;
  96          else
  97             return params->ParameterValues[reg];
  98       }
  99    }
 100
 101    switch (source->File) {
 102    case PROGRAM_TEMPORARY:
 103       ASSERT(source->Index < MAX_PROGRAM_TEMPS);
 104       return machine->Temporaries[source->Index];
 105
 106    case PROGRAM_INPUT:
 107       if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB) {
 108          ASSERT(source->Index < VERT_ATTRIB_MAX);
 109          return machine->VertAttribs[source->Index];
 110       }
 111       else {
 112          ASSERT(source->Index < FRAG_ATTRIB_MAX);
 113          return machine->Attribs[source->Index][machine->CurElement];
 114       }
 115
 116    case PROGRAM_OUTPUT:
 117       ASSERT(source->Index < MAX_PROGRAM_OUTPUTS);
 118       return machine->Outputs[source->Index];
 119
 120    case PROGRAM_LOCAL_PARAM:
 121       ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 122       return machine->CurProgram->LocalParams[source->Index];
 123
 124    case PROGRAM_ENV_PARAM:
 125       ASSERT(source->Index < MAX_PROGRAM_ENV_PARAMS);
 126       return machine->EnvParams[source->Index];
 127
 128    case PROGRAM_STATE_VAR:
 129       /* Fallthrough */
 130    case PROGRAM_CONSTANT:
 131       /* Fallthrough */
 132    case PROGRAM_UNIFORM:
 133       /* Fallthrough */
 134    case PROGRAM_NAMED_PARAM:
 135       ASSERT(source->Index <
 136              (GLint) machine->CurProgram->Parameters->NumParameters);
 137       return machine->CurProgram->Parameters->ParameterValues[source->Index];
 138
 139    default:
 140       _mesa_problem(NULL,
 141                     "Invalid input register file %d in get_register_pointer()",
 142                     source->File);
 143       return NULL;
 144    }
 145 }
 146
 147
 148 #if FEATURE_MESA_program_debug
 149 static struct gl_program_machine *CurrentMachine = NULL;
 150
 151 /**
 152  * For GL_MESA_program_debug.
 153  * Return current value (4*GLfloat) of a program register.
 154  * Called via ctx->Driver.GetProgramRegister().
 155  */
 156 void
 157 _mesa_get_program_register(GLcontext *ctx, enum register_file file,
 158                            GLuint index, GLfloat val[4])
 159 {
 160    if (CurrentMachine) {
 161       struct prog_src_register src;
 162       const GLfloat *reg;
 163       src.File = file;
 164       src.Index = index;
 165       reg = get_register_pointer(&src, CurrentMachine);
 166       COPY_4V(val, reg);
 167    }
 168 }
 169 #endif /* FEATURE_MESA_program_debug */
 170
 171
 172 /**
 173  * Fetch a 4-element float vector from the given source register.
 174  * Apply swizzling and negating as needed.
 175  */
 176 static void
 177 fetch_vector4(const struct prog_src_register *source,
 178               const struct gl_program_machine *machine, GLfloat result[4])
 179 {
 180    const GLfloat *src = get_register_pointer(source, machine);
 181    ASSERT(src);
 182
 183    if (source->Swizzle == SWIZZLE_NOOP) {
 184       /* no swizzling */
 185       COPY_4V(result, src);
 186    }
 187    else {
 188       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 189       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 190       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 191       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 192       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 193       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 194       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 195       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 196    }
 197
 198    if (source->NegateBase) {
 199       result[0] = -result[0];
 200       result[1] = -result[1];
 201       result[2] = -result[2];
 202       result[3] = -result[3];
 203    }
 204    if (source->Abs) {
 205       result[0] = FABSF(result[0]);
 206       result[1] = FABSF(result[1]);
 207       result[2] = FABSF(result[2]);
 208       result[3] = FABSF(result[3]);
 209    }
 210    if (source->NegateAbs) {
 211       result[0] = -result[0];
 212       result[1] = -result[1];
 213       result[2] = -result[2];
 214       result[3] = -result[3];
 215    }
 216 }
 217
 218
 219 /**
 220  * Fetch the derivative with respect to X or Y for the given register.
 221  * XXX this currently only works for fragment program input attribs.
 222  */
 223 static void
 224 fetch_vector4_deriv(GLcontext * ctx,
 225                     const struct prog_src_register *source,
 226                     const struct gl_program_machine *machine,
 227                     char xOrY, GLfloat result[4])
 228 {
 229    if (source->File == PROGRAM_INPUT && source->Index < machine->NumDeriv) {
 230       const GLint col = machine->CurElement;
 231       const GLfloat w = machine->Attribs[FRAG_ATTRIB_WPOS][col][3];
 232       const GLfloat invQ = 1.0f / w;
 233       GLfloat deriv[4];
 234
 235       if (xOrY == 'X') {
 236          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 237          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 238          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 239          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 240       }
 241       else {
 242          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 243          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 244          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 245          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 246       }
 247
 248       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 249       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 250       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 251       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 252
 253       if (source->NegateBase) {
 254          result[0] = -result[0];
 255          result[1] = -result[1];
 256          result[2] = -result[2];
 257          result[3] = -result[3];
 258       }
 259       if (source->Abs) {
 260          result[0] = FABSF(result[0]);
 261          result[1] = FABSF(result[1]);
 262          result[2] = FABSF(result[2]);
 263          result[3] = FABSF(result[3]);
 264       }
 265       if (source->NegateAbs) {
 266          result[0] = -result[0];
 267          result[1] = -result[1];
 268          result[2] = -result[2];
 269          result[3] = -result[3];
 270       }
 271    }
 272    else {
 273       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 274    }
 275 }
 276
 277
 278 /**
 279  * As above, but only return result[0] element.
 280  */
 281 static void
 282 fetch_vector1(const struct prog_src_register *source,
 283               const struct gl_program_machine *machine, GLfloat result[4])
 284 {
 285    const GLfloat *src = get_register_pointer(source, machine);
 286    ASSERT(src);
 287
 288    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 289
 290    if (source->NegateBase) {
 291       result[0] = -result[0];
 292    }
 293    if (source->Abs) {
 294       result[0] = FABSF(result[0]);
 295    }
 296    if (source->NegateAbs) {
 297       result[0] = -result[0];
 298    }
 299 }
 300
 301
 302 /**
 303  * Fetch texel from texture.  Use partial derivatives when possible.
 304  */
 305 static INLINE void
 306 fetch_texel(GLcontext *ctx,
 307             const struct gl_program_machine *machine,
 308             const struct prog_instruction *inst,
 309             const GLfloat texcoord[4], GLfloat lodBias,
 310             GLfloat color[4])
 311 {
 312    /* Note: we only have the right derivatives for fragment input attribs.
 313     */
 314    if (machine->NumDeriv > 0 &&
 315        inst->SrcReg[0].File == PROGRAM_INPUT &&
 316        inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit) {
 317       /* simple texture fetch for which we should have derivatives */
 318       GLuint attr = inst->SrcReg[0].Index;
 319       machine->FetchTexelDeriv(ctx, texcoord,
 320                                machine->DerivX[attr],
 321                                machine->DerivY[attr],
 322                                lodBias,
 323                                inst->TexSrcUnit, color);
 324    }
 325    else {
 326       machine->FetchTexelLod(ctx, texcoord, lodBias,
 327                              inst->TexSrcUnit, color);
 328    }
 329 }
 330
 331
 332 /**
 333  * Test value against zero and return GT, LT, EQ or UN if NaN.
 334  */
 335 static INLINE GLuint
 336 generate_cc(float value)
 337 {
 338    if (value != value)
 339       return COND_UN;           /* NaN */
 340    if (value > 0.0F)
 341       return COND_GT;
 342    if (value < 0.0F)
 343       return COND_LT;
 344    return COND_EQ;
 345 }
 346
 347
 348 /**
 349  * Test if the ccMaskRule is satisfied by the given condition code.
 350  * Used to mask destination writes according to the current condition code.
 351  */
 352 static INLINE GLboolean
 353 test_cc(GLuint condCode, GLuint ccMaskRule)
 354 {
 355    switch (ccMaskRule) {
 356    case COND_EQ: return (condCode == COND_EQ);
 357    case COND_NE: return (condCode != COND_EQ);
 358    case COND_LT: return (condCode == COND_LT);
 359    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 360    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 361    case COND_GT: return (condCode == COND_GT);
 362    case COND_TR: return GL_TRUE;
 363    case COND_FL: return GL_FALSE;
 364    default:      return GL_TRUE;
 365    }
 366 }
 367
 368
 369 /**
 370  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 371  * or GL_FALSE to indicate result.
 372  */
 373 static INLINE GLboolean
 374 eval_condition(const struct gl_program_machine *machine,
 375                const struct prog_instruction *inst)
 376 {
 377    const GLuint swizzle = inst->DstReg.CondSwizzle;
 378    const GLuint condMask = inst->DstReg.CondMask;
 379    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 380        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 381        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 382        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 383       return GL_TRUE;
 384    }
 385    else {
 386       return GL_FALSE;
 387    }
 388 }
 389
 390
 391
 392 /**
 393  * Store 4 floats into a register.  Observe the instructions saturate and
 394  * set-condition-code flags.
 395  */
 396 static void
 397 store_vector4(const struct prog_instruction *inst,
 398               struct gl_program_machine *machine, const GLfloat value[4])
 399 {
 400    const struct prog_dst_register *dest = &(inst->DstReg);
 401    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 402    GLfloat *dstReg;
 403    GLfloat dummyReg[4];
 404    GLfloat clampedValue[4];
 405    GLuint writeMask = dest->WriteMask;
 406
 407    switch (dest->File) {
 408    case PROGRAM_OUTPUT:
 409       ASSERT(dest->Index < MAX_PROGRAM_OUTPUTS);
 410       dstReg = machine->Outputs[dest->Index];
 411       break;
 412    case PROGRAM_TEMPORARY:
 413       ASSERT(dest->Index < MAX_PROGRAM_TEMPS);
 414       dstReg = machine->Temporaries[dest->Index];
 415       break;
 416    case PROGRAM_WRITE_ONLY:
 417       dstReg = dummyReg;
 418       return;
 419    default:
 420       _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 421       return;
 422    }
 423
 424 #if 0
 425    if (value[0] > 1.0e10 ||
 426        IS_INF_OR_NAN(value[0]) ||
 427        IS_INF_OR_NAN(value[1]) ||
 428        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 429       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 430 #endif
 431
 432    if (clamp) {
 433       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 434       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 435       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 436       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 437       value = clampedValue;
 438    }
 439
 440    if (dest->CondMask != COND_TR) {
 441       /* condition codes may turn off some writes */
 442       if (writeMask & WRITEMASK_X) {
 443          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)],
 444                       dest->CondMask))
 445             writeMask &= ~WRITEMASK_X;
 446       }
 447       if (writeMask & WRITEMASK_Y) {
 448          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)],
 449                       dest->CondMask))
 450             writeMask &= ~WRITEMASK_Y;
 451       }
 452       if (writeMask & WRITEMASK_Z) {
 453          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)],
 454                       dest->CondMask))
 455             writeMask &= ~WRITEMASK_Z;
 456       }
 457       if (writeMask & WRITEMASK_W) {
 458          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)],
 459                       dest->CondMask))
 460             writeMask &= ~WRITEMASK_W;
 461       }
 462    }
 463
 464    if (writeMask & WRITEMASK_X)
 465       dstReg[0] = value[0];
 466    if (writeMask & WRITEMASK_Y)
 467       dstReg[1] = value[1];
 468    if (writeMask & WRITEMASK_Z)
 469       dstReg[2] = value[2];
 470    if (writeMask & WRITEMASK_W)
 471       dstReg[3] = value[3];
 472
 473    if (inst->CondUpdate) {
 474       if (writeMask & WRITEMASK_X)
 475          machine->CondCodes[0] = generate_cc(value[0]);
 476       if (writeMask & WRITEMASK_Y)
 477          machine->CondCodes[1] = generate_cc(value[1]);
 478       if (writeMask & WRITEMASK_Z)
 479          machine->CondCodes[2] = generate_cc(value[2]);
 480       if (writeMask & WRITEMASK_W)
 481          machine->CondCodes[3] = generate_cc(value[3]);
 482 #if DEBUG_PROG
 483       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 484              _mesa_condcode_string(machine->CondCodes[0]),
 485              _mesa_condcode_string(machine->CondCodes[1]),
 486              _mesa_condcode_string(machine->CondCodes[2]),
 487              _mesa_condcode_string(machine->CondCodes[3]));
 488 #endif
 489    }
 490 }
 491
 492
 493 /**
 494  * Execute the given vertex/fragment program.
 495  *
 496  * \param ctx  rendering context
 497  * \param program  the program to execute
 498  * \param machine  machine state (must be initialized)
 499  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 500  */
 501 GLboolean
 502 _mesa_execute_program(GLcontext * ctx,
 503                       const struct gl_program *program,
 504                       struct gl_program_machine *machine)
 505 {
 506    const GLuint numInst = program->NumInstructions;
 507    const GLuint maxExec = 10000;
 508    GLint pc, numExec = 0;
 509
 510    machine->CurProgram = program;
 511
 512    if (DEBUG_PROG) {
 513       printf("execute program %u --------------------\n", program->Id);
 514    }
 515
 516 #if FEATURE_MESA_program_debug
 517    CurrentMachine = machine;
 518 #endif
 519
 520    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 521       machine->EnvParams = ctx->VertexProgram.Parameters;
 522    }
 523    else {
 524       machine->EnvParams = ctx->FragmentProgram.Parameters;
 525    }
 526
 527    for (pc = 0; pc < numInst; pc++) {
 528       const struct prog_instruction *inst = program->Instructions + pc;
 529
 530 #if FEATURE_MESA_program_debug
 531       if (ctx->FragmentProgram.CallbackEnabled &&
 532           ctx->FragmentProgram.Callback) {
 533          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 534          ctx->FragmentProgram.Callback(program->Target,
 535                                        ctx->FragmentProgram.CallbackData);
 536       }
 537 #endif
 538
 539       if (DEBUG_PROG) {
 540          _mesa_print_instruction(inst);
 541       }
 542
 543       switch (inst->Opcode) {
 544       case OPCODE_ABS:
 545          {
 546             GLfloat a[4], result[4];
 547             fetch_vector4(&inst->SrcReg[0], machine, a);
 548             result[0] = FABSF(a[0]);
 549             result[1] = FABSF(a[1]);
 550             result[2] = FABSF(a[2]);
 551             result[3] = FABSF(a[3]);
 552             store_vector4(inst, machine, result);
 553          }
 554          break;
 555       case OPCODE_ADD:
 556          {
 557             GLfloat a[4], b[4], result[4];
 558             fetch_vector4(&inst->SrcReg[0], machine, a);
 559             fetch_vector4(&inst->SrcReg[1], machine, b);
 560             result[0] = a[0] + b[0];
 561             result[1] = a[1] + b[1];
 562             result[2] = a[2] + b[2];
 563             result[3] = a[3] + b[3];
 564             store_vector4(inst, machine, result);
 565             if (DEBUG_PROG) {
 566                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 567                       result[0], result[1], result[2], result[3],
 568                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 569             }
 570          }
 571          break;
 572       case OPCODE_ARL:
 573          {
 574             GLfloat t[4];
 575             fetch_vector4(&inst->SrcReg[0], machine, t);
 576             machine->AddressReg[0][0] = (GLint) FLOORF(t[0]);
 577          }
 578          break;
 579       case OPCODE_BGNLOOP:
 580          /* no-op */
 581          break;
 582       case OPCODE_ENDLOOP:
 583          /* subtract 1 here since pc is incremented by for(pc) loop */
 584          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 585          break;
 586       case OPCODE_BGNSUB:      /* begin subroutine */
 587          break;
 588       case OPCODE_ENDSUB:      /* end subroutine */
 589          break;
 590       case OPCODE_BRA:         /* branch (conditional) */
 591          /* fall-through */
 592       case OPCODE_BRK:         /* break out of loop (conditional) */
 593          /* fall-through */
 594       case OPCODE_CONT:        /* continue loop (conditional) */
 595          if (eval_condition(machine, inst)) {
 596             /* take branch */
 597             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 598             pc = inst->BranchTarget - 1;
 599          }
 600          break;
 601       case OPCODE_CAL:         /* Call subroutine (conditional) */
 602          if (eval_condition(machine, inst)) {
 603             /* call the subroutine */
 604             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 605                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 606             }
 607             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 608             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 609             pc = inst->BranchTarget - 1;
 610          }
 611          break;
 612       case OPCODE_CMP:
 613          {
 614             GLfloat a[4], b[4], c[4], result[4];
 615             fetch_vector4(&inst->SrcReg[0], machine, a);
 616             fetch_vector4(&inst->SrcReg[1], machine, b);
 617             fetch_vector4(&inst->SrcReg[2], machine, c);
 618             result[0] = a[0] < 0.0F ? b[0] : c[0];
 619             result[1] = a[1] < 0.0F ? b[1] : c[1];
 620             result[2] = a[2] < 0.0F ? b[2] : c[2];
 621             result[3] = a[3] < 0.0F ? b[3] : c[3];
 622             store_vector4(inst, machine, result);
 623          }
 624          break;
 625       case OPCODE_COS:
 626          {
 627             GLfloat a[4], result[4];
 628             fetch_vector1(&inst->SrcReg[0], machine, a);
 629             result[0] = result[1] = result[2] = result[3]
 630                = (GLfloat) _mesa_cos(a[0]);
 631             store_vector4(inst, machine, result);
 632          }
 633          break;
 634       case OPCODE_DDX:         /* Partial derivative with respect to X */
 635          {
 636             GLfloat result[4];
 637             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 638                                 'X', result);
 639             store_vector4(inst, machine, result);
 640          }
 641          break;
 642       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 643          {
 644             GLfloat result[4];
 645             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 646                                 'Y', result);
 647             store_vector4(inst, machine, result);
 648          }
 649          break;
 650       case OPCODE_DP3:
 651          {
 652             GLfloat a[4], b[4], result[4];
 653             fetch_vector4(&inst->SrcReg[0], machine, a);
 654             fetch_vector4(&inst->SrcReg[1], machine, b);
 655             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 656             store_vector4(inst, machine, result);
 657             if (DEBUG_PROG) {
 658                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 659                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 660             }
 661          }
 662          break;
 663       case OPCODE_DP4:
 664          {
 665             GLfloat a[4], b[4], result[4];
 666             fetch_vector4(&inst->SrcReg[0], machine, a);
 667             fetch_vector4(&inst->SrcReg[1], machine, b);
 668             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 669             store_vector4(inst, machine, result);
 670             if (DEBUG_PROG) {
 671                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 672                       result[0], a[0], a[1], a[2], a[3],
 673                       b[0], b[1], b[2], b[3]);
 674             }
 675          }
 676          break;
 677       case OPCODE_DPH:
 678          {
 679             GLfloat a[4], b[4], result[4];
 680             fetch_vector4(&inst->SrcReg[0], machine, a);
 681             fetch_vector4(&inst->SrcReg[1], machine, b);
 682             result[0] = result[1] = result[2] = result[3] =
 683                a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 684             store_vector4(inst, machine, result);
 685          }
 686          break;
 687       case OPCODE_DST:         /* Distance vector */
 688          {
 689             GLfloat a[4], b[4], result[4];
 690             fetch_vector4(&inst->SrcReg[0], machine, a);
 691             fetch_vector4(&inst->SrcReg[1], machine, b);
 692             result[0] = 1.0F;
 693             result[1] = a[1] * b[1];
 694             result[2] = a[2];
 695             result[3] = b[3];
 696             store_vector4(inst, machine, result);
 697          }
 698          break;
 699       case OPCODE_EXP:
 700          {
 701             GLfloat t[4], q[4], floor_t0;
 702             fetch_vector1(&inst->SrcReg[0], machine, t);
 703             floor_t0 = FLOORF(t[0]);
 704             if (floor_t0 > FLT_MAX_EXP) {
 705                SET_POS_INFINITY(q[0]);
 706                SET_POS_INFINITY(q[2]);
 707             }
 708             else if (floor_t0 < FLT_MIN_EXP) {
 709                q[0] = 0.0F;
 710                q[2] = 0.0F;
 711             }
 712             else {
 713                q[0] = LDEXPF(1.0, (int) floor_t0);
 714                /* Note: GL_NV_vertex_program expects
 715                 * result.z = result.x * APPX(result.y)
 716                 * We do what the ARB extension says.
 717                 */
 718                q[2] = pow(2.0, t[0]);
 719             }
 720             q[1] = t[0] - floor_t0;
 721             q[3] = 1.0F;
 722             store_vector4( inst, machine, q );
 723          }
 724          break;
 725       case OPCODE_EX2:         /* Exponential base 2 */
 726          {
 727             GLfloat a[4], result[4];
 728             fetch_vector1(&inst->SrcReg[0], machine, a);
 729             result[0] = result[1] = result[2] = result[3] =
 730                (GLfloat) _mesa_pow(2.0, a[0]);
 731             store_vector4(inst, machine, result);
 732          }
 733          break;
 734       case OPCODE_FLR:
 735          {
 736             GLfloat a[4], result[4];
 737             fetch_vector4(&inst->SrcReg[0], machine, a);
 738             result[0] = FLOORF(a[0]);
 739             result[1] = FLOORF(a[1]);
 740             result[2] = FLOORF(a[2]);
 741             result[3] = FLOORF(a[3]);
 742             store_vector4(inst, machine, result);
 743          }
 744          break;
 745       case OPCODE_FRC:
 746          {
 747             GLfloat a[4], result[4];
 748             fetch_vector4(&inst->SrcReg[0], machine, a);
 749             result[0] = a[0] - FLOORF(a[0]);
 750             result[1] = a[1] - FLOORF(a[1]);
 751             result[2] = a[2] - FLOORF(a[2]);
 752             result[3] = a[3] - FLOORF(a[3]);
 753             store_vector4(inst, machine, result);
 754          }
 755          break;
 756       case OPCODE_IF:
 757          {
 758             GLboolean cond;
 759             /* eval condition */
 760             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 761                GLfloat a[4];
 762                fetch_vector1(&inst->SrcReg[0], machine, a);
 763                cond = (a[0] != 0.0);
 764             }
 765             else {
 766                cond = eval_condition(machine, inst);
 767             }
 768             if (DEBUG_PROG) {
 769                printf("IF: %d\n", cond);
 770             }
 771             /* do if/else */
 772             if (cond) {
 773                /* do if-clause (just continue execution) */
 774             }
 775             else {
 776                /* go to the instruction after ELSE or ENDIF */
 777                assert(inst->BranchTarget >= 0);
 778                pc = inst->BranchTarget - 1;
 779             }
 780          }
 781          break;
 782       case OPCODE_ELSE:
 783          /* goto ENDIF */
 784          assert(inst->BranchTarget >= 0);
 785          pc = inst->BranchTarget - 1;
 786          break;
 787       case OPCODE_ENDIF:
 788          /* nothing */
 789          break;
 790       case OPCODE_INT:         /* float to int */
 791          {
 792             GLfloat a[4], result[4];
 793             fetch_vector4(&inst->SrcReg[0], machine, a);
 794             result[0] = (GLfloat) (GLint) a[0];
 795             result[1] = (GLfloat) (GLint) a[1];
 796             result[2] = (GLfloat) (GLint) a[2];
 797             result[3] = (GLfloat) (GLint) a[3];
 798             store_vector4(inst, machine, result);
 799          }
 800          break;
 801       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 802          if (eval_condition(machine, inst)) {
 803             return GL_FALSE;
 804          }
 805          break;
 806       case OPCODE_KIL:         /* ARB_f_p only */
 807          {
 808             GLfloat a[4];
 809             fetch_vector4(&inst->SrcReg[0], machine, a);
 810             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 811                return GL_FALSE;
 812             }
 813          }
 814          break;
 815       case OPCODE_LG2:         /* log base 2 */
 816          {
 817             GLfloat a[4], result[4];
 818             fetch_vector1(&inst->SrcReg[0], machine, a);
 819             result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
 820             store_vector4(inst, machine, result);
 821          }
 822          break;
 823       case OPCODE_LIT:
 824          {
 825             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
 826             GLfloat a[4], result[4];
 827             fetch_vector4(&inst->SrcReg[0], machine, a);
 828             a[0] = MAX2(a[0], 0.0F);
 829             a[1] = MAX2(a[1], 0.0F);
 830             /* XXX ARB version clamps a[3], NV version doesn't */
 831             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 832             result[0] = 1.0F;
 833             result[1] = a[0];
 834             /* XXX we could probably just use pow() here */
 835             if (a[0] > 0.0F) {
 836                if (a[1] == 0.0 && a[3] == 0.0)
 837                   result[2] = 1.0;
 838                else
 839                   result[2] = EXPF(a[3] * LOGF(a[1]));
 840             }
 841             else {
 842                result[2] = 0.0;
 843             }
 844             result[3] = 1.0F;
 845             store_vector4(inst, machine, result);
 846             if (DEBUG_PROG) {
 847                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
 848                       result[0], result[1], result[2], result[3],
 849                       a[0], a[1], a[2], a[3]);
 850             }
 851          }
 852          break;
 853       case OPCODE_LOG:
 854          {
 855             GLfloat t[4], q[4], abs_t0;
 856             fetch_vector1(&inst->SrcReg[0], machine, t);
 857             abs_t0 = FABSF(t[0]);
 858             if (abs_t0 != 0.0F) {
 859                /* Since we really can't handle infinite values on VMS
 860                 * like other OSes we'll use __MAXFLOAT to represent
 861                 * infinity.  This may need some tweaking.
 862                 */
 863 #ifdef VMS
 864                if (abs_t0 == __MAXFLOAT)
 865 #else
 866                if (IS_INF_OR_NAN(abs_t0))
 867 #endif
 868                {
 869                   SET_POS_INFINITY(q[0]);
 870                   q[1] = 1.0F;
 871                   SET_POS_INFINITY(q[2]);
 872                }
 873                else {
 874                   int exponent;
 875                   GLfloat mantissa = FREXPF(t[0], &exponent);
 876                   q[0] = (GLfloat) (exponent - 1);
 877                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 878                   q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 879                }
 880             }
 881             else {
 882                SET_NEG_INFINITY(q[0]);
 883                q[1] = 1.0F;
 884                SET_NEG_INFINITY(q[2]);
 885             }
 886             q[3] = 1.0;
 887             store_vector4(inst, machine, q);
 888          }
 889          break;
 890       case OPCODE_LRP:
 891          {
 892             GLfloat a[4], b[4], c[4], result[4];
 893             fetch_vector4(&inst->SrcReg[0], machine, a);
 894             fetch_vector4(&inst->SrcReg[1], machine, b);
 895             fetch_vector4(&inst->SrcReg[2], machine, c);
 896             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 897             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 898             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 899             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 900             store_vector4(inst, machine, result);
 901             if (DEBUG_PROG) {
 902                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
 903                       "(%g %g %g %g), (%g %g %g %g)\n",
 904                       result[0], result[1], result[2], result[3],
 905                       a[0], a[1], a[2], a[3],
 906                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
 907             }
 908          }
 909          break;
 910       case OPCODE_MAD:
 911          {
 912             GLfloat a[4], b[4], c[4], result[4];
 913             fetch_vector4(&inst->SrcReg[0], machine, a);
 914             fetch_vector4(&inst->SrcReg[1], machine, b);
 915             fetch_vector4(&inst->SrcReg[2], machine, c);
 916             result[0] = a[0] * b[0] + c[0];
 917             result[1] = a[1] * b[1] + c[1];
 918             result[2] = a[2] * b[2] + c[2];
 919             result[3] = a[3] * b[3] + c[3];
 920             store_vector4(inst, machine, result);
 921             if (DEBUG_PROG) {
 922                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
 923                       "(%g %g %g %g) + (%g %g %g %g)\n",
 924                       result[0], result[1], result[2], result[3],
 925                       a[0], a[1], a[2], a[3],
 926                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
 927             }
 928          }
 929          break;
 930       case OPCODE_MAX:
 931          {
 932             GLfloat a[4], b[4], result[4];
 933             fetch_vector4(&inst->SrcReg[0], machine, a);
 934             fetch_vector4(&inst->SrcReg[1], machine, b);
 935             result[0] = MAX2(a[0], b[0]);
 936             result[1] = MAX2(a[1], b[1]);
 937             result[2] = MAX2(a[2], b[2]);
 938             result[3] = MAX2(a[3], b[3]);
 939             store_vector4(inst, machine, result);
 940             if (DEBUG_PROG) {
 941                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 942                       result[0], result[1], result[2], result[3],
 943                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 944             }
 945          }
 946          break;
 947       case OPCODE_MIN:
 948          {
 949             GLfloat a[4], b[4], result[4];
 950             fetch_vector4(&inst->SrcReg[0], machine, a);
 951             fetch_vector4(&inst->SrcReg[1], machine, b);
 952             result[0] = MIN2(a[0], b[0]);
 953             result[1] = MIN2(a[1], b[1]);
 954             result[2] = MIN2(a[2], b[2]);
 955             result[3] = MIN2(a[3], b[3]);
 956             store_vector4(inst, machine, result);
 957          }
 958          break;
 959       case OPCODE_MOV:
 960          {
 961             GLfloat result[4];
 962             fetch_vector4(&inst->SrcReg[0], machine, result);
 963             store_vector4(inst, machine, result);
 964             if (DEBUG_PROG) {
 965                printf("MOV (%g %g %g %g)\n",
 966                       result[0], result[1], result[2], result[3]);
 967             }
 968          }
 969          break;
 970       case OPCODE_MUL:
 971          {
 972             GLfloat a[4], b[4], result[4];
 973             fetch_vector4(&inst->SrcReg[0], machine, a);
 974             fetch_vector4(&inst->SrcReg[1], machine, b);
 975             result[0] = a[0] * b[0];
 976             result[1] = a[1] * b[1];
 977             result[2] = a[2] * b[2];
 978             result[3] = a[3] * b[3];
 979             store_vector4(inst, machine, result);
 980             if (DEBUG_PROG) {
 981                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 982                       result[0], result[1], result[2], result[3],
 983                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 984             }
 985          }
 986          break;
 987       case OPCODE_NOISE1:
 988          {
 989             GLfloat a[4], result[4];
 990             fetch_vector1(&inst->SrcReg[0], machine, a);
 991             result[0] =
 992                result[1] =
 993                result[2] = result[3] = _slang_library_noise1(a[0]);
 994             store_vector4(inst, machine, result);
 995          }
 996          break;
 997       case OPCODE_NOISE2:
 998          {
 999             GLfloat a[4], result[4];
1000             fetch_vector4(&inst->SrcReg[0], machine, a);
1001             result[0] =
1002                result[1] =
1003                result[2] = result[3] = _slang_library_noise2(a[0], a[1]);
1004             store_vector4(inst, machine, result);
1005          }
1006          break;
1007       case OPCODE_NOISE3:
1008          {
1009             GLfloat a[4], result[4];
1010             fetch_vector4(&inst->SrcReg[0], machine, a);
1011             result[0] =
1012                result[1] =
1013                result[2] =
1014                result[3] = _slang_library_noise3(a[0], a[1], a[2]);
1015             store_vector4(inst, machine, result);
1016          }
1017          break;
1018       case OPCODE_NOISE4:
1019          {
1020             GLfloat a[4], result[4];
1021             fetch_vector4(&inst->SrcReg[0], machine, a);
1022             result[0] =
1023                result[1] =
1024                result[2] =
1025                result[3] = _slang_library_noise4(a[0], a[1], a[2], a[3]);
1026             store_vector4(inst, machine, result);
1027          }
1028          break;
1029       case OPCODE_NOP:
1030          break;
1031       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1032          {
1033             GLfloat a[4], result[4];
1034             GLhalfNV hx, hy;
1035             GLuint *rawResult = (GLuint *) result;
1036             GLuint twoHalves;
1037             fetch_vector4(&inst->SrcReg[0], machine, a);
1038             hx = _mesa_float_to_half(a[0]);
1039             hy = _mesa_float_to_half(a[1]);
1040             twoHalves = hx | (hy << 16);
1041             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1042                = twoHalves;
1043             store_vector4(inst, machine, result);
1044          }
1045          break;
1046       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1047          {
1048             GLfloat a[4], result[4];
1049             GLuint usx, usy, *rawResult = (GLuint *) result;
1050             fetch_vector4(&inst->SrcReg[0], machine, a);
1051             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1052             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1053             usx = IROUND(a[0] * 65535.0F);
1054             usy = IROUND(a[1] * 65535.0F);
1055             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1056                = usx | (usy << 16);
1057             store_vector4(inst, machine, result);
1058          }
1059          break;
1060       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1061          {
1062             GLfloat a[4], result[4];
1063             GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1064             fetch_vector4(&inst->SrcReg[0], machine, a);
1065             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1066             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1067             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1068             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1069             ubx = IROUND(127.0F * a[0] + 128.0F);
1070             uby = IROUND(127.0F * a[1] + 128.0F);
1071             ubz = IROUND(127.0F * a[2] + 128.0F);
1072             ubw = IROUND(127.0F * a[3] + 128.0F);
1073             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1074                = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1075             store_vector4(inst, machine, result);
1076          }
1077          break;
1078       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1079          {
1080             GLfloat a[4], result[4];
1081             GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1082             fetch_vector4(&inst->SrcReg[0], machine, a);
1083             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1084             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1085             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1086             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1087             ubx = IROUND(255.0F * a[0]);
1088             uby = IROUND(255.0F * a[1]);
1089             ubz = IROUND(255.0F * a[2]);
1090             ubw = IROUND(255.0F * a[3]);
1091             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1092                = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1093             store_vector4(inst, machine, result);
1094          }
1095          break;
1096       case OPCODE_POW:
1097          {
1098             GLfloat a[4], b[4], result[4];
1099             fetch_vector1(&inst->SrcReg[0], machine, a);
1100             fetch_vector1(&inst->SrcReg[1], machine, b);
1101             result[0] = result[1] = result[2] = result[3]
1102                = (GLfloat) _mesa_pow(a[0], b[0]);
1103             store_vector4(inst, machine, result);
1104          }
1105          break;
1106       case OPCODE_RCP:
1107          {
1108             GLfloat a[4], result[4];
1109             fetch_vector1(&inst->SrcReg[0], machine, a);
1110             if (DEBUG_PROG) {
1111                if (a[0] == 0)
1112                   printf("RCP(0)\n");
1113                else if (IS_INF_OR_NAN(a[0]))
1114                   printf("RCP(inf)\n");
1115             }
1116             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1117             store_vector4(inst, machine, result);
1118          }
1119          break;
1120       case OPCODE_RET:         /* return from subroutine (conditional) */
1121          if (eval_condition(machine, inst)) {
1122             if (machine->StackDepth == 0) {
1123                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1124             }
1125             /* subtract one because of pc++ in the for loop */
1126             pc = machine->CallStack[--machine->StackDepth] - 1;
1127          }
1128          break;
1129       case OPCODE_RFL:         /* reflection vector */
1130          {
1131             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1132             fetch_vector4(&inst->SrcReg[0], machine, axis);
1133             fetch_vector4(&inst->SrcReg[1], machine, dir);
1134             tmpW = DOT3(axis, axis);
1135             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1136             result[0] = tmpX * axis[0] - dir[0];
1137             result[1] = tmpX * axis[1] - dir[1];
1138             result[2] = tmpX * axis[2] - dir[2];
1139             /* result[3] is never written! XXX enforce in parser! */
1140             store_vector4(inst, machine, result);
1141          }
1142          break;
1143       case OPCODE_RSQ:         /* 1 / sqrt() */
1144          {
1145             GLfloat a[4], result[4];
1146             fetch_vector1(&inst->SrcReg[0], machine, a);
1147             a[0] = FABSF(a[0]);
1148             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1149             store_vector4(inst, machine, result);
1150             if (DEBUG_PROG) {
1151                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1152             }
1153          }
1154          break;
1155       case OPCODE_SCS:         /* sine and cos */
1156          {
1157             GLfloat a[4], result[4];
1158             fetch_vector1(&inst->SrcReg[0], machine, a);
1159             result[0] = (GLfloat) _mesa_cos(a[0]);
1160             result[1] = (GLfloat) _mesa_sin(a[0]);
1161             result[2] = 0.0;    /* undefined! */
1162             result[3] = 0.0;    /* undefined! */
1163             store_vector4(inst, machine, result);
1164          }
1165          break;
1166       case OPCODE_SEQ:         /* set on equal */
1167          {
1168             GLfloat a[4], b[4], result[4];
1169             fetch_vector4(&inst->SrcReg[0], machine, a);
1170             fetch_vector4(&inst->SrcReg[1], machine, b);
1171             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1172             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1173             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1174             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1175             store_vector4(inst, machine, result);
1176             if (DEBUG_PROG) {
1177                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1178                       result[0], result[1], result[2], result[3],
1179                       a[0], a[1], a[2], a[3],
1180                       b[0], b[1], b[2], b[3]);
1181             }
1182          }
1183          break;
1184       case OPCODE_SFL:         /* set false, operands ignored */
1185          {
1186             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1187             store_vector4(inst, machine, result);
1188          }
1189          break;
1190       case OPCODE_SGE:         /* set on greater or equal */
1191          {
1192             GLfloat a[4], b[4], result[4];
1193             fetch_vector4(&inst->SrcReg[0], machine, a);
1194             fetch_vector4(&inst->SrcReg[1], machine, b);
1195             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1196             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1197             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1198             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1199             store_vector4(inst, machine, result);
1200             if (DEBUG_PROG) {
1201                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1202                       result[0], result[1], result[2], result[3],
1203                       a[0], a[1], a[2], a[3],
1204                       b[0], b[1], b[2], b[3]);
1205             }
1206          }
1207          break;
1208       case OPCODE_SGT:         /* set on greater */
1209          {
1210             GLfloat a[4], b[4], result[4];
1211             fetch_vector4(&inst->SrcReg[0], machine, a);
1212             fetch_vector4(&inst->SrcReg[1], machine, b);
1213             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1214             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1215             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1216             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1217             store_vector4(inst, machine, result);
1218             if (DEBUG_PROG) {
1219                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1220                       result[0], result[1], result[2], result[3],
1221                       a[0], a[1], a[2], a[3],
1222                       b[0], b[1], b[2], b[3]);
1223             }
1224          }
1225          break;
1226       case OPCODE_SIN:
1227          {
1228             GLfloat a[4], result[4];
1229             fetch_vector1(&inst->SrcReg[0], machine, a);
1230             result[0] = result[1] = result[2] = result[3]
1231                = (GLfloat) _mesa_sin(a[0]);
1232             store_vector4(inst, machine, result);
1233          }
1234          break;
1235       case OPCODE_SLE:         /* set on less or equal */
1236          {
1237             GLfloat a[4], b[4], result[4];
1238             fetch_vector4(&inst->SrcReg[0], machine, a);
1239             fetch_vector4(&inst->SrcReg[1], machine, b);
1240             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1241             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1242             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1243             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1244             store_vector4(inst, machine, result);
1245             if (DEBUG_PROG) {
1246                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1247                       result[0], result[1], result[2], result[3],
1248                       a[0], a[1], a[2], a[3],
1249                       b[0], b[1], b[2], b[3]);
1250             }
1251          }
1252          break;
1253       case OPCODE_SLT:         /* set on less */
1254          {
1255             GLfloat a[4], b[4], result[4];
1256             fetch_vector4(&inst->SrcReg[0], machine, a);
1257             fetch_vector4(&inst->SrcReg[1], machine, b);
1258             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1259             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1260             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1261             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1262             store_vector4(inst, machine, result);
1263             if (DEBUG_PROG) {
1264                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1265                       result[0], result[1], result[2], result[3],
1266                       a[0], a[1], a[2], a[3],
1267                       b[0], b[1], b[2], b[3]);
1268             }
1269          }
1270          break;
1271       case OPCODE_SNE:         /* set on not equal */
1272          {
1273             GLfloat a[4], b[4], result[4];
1274             fetch_vector4(&inst->SrcReg[0], machine, a);
1275             fetch_vector4(&inst->SrcReg[1], machine, b);
1276             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1277             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1278             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1279             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1280             store_vector4(inst, machine, result);
1281             if (DEBUG_PROG) {
1282                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1283                       result[0], result[1], result[2], result[3],
1284                       a[0], a[1], a[2], a[3],
1285                       b[0], b[1], b[2], b[3]);
1286             }
1287          }
1288          break;
1289       case OPCODE_STR:         /* set true, operands ignored */
1290          {
1291             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1292             store_vector4(inst, machine, result);
1293          }
1294          break;
1295       case OPCODE_SUB:
1296          {
1297             GLfloat a[4], b[4], result[4];
1298             fetch_vector4(&inst->SrcReg[0], machine, a);
1299             fetch_vector4(&inst->SrcReg[1], machine, b);
1300             result[0] = a[0] - b[0];
1301             result[1] = a[1] - b[1];
1302             result[2] = a[2] - b[2];
1303             result[3] = a[3] - b[3];
1304             store_vector4(inst, machine, result);
1305             if (DEBUG_PROG) {
1306                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1307                       result[0], result[1], result[2], result[3],
1308                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1309             }
1310          }
1311          break;
1312       case OPCODE_SWZ:         /* extended swizzle */
1313          {
1314             const struct prog_src_register *source = &inst->SrcReg[0];
1315             const GLfloat *src = get_register_pointer(source, machine);
1316             GLfloat result[4];
1317             GLuint i;
1318             for (i = 0; i < 4; i++) {
1319                const GLuint swz = GET_SWZ(source->Swizzle, i);
1320                if (swz == SWIZZLE_ZERO)
1321                   result[i] = 0.0;
1322                else if (swz == SWIZZLE_ONE)
1323                   result[i] = 1.0;
1324                else {
1325                   ASSERT(swz >= 0);
1326                   ASSERT(swz <= 3);
1327                   result[i] = src[swz];
1328                }
1329                if (source->NegateBase & (1 << i))
1330                   result[i] = -result[i];
1331             }
1332             store_vector4(inst, machine, result);
1333          }
1334          break;
1335       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1336          /* Simple texel lookup */
1337          {
1338             GLfloat texcoord[4], color[4];
1339             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1340
1341             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1342
1343             if (DEBUG_PROG) {
1344                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1345                       color[0], color[1], color[2], color[3],
1346                       inst->TexSrcUnit,
1347                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1348             }
1349             store_vector4(inst, machine, color);
1350          }
1351          break;
1352       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1353          /* Texel lookup with LOD bias */
1354          {
1355             const struct gl_texture_unit *texUnit
1356                = &ctx->Texture.Unit[inst->TexSrcUnit];
1357             GLfloat texcoord[4], color[4], lodBias;
1358
1359             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1360
1361             /* texcoord[3] is the bias to add to lambda */
1362             lodBias = texUnit->LodBias + texcoord[3];
1363             if (texUnit->_Current) {
1364                lodBias += texUnit->_Current->LodBias;
1365             }
1366
1367             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1368
1369             store_vector4(inst, machine, color);
1370          }
1371          break;
1372       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1373          /* Texture lookup w/ partial derivatives for LOD */
1374          {
1375             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1376             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1377             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1378             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1379             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1380                                      0.0, /* lodBias */
1381                                      inst->TexSrcUnit, color);
1382             store_vector4(inst, machine, color);
1383          }
1384          break;
1385       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1386          /* Texture lookup w/ projective divide */
1387          {
1388             GLfloat texcoord[4], color[4];
1389
1390             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1391             /* Not so sure about this test - if texcoord[3] is
1392              * zero, we'd probably be fine except for an ASSERT in
1393              * IROUND_POS() which gets triggered by the inf values created.
1394              */
1395             if (texcoord[3] != 0.0) {
1396                texcoord[0] /= texcoord[3];
1397                texcoord[1] /= texcoord[3];
1398                texcoord[2] /= texcoord[3];
1399             }
1400
1401             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1402
1403             store_vector4(inst, machine, color);
1404          }
1405          break;
1406       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1407          /* Texture lookup w/ projective divide, as above, but do not
1408           * do the divide by w if sampling from a cube map.
1409           */
1410          {
1411             GLfloat texcoord[4], color[4];
1412
1413             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1414             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1415                 texcoord[3] != 0.0) {
1416                texcoord[0] /= texcoord[3];
1417                texcoord[1] /= texcoord[3];
1418                texcoord[2] /= texcoord[3];
1419             }
1420
1421             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1422
1423             store_vector4(inst, machine, color);
1424          }
1425          break;
1426       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1427          {
1428             GLfloat a[4], result[4];
1429             const GLuint *rawBits = (const GLuint *) a;
1430             GLhalfNV hx, hy;
1431             fetch_vector1(&inst->SrcReg[0], machine, a);
1432             hx = rawBits[0] & 0xffff;
1433             hy = rawBits[0] >> 16;
1434             result[0] = result[2] = _mesa_half_to_float(hx);
1435             result[1] = result[3] = _mesa_half_to_float(hy);
1436             store_vector4(inst, machine, result);
1437          }
1438          break;
1439       case OPCODE_UP2US:       /* unpack two GLushorts */
1440          {
1441             GLfloat a[4], result[4];
1442             const GLuint *rawBits = (const GLuint *) a;
1443             GLushort usx, usy;
1444             fetch_vector1(&inst->SrcReg[0], machine, a);
1445             usx = rawBits[0] & 0xffff;
1446             usy = rawBits[0] >> 16;
1447             result[0] = result[2] = usx * (1.0f / 65535.0f);
1448             result[1] = result[3] = usy * (1.0f / 65535.0f);
1449             store_vector4(inst, machine, result);
1450          }
1451          break;
1452       case OPCODE_UP4B:        /* unpack four GLbytes */
1453          {
1454             GLfloat a[4], result[4];
1455             const GLuint *rawBits = (const GLuint *) a;
1456             fetch_vector1(&inst->SrcReg[0], machine, a);
1457             result[0] = (((rawBits[0] >> 0) & 0xff) - 128) / 127.0F;
1458             result[1] = (((rawBits[0] >> 8) & 0xff) - 128) / 127.0F;
1459             result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1460             result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1461             store_vector4(inst, machine, result);
1462          }
1463          break;
1464       case OPCODE_UP4UB:       /* unpack four GLubytes */
1465          {
1466             GLfloat a[4], result[4];
1467             const GLuint *rawBits = (const GLuint *) a;
1468             fetch_vector1(&inst->SrcReg[0], machine, a);
1469             result[0] = ((rawBits[0] >> 0) & 0xff) / 255.0F;
1470             result[1] = ((rawBits[0] >> 8) & 0xff) / 255.0F;
1471             result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1472             result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1473             store_vector4(inst, machine, result);
1474          }
1475          break;
1476       case OPCODE_XPD:         /* cross product */
1477          {
1478             GLfloat a[4], b[4], result[4];
1479             fetch_vector4(&inst->SrcReg[0], machine, a);
1480             fetch_vector4(&inst->SrcReg[1], machine, b);
1481             result[0] = a[1] * b[2] - a[2] * b[1];
1482             result[1] = a[2] * b[0] - a[0] * b[2];
1483             result[2] = a[0] * b[1] - a[1] * b[0];
1484             result[3] = 1.0;
1485             store_vector4(inst, machine, result);
1486             if (DEBUG_PROG) {
1487                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1488                       result[0], result[1], result[2], result[3],
1489                       a[0], a[1], a[2], b[0], b[1], b[2]);
1490             }
1491          }
1492          break;
1493       case OPCODE_X2D:         /* 2-D matrix transform */
1494          {
1495             GLfloat a[4], b[4], c[4], result[4];
1496             fetch_vector4(&inst->SrcReg[0], machine, a);
1497             fetch_vector4(&inst->SrcReg[1], machine, b);
1498             fetch_vector4(&inst->SrcReg[2], machine, c);
1499             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1500             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1501             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1502             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1503             store_vector4(inst, machine, result);
1504          }
1505          break;
1506       case OPCODE_PRINT:
1507          {
1508             if (inst->SrcReg[0].File != -1) {
1509                GLfloat a[4];
1510                fetch_vector4(&inst->SrcReg[0], machine, a);
1511                _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1512                             a[0], a[1], a[2], a[3]);
1513             }
1514             else {
1515                _mesa_printf("%s\n", (const char *) inst->Data);
1516             }
1517          }
1518          break;
1519       case OPCODE_END:
1520          return GL_TRUE;
1521       default:
1522          _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1523                        inst->Opcode);
1524          return GL_TRUE;        /* return value doesn't matter */
1525
1526       }
1527
1528       numExec++;
1529       if (numExec > maxExec) {
1530          _mesa_problem(ctx, "Infinite loop detected in fragment program");
1531          return GL_TRUE;
1532       }
1533
1534    } /* for pc */
1535
1536 #if FEATURE_MESA_program_debug
1537    CurrentMachine = NULL;
1538 #endif
1539
1540    return GL_TRUE;
1541 }