src/mesa/shader/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5.3
   4  *
   5  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "glheader.h"
  39 #include "colormac.h"
  40 #include "context.h"
  41 #include "program.h"
  42 #include "prog_execute.h"
  43 #include "prog_instruction.h"
  44 #include "prog_parameter.h"
  45 #include "prog_print.h"
  46 #include "shader/slang/slang_library_noise.h"
  47
  48
  49 /* See comments below for info about this */
  50 #define LAMBDA_ZERO 1
  51
  52 /* debug predicate */
  53 #define DEBUG_PROG 0
  54
  55
  56 /**
  57  * Set x to positive or negative infinity.
  58  */
  59 #if defined(USE_IEEE) || defined(_WIN32)
  60 #define SET_POS_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0x7F800000 )
  61 #define SET_NEG_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0xFF800000 )
  62 #elif defined(VMS)
  63 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
  64 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
  65 #else
  66 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  67 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  68 #endif
  69
  70 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  71
  72
  73 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  74
  75
  76
  77 /**
  78  * Return a pointer to the 4-element float vector specified by the given
  79  * source register.
  80  */
  81 static INLINE const GLfloat *
  82 get_register_pointer(const struct prog_src_register *source,
  83                      const struct gl_program_machine *machine)
  84 {
  85    if (source->RelAddr) {
  86       const GLint reg = source->Index + machine->AddressReg[0][0];
  87       if (source->File == PROGRAM_ENV_PARAM)
  88          if (reg < 0 || reg >= MAX_PROGRAM_ENV_PARAMS)
  89             return ZeroVec;
  90          else
  91             return machine->EnvParams[reg];
  92       else {
  93          const struct gl_program_parameter_list *params;
  94          ASSERT(source->File == PROGRAM_LOCAL_PARAM ||
  95                 source->File == PROGRAM_STATE_VAR);
  96          params = machine->CurProgram->Parameters;
  97          if (reg < 0 || reg >= params->NumParameters)
  98             return ZeroVec;
  99          else
 100             return params->ParameterValues[reg];
 101       }
 102    }
 103
 104    switch (source->File) {
 105    case PROGRAM_TEMPORARY:
 106       ASSERT(source->Index < MAX_PROGRAM_TEMPS);
 107       return machine->Temporaries[source->Index];
 108
 109    case PROGRAM_INPUT:
 110       if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB) {
 111          ASSERT(source->Index < VERT_ATTRIB_MAX);
 112          return machine->VertAttribs[source->Index];
 113       }
 114       else {
 115          ASSERT(source->Index < FRAG_ATTRIB_MAX);
 116          return machine->Attribs[source->Index][machine->CurElement];
 117       }
 118
 119    case PROGRAM_OUTPUT:
 120       ASSERT(source->Index < MAX_PROGRAM_OUTPUTS);
 121       return machine->Outputs[source->Index];
 122
 123    case PROGRAM_LOCAL_PARAM:
 124       ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 125       return machine->CurProgram->LocalParams[source->Index];
 126
 127    case PROGRAM_ENV_PARAM:
 128       ASSERT(source->Index < MAX_PROGRAM_ENV_PARAMS);
 129       return machine->EnvParams[source->Index];
 130
 131    case PROGRAM_STATE_VAR:
 132       /* Fallthrough */
 133    case PROGRAM_CONSTANT:
 134       /* Fallthrough */
 135    case PROGRAM_UNIFORM:
 136       /* Fallthrough */
 137    case PROGRAM_NAMED_PARAM:
 138       ASSERT(source->Index <
 139              (GLint) machine->CurProgram->Parameters->NumParameters);
 140       return machine->CurProgram->Parameters->ParameterValues[source->Index];
 141
 142    default:
 143       _mesa_problem(NULL,
 144                     "Invalid input register file %d in get_register_pointer()",
 145                     source->File);
 146       return NULL;
 147    }
 148 }
 149
 150
 151 #if FEATURE_MESA_program_debug
 152 static struct gl_program_machine *CurrentMachine = NULL;
 153
 154 /**
 155  * For GL_MESA_program_debug.
 156  * Return current value (4*GLfloat) of a program register.
 157  * Called via ctx->Driver.GetProgramRegister().
 158  */
 159 void
 160 _mesa_get_program_register(GLcontext *ctx, enum register_file file,
 161                            GLuint index, GLfloat val[4])
 162 {
 163    if (CurrentMachine) {
 164       struct prog_src_register src;
 165       const GLfloat *reg;
 166       src.File = file;
 167       src.Index = index;
 168       reg = get_register_pointer(&src, CurrentMachine);
 169       COPY_4V(val, reg);
 170    }
 171 }
 172 #endif /* FEATURE_MESA_program_debug */
 173
 174
 175 /**
 176  * Fetch a 4-element float vector from the given source register.
 177  * Apply swizzling and negating as needed.
 178  */
 179 static void
 180 fetch_vector4(const struct prog_src_register *source,
 181               const struct gl_program_machine *machine, GLfloat result[4])
 182 {
 183    const GLfloat *src = get_register_pointer(source, machine);
 184    ASSERT(src);
 185
 186    if (source->Swizzle == SWIZZLE_NOOP) {
 187       /* no swizzling */
 188       COPY_4V(result, src);
 189    }
 190    else {
 191       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 192       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 193       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 194       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 195       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 196       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 197       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 198       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 199    }
 200
 201    if (source->NegateBase) {
 202       result[0] = -result[0];
 203       result[1] = -result[1];
 204       result[2] = -result[2];
 205       result[3] = -result[3];
 206    }
 207    if (source->Abs) {
 208       result[0] = FABSF(result[0]);
 209       result[1] = FABSF(result[1]);
 210       result[2] = FABSF(result[2]);
 211       result[3] = FABSF(result[3]);
 212    }
 213    if (source->NegateAbs) {
 214       result[0] = -result[0];
 215       result[1] = -result[1];
 216       result[2] = -result[2];
 217       result[3] = -result[3];
 218    }
 219 }
 220
 221
 222 /**
 223  * Fetch the derivative with respect to X or Y for the given register.
 224  * XXX this currently only works for fragment program input attribs.
 225  */
 226 static void
 227 fetch_vector4_deriv(GLcontext * ctx,
 228                     const struct prog_src_register *source,
 229                     const struct gl_program_machine *machine,
 230                     char xOrY, GLfloat result[4])
 231 {
 232    if (source->File == PROGRAM_INPUT && source->Index < machine->NumDeriv) {
 233       const GLint col = machine->CurElement;
 234       const GLfloat w = machine->Attribs[FRAG_ATTRIB_WPOS][col][3];
 235       const GLfloat invQ = 1.0f / w;
 236       GLfloat deriv[4];
 237
 238       if (xOrY == 'X') {
 239          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 240          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 241          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 242          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 243       }
 244       else {
 245          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 246          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 247          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 248          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 249       }
 250
 251       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 252       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 253       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 254       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 255
 256       if (source->NegateBase) {
 257          result[0] = -result[0];
 258          result[1] = -result[1];
 259          result[2] = -result[2];
 260          result[3] = -result[3];
 261       }
 262       if (source->Abs) {
 263          result[0] = FABSF(result[0]);
 264          result[1] = FABSF(result[1]);
 265          result[2] = FABSF(result[2]);
 266          result[3] = FABSF(result[3]);
 267       }
 268       if (source->NegateAbs) {
 269          result[0] = -result[0];
 270          result[1] = -result[1];
 271          result[2] = -result[2];
 272          result[3] = -result[3];
 273       }
 274    }
 275    else {
 276       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 277    }
 278 }
 279
 280
 281 /**
 282  * As above, but only return result[0] element.
 283  */
 284 static void
 285 fetch_vector1(const struct prog_src_register *source,
 286               const struct gl_program_machine *machine, GLfloat result[4])
 287 {
 288    const GLfloat *src = get_register_pointer(source, machine);
 289    ASSERT(src);
 290
 291    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 292
 293    if (source->NegateBase) {
 294       result[0] = -result[0];
 295    }
 296    if (source->Abs) {
 297       result[0] = FABSF(result[0]);
 298    }
 299    if (source->NegateAbs) {
 300       result[0] = -result[0];
 301    }
 302 }
 303
 304
 305 /**
 306  * Test value against zero and return GT, LT, EQ or UN if NaN.
 307  */
 308 static INLINE GLuint
 309 generate_cc(float value)
 310 {
 311    if (value != value)
 312       return COND_UN;           /* NaN */
 313    if (value > 0.0F)
 314       return COND_GT;
 315    if (value < 0.0F)
 316       return COND_LT;
 317    return COND_EQ;
 318 }
 319
 320
 321 /**
 322  * Test if the ccMaskRule is satisfied by the given condition code.
 323  * Used to mask destination writes according to the current condition code.
 324  */
 325 static INLINE GLboolean
 326 test_cc(GLuint condCode, GLuint ccMaskRule)
 327 {
 328    switch (ccMaskRule) {
 329    case COND_EQ: return (condCode == COND_EQ);
 330    case COND_NE: return (condCode != COND_EQ);
 331    case COND_LT: return (condCode == COND_LT);
 332    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 333    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 334    case COND_GT: return (condCode == COND_GT);
 335    case COND_TR: return GL_TRUE;
 336    case COND_FL: return GL_FALSE;
 337    default:      return GL_TRUE;
 338    }
 339 }
 340
 341
 342 /**
 343  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 344  * or GL_FALSE to indicate result.
 345  */
 346 static INLINE GLboolean
 347 eval_condition(const struct gl_program_machine *machine,
 348                const struct prog_instruction *inst)
 349 {
 350    const GLuint swizzle = inst->DstReg.CondSwizzle;
 351    const GLuint condMask = inst->DstReg.CondMask;
 352    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 353        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 354        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 355        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 356       return GL_TRUE;
 357    }
 358    else {
 359       return GL_FALSE;
 360    }
 361 }
 362
 363
 364
 365 /**
 366  * Store 4 floats into a register.  Observe the instructions saturate and
 367  * set-condition-code flags.
 368  */
 369 static void
 370 store_vector4(const struct prog_instruction *inst,
 371               struct gl_program_machine *machine, const GLfloat value[4])
 372 {
 373    const struct prog_dst_register *dest = &(inst->DstReg);
 374    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 375    GLfloat *dstReg;
 376    GLfloat dummyReg[4];
 377    GLfloat clampedValue[4];
 378    GLuint writeMask = dest->WriteMask;
 379
 380    switch (dest->File) {
 381    case PROGRAM_OUTPUT:
 382       ASSERT(dest->Index < MAX_PROGRAM_OUTPUTS);
 383       dstReg = machine->Outputs[dest->Index];
 384       break;
 385    case PROGRAM_TEMPORARY:
 386       ASSERT(dest->Index < MAX_PROGRAM_TEMPS);
 387       dstReg = machine->Temporaries[dest->Index];
 388       break;
 389    case PROGRAM_WRITE_ONLY:
 390       dstReg = dummyReg;
 391       return;
 392    default:
 393       _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 394       return;
 395    }
 396
 397 #if 0
 398    if (value[0] > 1.0e10 ||
 399        IS_INF_OR_NAN(value[0]) ||
 400        IS_INF_OR_NAN(value[1]) ||
 401        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 402       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 403 #endif
 404
 405    if (clamp) {
 406       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 407       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 408       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 409       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 410       value = clampedValue;
 411    }
 412
 413    if (dest->CondMask != COND_TR) {
 414       /* condition codes may turn off some writes */
 415       if (writeMask & WRITEMASK_X) {
 416          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)],
 417                       dest->CondMask))
 418             writeMask &= ~WRITEMASK_X;
 419       }
 420       if (writeMask & WRITEMASK_Y) {
 421          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)],
 422                       dest->CondMask))
 423             writeMask &= ~WRITEMASK_Y;
 424       }
 425       if (writeMask & WRITEMASK_Z) {
 426          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)],
 427                       dest->CondMask))
 428             writeMask &= ~WRITEMASK_Z;
 429       }
 430       if (writeMask & WRITEMASK_W) {
 431          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)],
 432                       dest->CondMask))
 433             writeMask &= ~WRITEMASK_W;
 434       }
 435    }
 436
 437    if (writeMask & WRITEMASK_X)
 438       dstReg[0] = value[0];
 439    if (writeMask & WRITEMASK_Y)
 440       dstReg[1] = value[1];
 441    if (writeMask & WRITEMASK_Z)
 442       dstReg[2] = value[2];
 443    if (writeMask & WRITEMASK_W)
 444       dstReg[3] = value[3];
 445
 446    if (inst->CondUpdate) {
 447       if (writeMask & WRITEMASK_X)
 448          machine->CondCodes[0] = generate_cc(value[0]);
 449       if (writeMask & WRITEMASK_Y)
 450          machine->CondCodes[1] = generate_cc(value[1]);
 451       if (writeMask & WRITEMASK_Z)
 452          machine->CondCodes[2] = generate_cc(value[2]);
 453       if (writeMask & WRITEMASK_W)
 454          machine->CondCodes[3] = generate_cc(value[3]);
 455 #if DEBUG_PROG
 456       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 457              _mesa_condcode_string(machine->CondCodes[0]),
 458              _mesa_condcode_string(machine->CondCodes[1]),
 459              _mesa_condcode_string(machine->CondCodes[2]),
 460              _mesa_condcode_string(machine->CondCodes[3]));
 461 #endif
 462    }
 463 }
 464
 465
 466 /**
 467  * Execute the given vertex/fragment program.
 468  *
 469  * \param ctx  rendering context
 470  * \param program  the program to execute
 471  * \param machine  machine state (must be initialized)
 472  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 473  */
 474 GLboolean
 475 _mesa_execute_program(GLcontext * ctx,
 476                       const struct gl_program *program,
 477                       struct gl_program_machine *machine)
 478 {
 479    const GLuint numInst = program->NumInstructions;
 480    const GLuint maxExec = 10000;
 481    GLint pc, numExec = 0;
 482
 483    machine->CurProgram = program;
 484
 485    if (DEBUG_PROG) {
 486       printf("execute program %u --------------------\n", program->Id);
 487    }
 488
 489 #if FEATURE_MESA_program_debug
 490    CurrentMachine = machine;
 491 #endif
 492
 493    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 494       machine->EnvParams = ctx->VertexProgram.Parameters;
 495    }
 496    else {
 497       machine->EnvParams = ctx->FragmentProgram.Parameters;
 498    }
 499
 500    for (pc = 0; pc < numInst; pc++) {
 501       const struct prog_instruction *inst = program->Instructions + pc;
 502
 503 #if FEATURE_MESA_program_debug
 504       if (ctx->FragmentProgram.CallbackEnabled &&
 505           ctx->FragmentProgram.Callback) {
 506          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 507          ctx->FragmentProgram.Callback(program->Target,
 508                                        ctx->FragmentProgram.CallbackData);
 509       }
 510 #endif
 511
 512       if (DEBUG_PROG) {
 513          _mesa_print_instruction(inst);
 514       }
 515
 516       switch (inst->Opcode) {
 517       case OPCODE_ABS:
 518          {
 519             GLfloat a[4], result[4];
 520             fetch_vector4(&inst->SrcReg[0], machine, a);
 521             result[0] = FABSF(a[0]);
 522             result[1] = FABSF(a[1]);
 523             result[2] = FABSF(a[2]);
 524             result[3] = FABSF(a[3]);
 525             store_vector4(inst, machine, result);
 526          }
 527          break;
 528       case OPCODE_ADD:
 529          {
 530             GLfloat a[4], b[4], result[4];
 531             fetch_vector4(&inst->SrcReg[0], machine, a);
 532             fetch_vector4(&inst->SrcReg[1], machine, b);
 533             result[0] = a[0] + b[0];
 534             result[1] = a[1] + b[1];
 535             result[2] = a[2] + b[2];
 536             result[3] = a[3] + b[3];
 537             store_vector4(inst, machine, result);
 538             if (DEBUG_PROG) {
 539                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 540                       result[0], result[1], result[2], result[3],
 541                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 542             }
 543          }
 544          break;
 545       case OPCODE_ARL:
 546          {
 547             GLfloat t[4];
 548             fetch_vector4(&inst->SrcReg[0], machine, t);
 549             machine->AddressReg[0][0] = (GLint) FLOORF(t[0]);
 550          }
 551          break;
 552       case OPCODE_BGNLOOP:
 553          /* no-op */
 554          break;
 555       case OPCODE_ENDLOOP:
 556          /* subtract 1 here since pc is incremented by for(pc) loop */
 557          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 558          break;
 559       case OPCODE_BGNSUB:      /* begin subroutine */
 560          break;
 561       case OPCODE_ENDSUB:      /* end subroutine */
 562          break;
 563       case OPCODE_BRA:         /* branch (conditional) */
 564          /* fall-through */
 565       case OPCODE_BRK:         /* break out of loop (conditional) */
 566          /* fall-through */
 567       case OPCODE_CONT:        /* continue loop (conditional) */
 568          if (eval_condition(machine, inst)) {
 569             /* take branch */
 570             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 571             pc = inst->BranchTarget - 1;
 572          }
 573          break;
 574       case OPCODE_CAL:         /* Call subroutine (conditional) */
 575          if (eval_condition(machine, inst)) {
 576             /* call the subroutine */
 577             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 578                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 579             }
 580             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 581             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 582             pc = inst->BranchTarget - 1;
 583          }
 584          break;
 585       case OPCODE_CMP:
 586          {
 587             GLfloat a[4], b[4], c[4], result[4];
 588             fetch_vector4(&inst->SrcReg[0], machine, a);
 589             fetch_vector4(&inst->SrcReg[1], machine, b);
 590             fetch_vector4(&inst->SrcReg[2], machine, c);
 591             result[0] = a[0] < 0.0F ? b[0] : c[0];
 592             result[1] = a[1] < 0.0F ? b[1] : c[1];
 593             result[2] = a[2] < 0.0F ? b[2] : c[2];
 594             result[3] = a[3] < 0.0F ? b[3] : c[3];
 595             store_vector4(inst, machine, result);
 596          }
 597          break;
 598       case OPCODE_COS:
 599          {
 600             GLfloat a[4], result[4];
 601             fetch_vector1(&inst->SrcReg[0], machine, a);
 602             result[0] = result[1] = result[2] = result[3]
 603                = (GLfloat) _mesa_cos(a[0]);
 604             store_vector4(inst, machine, result);
 605          }
 606          break;
 607       case OPCODE_DDX:         /* Partial derivative with respect to X */
 608          {
 609             GLfloat result[4];
 610             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 611                                 'X', result);
 612             store_vector4(inst, machine, result);
 613          }
 614          break;
 615       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 616          {
 617             GLfloat result[4];
 618             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 619                                 'Y', result);
 620             store_vector4(inst, machine, result);
 621          }
 622          break;
 623       case OPCODE_DP3:
 624          {
 625             GLfloat a[4], b[4], result[4];
 626             fetch_vector4(&inst->SrcReg[0], machine, a);
 627             fetch_vector4(&inst->SrcReg[1], machine, b);
 628             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 629             store_vector4(inst, machine, result);
 630             if (DEBUG_PROG) {
 631                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 632                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 633             }
 634          }
 635          break;
 636       case OPCODE_DP4:
 637          {
 638             GLfloat a[4], b[4], result[4];
 639             fetch_vector4(&inst->SrcReg[0], machine, a);
 640             fetch_vector4(&inst->SrcReg[1], machine, b);
 641             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 642             store_vector4(inst, machine, result);
 643             if (DEBUG_PROG) {
 644                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 645                       result[0], a[0], a[1], a[2], a[3],
 646                       b[0], b[1], b[2], b[3]);
 647             }
 648          }
 649          break;
 650       case OPCODE_DPH:
 651          {
 652             GLfloat a[4], b[4], result[4];
 653             fetch_vector4(&inst->SrcReg[0], machine, a);
 654             fetch_vector4(&inst->SrcReg[1], machine, b);
 655             result[0] = result[1] = result[2] = result[3] =
 656                a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 657             store_vector4(inst, machine, result);
 658          }
 659          break;
 660       case OPCODE_DST:         /* Distance vector */
 661          {
 662             GLfloat a[4], b[4], result[4];
 663             fetch_vector4(&inst->SrcReg[0], machine, a);
 664             fetch_vector4(&inst->SrcReg[1], machine, b);
 665             result[0] = 1.0F;
 666             result[1] = a[1] * b[1];
 667             result[2] = a[2];
 668             result[3] = b[3];
 669             store_vector4(inst, machine, result);
 670          }
 671          break;
 672       case OPCODE_EXP:
 673          {
 674             GLfloat t[4], q[4], floor_t0;
 675             fetch_vector1(&inst->SrcReg[0], machine, t);
 676             floor_t0 = FLOORF(t[0]);
 677             if (floor_t0 > FLT_MAX_EXP) {
 678                SET_POS_INFINITY(q[0]);
 679                SET_POS_INFINITY(q[2]);
 680             }
 681             else if (floor_t0 < FLT_MIN_EXP) {
 682                q[0] = 0.0F;
 683                q[2] = 0.0F;
 684             }
 685             else {
 686                q[0] = LDEXPF(1.0, (int) floor_t0);
 687                /* Note: GL_NV_vertex_program expects
 688                 * result.z = result.x * APPX(result.y)
 689                 * We do what the ARB extension says.
 690                 */
 691                q[2] = pow(2.0, t[0]);
 692             }
 693             q[1] = t[0] - floor_t0;
 694             q[3] = 1.0F;
 695             store_vector4( inst, machine, q );
 696          }
 697          break;
 698       case OPCODE_EX2:         /* Exponential base 2 */
 699          {
 700             GLfloat a[4], result[4];
 701             fetch_vector1(&inst->SrcReg[0], machine, a);
 702             result[0] = result[1] = result[2] = result[3] =
 703                (GLfloat) _mesa_pow(2.0, a[0]);
 704             store_vector4(inst, machine, result);
 705          }
 706          break;
 707       case OPCODE_FLR:
 708          {
 709             GLfloat a[4], result[4];
 710             fetch_vector4(&inst->SrcReg[0], machine, a);
 711             result[0] = FLOORF(a[0]);
 712             result[1] = FLOORF(a[1]);
 713             result[2] = FLOORF(a[2]);
 714             result[3] = FLOORF(a[3]);
 715             store_vector4(inst, machine, result);
 716          }
 717          break;
 718       case OPCODE_FRC:
 719          {
 720             GLfloat a[4], result[4];
 721             fetch_vector4(&inst->SrcReg[0], machine, a);
 722             result[0] = a[0] - FLOORF(a[0]);
 723             result[1] = a[1] - FLOORF(a[1]);
 724             result[2] = a[2] - FLOORF(a[2]);
 725             result[3] = a[3] - FLOORF(a[3]);
 726             store_vector4(inst, machine, result);
 727          }
 728          break;
 729       case OPCODE_IF:
 730          {
 731             GLboolean cond;
 732             /* eval condition */
 733             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 734                GLfloat a[4];
 735                fetch_vector1(&inst->SrcReg[0], machine, a);
 736                cond = (a[0] != 0.0);
 737             }
 738             else {
 739                cond = eval_condition(machine, inst);
 740             }
 741             if (DEBUG_PROG) {
 742                printf("IF: %d\n", cond);
 743             }
 744             /* do if/else */
 745             if (cond) {
 746                /* do if-clause (just continue execution) */
 747             }
 748             else {
 749                /* go to the instruction after ELSE or ENDIF */
 750                assert(inst->BranchTarget >= 0);
 751                pc = inst->BranchTarget - 1;
 752             }
 753          }
 754          break;
 755       case OPCODE_ELSE:
 756          /* goto ENDIF */
 757          assert(inst->BranchTarget >= 0);
 758          pc = inst->BranchTarget - 1;
 759          break;
 760       case OPCODE_ENDIF:
 761          /* nothing */
 762          break;
 763       case OPCODE_INT:         /* float to int */
 764          {
 765             GLfloat a[4], result[4];
 766             fetch_vector4(&inst->SrcReg[0], machine, a);
 767             result[0] = (GLfloat) (GLint) a[0];
 768             result[1] = (GLfloat) (GLint) a[1];
 769             result[2] = (GLfloat) (GLint) a[2];
 770             result[3] = (GLfloat) (GLint) a[3];
 771             store_vector4(inst, machine, result);
 772          }
 773          break;
 774       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 775          if (eval_condition(machine, inst)) {
 776             return GL_FALSE;
 777          }
 778          break;
 779       case OPCODE_KIL:         /* ARB_f_p only */
 780          {
 781             GLfloat a[4];
 782             fetch_vector4(&inst->SrcReg[0], machine, a);
 783             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 784                return GL_FALSE;
 785             }
 786          }
 787          break;
 788       case OPCODE_LG2:         /* log base 2 */
 789          {
 790             GLfloat a[4], result[4];
 791             fetch_vector1(&inst->SrcReg[0], machine, a);
 792             result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
 793             store_vector4(inst, machine, result);
 794          }
 795          break;
 796       case OPCODE_LIT:
 797          {
 798             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
 799             GLfloat a[4], result[4];
 800             fetch_vector4(&inst->SrcReg[0], machine, a);
 801             a[0] = MAX2(a[0], 0.0F);
 802             a[1] = MAX2(a[1], 0.0F);
 803             /* XXX ARB version clamps a[3], NV version doesn't */
 804             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 805             result[0] = 1.0F;
 806             result[1] = a[0];
 807             /* XXX we could probably just use pow() here */
 808             if (a[0] > 0.0F) {
 809                if (a[1] == 0.0 && a[3] == 0.0)
 810                   result[2] = 1.0;
 811                else
 812                   result[2] = EXPF(a[3] * LOGF(a[1]));
 813             }
 814             else {
 815                result[2] = 0.0;
 816             }
 817             result[3] = 1.0F;
 818             store_vector4(inst, machine, result);
 819             if (DEBUG_PROG) {
 820                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
 821                       result[0], result[1], result[2], result[3],
 822                       a[0], a[1], a[2], a[3]);
 823             }
 824          }
 825          break;
 826       case OPCODE_LOG:
 827          {
 828             GLfloat t[4], q[4], abs_t0;
 829             fetch_vector1(&inst->SrcReg[0], machine, t);
 830             abs_t0 = FABSF(t[0]);
 831             if (abs_t0 != 0.0F) {
 832                /* Since we really can't handle infinite values on VMS
 833                 * like other OSes we'll use __MAXFLOAT to represent
 834                 * infinity.  This may need some tweaking.
 835                 */
 836 #ifdef VMS
 837                if (abs_t0 == __MAXFLOAT)
 838 #else
 839                if (IS_INF_OR_NAN(abs_t0))
 840 #endif
 841                {
 842                   SET_POS_INFINITY(q[0]);
 843                   q[1] = 1.0F;
 844                   SET_POS_INFINITY(q[2]);
 845                }
 846                else {
 847                   int exponent;
 848                   GLfloat mantissa = FREXPF(t[0], &exponent);
 849                   q[0] = (GLfloat) (exponent - 1);
 850                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 851                   q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 852                }
 853             }
 854             else {
 855                SET_NEG_INFINITY(q[0]);
 856                q[1] = 1.0F;
 857                SET_NEG_INFINITY(q[2]);
 858             }
 859             q[3] = 1.0;
 860             store_vector4(inst, machine, q);
 861          }
 862          break;
 863       case OPCODE_LRP:
 864          {
 865             GLfloat a[4], b[4], c[4], result[4];
 866             fetch_vector4(&inst->SrcReg[0], machine, a);
 867             fetch_vector4(&inst->SrcReg[1], machine, b);
 868             fetch_vector4(&inst->SrcReg[2], machine, c);
 869             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 870             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 871             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 872             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 873             store_vector4(inst, machine, result);
 874             if (DEBUG_PROG) {
 875                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
 876                       "(%g %g %g %g), (%g %g %g %g)\n",
 877                       result[0], result[1], result[2], result[3],
 878                       a[0], a[1], a[2], a[3],
 879                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
 880             }
 881          }
 882          break;
 883       case OPCODE_MAD:
 884          {
 885             GLfloat a[4], b[4], c[4], result[4];
 886             fetch_vector4(&inst->SrcReg[0], machine, a);
 887             fetch_vector4(&inst->SrcReg[1], machine, b);
 888             fetch_vector4(&inst->SrcReg[2], machine, c);
 889             result[0] = a[0] * b[0] + c[0];
 890             result[1] = a[1] * b[1] + c[1];
 891             result[2] = a[2] * b[2] + c[2];
 892             result[3] = a[3] * b[3] + c[3];
 893             store_vector4(inst, machine, result);
 894             if (DEBUG_PROG) {
 895                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
 896                       "(%g %g %g %g) + (%g %g %g %g)\n",
 897                       result[0], result[1], result[2], result[3],
 898                       a[0], a[1], a[2], a[3],
 899                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
 900             }
 901          }
 902          break;
 903       case OPCODE_MAX:
 904          {
 905             GLfloat a[4], b[4], result[4];
 906             fetch_vector4(&inst->SrcReg[0], machine, a);
 907             fetch_vector4(&inst->SrcReg[1], machine, b);
 908             result[0] = MAX2(a[0], b[0]);
 909             result[1] = MAX2(a[1], b[1]);
 910             result[2] = MAX2(a[2], b[2]);
 911             result[3] = MAX2(a[3], b[3]);
 912             store_vector4(inst, machine, result);
 913             if (DEBUG_PROG) {
 914                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 915                       result[0], result[1], result[2], result[3],
 916                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 917             }
 918          }
 919          break;
 920       case OPCODE_MIN:
 921          {
 922             GLfloat a[4], b[4], result[4];
 923             fetch_vector4(&inst->SrcReg[0], machine, a);
 924             fetch_vector4(&inst->SrcReg[1], machine, b);
 925             result[0] = MIN2(a[0], b[0]);
 926             result[1] = MIN2(a[1], b[1]);
 927             result[2] = MIN2(a[2], b[2]);
 928             result[3] = MIN2(a[3], b[3]);
 929             store_vector4(inst, machine, result);
 930          }
 931          break;
 932       case OPCODE_MOV:
 933          {
 934             GLfloat result[4];
 935             fetch_vector4(&inst->SrcReg[0], machine, result);
 936             store_vector4(inst, machine, result);
 937             if (DEBUG_PROG) {
 938                printf("MOV (%g %g %g %g)\n",
 939                       result[0], result[1], result[2], result[3]);
 940             }
 941          }
 942          break;
 943       case OPCODE_MUL:
 944          {
 945             GLfloat a[4], b[4], result[4];
 946             fetch_vector4(&inst->SrcReg[0], machine, a);
 947             fetch_vector4(&inst->SrcReg[1], machine, b);
 948             result[0] = a[0] * b[0];
 949             result[1] = a[1] * b[1];
 950             result[2] = a[2] * b[2];
 951             result[3] = a[3] * b[3];
 952             store_vector4(inst, machine, result);
 953             if (DEBUG_PROG) {
 954                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 955                       result[0], result[1], result[2], result[3],
 956                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 957             }
 958          }
 959          break;
 960       case OPCODE_NOISE1:
 961          {
 962             GLfloat a[4], result[4];
 963             fetch_vector1(&inst->SrcReg[0], machine, a);
 964             result[0] =
 965                result[1] =
 966                result[2] = result[3] = _slang_library_noise1(a[0]);
 967             store_vector4(inst, machine, result);
 968          }
 969          break;
 970       case OPCODE_NOISE2:
 971          {
 972             GLfloat a[4], result[4];
 973             fetch_vector4(&inst->SrcReg[0], machine, a);
 974             result[0] =
 975                result[1] =
 976                result[2] = result[3] = _slang_library_noise2(a[0], a[1]);
 977             store_vector4(inst, machine, result);
 978          }
 979          break;
 980       case OPCODE_NOISE3:
 981          {
 982             GLfloat a[4], result[4];
 983             fetch_vector4(&inst->SrcReg[0], machine, a);
 984             result[0] =
 985                result[1] =
 986                result[2] =
 987                result[3] = _slang_library_noise3(a[0], a[1], a[2]);
 988             store_vector4(inst, machine, result);
 989          }
 990          break;
 991       case OPCODE_NOISE4:
 992          {
 993             GLfloat a[4], result[4];
 994             fetch_vector4(&inst->SrcReg[0], machine, a);
 995             result[0] =
 996                result[1] =
 997                result[2] =
 998                result[3] = _slang_library_noise4(a[0], a[1], a[2], a[3]);
 999             store_vector4(inst, machine, result);
1000          }
1001          break;
1002       case OPCODE_NOP:
1003          break;
1004       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1005          {
1006             GLfloat a[4], result[4];
1007             GLhalfNV hx, hy;
1008             GLuint *rawResult = (GLuint *) result;
1009             GLuint twoHalves;
1010             fetch_vector4(&inst->SrcReg[0], machine, a);
1011             hx = _mesa_float_to_half(a[0]);
1012             hy = _mesa_float_to_half(a[1]);
1013             twoHalves = hx | (hy << 16);
1014             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1015                = twoHalves;
1016             store_vector4(inst, machine, result);
1017          }
1018          break;
1019       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1020          {
1021             GLfloat a[4], result[4];
1022             GLuint usx, usy, *rawResult = (GLuint *) result;
1023             fetch_vector4(&inst->SrcReg[0], machine, a);
1024             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1025             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1026             usx = IROUND(a[0] * 65535.0F);
1027             usy = IROUND(a[1] * 65535.0F);
1028             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1029                = usx | (usy << 16);
1030             store_vector4(inst, machine, result);
1031          }
1032          break;
1033       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1034          {
1035             GLfloat a[4], result[4];
1036             GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1037             fetch_vector4(&inst->SrcReg[0], machine, a);
1038             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1039             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1040             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1041             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1042             ubx = IROUND(127.0F * a[0] + 128.0F);
1043             uby = IROUND(127.0F * a[1] + 128.0F);
1044             ubz = IROUND(127.0F * a[2] + 128.0F);
1045             ubw = IROUND(127.0F * a[3] + 128.0F);
1046             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1047                = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1048             store_vector4(inst, machine, result);
1049          }
1050          break;
1051       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1052          {
1053             GLfloat a[4], result[4];
1054             GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1055             fetch_vector4(&inst->SrcReg[0], machine, a);
1056             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1057             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1058             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1059             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1060             ubx = IROUND(255.0F * a[0]);
1061             uby = IROUND(255.0F * a[1]);
1062             ubz = IROUND(255.0F * a[2]);
1063             ubw = IROUND(255.0F * a[3]);
1064             rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1065                = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1066             store_vector4(inst, machine, result);
1067          }
1068          break;
1069       case OPCODE_POW:
1070          {
1071             GLfloat a[4], b[4], result[4];
1072             fetch_vector1(&inst->SrcReg[0], machine, a);
1073             fetch_vector1(&inst->SrcReg[1], machine, b);
1074             result[0] = result[1] = result[2] = result[3]
1075                = (GLfloat) _mesa_pow(a[0], b[0]);
1076             store_vector4(inst, machine, result);
1077          }
1078          break;
1079       case OPCODE_RCP:
1080          {
1081             GLfloat a[4], result[4];
1082             fetch_vector1(&inst->SrcReg[0], machine, a);
1083             if (DEBUG_PROG) {
1084                if (a[0] == 0)
1085                   printf("RCP(0)\n");
1086                else if (IS_INF_OR_NAN(a[0]))
1087                   printf("RCP(inf)\n");
1088             }
1089             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1090             store_vector4(inst, machine, result);
1091          }
1092          break;
1093       case OPCODE_RET:         /* return from subroutine (conditional) */
1094          if (eval_condition(machine, inst)) {
1095             if (machine->StackDepth == 0) {
1096                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1097             }
1098             /* subtract one because of pc++ in the for loop */
1099             pc = machine->CallStack[--machine->StackDepth] - 1;
1100          }
1101          break;
1102       case OPCODE_RFL:         /* reflection vector */
1103          {
1104             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1105             fetch_vector4(&inst->SrcReg[0], machine, axis);
1106             fetch_vector4(&inst->SrcReg[1], machine, dir);
1107             tmpW = DOT3(axis, axis);
1108             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1109             result[0] = tmpX * axis[0] - dir[0];
1110             result[1] = tmpX * axis[1] - dir[1];
1111             result[2] = tmpX * axis[2] - dir[2];
1112             /* result[3] is never written! XXX enforce in parser! */
1113             store_vector4(inst, machine, result);
1114          }
1115          break;
1116       case OPCODE_RSQ:         /* 1 / sqrt() */
1117          {
1118             GLfloat a[4], result[4];
1119             fetch_vector1(&inst->SrcReg[0], machine, a);
1120             a[0] = FABSF(a[0]);
1121             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1122             store_vector4(inst, machine, result);
1123             if (DEBUG_PROG) {
1124                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1125             }
1126          }
1127          break;
1128       case OPCODE_SCS:         /* sine and cos */
1129          {
1130             GLfloat a[4], result[4];
1131             fetch_vector1(&inst->SrcReg[0], machine, a);
1132             result[0] = (GLfloat) _mesa_cos(a[0]);
1133             result[1] = (GLfloat) _mesa_sin(a[0]);
1134             result[2] = 0.0;    /* undefined! */
1135             result[3] = 0.0;    /* undefined! */
1136             store_vector4(inst, machine, result);
1137          }
1138          break;
1139       case OPCODE_SEQ:         /* set on equal */
1140          {
1141             GLfloat a[4], b[4], result[4];
1142             fetch_vector4(&inst->SrcReg[0], machine, a);
1143             fetch_vector4(&inst->SrcReg[1], machine, b);
1144             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1145             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1146             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1147             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1148             store_vector4(inst, machine, result);
1149             if (DEBUG_PROG) {
1150                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1151                       result[0], result[1], result[2], result[3],
1152                       a[0], a[1], a[2], a[3],
1153                       b[0], b[1], b[2], b[3]);
1154             }
1155          }
1156          break;
1157       case OPCODE_SFL:         /* set false, operands ignored */
1158          {
1159             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1160             store_vector4(inst, machine, result);
1161          }
1162          break;
1163       case OPCODE_SGE:         /* set on greater or equal */
1164          {
1165             GLfloat a[4], b[4], result[4];
1166             fetch_vector4(&inst->SrcReg[0], machine, a);
1167             fetch_vector4(&inst->SrcReg[1], machine, b);
1168             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1169             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1170             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1171             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1172             store_vector4(inst, machine, result);
1173             if (DEBUG_PROG) {
1174                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1175                       result[0], result[1], result[2], result[3],
1176                       a[0], a[1], a[2], a[3],
1177                       b[0], b[1], b[2], b[3]);
1178             }
1179          }
1180          break;
1181       case OPCODE_SGT:         /* set on greater */
1182          {
1183             GLfloat a[4], b[4], result[4];
1184             fetch_vector4(&inst->SrcReg[0], machine, a);
1185             fetch_vector4(&inst->SrcReg[1], machine, b);
1186             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1187             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1188             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1189             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1190             store_vector4(inst, machine, result);
1191             if (DEBUG_PROG) {
1192                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1193                       result[0], result[1], result[2], result[3],
1194                       a[0], a[1], a[2], a[3],
1195                       b[0], b[1], b[2], b[3]);
1196             }
1197          }
1198          break;
1199       case OPCODE_SIN:
1200          {
1201             GLfloat a[4], result[4];
1202             fetch_vector1(&inst->SrcReg[0], machine, a);
1203             result[0] = result[1] = result[2] = result[3]
1204                = (GLfloat) _mesa_sin(a[0]);
1205             store_vector4(inst, machine, result);
1206          }
1207          break;
1208       case OPCODE_SLE:         /* set on less or equal */
1209          {
1210             GLfloat a[4], b[4], result[4];
1211             fetch_vector4(&inst->SrcReg[0], machine, a);
1212             fetch_vector4(&inst->SrcReg[1], machine, b);
1213             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1214             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1215             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1216             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1217             store_vector4(inst, machine, result);
1218             if (DEBUG_PROG) {
1219                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1220                       result[0], result[1], result[2], result[3],
1221                       a[0], a[1], a[2], a[3],
1222                       b[0], b[1], b[2], b[3]);
1223             }
1224          }
1225          break;
1226       case OPCODE_SLT:         /* set on less */
1227          {
1228             GLfloat a[4], b[4], result[4];
1229             fetch_vector4(&inst->SrcReg[0], machine, a);
1230             fetch_vector4(&inst->SrcReg[1], machine, b);
1231             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1232             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1233             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1234             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1235             store_vector4(inst, machine, result);
1236             if (DEBUG_PROG) {
1237                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1238                       result[0], result[1], result[2], result[3],
1239                       a[0], a[1], a[2], a[3],
1240                       b[0], b[1], b[2], b[3]);
1241             }
1242          }
1243          break;
1244       case OPCODE_SNE:         /* set on not equal */
1245          {
1246             GLfloat a[4], b[4], result[4];
1247             fetch_vector4(&inst->SrcReg[0], machine, a);
1248             fetch_vector4(&inst->SrcReg[1], machine, b);
1249             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1250             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1251             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1252             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1253             store_vector4(inst, machine, result);
1254             if (DEBUG_PROG) {
1255                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1256                       result[0], result[1], result[2], result[3],
1257                       a[0], a[1], a[2], a[3],
1258                       b[0], b[1], b[2], b[3]);
1259             }
1260          }
1261          break;
1262       case OPCODE_STR:         /* set true, operands ignored */
1263          {
1264             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1265             store_vector4(inst, machine, result);
1266          }
1267          break;
1268       case OPCODE_SUB:
1269          {
1270             GLfloat a[4], b[4], result[4];
1271             fetch_vector4(&inst->SrcReg[0], machine, a);
1272             fetch_vector4(&inst->SrcReg[1], machine, b);
1273             result[0] = a[0] - b[0];
1274             result[1] = a[1] - b[1];
1275             result[2] = a[2] - b[2];
1276             result[3] = a[3] - b[3];
1277             store_vector4(inst, machine, result);
1278             if (DEBUG_PROG) {
1279                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1280                       result[0], result[1], result[2], result[3],
1281                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1282             }
1283          }
1284          break;
1285       case OPCODE_SWZ:         /* extended swizzle */
1286          {
1287             const struct prog_src_register *source = &inst->SrcReg[0];
1288             const GLfloat *src = get_register_pointer(source, machine);
1289             GLfloat result[4];
1290             GLuint i;
1291             for (i = 0; i < 4; i++) {
1292                const GLuint swz = GET_SWZ(source->Swizzle, i);
1293                if (swz == SWIZZLE_ZERO)
1294                   result[i] = 0.0;
1295                else if (swz == SWIZZLE_ONE)
1296                   result[i] = 1.0;
1297                else {
1298                   ASSERT(swz >= 0);
1299                   ASSERT(swz <= 3);
1300                   result[i] = src[swz];
1301                }
1302                if (source->NegateBase & (1 << i))
1303                   result[i] = -result[i];
1304             }
1305             store_vector4(inst, machine, result);
1306          }
1307          break;
1308       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1309          /* Texel lookup */
1310          {
1311             /* Note: only use the precomputed lambda value when we're
1312              * sampling texture unit [K] with texcoord[K].
1313              * Otherwise, the lambda value may have no relation to the
1314              * instruction's texcoord or texture image.  Using the wrong
1315              * lambda is usually bad news.
1316              * The rest of the time, just use zero (until we get a more
1317              * sophisticated way of computing lambda).
1318              */
1319             GLfloat coord[4], color[4], lambda;
1320 #if 0
1321             if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1322                 inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
1323                lambda = span->array->lambda[inst->TexSrcUnit][column];
1324             else
1325 #endif
1326                lambda = 0.0;
1327             fetch_vector4(&inst->SrcReg[0], machine, coord);
1328             machine->FetchTexelLod(ctx, coord, lambda, inst->TexSrcUnit,
1329                                    color);
1330             if (DEBUG_PROG) {
1331                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g], "
1332                       "lod %f\n",
1333                       color[0], color[1], color[2], color[3],
1334                       inst->TexSrcUnit,
1335                       coord[0], coord[1], coord[2], coord[3], lambda);
1336             }
1337             store_vector4(inst, machine, color);
1338          }
1339          break;
1340       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1341          /* Texel lookup with LOD bias */
1342          {
1343             const struct gl_texture_unit *texUnit
1344                = &ctx->Texture.Unit[inst->TexSrcUnit];
1345             GLfloat coord[4], color[4], lambda, bias;
1346 #if 0
1347             if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1348                 inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
1349                lambda = span->array->lambda[inst->TexSrcUnit][column];
1350             else
1351 #endif
1352                lambda = 0.0;
1353             fetch_vector4(&inst->SrcReg[0], machine, coord);
1354             /* coord[3] is the bias to add to lambda */
1355             bias = texUnit->LodBias + coord[3];
1356             if (texUnit->_Current)
1357                bias += texUnit->_Current->LodBias;
1358             machine->FetchTexelLod(ctx, coord, lambda + bias,
1359                                    inst->TexSrcUnit, color);
1360             store_vector4(inst, machine, color);
1361          }
1362          break;
1363       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1364          /* Texture lookup w/ partial derivatives for LOD */
1365          {
1366             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1367             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1368             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1369             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1370             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1371                                      inst->TexSrcUnit, color);
1372             store_vector4(inst, machine, color);
1373          }
1374          break;
1375       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1376          /* Texture lookup w/ projective divide */
1377          {
1378             GLfloat texcoord[4], color[4], lambda;
1379 #if 0
1380             if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1381                 inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
1382                lambda = span->array->lambda[inst->TexSrcUnit][column];
1383             else
1384 #endif
1385                lambda = 0.0;
1386             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1387             /* Not so sure about this test - if texcoord[3] is
1388              * zero, we'd probably be fine except for an ASSERT in
1389              * IROUND_POS() which gets triggered by the inf values created.
1390              */
1391             if (texcoord[3] != 0.0) {
1392                texcoord[0] /= texcoord[3];
1393                texcoord[1] /= texcoord[3];
1394                texcoord[2] /= texcoord[3];
1395             }
1396             machine->FetchTexelLod(ctx, texcoord, lambda,
1397                                    inst->TexSrcUnit, color);
1398             store_vector4(inst, machine, color);
1399          }
1400          break;
1401       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1402          /* Texture lookup w/ projective divide */
1403          {
1404             GLfloat texcoord[4], color[4], lambda;
1405 #if 0
1406             if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1407                 inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
1408                lambda = span->array->lambda[inst->TexSrcUnit][column];
1409             else
1410 #endif
1411                lambda = 0.0;
1412             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1413             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1414                 texcoord[3] != 0.0) {
1415                texcoord[0] /= texcoord[3];
1416                texcoord[1] /= texcoord[3];
1417                texcoord[2] /= texcoord[3];
1418             }
1419             machine->FetchTexelLod(ctx, texcoord, lambda,
1420                                    inst->TexSrcUnit, color);
1421             store_vector4(inst, machine, color);
1422          }
1423          break;
1424       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1425          {
1426             GLfloat a[4], result[4];
1427             const GLuint *rawBits = (const GLuint *) a;
1428             GLhalfNV hx, hy;
1429             fetch_vector1(&inst->SrcReg[0], machine, a);
1430             hx = rawBits[0] & 0xffff;
1431             hy = rawBits[0] >> 16;
1432             result[0] = result[2] = _mesa_half_to_float(hx);
1433             result[1] = result[3] = _mesa_half_to_float(hy);
1434             store_vector4(inst, machine, result);
1435          }
1436          break;
1437       case OPCODE_UP2US:       /* unpack two GLushorts */
1438          {
1439             GLfloat a[4], result[4];
1440             const GLuint *rawBits = (const GLuint *) a;
1441             GLushort usx, usy;
1442             fetch_vector1(&inst->SrcReg[0], machine, a);
1443             usx = rawBits[0] & 0xffff;
1444             usy = rawBits[0] >> 16;
1445             result[0] = result[2] = usx * (1.0f / 65535.0f);
1446             result[1] = result[3] = usy * (1.0f / 65535.0f);
1447             store_vector4(inst, machine, result);
1448          }
1449          break;
1450       case OPCODE_UP4B:        /* unpack four GLbytes */
1451          {
1452             GLfloat a[4], result[4];
1453             const GLuint *rawBits = (const GLuint *) a;
1454             fetch_vector1(&inst->SrcReg[0], machine, a);
1455             result[0] = (((rawBits[0] >> 0) & 0xff) - 128) / 127.0F;
1456             result[1] = (((rawBits[0] >> 8) & 0xff) - 128) / 127.0F;
1457             result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1458             result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1459             store_vector4(inst, machine, result);
1460          }
1461          break;
1462       case OPCODE_UP4UB:       /* unpack four GLubytes */
1463          {
1464             GLfloat a[4], result[4];
1465             const GLuint *rawBits = (const GLuint *) a;
1466             fetch_vector1(&inst->SrcReg[0], machine, a);
1467             result[0] = ((rawBits[0] >> 0) & 0xff) / 255.0F;
1468             result[1] = ((rawBits[0] >> 8) & 0xff) / 255.0F;
1469             result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1470             result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1471             store_vector4(inst, machine, result);
1472          }
1473          break;
1474       case OPCODE_XPD:         /* cross product */
1475          {
1476             GLfloat a[4], b[4], result[4];
1477             fetch_vector4(&inst->SrcReg[0], machine, a);
1478             fetch_vector4(&inst->SrcReg[1], machine, b);
1479             result[0] = a[1] * b[2] - a[2] * b[1];
1480             result[1] = a[2] * b[0] - a[0] * b[2];
1481             result[2] = a[0] * b[1] - a[1] * b[0];
1482             result[3] = 1.0;
1483             store_vector4(inst, machine, result);
1484             if (DEBUG_PROG) {
1485                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1486                       result[0], result[1], result[2], result[3],
1487                       a[0], a[1], a[2], b[0], b[1], b[2]);
1488             }
1489          }
1490          break;
1491       case OPCODE_X2D:         /* 2-D matrix transform */
1492          {
1493             GLfloat a[4], b[4], c[4], result[4];
1494             fetch_vector4(&inst->SrcReg[0], machine, a);
1495             fetch_vector4(&inst->SrcReg[1], machine, b);
1496             fetch_vector4(&inst->SrcReg[2], machine, c);
1497             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1498             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1499             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1500             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1501             store_vector4(inst, machine, result);
1502          }
1503          break;
1504       case OPCODE_PRINT:
1505          {
1506             if (inst->SrcReg[0].File != -1) {
1507                GLfloat a[4];
1508                fetch_vector4(&inst->SrcReg[0], machine, a);
1509                _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1510                             a[0], a[1], a[2], a[3]);
1511             }
1512             else {
1513                _mesa_printf("%s\n", (const char *) inst->Data);
1514             }
1515          }
1516          break;
1517       case OPCODE_END:
1518          return GL_TRUE;
1519       default:
1520          _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1521                        inst->Opcode);
1522          return GL_TRUE;        /* return value doesn't matter */
1523
1524       }
1525
1526       numExec++;
1527       if (numExec > maxExec) {
1528          _mesa_problem(ctx, "Infinite loop detected in fragment program");
1529          return GL_TRUE;
1530       }
1531
1532    } /* for pc */
1533
1534 #if FEATURE_MESA_program_debug
1535    CurrentMachine = NULL;
1536 #endif
1537
1538    return GL_TRUE;
1539 }