Bug #13492: Only call ProgramStringNotify if program parsing succeeded.

[mesa.git] / src / mesa / shader / prog_execute.c
diff --git a/src/mesa/shader/prog_execute.c b/src/mesa/shader/prog_execute.c

index 562650e8ca9a4d23c0ce911d7a7b60306887d784..00a375078b44e0b59173a87065e4a77b7d65bdb7 100644 (file)
--- a/src/mesa/shader/prog_execute.c
+++ b/src/mesa/shader/prog_execute.c
@@ -1,6 +1,6 @@
  /*
   * Mesa 3-D graphics library
- * Version:  6.5.3
+ * Version:  7.0.3
   *
   * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
   *
@@ -43,12 +43,9 @@
  #include "prog_instruction.h"
  #include "prog_parameter.h"
  #include "prog_print.h"
-#include "slang_library_noise.h"
+#include "shader/slang/slang_library_noise.h"
  
  
-/* See comments below for info about this */
-#define LAMBDA_ZERO 1
-
  /* debug predicate */
  #define DEBUG_PROG 0
  
@@ -79,22 +76,25 @@ static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
   * source register.
   */
  static INLINE const GLfloat *
-get_register_pointer(GLcontext * ctx,
-                     const struct prog_src_register *source,
+get_register_pointer(const struct prog_src_register *source,
                       const struct gl_program_machine *machine)
  {
     if (source->RelAddr) {
        const GLint reg = source->Index + machine->AddressReg[0][0];
-      ASSERT( (source->File == PROGRAM_ENV_PARAM) || 
-        (source->File == PROGRAM_STATE_VAR) );
-      if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
-         return ZeroVec;
-      else if (source->File == PROGRAM_ENV_PARAM)
-         return ctx->VertexProgram.Parameters[reg];
+      if (source->File == PROGRAM_ENV_PARAM)
+         if (reg < 0 || reg >= MAX_PROGRAM_ENV_PARAMS)
+            return ZeroVec;
+         else
+            return machine->EnvParams[reg];
        else {
+         const struct gl_program_parameter_list *params;
           ASSERT(source->File == PROGRAM_LOCAL_PARAM ||
                  source->File == PROGRAM_STATE_VAR);
-         return machine->CurProgram->Parameters->ParameterValues[reg];
+         params = machine->CurProgram->Parameters;
+         if (reg < 0 || reg >= params->NumParameters)
+            return ZeroVec;
+         else
+            return params->ParameterValues[reg];
        }
     }
  
@@ -123,10 +123,7 @@ get_register_pointer(GLcontext * ctx,
  
     case PROGRAM_ENV_PARAM:
        ASSERT(source->Index < MAX_PROGRAM_ENV_PARAMS);
-      if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB)
-         return ctx->VertexProgram.Parameters[source->Index];
-      else
-         return ctx->FragmentProgram.Parameters[source->Index];
+      return machine->EnvParams[source->Index];
  
     case PROGRAM_STATE_VAR:
        /* Fallthrough */
@@ -140,7 +137,7 @@ get_register_pointer(GLcontext * ctx,
        return machine->CurProgram->Parameters->ParameterValues[source->Index];
  
     default:
-      _mesa_problem(ctx,
+      _mesa_problem(NULL,
                      "Invalid input register file %d in get_register_pointer()",
                      source->File);
        return NULL;
@@ -165,7 +162,7 @@ _mesa_get_program_register(GLcontext *ctx, enum register_file file,
        const GLfloat *reg;
        src.File = file;
        src.Index = index;
-      reg = get_register_pointer(ctx, &src, CurrentMachine);
+      reg = get_register_pointer(&src, CurrentMachine);
        COPY_4V(val, reg);
     }
  }
@@ -177,11 +174,10 @@ _mesa_get_program_register(GLcontext *ctx, enum register_file file,
   * Apply swizzling and negating as needed.
   */
  static void
-fetch_vector4(GLcontext * ctx,
-              const struct prog_src_register *source,
+fetch_vector4(const struct prog_src_register *source,
                const struct gl_program_machine *machine, GLfloat result[4])
  {
-   const GLfloat *src = get_register_pointer(ctx, source, machine);
+   const GLfloat *src = get_register_pointer(source, machine);
     ASSERT(src);
  
     if (source->Swizzle == SWIZZLE_NOOP) {
@@ -219,131 +215,74 @@ fetch_vector4(GLcontext * ctx,
     }
  }
  
-#if 0
+
  /**
- * Fetch the derivative with respect to X for the given register.
- * \return GL_TRUE if it was easily computed or GL_FALSE if we
- * need to execute another instance of the program (ugh)!
+ * Fetch the derivative with respect to X or Y for the given register.
+ * XXX this currently only works for fragment program input attribs.
   */
-static GLboolean
+static void
  fetch_vector4_deriv(GLcontext * ctx,
                      const struct prog_src_register *source,
-                    const SWspan * span,
-                    char xOrY, GLint column, GLfloat result[4])
+                    const struct gl_program_machine *machine,
+                    char xOrY, GLfloat result[4])
  {
-   GLfloat src[4];
-
-   ASSERT(xOrY == 'X' || xOrY == 'Y');
+   if (source->File == PROGRAM_INPUT && source->Index < machine->NumDeriv) {
+      const GLint col = machine->CurElement;
+      const GLfloat w = machine->Attribs[FRAG_ATTRIB_WPOS][col][3];
+      const GLfloat invQ = 1.0f / w;
+      GLfloat deriv[4];
  
-   switch (source->Index) {
-   case FRAG_ATTRIB_WPOS:
        if (xOrY == 'X') {
-         src[0] = 1.0;
-         src[1] = 0.0;
-         src[2] = span->attrStepX[FRAG_ATTRIB_WPOS][2]
-            / ctx->DrawBuffer->_DepthMaxF;
-         src[3] = span->attrStepX[FRAG_ATTRIB_WPOS][3];
+         deriv[0] = machine->DerivX[source->Index][0] * invQ;
+         deriv[1] = machine->DerivX[source->Index][1] * invQ;
+         deriv[2] = machine->DerivX[source->Index][2] * invQ;
+         deriv[3] = machine->DerivX[source->Index][3] * invQ;
        }
        else {
-         src[0] = 0.0;
-         src[1] = 1.0;
-         src[2] = span->attrStepY[FRAG_ATTRIB_WPOS][2]
-            / ctx->DrawBuffer->_DepthMaxF;
-         src[3] = span->attrStepY[FRAG_ATTRIB_WPOS][3];
-      }
-      break;
-   case FRAG_ATTRIB_COL0:
-   case FRAG_ATTRIB_COL1:
-      if (xOrY == 'X') {
-         src[0] = span->attrStepX[source->Index][0] * (1.0F / CHAN_MAXF);
-         src[1] = span->attrStepX[source->Index][1] * (1.0F / CHAN_MAXF);
-         src[2] = span->attrStepX[source->Index][2] * (1.0F / CHAN_MAXF);
-         src[3] = span->attrStepX[source->Index][3] * (1.0F / CHAN_MAXF);
-      }
-      else {
-         src[0] = span->attrStepY[source->Index][0] * (1.0F / CHAN_MAXF);
-         src[1] = span->attrStepY[source->Index][1] * (1.0F / CHAN_MAXF);
-         src[2] = span->attrStepY[source->Index][2] * (1.0F / CHAN_MAXF);
-         src[3] = span->attrStepY[source->Index][3] * (1.0F / CHAN_MAXF);
-      }
-      break;
-   case FRAG_ATTRIB_FOGC:
-      if (xOrY == 'X') {
-         src[0] = span->attrStepX[FRAG_ATTRIB_FOGC][0] * (1.0F / CHAN_MAXF);
-         src[1] = 0.0;
-         src[2] = 0.0;
-         src[3] = 0.0;
+         deriv[0] = machine->DerivY[source->Index][0] * invQ;
+         deriv[1] = machine->DerivY[source->Index][1] * invQ;
+         deriv[2] = machine->DerivY[source->Index][2] * invQ;
+         deriv[3] = machine->DerivY[source->Index][3] * invQ;
        }
-      else {
-         src[0] = span->attrStepY[FRAG_ATTRIB_FOGC][0] * (1.0F / CHAN_MAXF);
-         src[1] = 0.0;
-         src[2] = 0.0;
-         src[3] = 0.0;
+
+      result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
+      result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
+      result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
+      result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
+      
+      if (source->NegateBase) {
+         result[0] = -result[0];
+         result[1] = -result[1];
+         result[2] = -result[2];
+         result[3] = -result[3];
        }
-      break;
-   default:
-      assert(source->Index < FRAG_ATTRIB_MAX);
-      /* texcoord or varying */
-      if (xOrY == 'X') {
-         /* this is a little tricky - I think I've got it right */
-         const GLfloat invQ = 1.0f / (span->attrStart[source->Index][3]
-                                      +
-                                      span->attrStepX[source->Index][3] *
-                                      column);
-         src[0] = span->attrStepX[source->Index][0] * invQ;
-         src[1] = span->attrStepX[source->Index][1] * invQ;
-         src[2] = span->attrStepX[source->Index][2] * invQ;
-         src[3] = span->attrStepX[source->Index][3] * invQ;
+      if (source->Abs) {
+         result[0] = FABSF(result[0]);
+         result[1] = FABSF(result[1]);
+         result[2] = FABSF(result[2]);
+         result[3] = FABSF(result[3]);
        }
-      else {
-         /* Tricky, as above, but in Y direction */
-         const GLfloat invQ = 1.0f / (span->attrStart[source->Index][3]
-                                      + span->attrStepY[source->Index][3]);
-         src[0] = span->attrStepY[source->Index][0] * invQ;
-         src[1] = span->attrStepY[source->Index][1] * invQ;
-         src[2] = span->attrStepY[source->Index][2] * invQ;
-         src[3] = span->attrStepY[source->Index][3] * invQ;
+      if (source->NegateAbs) {
+         result[0] = -result[0];
+         result[1] = -result[1];
+         result[2] = -result[2];
+         result[3] = -result[3];
        }
-      break;
     }
-
-   result[0] = src[GET_SWZ(source->Swizzle, 0)];
-   result[1] = src[GET_SWZ(source->Swizzle, 1)];
-   result[2] = src[GET_SWZ(source->Swizzle, 2)];
-   result[3] = src[GET_SWZ(source->Swizzle, 3)];
-
-   if (source->NegateBase) {
-      result[0] = -result[0];
-      result[1] = -result[1];
-      result[2] = -result[2];
-      result[3] = -result[3];
-   }
-   if (source->Abs) {
-      result[0] = FABSF(result[0]);
-      result[1] = FABSF(result[1]);
-      result[2] = FABSF(result[2]);
-      result[3] = FABSF(result[3]);
-   }
-   if (source->NegateAbs) {
-      result[0] = -result[0];
-      result[1] = -result[1];
-      result[2] = -result[2];
-      result[3] = -result[3];
+   else {
+      ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
     }
-   return GL_TRUE;
  }
-#endif
  
  
  /**
   * As above, but only return result[0] element.
   */
  static void
-fetch_vector1(GLcontext * ctx,
-              const struct prog_src_register *source,
+fetch_vector1(const struct prog_src_register *source,
                const struct gl_program_machine *machine, GLfloat result[4])
  {
-   const GLfloat *src = get_register_pointer(ctx, source, machine);
+   const GLfloat *src = get_register_pointer(source, machine);
     ASSERT(src);
  
     result[0] = src[GET_SWZ(source->Swizzle, 0)];
@@ -360,6 +299,36 @@ fetch_vector1(GLcontext * ctx,
  }
  
  
+/**
+ * Fetch texel from texture.  Use partial derivatives when possible.
+ */
+static INLINE void
+fetch_texel(GLcontext *ctx,
+            const struct gl_program_machine *machine,
+            const struct prog_instruction *inst,
+            const GLfloat texcoord[4], GLfloat lodBias,
+            GLfloat color[4])
+{
+   /* Note: we only have the right derivatives for fragment input attribs.
+    */
+   if (machine->NumDeriv > 0 &&
+       inst->SrcReg[0].File == PROGRAM_INPUT &&
+       inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit) {
+      /* simple texture fetch for which we should have derivatives */
+      GLuint attr = inst->SrcReg[0].Index;
+      machine->FetchTexelDeriv(ctx, texcoord,
+                               machine->DerivX[attr],
+                               machine->DerivY[attr],
+                               lodBias,
+                               inst->TexSrcUnit, color);
+   }
+   else {
+      machine->FetchTexelLod(ctx, texcoord, lodBias,
+                             inst->TexSrcUnit, color);
+   }
+}
+
+
  /**
   * Test value against zero and return GT, LT, EQ or UN if NaN.
   */
@@ -510,116 +479,23 @@ store_vector4(const struct prog_instruction *inst,
           machine->CondCodes[2] = generate_cc(value[2]);
        if (writeMask & WRITEMASK_W)
           machine->CondCodes[3] = generate_cc(value[3]);
+#if DEBUG_PROG
+      printf("CondCodes=(%s,%s,%s,%s) for:\n",
+             _mesa_condcode_string(machine->CondCodes[0]),
+             _mesa_condcode_string(machine->CondCodes[1]),
+             _mesa_condcode_string(machine->CondCodes[2]),
+             _mesa_condcode_string(machine->CondCodes[3]));
+#endif
     }
  }
  
  
-#if 0
-/**
- * Initialize a new machine state instance from an existing one, adding
- * the partial derivatives onto the input registers.
- * Used to implement DDX and DDY instructions in non-trivial cases.
- */
-static void
-init_machine_deriv(GLcontext * ctx,
-                   const struct gl_program_machine *machine,
-                   const struct gl_fragment_program *program,
-                   const SWspan * span, char xOrY,
-                   struct gl_program_machine *dMachine)
-{
-   GLuint attr;
-
-   ASSERT(xOrY == 'X' || xOrY == 'Y');
-
-   /* copy existing machine */
-   _mesa_memcpy(dMachine, machine, sizeof(struct gl_program_machine));
-
-   if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
-      /* XXX also need to do this when using valgrind */
-      /* Clear temporary registers (undefined for ARB_f_p) */
-      _mesa_bzero((void *) machine->Temporaries,
-                  MAX_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
-   }
-
-   /* Add derivatives */
-   if (program->Base.InputsRead & FRAG_BIT_WPOS) {
-      GLfloat *wpos = machine->Attribs[FRAG_ATTRIB_WPOS][machine->CurElement];
-      if (xOrY == 'X') {
-         wpos[0] += 1.0F;
-         wpos[1] += 0.0F;
-         wpos[2] += span->attrStepX[FRAG_ATTRIB_WPOS][2];
-         wpos[3] += span->attrStepX[FRAG_ATTRIB_WPOS][3];
-      }
-      else {
-         wpos[0] += 0.0F;
-         wpos[1] += 1.0F;
-         wpos[2] += span->attrStepY[FRAG_ATTRIB_WPOS][2];
-         wpos[3] += span->attrStepY[FRAG_ATTRIB_WPOS][3];
-      }
-   }
-
-   /* primary, secondary colors */
-   for (attr = FRAG_ATTRIB_COL0; attr <= FRAG_ATTRIB_COL1; attr++) {
-      if (program->Base.InputsRead & (1 << attr)) {
-         GLfloat *col = machine->Attribs[attr][machine->CurElement];
-         if (xOrY == 'X') {
-            col[0] += span->attrStepX[attr][0] * (1.0F / CHAN_MAXF);
-            col[1] += span->attrStepX[attr][1] * (1.0F / CHAN_MAXF);
-            col[2] += span->attrStepX[attr][2] * (1.0F / CHAN_MAXF);
-            col[3] += span->attrStepX[attr][3] * (1.0F / CHAN_MAXF);
-         }
-         else {
-            col[0] += span->attrStepY[attr][0] * (1.0F / CHAN_MAXF);
-            col[1] += span->attrStepY[attr][1] * (1.0F / CHAN_MAXF);
-            col[2] += span->attrStepY[attr][2] * (1.0F / CHAN_MAXF);
-            col[3] += span->attrStepY[attr][3] * (1.0F / CHAN_MAXF);
-         }
-      }
-   }
-   if (program->Base.InputsRead & FRAG_BIT_FOGC) {
-      GLfloat *fogc = machine->Attribs[FRAG_ATTRIB_FOGC][machine->CurElement];
-      if (xOrY == 'X') {
-         fogc[0] += span->attrStepX[FRAG_ATTRIB_FOGC][0];
-      }
-      else {
-         fogc[0] += span->attrStepY[FRAG_ATTRIB_FOGC][0];
-      }
-   }
-   /* texcoord and varying vars */
-   for (attr = FRAG_ATTRIB_TEX0; attr < FRAG_ATTRIB_MAX; attr++) {
-      if (program->Base.InputsRead & (1 << attr)) {
-         GLfloat *val = machine->Attribs[attr][machine->CurElement];
-         /* XXX perspective-correct interpolation */
-         if (xOrY == 'X') {
-            val[0] += span->attrStepX[attr][0];
-            val[1] += span->attrStepX[attr][1];
-            val[2] += span->attrStepX[attr][2];
-            val[3] += span->attrStepX[attr][3];
-         }
-         else {
-            val[0] += span->attrStepY[attr][0];
-            val[1] += span->attrStepY[attr][1];
-            val[2] += span->attrStepY[attr][2];
-            val[3] += span->attrStepY[attr][3];
-         }
-      }
-   }
-
-   /* init condition codes */
-   dMachine->CondCodes[0] = COND_EQ;
-   dMachine->CondCodes[1] = COND_EQ;
-   dMachine->CondCodes[2] = COND_EQ;
-   dMachine->CondCodes[3] = COND_EQ;
-}
-#endif
-
-
  /**
   * Execute the given vertex/fragment program.
   *
- * \param ctx - rendering context
- * \param program - the fragment program to execute
- * \param machine - machine state (register file)
+ * \param ctx  rendering context
+ * \param program  the program to execute
+ * \param machine  machine state (must be initialized)
   * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
   */
  GLboolean
@@ -641,6 +517,13 @@ _mesa_execute_program(GLcontext * ctx,
     CurrentMachine = machine;
  #endif
  
+   if (program->Target == GL_VERTEX_PROGRAM_ARB) {
+      machine->EnvParams = ctx->VertexProgram.Parameters;
+   }
+   else {
+      machine->EnvParams = ctx->FragmentProgram.Parameters;
+   }
+
     for (pc = 0; pc < numInst; pc++) {
        const struct prog_instruction *inst = program->Instructions + pc;
  
@@ -661,7 +544,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_ABS:
           {
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              result[0] = FABSF(a[0]);
              result[1] = FABSF(a[1]);
              result[2] = FABSF(a[2]);
@@ -672,8 +555,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_ADD:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = a[0] + b[0];
              result[1] = a[1] + b[1];
              result[2] = a[2] + b[2];
@@ -689,7 +572,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_ARL:
           {
              GLfloat t[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, t);
+            fetch_vector4(&inst->SrcReg[0], machine, t);
              machine->AddressReg[0][0] = (GLint) FLOORF(t[0]);
           }
           break;
@@ -721,16 +604,17 @@ _mesa_execute_program(GLcontext * ctx,
              if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
                 return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
              }
-            machine->CallStack[machine->StackDepth++] = pc + 1;
-            pc = inst->BranchTarget;    /* XXX - 1 ??? */
+            machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
+            /* Subtract 1 here since we'll do pc++ at end of for-loop */
+            pc = inst->BranchTarget - 1;
           }
           break;
        case OPCODE_CMP:
           {
              GLfloat a[4], b[4], c[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
-            fetch_vector4(ctx, &inst->SrcReg[2], machine, c);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[2], machine, c);
              result[0] = a[0] < 0.0F ? b[0] : c[0];
              result[1] = a[1] < 0.0F ? b[1] : c[1];
              result[2] = a[2] < 0.0F ? b[2] : c[2];
@@ -741,7 +625,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_COS:
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] = result[1] = result[2] = result[3]
                 = (GLfloat) _mesa_cos(a[0]);
              store_vector4(inst, machine, result);
@@ -749,64 +633,25 @@ _mesa_execute_program(GLcontext * ctx,
           break;
        case OPCODE_DDX:         /* Partial derivative with respect to X */
           {
-#if 0
-            GLfloat a[4], aNext[4], result[4];
-            struct gl_program_machine dMachine;
-            if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
-                                     column, result)) {
-               /* This is tricky.  Make a copy of the current machine state,
-                * increment the input registers by the dx or dy partial
-                * derivatives, then re-execute the program up to the
-                * preceeding instruction, then fetch the source register.
-                * Finally, find the difference in the register values for
-                * the original and derivative runs.
-                */
-               fetch_vector4(ctx, &inst->SrcReg[0], machine, program, a);
-               init_machine_deriv(ctx, machine, program, span,
-                                  'X', &dMachine);
-               execute_program(ctx, program, pc, &dMachine, span, column);
-               fetch_vector4(ctx, &inst->SrcReg[0], &dMachine, program,
-                             aNext);
-               result[0] = aNext[0] - a[0];
-               result[1] = aNext[1] - a[1];
-               result[2] = aNext[2] - a[2];
-               result[3] = aNext[3] - a[3];
-            }
+            GLfloat result[4];
+            fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
+                                'X', result);
              store_vector4(inst, machine, result);
-#else
-            store_vector4(inst, machine, ZeroVec);
-#endif
           }
           break;
        case OPCODE_DDY:         /* Partial derivative with respect to Y */
           {
-#if 0
-            GLfloat a[4], aNext[4], result[4];
-            struct gl_program_machine dMachine;
-            if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
-                                     column, result)) {
-               init_machine_deriv(ctx, machine, program, span,
-                                  'Y', &dMachine);
-               fetch_vector4(ctx, &inst->SrcReg[0], machine, program, a);
-               execute_program(ctx, program, pc, &dMachine, span, column);
-               fetch_vector4(ctx, &inst->SrcReg[0], &dMachine, program,
-                             aNext);
-               result[0] = aNext[0] - a[0];
-               result[1] = aNext[1] - a[1];
-               result[2] = aNext[2] - a[2];
-               result[3] = aNext[3] - a[3];
-            }
+            GLfloat result[4];
+            fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
+                                'Y', result);
              store_vector4(inst, machine, result);
-#else
-            store_vector4(inst, machine, ZeroVec);
-#endif
           }
           break;
        case OPCODE_DP3:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
              store_vector4(inst, machine, result);
              if (DEBUG_PROG) {
@@ -818,8 +663,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_DP4:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
              store_vector4(inst, machine, result);
              if (DEBUG_PROG) {
@@ -832,8 +677,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_DPH:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = result[1] = result[2] = result[3] =
                 a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
              store_vector4(inst, machine, result);
@@ -842,8 +687,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_DST:         /* Distance vector */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = 1.0F;
              result[1] = a[1] * b[1];
              result[2] = a[2];
@@ -854,7 +699,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_EXP:
           {
              GLfloat t[4], q[4], floor_t0;
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, t);
+            fetch_vector1(&inst->SrcReg[0], machine, t);
              floor_t0 = FLOORF(t[0]);
              if (floor_t0 > FLT_MAX_EXP) {
                 SET_POS_INFINITY(q[0]);
@@ -880,7 +725,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_EX2:         /* Exponential base 2 */
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] = result[1] = result[2] = result[3] =
                 (GLfloat) _mesa_pow(2.0, a[0]);
              store_vector4(inst, machine, result);
@@ -889,7 +734,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_FLR:
           {
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              result[0] = FLOORF(a[0]);
              result[1] = FLOORF(a[1]);
              result[2] = FLOORF(a[2]);
@@ -900,7 +745,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_FRC:
           {
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              result[0] = a[0] - FLOORF(a[0]);
              result[1] = a[1] - FLOORF(a[1]);
              result[2] = a[2] - FLOORF(a[2]);
@@ -909,13 +754,29 @@ _mesa_execute_program(GLcontext * ctx,
           }
           break;
        case OPCODE_IF:
-         if (eval_condition(machine, inst)) {
-            /* do if-clause (just continue execution) */
-         }
-         else {
-            /* go to the instruction after ELSE or ENDIF */
-            assert(inst->BranchTarget >= 0);
-            pc = inst->BranchTarget - 1;
+         {
+            GLboolean cond;
+            /* eval condition */
+            if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
+               GLfloat a[4];
+               fetch_vector1(&inst->SrcReg[0], machine, a);
+               cond = (a[0] != 0.0);
+            }
+            else {
+               cond = eval_condition(machine, inst);
+            }
+            if (DEBUG_PROG) {
+               printf("IF: %d\n", cond);
+            }
+            /* do if/else */
+            if (cond) {
+               /* do if-clause (just continue execution) */
+            }
+            else {
+               /* go to the instruction after ELSE or ENDIF */
+               assert(inst->BranchTarget >= 0);
+               pc = inst->BranchTarget - 1;
+            }
           }
           break;
        case OPCODE_ELSE:
@@ -929,7 +790,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_INT:         /* float to int */
           {
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              result[0] = (GLfloat) (GLint) a[0];
              result[1] = (GLfloat) (GLint) a[1];
              result[2] = (GLfloat) (GLint) a[2];
@@ -945,7 +806,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_KIL:         /* ARB_f_p only */
           {
              GLfloat a[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
                 return GL_FALSE;
              }
@@ -954,7 +815,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_LG2:         /* log base 2 */
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
              store_vector4(inst, machine, result);
           }
@@ -963,7 +824,7 @@ _mesa_execute_program(GLcontext * ctx,
           {
              const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              a[0] = MAX2(a[0], 0.0F);
              a[1] = MAX2(a[1], 0.0F);
              /* XXX ARB version clamps a[3], NV version doesn't */
@@ -992,7 +853,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_LOG:
           {
              GLfloat t[4], q[4], abs_t0;
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, t);
+            fetch_vector1(&inst->SrcReg[0], machine, t);
              abs_t0 = FABSF(t[0]);
              if (abs_t0 != 0.0F) {
                 /* Since we really can't handle infinite values on VMS
@@ -1029,9 +890,9 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_LRP:
           {
              GLfloat a[4], b[4], c[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
-            fetch_vector4(ctx, &inst->SrcReg[2], machine, c);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[2], machine, c);
              result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
              result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
              result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
@@ -1049,9 +910,9 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_MAD:
           {
              GLfloat a[4], b[4], c[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
-            fetch_vector4(ctx, &inst->SrcReg[2], machine, c);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[2], machine, c);
              result[0] = a[0] * b[0] + c[0];
              result[1] = a[1] * b[1] + c[1];
              result[2] = a[2] * b[2] + c[2];
@@ -1069,8 +930,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_MAX:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = MAX2(a[0], b[0]);
              result[1] = MAX2(a[1], b[1]);
              result[2] = MAX2(a[2], b[2]);
@@ -1086,8 +947,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_MIN:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = MIN2(a[0], b[0]);
              result[1] = MIN2(a[1], b[1]);
              result[2] = MIN2(a[2], b[2]);
@@ -1098,7 +959,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_MOV:
           {
              GLfloat result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, result);
+            fetch_vector4(&inst->SrcReg[0], machine, result);
              store_vector4(inst, machine, result);
              if (DEBUG_PROG) {
                 printf("MOV (%g %g %g %g)\n",
@@ -1109,8 +970,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_MUL:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = a[0] * b[0];
              result[1] = a[1] * b[1];
              result[2] = a[2] * b[2];
@@ -1126,7 +987,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_NOISE1:
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] =
                 result[1] =
                 result[2] = result[3] = _slang_library_noise1(a[0]);
@@ -1136,7 +997,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_NOISE2:
           {
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              result[0] =
                 result[1] =
                 result[2] = result[3] = _slang_library_noise2(a[0], a[1]);
@@ -1146,7 +1007,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_NOISE3:
           {
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              result[0] =
                 result[1] =
                 result[2] =
@@ -1157,7 +1018,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_NOISE4:
           {
              GLfloat a[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              result[0] =
                 result[1] =
                 result[2] =
@@ -1173,7 +1034,7 @@ _mesa_execute_program(GLcontext * ctx,
              GLhalfNV hx, hy;
              GLuint *rawResult = (GLuint *) result;
              GLuint twoHalves;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              hx = _mesa_float_to_half(a[0]);
              hy = _mesa_float_to_half(a[1]);
              twoHalves = hx | (hy << 16);
@@ -1186,7 +1047,7 @@ _mesa_execute_program(GLcontext * ctx,
           {
              GLfloat a[4], result[4];
              GLuint usx, usy, *rawResult = (GLuint *) result;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              a[0] = CLAMP(a[0], 0.0F, 1.0F);
              a[1] = CLAMP(a[1], 0.0F, 1.0F);
              usx = IROUND(a[0] * 65535.0F);
@@ -1200,7 +1061,7 @@ _mesa_execute_program(GLcontext * ctx,
           {
              GLfloat a[4], result[4];
              GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
              a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
              a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
@@ -1218,7 +1079,7 @@ _mesa_execute_program(GLcontext * ctx,
           {
              GLfloat a[4], result[4];
              GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
              a[0] = CLAMP(a[0], 0.0F, 1.0F);
              a[1] = CLAMP(a[1], 0.0F, 1.0F);
              a[2] = CLAMP(a[2], 0.0F, 1.0F);
@@ -1235,8 +1096,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_POW:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector1(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[1], machine, b);
              result[0] = result[1] = result[2] = result[3]
                 = (GLfloat) _mesa_pow(a[0], b[0]);
              store_vector4(inst, machine, result);
@@ -1245,7 +1106,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_RCP:
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              if (DEBUG_PROG) {
                 if (a[0] == 0)
                    printf("RCP(0)\n");
@@ -1261,14 +1122,15 @@ _mesa_execute_program(GLcontext * ctx,
              if (machine->StackDepth == 0) {
                 return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
              }
-            pc = machine->CallStack[--machine->StackDepth];
+            /* subtract one because of pc++ in the for loop */
+            pc = machine->CallStack[--machine->StackDepth] - 1;
           }
           break;
        case OPCODE_RFL:         /* reflection vector */
           {
              GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, axis);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, dir);
+            fetch_vector4(&inst->SrcReg[0], machine, axis);
+            fetch_vector4(&inst->SrcReg[1], machine, dir);
              tmpW = DOT3(axis, axis);
              tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
              result[0] = tmpX * axis[0] - dir[0];
@@ -1281,7 +1143,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_RSQ:         /* 1 / sqrt() */
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              a[0] = FABSF(a[0]);
              result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
              store_vector4(inst, machine, result);
@@ -1293,7 +1155,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_SCS:         /* sine and cos */
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] = (GLfloat) _mesa_cos(a[0]);
              result[1] = (GLfloat) _mesa_sin(a[0]);
              result[2] = 0.0;    /* undefined! */
@@ -1304,13 +1166,19 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_SEQ:         /* set on equal */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
              result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
              result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
              result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
              store_vector4(inst, machine, result);
+            if (DEBUG_PROG) {
+               printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
+                      result[0], result[1], result[2], result[3],
+                      a[0], a[1], a[2], a[3],
+                      b[0], b[1], b[2], b[3]);
+            }
           }
           break;
        case OPCODE_SFL:         /* set false, operands ignored */
@@ -1322,35 +1190,43 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_SGE:         /* set on greater or equal */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
              result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
              result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
              result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
              store_vector4(inst, machine, result);
+            if (DEBUG_PROG) {
+               printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
+                      result[0], result[1], result[2], result[3],
+                      a[0], a[1], a[2], a[3],
+                      b[0], b[1], b[2], b[3]);
+            }
           }
           break;
        case OPCODE_SGT:         /* set on greater */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
              result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
              result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
              result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
              store_vector4(inst, machine, result);
              if (DEBUG_PROG) {
-               printf("SGT %g %g %g %g\n",
-                      result[0], result[1], result[2], result[3]);
+               printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
+                      result[0], result[1], result[2], result[3],
+                      a[0], a[1], a[2], a[3],
+                      b[0], b[1], b[2], b[3]);
              }
           }
           break;
        case OPCODE_SIN:
           {
              GLfloat a[4], result[4];
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] = result[1] = result[2] = result[3]
                 = (GLfloat) _mesa_sin(a[0]);
              store_vector4(inst, machine, result);
@@ -1359,37 +1235,55 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_SLE:         /* set on less or equal */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
              result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
              result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
              result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
              store_vector4(inst, machine, result);
+            if (DEBUG_PROG) {
+               printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
+                      result[0], result[1], result[2], result[3],
+                      a[0], a[1], a[2], a[3],
+                      b[0], b[1], b[2], b[3]);
+            }
           }
           break;
        case OPCODE_SLT:         /* set on less */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
              result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
              result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
              result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
              store_vector4(inst, machine, result);
+            if (DEBUG_PROG) {
+               printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
+                      result[0], result[1], result[2], result[3],
+                      a[0], a[1], a[2], a[3],
+                      b[0], b[1], b[2], b[3]);
+            }
           }
           break;
        case OPCODE_SNE:         /* set on not equal */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
              result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
              result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
              result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
              store_vector4(inst, machine, result);
+            if (DEBUG_PROG) {
+               printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
+                      result[0], result[1], result[2], result[3],
+                      a[0], a[1], a[2], a[3],
+                      b[0], b[1], b[2], b[3]);
+            }
           }
           break;
        case OPCODE_STR:         /* set true, operands ignored */
@@ -1401,8 +1295,8 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_SUB:
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = a[0] - b[0];
              result[1] = a[1] - b[1];
              result[2] = a[2] - b[2];
@@ -1418,7 +1312,7 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_SWZ:         /* extended swizzle */
           {
              const struct prog_src_register *source = &inst->SrcReg[0];
-            const GLfloat *src = get_register_pointer(ctx, source, machine);
+            const GLfloat *src = get_register_pointer(source, machine);
              GLfloat result[4];
              GLuint i;
              for (i = 0; i < 4; i++) {
@@ -1439,33 +1333,18 @@ _mesa_execute_program(GLcontext * ctx,
           }
           break;
        case OPCODE_TEX:         /* Both ARB and NV frag prog */
-         /* Texel lookup */
+         /* Simple texel lookup */
           {
-            /* Note: only use the precomputed lambda value when we're
-             * sampling texture unit [K] with texcoord[K].
-             * Otherwise, the lambda value may have no relation to the
-             * instruction's texcoord or texture image.  Using the wrong
-             * lambda is usually bad news.
-             * The rest of the time, just use zero (until we get a more
-             * sophisticated way of computing lambda).
-             */
-            GLfloat coord[4], color[4], lambda;
-#if 0
-            if (inst->SrcReg[0].File == PROGRAM_INPUT &&
-                inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
-               lambda = span->array->lambda[inst->TexSrcUnit][column];
-            else
-#endif
-               lambda = 0.0;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, coord);
-            machine->FetchTexelLod(ctx, coord, lambda, inst->TexSrcUnit,
-                                   color);
+            GLfloat texcoord[4], color[4];
+            fetch_vector4(&inst->SrcReg[0], machine, texcoord);
+
+            fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
+
              if (DEBUG_PROG) {
-               printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g], "
-                      "lod %f\n",
+               printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
                        color[0], color[1], color[2], color[3],
                        inst->TexSrcUnit,
-                      coord[0], coord[1], coord[2], coord[3], lambda);
+                      texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
              }
              store_vector4(inst, machine, color);
           }
@@ -1475,21 +1354,18 @@ _mesa_execute_program(GLcontext * ctx,
           {
              const struct gl_texture_unit *texUnit
                 = &ctx->Texture.Unit[inst->TexSrcUnit];
-            GLfloat coord[4], color[4], lambda, bias;
-#if 0
-            if (inst->SrcReg[0].File == PROGRAM_INPUT &&
-                inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
-               lambda = span->array->lambda[inst->TexSrcUnit][column];
-            else
-#endif
-               lambda = 0.0;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, coord);
-            /* coord[3] is the bias to add to lambda */
-            bias = texUnit->LodBias + coord[3];
-            if (texUnit->_Current)
-               bias += texUnit->_Current->LodBias;
-            machine->FetchTexelLod(ctx, coord, lambda + bias,
-                                   inst->TexSrcUnit, color);
+            GLfloat texcoord[4], color[4], lodBias;
+
+            fetch_vector4(&inst->SrcReg[0], machine, texcoord);
+
+            /* texcoord[3] is the bias to add to lambda */
+            lodBias = texUnit->LodBias + texcoord[3];
+            if (texUnit->_Current) {
+               lodBias += texUnit->_Current->LodBias;
+            }
+
+            fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
+
              store_vector4(inst, machine, color);
           }
           break;
@@ -1497,10 +1373,11 @@ _mesa_execute_program(GLcontext * ctx,
           /* Texture lookup w/ partial derivatives for LOD */
           {
              GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, texcoord);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, dtdx);
-            fetch_vector4(ctx, &inst->SrcReg[2], machine, dtdy);
+            fetch_vector4(&inst->SrcReg[0], machine, texcoord);
+            fetch_vector4(&inst->SrcReg[1], machine, dtdx);
+            fetch_vector4(&inst->SrcReg[2], machine, dtdy);
              machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
+                                     0.0, /* lodBias */
                                       inst->TexSrcUnit, color);
              store_vector4(inst, machine, color);
           }
@@ -1508,15 +1385,9 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_TXP:         /* GL_ARB_fragment_program only */
           /* Texture lookup w/ projective divide */
           {
-            GLfloat texcoord[4], color[4], lambda;
-#if 0
-            if (inst->SrcReg[0].File == PROGRAM_INPUT &&
-                inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
-               lambda = span->array->lambda[inst->TexSrcUnit][column];
-            else
-#endif
-               lambda = 0.0;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, texcoord);
+            GLfloat texcoord[4], color[4];
+
+            fetch_vector4(&inst->SrcReg[0], machine, texcoord);
              /* Not so sure about this test - if texcoord[3] is
               * zero, we'd probably be fine except for an ASSERT in
               * IROUND_POS() which gets triggered by the inf values created.
@@ -1526,31 +1397,29 @@ _mesa_execute_program(GLcontext * ctx,
                 texcoord[1] /= texcoord[3];
                 texcoord[2] /= texcoord[3];
              }
-            machine->FetchTexelLod(ctx, texcoord, lambda,
-                                   inst->TexSrcUnit, color);
+
+            fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
+
              store_vector4(inst, machine, color);
           }
           break;
        case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
-         /* Texture lookup w/ projective divide */
+         /* Texture lookup w/ projective divide, as above, but do not
+          * do the divide by w if sampling from a cube map.
+          */
           {
-            GLfloat texcoord[4], color[4], lambda;
-#if 0
-            if (inst->SrcReg[0].File == PROGRAM_INPUT &&
-                inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0 + inst->TexSrcUnit)
-               lambda = span->array->lambda[inst->TexSrcUnit][column];
-            else
-#endif
-               lambda = 0.0;
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, texcoord);
+            GLfloat texcoord[4], color[4];
+
+            fetch_vector4(&inst->SrcReg[0], machine, texcoord);
              if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
                  texcoord[3] != 0.0) {
                 texcoord[0] /= texcoord[3];
                 texcoord[1] /= texcoord[3];
                 texcoord[2] /= texcoord[3];
              }
-            machine->FetchTexelLod(ctx, texcoord, lambda,
-                                   inst->TexSrcUnit, color);
+
+            fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
+
              store_vector4(inst, machine, color);
           }
           break;
@@ -1559,7 +1428,7 @@ _mesa_execute_program(GLcontext * ctx,
              GLfloat a[4], result[4];
              const GLuint *rawBits = (const GLuint *) a;
              GLhalfNV hx, hy;
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              hx = rawBits[0] & 0xffff;
              hy = rawBits[0] >> 16;
              result[0] = result[2] = _mesa_half_to_float(hx);
@@ -1572,7 +1441,7 @@ _mesa_execute_program(GLcontext * ctx,
              GLfloat a[4], result[4];
              const GLuint *rawBits = (const GLuint *) a;
              GLushort usx, usy;
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              usx = rawBits[0] & 0xffff;
              usy = rawBits[0] >> 16;
              result[0] = result[2] = usx * (1.0f / 65535.0f);
@@ -1584,7 +1453,7 @@ _mesa_execute_program(GLcontext * ctx,
           {
              GLfloat a[4], result[4];
              const GLuint *rawBits = (const GLuint *) a;
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] = (((rawBits[0] >> 0) & 0xff) - 128) / 127.0F;
              result[1] = (((rawBits[0] >> 8) & 0xff) - 128) / 127.0F;
              result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
@@ -1596,7 +1465,7 @@ _mesa_execute_program(GLcontext * ctx,
           {
              GLfloat a[4], result[4];
              const GLuint *rawBits = (const GLuint *) a;
-            fetch_vector1(ctx, &inst->SrcReg[0], machine, a);
+            fetch_vector1(&inst->SrcReg[0], machine, a);
              result[0] = ((rawBits[0] >> 0) & 0xff) / 255.0F;
              result[1] = ((rawBits[0] >> 8) & 0xff) / 255.0F;
              result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
@@ -1607,21 +1476,26 @@ _mesa_execute_program(GLcontext * ctx,
        case OPCODE_XPD:         /* cross product */
           {
              GLfloat a[4], b[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
              result[0] = a[1] * b[2] - a[2] * b[1];
              result[1] = a[2] * b[0] - a[0] * b[2];
              result[2] = a[0] * b[1] - a[1] * b[0];
              result[3] = 1.0;
              store_vector4(inst, machine, result);
+            if (DEBUG_PROG) {
+               printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
+                      result[0], result[1], result[2], result[3],
+                      a[0], a[1], a[2], b[0], b[1], b[2]);
+            }
           }
           break;
        case OPCODE_X2D:         /* 2-D matrix transform */
           {
              GLfloat a[4], b[4], c[4], result[4];
-            fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
-            fetch_vector4(ctx, &inst->SrcReg[1], machine, b);
-            fetch_vector4(ctx, &inst->SrcReg[2], machine, c);
+            fetch_vector4(&inst->SrcReg[0], machine, a);
+            fetch_vector4(&inst->SrcReg[1], machine, b);
+            fetch_vector4(&inst->SrcReg[2], machine, c);
              result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
              result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
              result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
@@ -1633,7 +1507,7 @@ _mesa_execute_program(GLcontext * ctx,
           {
              if (inst->SrcReg[0].File != -1) {
                 GLfloat a[4];
-               fetch_vector4(ctx, &inst->SrcReg[0], machine, a);
+               fetch_vector4(&inst->SrcReg[0], machine, a);
                 _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
                              a[0], a[1], a[2], a[3]);
              }