src/mesa/shader/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5.3
   4  *
   5  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "glheader.h"
  39 #include "colormac.h"
  40 #include "context.h"
  41 #include "program.h"
  42 #include "prog_execute.h"
  43 #include "prog_instruction.h"
  44 #include "prog_parameter.h"
  45 #include "prog_print.h"
  46 #include "slang_library_noise.h"
  47
  48
  49 /* See comments below for info about this */
  50 #define LAMBDA_ZERO 1
  51
  52 /* debug predicate */
  53 #define DEBUG_PROG 0
  54
  55
  56 #if FEATURE_MESA_program_debug
  57 static struct gl_program_machine *CurrentMachine = NULL;
  58
  59 /**
  60  * For GL_MESA_program_debug.
  61  * Return current value (4*GLfloat) of a program register.
  62  * Called via ctx->Driver.GetFragmentProgramRegister().
  63  */
  64 void
  65 _mesa_get_program_register(GLcontext *ctx, enum register_file file,
  66                            GLuint index, GLfloat val[4])
  67 {
  68    if (CurrentMachine) {
  69       switch (file) {
  70       case PROGRAM_INPUT:
  71          if (CurrentMachine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB) {
  72             COPY_4V(val, CurrentMachine->VertAttribs[index]);
  73          }
  74          else {
  75             COPY_4V(val,
  76                     CurrentMachine->Attribs[index][CurrentMachine->CurElement]);
  77          }
  78          break;
  79       case PROGRAM_OUTPUT:
  80          COPY_4V(val, CurrentMachine->Outputs[index]);
  81          break;
  82       case PROGRAM_TEMPORARY:
  83          COPY_4V(val, CurrentMachine->Temporaries[index]);
  84          break;
  85       default:
  86          _mesa_problem(NULL,
  87                        "bad register file in _swrast_get_program_register");
  88       }
  89    }
  90 }
  91 #endif /* FEATURE_MESA_program_debug */
  92
  93
  94
  95 /**
  96  * Return a pointer to the 4-element float vector specified by the given
  97  * source register.
  98  */
  99 static INLINE const GLfloat *
 100 get_register_pointer( GLcontext *ctx,
 101                       const struct prog_src_register *source,
 102                       const struct gl_program_machine *machine)
 103 {
 104    /* XXX relative addressing... */
 105    switch (source->File) {
 106    case PROGRAM_TEMPORARY:
 107       ASSERT(source->Index < MAX_PROGRAM_TEMPS);
 108       return machine->Temporaries[source->Index];
 109
 110    case PROGRAM_INPUT:
 111       if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB) {
 112          ASSERT(source->Index < VERT_ATTRIB_MAX);
 113          return machine->VertAttribs[source->Index];
 114       }
 115       else {
 116          ASSERT(source->Index < FRAG_ATTRIB_MAX);
 117          return machine->Attribs[source->Index][machine->CurElement];
 118       }
 119
 120    case PROGRAM_OUTPUT:
 121       /* This is only for PRINT */
 122       ASSERT(source->Index < FRAG_RESULT_MAX);
 123       return machine->Outputs[source->Index];
 124
 125    case PROGRAM_LOCAL_PARAM:
 126       ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 127       return machine->CurProgram->LocalParams[source->Index];
 128
 129    case PROGRAM_ENV_PARAM:
 130       ASSERT(source->Index < MAX_PROGRAM_ENV_PARAMS);
 131       if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB)
 132          return ctx->VertexProgram.Parameters[source->Index];
 133       else
 134          return ctx->FragmentProgram.Parameters[source->Index];
 135
 136    case PROGRAM_STATE_VAR:
 137       /* Fallthrough */
 138    case PROGRAM_CONSTANT:
 139       /* Fallthrough */
 140    case PROGRAM_UNIFORM:
 141       /* Fallthrough */
 142    case PROGRAM_NAMED_PARAM:
 143       ASSERT(source->Index <
 144              (GLint) machine->CurProgram->Parameters->NumParameters);
 145       return machine->CurProgram->Parameters->ParameterValues[source->Index];
 146
 147    default:
 148       _mesa_problem(ctx,
 149                     "Invalid input register file %d in get_register_pointer()",
 150                     source->File);
 151       return NULL;
 152    }
 153 }
 154
 155
 156 /**
 157  * Fetch a 4-element float vector from the given source register.
 158  * Apply swizzling and negating as needed.
 159  */
 160 static void
 161 fetch_vector4( GLcontext *ctx,
 162                const struct prog_src_register *source,
 163                const struct gl_program_machine *machine,
 164                GLfloat result[4] )
 165 {
 166    const GLfloat *src = get_register_pointer(ctx, source, machine);
 167    ASSERT(src);
 168
 169    if (source->Swizzle == SWIZZLE_NOOP) {
 170       /* no swizzling */
 171       COPY_4V(result, src);
 172    }
 173    else {
 174       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 175       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 176       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 177       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 178       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 179       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 180       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 181       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 182    }
 183
 184    if (source->NegateBase) {
 185       result[0] = -result[0];
 186       result[1] = -result[1];
 187       result[2] = -result[2];
 188       result[3] = -result[3];
 189    }
 190    if (source->Abs) {
 191       result[0] = FABSF(result[0]);
 192       result[1] = FABSF(result[1]);
 193       result[2] = FABSF(result[2]);
 194       result[3] = FABSF(result[3]);
 195    }
 196    if (source->NegateAbs) {
 197       result[0] = -result[0];
 198       result[1] = -result[1];
 199       result[2] = -result[2];
 200       result[3] = -result[3];
 201    }
 202 }
 203
 204 #if 0
 205 /**
 206  * Fetch the derivative with respect to X for the given register.
 207  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 208  * need to execute another instance of the program (ugh)!
 209  */
 210 static GLboolean
 211 fetch_vector4_deriv( GLcontext *ctx,
 212                      const struct prog_src_register *source,
 213                      const SWspan *span,
 214                      char xOrY, GLint column, GLfloat result[4] )
 215 {
 216    GLfloat src[4];
 217
 218    ASSERT(xOrY == 'X' || xOrY == 'Y');
 219
 220    switch (source->Index) {
 221    case FRAG_ATTRIB_WPOS:
 222       if (xOrY == 'X') {
 223          src[0] = 1.0;
 224          src[1] = 0.0;
 225          src[2] = span->attrStepX[FRAG_ATTRIB_WPOS][2]
 226                 / ctx->DrawBuffer->_DepthMaxF;
 227          src[3] = span->attrStepX[FRAG_ATTRIB_WPOS][3];
 228       }
 229       else {
 230          src[0] = 0.0;
 231          src[1] = 1.0;
 232          src[2] = span->attrStepY[FRAG_ATTRIB_WPOS][2]
 233                 / ctx->DrawBuffer->_DepthMaxF;
 234          src[3] = span->attrStepY[FRAG_ATTRIB_WPOS][3];
 235       }
 236       break;
 237    case FRAG_ATTRIB_COL0:
 238    case FRAG_ATTRIB_COL1:
 239       if (xOrY == 'X') {
 240          src[0] = span->attrStepX[source->Index][0] * (1.0F / CHAN_MAXF);
 241          src[1] = span->attrStepX[source->Index][1] * (1.0F / CHAN_MAXF);
 242          src[2] = span->attrStepX[source->Index][2] * (1.0F / CHAN_MAXF);
 243          src[3] = span->attrStepX[source->Index][3] * (1.0F / CHAN_MAXF);
 244       }
 245       else {
 246          src[0] = span->attrStepY[source->Index][0] * (1.0F / CHAN_MAXF);
 247          src[1] = span->attrStepY[source->Index][1] * (1.0F / CHAN_MAXF);
 248          src[2] = span->attrStepY[source->Index][2] * (1.0F / CHAN_MAXF);
 249          src[3] = span->attrStepY[source->Index][3] * (1.0F / CHAN_MAXF);
 250       }
 251       break;
 252    case FRAG_ATTRIB_FOGC:
 253       if (xOrY == 'X') {
 254          src[0] = span->attrStepX[FRAG_ATTRIB_FOGC][0] * (1.0F / CHAN_MAXF);
 255          src[1] = 0.0;
 256          src[2] = 0.0;
 257          src[3] = 0.0;
 258       }
 259       else {
 260          src[0] = span->attrStepY[FRAG_ATTRIB_FOGC][0] * (1.0F / CHAN_MAXF);
 261          src[1] = 0.0;
 262          src[2] = 0.0;
 263          src[3] = 0.0;
 264       }
 265       break;
 266    default:
 267       assert(source->Index < FRAG_ATTRIB_MAX);
 268       /* texcoord or varying */
 269       if (xOrY == 'X') {
 270          /* this is a little tricky - I think I've got it right */
 271          const GLfloat invQ = 1.0f / (span->attrStart[source->Index][3]
 272                                + span->attrStepX[source->Index][3] * column);
 273          src[0] = span->attrStepX[source->Index][0] * invQ;
 274          src[1] = span->attrStepX[source->Index][1] * invQ;
 275          src[2] = span->attrStepX[source->Index][2] * invQ;
 276          src[3] = span->attrStepX[source->Index][3] * invQ;
 277       }
 278       else {
 279          /* Tricky, as above, but in Y direction */
 280          const GLfloat invQ = 1.0f / (span->attrStart[source->Index][3]
 281                                       + span->attrStepY[source->Index][3]);
 282          src[0] = span->attrStepY[source->Index][0] * invQ;
 283          src[1] = span->attrStepY[source->Index][1] * invQ;
 284          src[2] = span->attrStepY[source->Index][2] * invQ;
 285          src[3] = span->attrStepY[source->Index][3] * invQ;
 286       }
 287       break;
 288    }
 289
 290    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 291    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 292    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 293    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 294
 295    if (source->NegateBase) {
 296       result[0] = -result[0];
 297       result[1] = -result[1];
 298       result[2] = -result[2];
 299       result[3] = -result[3];
 300    }
 301    if (source->Abs) {
 302       result[0] = FABSF(result[0]);
 303       result[1] = FABSF(result[1]);
 304       result[2] = FABSF(result[2]);
 305       result[3] = FABSF(result[3]);
 306    }
 307    if (source->NegateAbs) {
 308       result[0] = -result[0];
 309       result[1] = -result[1];
 310       result[2] = -result[2];
 311       result[3] = -result[3];
 312    }
 313    return GL_TRUE;
 314 }
 315 #endif
 316
 317
 318 /**
 319  * As above, but only return result[0] element.
 320  */
 321 static void
 322 fetch_vector1( GLcontext *ctx,
 323                const struct prog_src_register *source,
 324                const struct gl_program_machine *machine,
 325                GLfloat result[4] )
 326 {
 327    const GLfloat *src = get_register_pointer(ctx, source, machine);
 328    ASSERT(src);
 329
 330    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 331
 332    if (source->NegateBase) {
 333       result[0] = -result[0];
 334    }
 335    if (source->Abs) {
 336       result[0] = FABSF(result[0]);
 337    }
 338    if (source->NegateAbs) {
 339       result[0] = -result[0];
 340    }
 341 }
 342
 343
 344 /**
 345  * Test value against zero and return GT, LT, EQ or UN if NaN.
 346  */
 347 static INLINE GLuint
 348 generate_cc( float value )
 349 {
 350    if (value != value)
 351       return COND_UN;  /* NaN */
 352    if (value > 0.0F)
 353       return COND_GT;
 354    if (value < 0.0F)
 355       return COND_LT;
 356    return COND_EQ;
 357 }
 358
 359
 360 /**
 361  * Test if the ccMaskRule is satisfied by the given condition code.
 362  * Used to mask destination writes according to the current condition code.
 363  */
 364 static INLINE GLboolean
 365 test_cc(GLuint condCode, GLuint ccMaskRule)
 366 {
 367    switch (ccMaskRule) {
 368    case COND_EQ: return (condCode == COND_EQ);
 369    case COND_NE: return (condCode != COND_EQ);
 370    case COND_LT: return (condCode == COND_LT);
 371    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 372    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 373    case COND_GT: return (condCode == COND_GT);
 374    case COND_TR: return GL_TRUE;
 375    case COND_FL: return GL_FALSE;
 376    default:      return GL_TRUE;
 377    }
 378 }
 379
 380
 381 /**
 382  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 383  * or GL_FALSE to indicate result.
 384  */
 385 static INLINE GLboolean
 386 eval_condition(const struct gl_program_machine *machine,
 387                const struct prog_instruction *inst)
 388 {
 389    const GLuint swizzle = inst->DstReg.CondSwizzle;
 390    const GLuint condMask = inst->DstReg.CondMask;
 391    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 392        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 393        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 394        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 395       return GL_TRUE;
 396    }
 397    else {
 398       return GL_FALSE;
 399    }
 400 }
 401
 402
 403
 404 /**
 405  * Store 4 floats into a register.  Observe the instructions saturate and
 406  * set-condition-code flags.
 407  */
 408 static void
 409 store_vector4( const struct prog_instruction *inst,
 410                struct gl_program_machine *machine,
 411                const GLfloat value[4] )
 412 {
 413    const struct prog_dst_register *dest = &(inst->DstReg);
 414    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 415    GLfloat *dstReg;
 416    GLfloat dummyReg[4];
 417    GLfloat clampedValue[4];
 418    GLuint writeMask = dest->WriteMask;
 419
 420    switch (dest->File) {
 421       case PROGRAM_OUTPUT:
 422          dstReg = machine->Outputs[dest->Index];
 423          break;
 424       case PROGRAM_TEMPORARY:
 425          dstReg = machine->Temporaries[dest->Index];
 426          break;
 427       case PROGRAM_WRITE_ONLY:
 428          dstReg = dummyReg;
 429          return;
 430       default:
 431          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 432          return;
 433    }
 434
 435 #if 0
 436    if (value[0] > 1.0e10 ||
 437        IS_INF_OR_NAN(value[0]) ||
 438        IS_INF_OR_NAN(value[1]) ||
 439        IS_INF_OR_NAN(value[2]) ||
 440        IS_INF_OR_NAN(value[3])  )
 441       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 442 #endif
 443
 444    if (clamp) {
 445       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 446       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 447       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 448       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 449       value = clampedValue;
 450    }
 451
 452    if (dest->CondMask != COND_TR) {
 453       /* condition codes may turn off some writes */
 454       if (writeMask & WRITEMASK_X) {
 455          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)],
 456                       dest->CondMask))
 457             writeMask &= ~WRITEMASK_X;
 458       }
 459       if (writeMask & WRITEMASK_Y) {
 460          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)],
 461                       dest->CondMask))
 462             writeMask &= ~WRITEMASK_Y;
 463       }
 464       if (writeMask & WRITEMASK_Z) {
 465          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)],
 466                       dest->CondMask))
 467             writeMask &= ~WRITEMASK_Z;
 468       }
 469       if (writeMask & WRITEMASK_W) {
 470          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)],
 471                       dest->CondMask))
 472             writeMask &= ~WRITEMASK_W;
 473       }
 474    }
 475
 476    if (writeMask & WRITEMASK_X)
 477       dstReg[0] = value[0];
 478    if (writeMask & WRITEMASK_Y)
 479       dstReg[1] = value[1];
 480    if (writeMask & WRITEMASK_Z)
 481       dstReg[2] = value[2];
 482    if (writeMask & WRITEMASK_W)
 483       dstReg[3] = value[3];
 484
 485    if (inst->CondUpdate) {
 486       if (writeMask & WRITEMASK_X)
 487          machine->CondCodes[0] = generate_cc(value[0]);
 488       if (writeMask & WRITEMASK_Y)
 489          machine->CondCodes[1] = generate_cc(value[1]);
 490       if (writeMask & WRITEMASK_Z)
 491          machine->CondCodes[2] = generate_cc(value[2]);
 492       if (writeMask & WRITEMASK_W)
 493          machine->CondCodes[3] = generate_cc(value[3]);
 494    }
 495 }
 496
 497
 498 #if 0
 499 /**
 500  * Initialize a new machine state instance from an existing one, adding
 501  * the partial derivatives onto the input registers.
 502  * Used to implement DDX and DDY instructions in non-trivial cases.
 503  */
 504 static void
 505 init_machine_deriv( GLcontext *ctx,
 506                     const struct gl_program_machine *machine,
 507                     const struct gl_fragment_program *program,
 508                     const SWspan *span, char xOrY,
 509                     struct gl_program_machine *dMachine )
 510 {
 511    GLuint attr;
 512
 513    ASSERT(xOrY == 'X' || xOrY == 'Y');
 514
 515    /* copy existing machine */
 516    _mesa_memcpy(dMachine, machine, sizeof(struct gl_program_machine));
 517
 518    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
 519       /* XXX also need to do this when using valgrind */
 520       /* Clear temporary registers (undefined for ARB_f_p) */
 521       _mesa_bzero( (void*) machine->Temporaries,
 522                    MAX_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 523    }
 524
 525    /* Add derivatives */
 526    if (program->Base.InputsRead & FRAG_BIT_WPOS) {
 527       GLfloat *wpos = machine->Attribs[FRAG_ATTRIB_WPOS][machine->CurElement];
 528       if (xOrY == 'X') {
 529          wpos[0] += 1.0F;
 530          wpos[1] += 0.0F;
 531          wpos[2] += span->attrStepX[FRAG_ATTRIB_WPOS][2];
 532          wpos[3] += span->attrStepX[FRAG_ATTRIB_WPOS][3];
 533       }
 534       else {
 535          wpos[0] += 0.0F;
 536          wpos[1] += 1.0F;
 537          wpos[2] += span->attrStepY[FRAG_ATTRIB_WPOS][2];
 538          wpos[3] += span->attrStepY[FRAG_ATTRIB_WPOS][3];
 539       }
 540    }
 541
 542    /* primary, secondary colors */
 543    for (attr = FRAG_ATTRIB_COL0; attr <= FRAG_ATTRIB_COL1; attr++) {
 544       if (program->Base.InputsRead & (1 << attr)) {
 545          GLfloat *col = machine->Attribs[attr][machine->CurElement];
 546          if (xOrY == 'X') {
 547             col[0] += span->attrStepX[attr][0] * (1.0F / CHAN_MAXF);
 548             col[1] += span->attrStepX[attr][1] * (1.0F / CHAN_MAXF);
 549             col[2] += span->attrStepX[attr][2] * (1.0F / CHAN_MAXF);
 550             col[3] += span->attrStepX[attr][3] * (1.0F / CHAN_MAXF);
 551          }
 552          else {
 553             col[0] += span->attrStepY[attr][0] * (1.0F / CHAN_MAXF);
 554             col[1] += span->attrStepY[attr][1] * (1.0F / CHAN_MAXF);
 555             col[2] += span->attrStepY[attr][2] * (1.0F / CHAN_MAXF);
 556             col[3] += span->attrStepY[attr][3] * (1.0F / CHAN_MAXF);
 557          }
 558       }
 559    }
 560    if (program->Base.InputsRead & FRAG_BIT_FOGC) {
 561       GLfloat *fogc = machine->Attribs[FRAG_ATTRIB_FOGC][machine->CurElement];
 562       if (xOrY == 'X') {
 563          fogc[0] += span->attrStepX[FRAG_ATTRIB_FOGC][0];
 564       }
 565       else {
 566          fogc[0] += span->attrStepY[FRAG_ATTRIB_FOGC][0];
 567       }
 568    }
 569    /* texcoord and varying vars */
 570    for (attr = FRAG_ATTRIB_TEX0; attr < FRAG_ATTRIB_MAX; attr++) {
 571       if (program->Base.InputsRead & (1 << attr)) {
 572          GLfloat *val = machine->Attribs[attr][machine->CurElement];
 573          /* XXX perspective-correct interpolation */
 574          if (xOrY == 'X') {
 575             val[0] += span->attrStepX[attr][0];
 576             val[1] += span->attrStepX[attr][1];
 577             val[2] += span->attrStepX[attr][2];
 578             val[3] += span->attrStepX[attr][3];
 579          }
 580          else {
 581             val[0] += span->attrStepY[attr][0];
 582             val[1] += span->attrStepY[attr][1];
 583             val[2] += span->attrStepY[attr][2];
 584             val[3] += span->attrStepY[attr][3];
 585          }
 586       }
 587    }
 588
 589    /* init condition codes */
 590    dMachine->CondCodes[0] = COND_EQ;
 591    dMachine->CondCodes[1] = COND_EQ;
 592    dMachine->CondCodes[2] = COND_EQ;
 593    dMachine->CondCodes[3] = COND_EQ;
 594 }
 595 #endif
 596
 597
 598 /**
 599  * Execute the given vertex/fragment program.
 600  *
 601  * \param ctx - rendering context
 602  * \param program - the fragment program to execute
 603  * \param machine - machine state (register file)
 604  * \param maxInst - max number of instructions to execute
 605  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 606  */
 607 GLboolean
 608 _mesa_execute_program(GLcontext *ctx,
 609                       const struct gl_program *program, GLuint maxInst,
 610                       struct gl_program_machine *machine, GLuint element)
 611 {
 612    const GLuint MAX_EXEC = 10000;
 613    GLint pc, total = 0;
 614
 615    machine->CurProgram = program;
 616
 617    if (DEBUG_PROG) {
 618       printf("execute program %u --------------------\n", program->Id);
 619    }
 620
 621 #if FEATURE_MESA_program_debug
 622    CurrentMachine = machine;
 623 #endif
 624
 625    for (pc = 0; pc < maxInst; pc++) {
 626       const struct prog_instruction *inst = program->Instructions + pc;
 627
 628 #if FEATURE_MESA_program_debug
 629       if (ctx->FragmentProgram.CallbackEnabled &&
 630           ctx->FragmentProgram.Callback) {
 631          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 632          ctx->FragmentProgram.Callback(program->Target,
 633                                        ctx->FragmentProgram.CallbackData);
 634       }
 635 #endif
 636
 637       if (DEBUG_PROG) {
 638          _mesa_print_instruction(inst);
 639       }
 640
 641       switch (inst->Opcode) {
 642          case OPCODE_ABS:
 643             {
 644                GLfloat a[4], result[4];
 645                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 646                result[0] = FABSF(a[0]);
 647                result[1] = FABSF(a[1]);
 648                result[2] = FABSF(a[2]);
 649                result[3] = FABSF(a[3]);
 650                store_vector4( inst, machine, result );
 651             }
 652             break;
 653          case OPCODE_ADD:
 654             {
 655                GLfloat a[4], b[4], result[4];
 656                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 657                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 658                result[0] = a[0] + b[0];
 659                result[1] = a[1] + b[1];
 660                result[2] = a[2] + b[2];
 661                result[3] = a[3] + b[3];
 662                store_vector4( inst, machine, result );
 663                if (DEBUG_PROG) {
 664                   printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 665                          result[0], result[1], result[2], result[3],
 666                          a[0], a[1], a[2], a[3],
 667                          b[0], b[1], b[2], b[3]);
 668                }
 669             }
 670             break;
 671          case OPCODE_BGNLOOP:
 672             /* no-op */
 673             break;
 674          case OPCODE_ENDLOOP:
 675             /* subtract 1 here since pc is incremented by for(pc) loop */
 676             pc = inst->BranchTarget - 1; /* go to matching BNGLOOP */
 677             break;
 678          case OPCODE_BGNSUB: /* begin subroutine */
 679             break;
 680          case OPCODE_ENDSUB: /* end subroutine */
 681             break;
 682          case OPCODE_BRA: /* branch (conditional) */
 683             /* fall-through */
 684          case OPCODE_BRK: /* break out of loop (conditional) */
 685             /* fall-through */
 686          case OPCODE_CONT: /* continue loop (conditional) */
 687             if (eval_condition(machine, inst)) {
 688                /* take branch */
 689                /* Subtract 1 here since we'll do pc++ at end of for-loop */
 690                pc = inst->BranchTarget - 1;
 691             }
 692             break;
 693          case OPCODE_CAL: /* Call subroutine (conditional) */
 694             if (eval_condition(machine, inst)) {
 695                /* call the subroutine */
 696                if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 697                   return GL_TRUE; /* Per GL_NV_vertex_program2 spec */
 698                }
 699                machine->CallStack[machine->StackDepth++] = pc + 1;
 700                pc = inst->BranchTarget; /* XXX - 1 ??? */
 701             }
 702             break;
 703          case OPCODE_CMP:
 704             {
 705                GLfloat a[4], b[4], c[4], result[4];
 706                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 707                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 708                fetch_vector4( ctx, &inst->SrcReg[2], machine, c );
 709                result[0] = a[0] < 0.0F ? b[0] : c[0];
 710                result[1] = a[1] < 0.0F ? b[1] : c[1];
 711                result[2] = a[2] < 0.0F ? b[2] : c[2];
 712                result[3] = a[3] < 0.0F ? b[3] : c[3];
 713                store_vector4( inst, machine, result );
 714             }
 715             break;
 716          case OPCODE_COS:
 717             {
 718                GLfloat a[4], result[4];
 719                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
 720                result[0] = result[1] = result[2] = result[3]
 721                   = (GLfloat) _mesa_cos(a[0]);
 722                store_vector4( inst, machine, result );
 723             }
 724             break;
 725          case OPCODE_DDX: /* Partial derivative with respect to X */
 726             {
 727 #if 0
 728                GLfloat a[4], aNext[4], result[4];
 729                struct gl_program_machine dMachine;
 730                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
 731                                         column, result)) {
 732                   /* This is tricky.  Make a copy of the current machine state,
 733                    * increment the input registers by the dx or dy partial
 734                    * derivatives, then re-execute the program up to the
 735                    * preceeding instruction, then fetch the source register.
 736                    * Finally, find the difference in the register values for
 737                    * the original and derivative runs.
 738                    */
 739                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 740                   init_machine_deriv(ctx, machine, program, span,
 741                                      'X', &dMachine);
 742                   execute_program(ctx, program, pc, &dMachine, span, column);
 743                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 744                   result[0] = aNext[0] - a[0];
 745                   result[1] = aNext[1] - a[1];
 746                   result[2] = aNext[2] - a[2];
 747                   result[3] = aNext[3] - a[3];
 748                }
 749                store_vector4( inst, machine, result );
 750 #else
 751                static const GLfloat result[4] = { 0, 0, 0, 0 };
 752                store_vector4( inst, machine, result );
 753 #endif
 754             }
 755             break;
 756          case OPCODE_DDY: /* Partial derivative with respect to Y */
 757             {
 758 #if 0
 759                GLfloat a[4], aNext[4], result[4];
 760                struct gl_program_machine dMachine;
 761                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
 762                                         column, result)) {
 763                   init_machine_deriv(ctx, machine, program, span,
 764                                      'Y', &dMachine);
 765                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 766                   execute_program(ctx, program, pc, &dMachine, span, column);
 767                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 768                   result[0] = aNext[0] - a[0];
 769                   result[1] = aNext[1] - a[1];
 770                   result[2] = aNext[2] - a[2];
 771                   result[3] = aNext[3] - a[3];
 772                }
 773                store_vector4( inst, machine, result );
 774 #else
 775                static const GLfloat result[4] = { 0, 0, 0, 0 };
 776                store_vector4( inst, machine, result );
 777 #endif
 778             }
 779             break;
 780          case OPCODE_DP3:
 781             {
 782                GLfloat a[4], b[4], result[4];
 783                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 784                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 785                result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 786                store_vector4( inst, machine, result );
 787                if (DEBUG_PROG) {
 788                   printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 789                          result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 790                }
 791             }
 792             break;
 793          case OPCODE_DP4:
 794             {
 795                GLfloat a[4], b[4], result[4];
 796                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 797                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 798                result[0] = result[1] = result[2] = result[3] = DOT4(a,b);
 799                store_vector4( inst, machine, result );
 800                if (DEBUG_PROG) {
 801                   printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 802                          result[0], a[0], a[1], a[2], a[3],
 803                          b[0], b[1], b[2], b[3]);
 804                }
 805             }
 806             break;
 807          case OPCODE_DPH:
 808             {
 809                GLfloat a[4], b[4], result[4];
 810                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 811                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 812                result[0] = result[1] = result[2] = result[3] =
 813                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 814                store_vector4( inst, machine, result );
 815             }
 816             break;
 817          case OPCODE_DST: /* Distance vector */
 818             {
 819                GLfloat a[4], b[4], result[4];
 820                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 821                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 822                result[0] = 1.0F;
 823                result[1] = a[1] * b[1];
 824                result[2] = a[2];
 825                result[3] = b[3];
 826                store_vector4( inst, machine, result );
 827             }
 828             break;
 829          case OPCODE_EX2: /* Exponential base 2 */
 830             {
 831                GLfloat a[4], result[4];
 832                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
 833                result[0] = result[1] = result[2] = result[3] =
 834                   (GLfloat) _mesa_pow(2.0, a[0]);
 835                store_vector4( inst, machine, result );
 836             }
 837             break;
 838          case OPCODE_FLR:
 839             {
 840                GLfloat a[4], result[4];
 841                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 842                result[0] = FLOORF(a[0]);
 843                result[1] = FLOORF(a[1]);
 844                result[2] = FLOORF(a[2]);
 845                result[3] = FLOORF(a[3]);
 846                store_vector4( inst, machine, result );
 847             }
 848             break;
 849          case OPCODE_FRC:
 850             {
 851                GLfloat a[4], result[4];
 852                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 853                result[0] = a[0] - FLOORF(a[0]);
 854                result[1] = a[1] - FLOORF(a[1]);
 855                result[2] = a[2] - FLOORF(a[2]);
 856                result[3] = a[3] - FLOORF(a[3]);
 857                store_vector4( inst, machine, result );
 858             }
 859             break;
 860          case OPCODE_IF:
 861             if (eval_condition(machine, inst)) {
 862                /* do if-clause (just continue execution) */
 863             }
 864             else {
 865                /* go to the instruction after ELSE or ENDIF */
 866                assert(inst->BranchTarget >= 0);
 867                pc = inst->BranchTarget - 1;
 868             }
 869             break;
 870          case OPCODE_ELSE:
 871             /* goto ENDIF */
 872             assert(inst->BranchTarget >= 0);
 873             pc = inst->BranchTarget - 1;
 874             break;
 875          case OPCODE_ENDIF:
 876             /* nothing */
 877             break;
 878          case OPCODE_INT: /* float to int */
 879             {
 880                GLfloat a[4], result[4];
 881                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 882                result[0] = (GLfloat) (GLint) a[0];
 883                result[1] = (GLfloat) (GLint) a[1];
 884                result[2] = (GLfloat) (GLint) a[2];
 885                result[3] = (GLfloat) (GLint) a[3];
 886                store_vector4( inst, machine, result );
 887             }
 888             break;
 889          case OPCODE_KIL_NV: /* NV_f_p only (conditional) */
 890             if (eval_condition(machine, inst)) {
 891                return GL_FALSE;
 892             }
 893             break;
 894          case OPCODE_KIL: /* ARB_f_p only */
 895             {
 896                GLfloat a[4];
 897                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 898                if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 899                   return GL_FALSE;
 900                }
 901             }
 902             break;
 903          case OPCODE_LG2:  /* log base 2 */
 904             {
 905                GLfloat a[4], result[4];
 906                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
 907                result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
 908                store_vector4( inst, machine, result );
 909             }
 910             break;
 911          case OPCODE_LIT:
 912             {
 913                const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
 914                GLfloat a[4], result[4];
 915                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 916                a[0] = MAX2(a[0], 0.0F);
 917                a[1] = MAX2(a[1], 0.0F);
 918                /* XXX ARB version clamps a[3], NV version doesn't */
 919                a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 920                result[0] = 1.0F;
 921                result[1] = a[0];
 922                /* XXX we could probably just use pow() here */
 923                if (a[0] > 0.0F) {
 924                   if (a[1] == 0.0 && a[3] == 0.0)
 925                      result[2] = 1.0;
 926                   else
 927                      result[2] = EXPF(a[3] * LOGF(a[1]));
 928                }
 929                else {
 930                   result[2] = 0.0;
 931                }
 932                result[3] = 1.0F;
 933                store_vector4( inst, machine, result );
 934                if (DEBUG_PROG) {
 935                   printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
 936                          result[0], result[1], result[2], result[3],
 937                          a[0], a[1], a[2], a[3]);
 938                }
 939             }
 940             break;
 941          case OPCODE_LRP:
 942             {
 943                GLfloat a[4], b[4], c[4], result[4];
 944                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 945                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 946                fetch_vector4( ctx, &inst->SrcReg[2], machine, c );
 947                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 948                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 949                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 950                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 951                store_vector4( inst, machine, result );
 952                if (DEBUG_PROG) {
 953                   printf("LRP (%g %g %g %g) = (%g %g %g %g), "
 954                          "(%g %g %g %g), (%g %g %g %g)\n",
 955                          result[0], result[1], result[2], result[3],
 956                          a[0], a[1], a[2], a[3],
 957                          b[0], b[1], b[2], b[3],
 958                          c[0], c[1], c[2], c[3]);
 959                }
 960             }
 961             break;
 962          case OPCODE_MAD:
 963             {
 964                GLfloat a[4], b[4], c[4], result[4];
 965                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 966                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 967                fetch_vector4( ctx, &inst->SrcReg[2], machine, c );
 968                result[0] = a[0] * b[0] + c[0];
 969                result[1] = a[1] * b[1] + c[1];
 970                result[2] = a[2] * b[2] + c[2];
 971                result[3] = a[3] * b[3] + c[3];
 972                store_vector4( inst, machine, result );
 973                if (DEBUG_PROG) {
 974                   printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
 975                          "(%g %g %g %g) + (%g %g %g %g)\n",
 976                          result[0], result[1], result[2], result[3],
 977                          a[0], a[1], a[2], a[3],
 978                          b[0], b[1], b[2], b[3],
 979                          c[0], c[1], c[2], c[3]);
 980                }
 981             }
 982             break;
 983          case OPCODE_MAX:
 984             {
 985                GLfloat a[4], b[4], result[4];
 986                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
 987                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
 988                result[0] = MAX2(a[0], b[0]);
 989                result[1] = MAX2(a[1], b[1]);
 990                result[2] = MAX2(a[2], b[2]);
 991                result[3] = MAX2(a[3], b[3]);
 992                store_vector4( inst, machine, result );
 993                if (DEBUG_PROG) {
 994                   printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 995                          result[0], result[1], result[2], result[3],
 996                          a[0], a[1], a[2], a[3],
 997                          b[0], b[1], b[2], b[3]);
 998                }
 999             }
1000             break;
1001          case OPCODE_MIN:
1002             {
1003                GLfloat a[4], b[4], result[4];
1004                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1005                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1006                result[0] = MIN2(a[0], b[0]);
1007                result[1] = MIN2(a[1], b[1]);
1008                result[2] = MIN2(a[2], b[2]);
1009                result[3] = MIN2(a[3], b[3]);
1010                store_vector4( inst, machine, result );
1011             }
1012             break;
1013          case OPCODE_MOV:
1014             {
1015                GLfloat result[4];
1016                fetch_vector4( ctx, &inst->SrcReg[0], machine, result );
1017                store_vector4( inst, machine, result );
1018                if (DEBUG_PROG) {
1019                   printf("MOV (%g %g %g %g)\n",
1020                          result[0], result[1], result[2], result[3]);
1021                }
1022             }
1023             break;
1024          case OPCODE_MUL:
1025             {
1026                GLfloat a[4], b[4], result[4];
1027                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1028                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1029                result[0] = a[0] * b[0];
1030                result[1] = a[1] * b[1];
1031                result[2] = a[2] * b[2];
1032                result[3] = a[3] * b[3];
1033                store_vector4( inst, machine, result );
1034                if (DEBUG_PROG) {
1035                   printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
1036                          result[0], result[1], result[2], result[3],
1037                          a[0], a[1], a[2], a[3],
1038                          b[0], b[1], b[2], b[3]);
1039                }
1040             }
1041             break;
1042          case OPCODE_NOISE1:
1043             {
1044                GLfloat a[4], result[4];
1045                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1046                result[0] =
1047                result[1] =
1048                result[2] =
1049                result[3] = _slang_library_noise1(a[0]);
1050                store_vector4( inst, machine, result );
1051             }
1052             break;
1053          case OPCODE_NOISE2:
1054             {
1055                GLfloat a[4], result[4];
1056                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1057                result[0] =
1058                result[1] =
1059                result[2] =
1060                result[3] = _slang_library_noise2(a[0], a[1]);
1061                store_vector4( inst, machine, result );
1062             }
1063             break;
1064          case OPCODE_NOISE3:
1065             {
1066                GLfloat a[4], result[4];
1067                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1068                result[0] =
1069                result[1] =
1070                result[2] =
1071                result[3] = _slang_library_noise3(a[0], a[1], a[2]);
1072                store_vector4( inst, machine, result );
1073             }
1074             break;
1075          case OPCODE_NOISE4:
1076             {
1077                GLfloat a[4], result[4];
1078                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1079                result[0] =
1080                result[1] =
1081                result[2] =
1082                result[3] = _slang_library_noise4(a[0], a[1], a[2], a[3]);
1083                store_vector4( inst, machine, result );
1084             }
1085             break;
1086          case OPCODE_NOP:
1087             break;
1088          case OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
1089             {
1090                GLfloat a[4], result[4];
1091                GLhalfNV hx, hy;
1092                GLuint *rawResult = (GLuint *) result;
1093                GLuint twoHalves;
1094                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1095                hx = _mesa_float_to_half(a[0]);
1096                hy = _mesa_float_to_half(a[1]);
1097                twoHalves = hx | (hy << 16);
1098                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1099                   = twoHalves;
1100                store_vector4( inst, machine, result );
1101             }
1102             break;
1103          case OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
1104             {
1105                GLfloat a[4], result[4];
1106                GLuint usx, usy, *rawResult = (GLuint *) result;
1107                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1108                a[0] = CLAMP(a[0], 0.0F, 1.0F);
1109                a[1] = CLAMP(a[1], 0.0F, 1.0F);
1110                usx = IROUND(a[0] * 65535.0F);
1111                usy = IROUND(a[1] * 65535.0F);
1112                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1113                   = usx | (usy << 16);
1114                store_vector4( inst, machine, result );
1115             }
1116             break;
1117          case OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
1118             {
1119                GLfloat a[4], result[4];
1120                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1121                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1122                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1123                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1124                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1125                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1126                ubx = IROUND(127.0F * a[0] + 128.0F);
1127                uby = IROUND(127.0F * a[1] + 128.0F);
1128                ubz = IROUND(127.0F * a[2] + 128.0F);
1129                ubw = IROUND(127.0F * a[3] + 128.0F);
1130                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1131                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1132                store_vector4( inst, machine, result );
1133             }
1134             break;
1135          case OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
1136             {
1137                GLfloat a[4], result[4];
1138                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1139                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1140                a[0] = CLAMP(a[0], 0.0F, 1.0F);
1141                a[1] = CLAMP(a[1], 0.0F, 1.0F);
1142                a[2] = CLAMP(a[2], 0.0F, 1.0F);
1143                a[3] = CLAMP(a[3], 0.0F, 1.0F);
1144                ubx = IROUND(255.0F * a[0]);
1145                uby = IROUND(255.0F * a[1]);
1146                ubz = IROUND(255.0F * a[2]);
1147                ubw = IROUND(255.0F * a[3]);
1148                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1149                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1150                store_vector4( inst, machine, result );
1151             }
1152             break;
1153          case OPCODE_POW:
1154             {
1155                GLfloat a[4], b[4], result[4];
1156                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1157                fetch_vector1( ctx, &inst->SrcReg[1], machine, b );
1158                result[0] = result[1] = result[2] = result[3]
1159                   = (GLfloat)_mesa_pow(a[0], b[0]);
1160                store_vector4( inst, machine, result );
1161             }
1162             break;
1163          case OPCODE_RCP:
1164             {
1165                GLfloat a[4], result[4];
1166                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1167                if (DEBUG_PROG) {
1168                   if (a[0] == 0)
1169                      printf("RCP(0)\n");
1170                   else if (IS_INF_OR_NAN(a[0]))
1171                      printf("RCP(inf)\n");
1172                }
1173                result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1174                store_vector4( inst, machine, result );
1175             }
1176             break;
1177          case OPCODE_RET: /* return from subroutine (conditional) */
1178             if (eval_condition(machine, inst)) {
1179                if (machine->StackDepth == 0) {
1180                   return GL_TRUE; /* Per GL_NV_vertex_program2 spec */
1181                }
1182                pc = machine->CallStack[--machine->StackDepth];
1183             }
1184             break;
1185          case OPCODE_RFL: /* reflection vector */
1186             {
1187                GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1188                fetch_vector4( ctx, &inst->SrcReg[0], machine, axis );
1189                fetch_vector4( ctx, &inst->SrcReg[1], machine, dir );
1190                tmpW = DOT3(axis, axis);
1191                tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1192                result[0] = tmpX * axis[0] - dir[0];
1193                result[1] = tmpX * axis[1] - dir[1];
1194                result[2] = tmpX * axis[2] - dir[2];
1195                /* result[3] is never written! XXX enforce in parser! */
1196                store_vector4( inst, machine, result );
1197             }
1198             break;
1199          case OPCODE_RSQ: /* 1 / sqrt() */
1200             {
1201                GLfloat a[4], result[4];
1202                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1203                a[0] = FABSF(a[0]);
1204                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1205                store_vector4( inst, machine, result );
1206                if (DEBUG_PROG) {
1207                   printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1208                }
1209             }
1210             break;
1211          case OPCODE_SCS: /* sine and cos */
1212             {
1213                GLfloat a[4], result[4];
1214                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1215                result[0] = (GLfloat) _mesa_cos(a[0]);
1216                result[1] = (GLfloat) _mesa_sin(a[0]);
1217                result[2] = 0.0;  /* undefined! */
1218                result[3] = 0.0;  /* undefined! */
1219                store_vector4( inst, machine, result );
1220             }
1221             break;
1222          case OPCODE_SEQ: /* set on equal */
1223             {
1224                GLfloat a[4], b[4], result[4];
1225                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1226                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1227                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1228                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1229                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1230                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1231                store_vector4( inst, machine, result );
1232             }
1233             break;
1234          case OPCODE_SFL: /* set false, operands ignored */
1235             {
1236                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1237                store_vector4( inst, machine, result );
1238             }
1239             break;
1240          case OPCODE_SGE: /* set on greater or equal */
1241             {
1242                GLfloat a[4], b[4], result[4];
1243                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1244                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1245                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1246                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1247                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1248                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1249                store_vector4( inst, machine, result );
1250             }
1251             break;
1252          case OPCODE_SGT: /* set on greater */
1253             {
1254                GLfloat a[4], b[4], result[4];
1255                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1256                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1257                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1258                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1259                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1260                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1261                store_vector4( inst, machine, result );
1262                if (DEBUG_PROG) {
1263                   printf("SGT %g %g %g %g\n",
1264                          result[0], result[1], result[2], result[3]);
1265                }
1266             }
1267             break;
1268          case OPCODE_SIN:
1269             {
1270                GLfloat a[4], result[4];
1271                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1272                result[0] = result[1] = result[2] = result[3]
1273                   = (GLfloat) _mesa_sin(a[0]);
1274                store_vector4( inst, machine, result );
1275             }
1276             break;
1277          case OPCODE_SLE: /* set on less or equal */
1278             {
1279                GLfloat a[4], b[4], result[4];
1280                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1281                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1282                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1283                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1284                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1285                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1286                store_vector4( inst, machine, result );
1287             }
1288             break;
1289          case OPCODE_SLT: /* set on less */
1290             {
1291                GLfloat a[4], b[4], result[4];
1292                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1293                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1294                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1295                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1296                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1297                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1298                store_vector4( inst, machine, result );
1299             }
1300             break;
1301          case OPCODE_SNE: /* set on not equal */
1302             {
1303                GLfloat a[4], b[4], result[4];
1304                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1305                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1306                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1307                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1308                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1309                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1310                store_vector4( inst, machine, result );
1311             }
1312             break;
1313          case OPCODE_STR: /* set true, operands ignored */
1314             {
1315                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1316                store_vector4( inst, machine, result );
1317             }
1318             break;
1319          case OPCODE_SUB:
1320             {
1321                GLfloat a[4], b[4], result[4];
1322                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1323                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1324                result[0] = a[0] - b[0];
1325                result[1] = a[1] - b[1];
1326                result[2] = a[2] - b[2];
1327                result[3] = a[3] - b[3];
1328                store_vector4( inst, machine, result );
1329                if (DEBUG_PROG) {
1330                   printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1331                          result[0], result[1], result[2], result[3],
1332                          a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1333                }
1334             }
1335             break;
1336          case OPCODE_SWZ: /* extended swizzle */
1337             {
1338                const struct prog_src_register *source = &inst->SrcReg[0];
1339                const GLfloat *src = get_register_pointer(ctx, source, machine);
1340                GLfloat result[4];
1341                GLuint i;
1342                for (i = 0; i < 4; i++) {
1343                   const GLuint swz = GET_SWZ(source->Swizzle, i);
1344                   if (swz == SWIZZLE_ZERO)
1345                      result[i] = 0.0;
1346                   else if (swz == SWIZZLE_ONE)
1347                      result[i] = 1.0;
1348                   else {
1349                      ASSERT(swz >= 0);
1350                      ASSERT(swz <= 3);
1351                      result[i] = src[swz];
1352                   }
1353                   if (source->NegateBase & (1 << i))
1354                      result[i] = -result[i];
1355                }
1356                store_vector4( inst, machine, result );
1357             }
1358             break;
1359          case OPCODE_TEX: /* Both ARB and NV frag prog */
1360             /* Texel lookup */
1361             {
1362                /* Note: only use the precomputed lambda value when we're
1363                 * sampling texture unit [K] with texcoord[K].
1364                 * Otherwise, the lambda value may have no relation to the
1365                 * instruction's texcoord or texture image.  Using the wrong
1366                 * lambda is usually bad news.
1367                 * The rest of the time, just use zero (until we get a more
1368                 * sophisticated way of computing lambda).
1369                 */
1370                GLfloat coord[4], color[4], lambda;
1371 #if 0
1372                if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1373                    inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
1374                   lambda = span->array->lambda[inst->TexSrcUnit][column];
1375                else
1376 #endif
1377                   lambda = 0.0;
1378                fetch_vector4(ctx, &inst->SrcReg[0], machine, coord);
1379                machine->FetchTexelLod(ctx, coord, lambda, inst->TexSrcUnit, color);
1380                if (DEBUG_PROG) {
1381                   printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g], "
1382                          "lod %f\n",
1383                          color[0], color[1], color[2], color[3],
1384                          inst->TexSrcUnit,
1385                          coord[0], coord[1], coord[2], coord[3], lambda);
1386                }
1387                store_vector4( inst, machine, color );
1388             }
1389             break;
1390          case OPCODE_TXB: /* GL_ARB_fragment_program only */
1391             /* Texel lookup with LOD bias */
1392             {
1393                const struct gl_texture_unit *texUnit
1394                   = &ctx->Texture.Unit[inst->TexSrcUnit];
1395                GLfloat coord[4], color[4], lambda, bias;
1396 #if 0
1397                if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1398                    inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
1399                   lambda = span->array->lambda[inst->TexSrcUnit][column];
1400                else
1401 #endif
1402                   lambda = 0.0;
1403                fetch_vector4(ctx, &inst->SrcReg[0], machine, coord);
1404                /* coord[3] is the bias to add to lambda */
1405                bias = texUnit->LodBias + coord[3];
1406                if (texUnit->_Current)
1407                   bias += texUnit->_Current->LodBias;
1408                machine->FetchTexelLod(ctx, coord, lambda + bias,
1409                                       inst->TexSrcUnit, color);
1410                store_vector4( inst, machine, color );
1411             }
1412             break;
1413          case OPCODE_TXD: /* GL_NV_fragment_program only */
1414             /* Texture lookup w/ partial derivatives for LOD */
1415             {
1416                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1417                fetch_vector4( ctx, &inst->SrcReg[0], machine, texcoord );
1418                fetch_vector4( ctx, &inst->SrcReg[1], machine, dtdx );
1419                fetch_vector4( ctx, &inst->SrcReg[2], machine, dtdy );
1420                machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1421                                         inst->TexSrcUnit, color );
1422                store_vector4( inst, machine, color );
1423             }
1424             break;
1425          case OPCODE_TXP: /* GL_ARB_fragment_program only */
1426             /* Texture lookup w/ projective divide */
1427             {
1428                GLfloat texcoord[4], color[4], lambda;
1429 #if 0
1430                if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1431                    inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
1432                   lambda = span->array->lambda[inst->TexSrcUnit][column];
1433                else
1434 #endif
1435                   lambda = 0.0;
1436                fetch_vector4(ctx, &inst->SrcReg[0], machine, texcoord);
1437                /* Not so sure about this test - if texcoord[3] is
1438                 * zero, we'd probably be fine except for an ASSERT in
1439                 * IROUND_POS() which gets triggered by the inf values created.
1440                 */
1441                if (texcoord[3] != 0.0) {
1442                   texcoord[0] /= texcoord[3];
1443                   texcoord[1] /= texcoord[3];
1444                   texcoord[2] /= texcoord[3];
1445                }
1446                machine->FetchTexelLod(ctx, texcoord, lambda,
1447                                       inst->TexSrcUnit, color);
1448                store_vector4( inst, machine, color );
1449             }
1450             break;
1451          case OPCODE_TXP_NV: /* GL_NV_fragment_program only */
1452             /* Texture lookup w/ projective divide */
1453             {
1454                GLfloat texcoord[4], color[4], lambda;
1455 #if 0
1456                if (inst->SrcReg[0].File == PROGRAM_INPUT &&
1457                    inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
1458                   lambda = span->array->lambda[inst->TexSrcUnit][column];
1459                else
1460 #endif
1461                   lambda = 0.0;
1462                fetch_vector4(ctx, &inst->SrcReg[0], machine, texcoord);
1463                if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1464                    texcoord[3] != 0.0) {
1465                   texcoord[0] /= texcoord[3];
1466                   texcoord[1] /= texcoord[3];
1467                   texcoord[2] /= texcoord[3];
1468                }
1469                machine->FetchTexelLod(ctx, texcoord, lambda,
1470                                       inst->TexSrcUnit, color);
1471                store_vector4( inst, machine, color );
1472             }
1473             break;
1474          case OPCODE_UP2H: /* unpack two 16-bit floats */
1475             {
1476                GLfloat a[4], result[4];
1477                const GLuint *rawBits = (const GLuint *) a;
1478                GLhalfNV hx, hy;
1479                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1480                hx = rawBits[0] & 0xffff;
1481                hy = rawBits[0] >> 16;
1482                result[0] = result[2] = _mesa_half_to_float(hx);
1483                result[1] = result[3] = _mesa_half_to_float(hy);
1484                store_vector4( inst, machine, result );
1485             }
1486             break;
1487          case OPCODE_UP2US: /* unpack two GLushorts */
1488             {
1489                GLfloat a[4], result[4];
1490                const GLuint *rawBits = (const GLuint *) a;
1491                GLushort usx, usy;
1492                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1493                usx = rawBits[0] & 0xffff;
1494                usy = rawBits[0] >> 16;
1495                result[0] = result[2] = usx * (1.0f / 65535.0f);
1496                result[1] = result[3] = usy * (1.0f / 65535.0f);
1497                store_vector4( inst, machine, result );
1498             }
1499             break;
1500          case OPCODE_UP4B: /* unpack four GLbytes */
1501             {
1502                GLfloat a[4], result[4];
1503                const GLuint *rawBits = (const GLuint *) a;
1504                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1505                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1506                result[1] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1507                result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1508                result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1509                store_vector4( inst, machine, result );
1510             }
1511             break;
1512          case OPCODE_UP4UB: /* unpack four GLubytes */
1513             {
1514                GLfloat a[4], result[4];
1515                const GLuint *rawBits = (const GLuint *) a;
1516                fetch_vector1( ctx, &inst->SrcReg[0], machine, a );
1517                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1518                result[1] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1519                result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1520                result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1521                store_vector4( inst, machine, result );
1522             }
1523             break;
1524          case OPCODE_XPD: /* cross product */
1525             {
1526                GLfloat a[4], b[4], result[4];
1527                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1528                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1529                result[0] = a[1] * b[2] - a[2] * b[1];
1530                result[1] = a[2] * b[0] - a[0] * b[2];
1531                result[2] = a[0] * b[1] - a[1] * b[0];
1532                result[3] = 1.0;
1533                store_vector4( inst, machine, result );
1534             }
1535             break;
1536          case OPCODE_X2D: /* 2-D matrix transform */
1537             {
1538                GLfloat a[4], b[4], c[4], result[4];
1539                fetch_vector4( ctx, &inst->SrcReg[0], machine, a );
1540                fetch_vector4( ctx, &inst->SrcReg[1], machine, b );
1541                fetch_vector4( ctx, &inst->SrcReg[2], machine, c );
1542                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1543                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1544                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1545                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1546                store_vector4( inst, machine, result );
1547             }
1548             break;
1549          case OPCODE_PRINT:
1550             {
1551                if (inst->SrcReg[0].File != -1) {
1552                   GLfloat a[4];
1553                   fetch_vector4( ctx, &inst->SrcReg[0], machine, a);
1554                   _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1555                                a[0], a[1], a[2], a[3]);
1556                }
1557                else {
1558                   _mesa_printf("%s\n", (const char *) inst->Data);
1559                }
1560             }
1561             break;
1562          case OPCODE_END:
1563             return GL_TRUE;
1564          default:
1565             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1566                           inst->Opcode);
1567             return GL_TRUE; /* return value doesn't matter */
1568
1569       }
1570       total++;
1571       if (total > MAX_EXEC) {
1572          _mesa_problem(ctx, "Infinite loop detected in fragment program");
1573          return GL_TRUE;
1574          abort();
1575       }
1576    }
1577
1578 #if FEATURE_MESA_program_debug
1579    CurrentMachine = NULL;
1580 #endif
1581
1582    return GL_TRUE;
1583 }