src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25
  26 #include "glheader.h"
  27 #include "colormac.h"
  28 #include "context.h"
  29 #include "nvfragprog.h"
  30 #include "macros.h"
  31 #include "program.h"
  32
  33 #include "s_nvfragprog.h"
  34 #include "s_span.h"
  35 #include "s_texture.h"
  36
  37
  38 /* if 1, print some debugging info */
  39 #define DEBUG_FRAG 0
  40
  41
  42 /**
  43  * Fetch a texel.
  44  */
  45 static void
  46 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  47              GLuint unit, GLfloat color[4] )
  48 {
  49    GLchan rgba[4];
  50    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  51
  52    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  53                                1, (const GLfloat (*)[4]) texcoord,
  54                                &lambda, &rgba);
  55    color[0] = CHAN_TO_FLOAT(rgba[0]);
  56    color[1] = CHAN_TO_FLOAT(rgba[1]);
  57    color[2] = CHAN_TO_FLOAT(rgba[2]);
  58    color[3] = CHAN_TO_FLOAT(rgba[3]);
  59 }
  60
  61
  62 /**
  63  * Fetch a texel with the given partial derivatives to compute a level
  64  * of detail in the mipmap.
  65  */
  66 static void
  67 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  68                    const GLfloat texdx[4], const GLfloat texdy[4],
  69                    GLuint unit, GLfloat color[4] )
  70 {
  71    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  72    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  73    const struct gl_texture_image *texImg = texObj->Image[texObj->BaseLevel];
  74    const GLfloat texW = (GLfloat) texImg->WidthScale;
  75    const GLfloat texH = (GLfloat) texImg->HeightScale;
  76    GLchan rgba[4];
  77
  78    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  79                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  80                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  81                                          texW, texH,
  82                                          texcoord[0], texcoord[1], texcoord[3],
  83                                          1.0F / texcoord[3]);
  84
  85    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  86                                1, (const GLfloat (*)[4]) texcoord,
  87                                &lambda, &rgba);
  88    color[0] = CHAN_TO_FLOAT(rgba[0]);
  89    color[1] = CHAN_TO_FLOAT(rgba[1]);
  90    color[2] = CHAN_TO_FLOAT(rgba[2]);
  91    color[3] = CHAN_TO_FLOAT(rgba[3]);
  92 }
  93
  94
  95
  96 /**
  97  * Fetch a 4-element float vector from the given source register.
  98  * Apply swizzling and negating as needed.
  99  */
 100 static void
 101 fetch_vector4( GLcontext *ctx,
 102                const struct fp_src_register *source,
 103                struct fp_machine *machine,
 104                const struct fragment_program *program,
 105                GLfloat result[4] )
 106 {
 107    const GLfloat *src;
 108
 109    switch (source->File) {
 110       case PROGRAM_TEMPORARY:
 111          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 112          src = machine->Temporaries[source->Index];
 113          break;
 114       case PROGRAM_INPUT:
 115          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 116          src = machine->Inputs[source->Index];
 117          break;
 118       case PROGRAM_LOCAL_PARAM:
 119          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 120          src = program->Base.LocalParams[source->Index];
 121          break;
 122       case PROGRAM_ENV_PARAM:
 123          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 124          src = ctx->FragmentProgram.Parameters[source->Index];
 125          break;
 126       case PROGRAM_NAMED_PARAM:
 127          ASSERT(source->Index < program->Parameters->NumParameters);
 128          src = program->Parameters->Parameters[source->Index].Values;
 129          break;
 130       case PROGRAM_STATE_VAR:
 131          abort();
 132       default:
 133          _mesa_problem(ctx, "Invalid input register file in fetch_vector4");
 134          return;
 135    }
 136
 137    result[0] = src[source->Swizzle[0]];
 138    result[1] = src[source->Swizzle[1]];
 139    result[2] = src[source->Swizzle[2]];
 140    result[3] = src[source->Swizzle[3]];
 141
 142    if (source->NegateBase) {
 143       result[0] = -result[0];
 144       result[1] = -result[1];
 145       result[2] = -result[2];
 146       result[3] = -result[3];
 147    }
 148    if (source->Abs) {
 149       result[0] = FABSF(result[0]);
 150       result[1] = FABSF(result[1]);
 151       result[2] = FABSF(result[2]);
 152       result[3] = FABSF(result[3]);
 153    }
 154    if (source->NegateAbs) {
 155       result[0] = -result[0];
 156       result[1] = -result[1];
 157       result[2] = -result[2];
 158       result[3] = -result[3];
 159    }
 160 }
 161
 162
 163 /**
 164  * Fetch the derivative with respect to X for the given register.
 165  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 166  * need to execute another instance of the program (ugh)!
 167  */
 168 static GLboolean
 169 fetch_vector4_deriv( const struct fp_src_register *source,
 170                      const struct sw_span *span,
 171                      char xOrY, GLfloat result[4] )
 172 {
 173    GLfloat src[4];
 174
 175    ASSERT(xOrY == 'X' || xOrY == 'Y');
 176
 177    assert(source->File == PROGRAM_INPUT);
 178
 179    switch (source->Index) {
 180    case FRAG_ATTRIB_WPOS:
 181       if (xOrY == 'X') {
 182          src[0] = 1.0;
 183          src[1] = 0.0;
 184          src[2] = span->dzdx;
 185          src[3] = span->dwdx;
 186       }
 187       else {
 188          src[0] = 0.0;
 189          src[1] = 1.0;
 190          src[2] = span->dzdy;
 191          src[3] = span->dwdy;
 192       }
 193       break;
 194    case FRAG_ATTRIB_COL0:
 195       if (xOrY == 'X') {
 196          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 197          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 198          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 199          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 200       }
 201       else {
 202          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 203          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 204          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 205          src[3] = span->dady * (1.0F / CHAN_MAXF);
 206       }
 207       break;
 208    case FRAG_ATTRIB_COL1:
 209       if (xOrY == 'X') {
 210          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 211          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 212          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 213          src[3] = 0.0; /* XXX need this */
 214       }
 215       else {
 216          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 217          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 218          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 219          src[3] = 0.0; /* XXX need this */
 220       }
 221       break;
 222    case FRAG_ATTRIB_FOGC:
 223       if (xOrY == 'X') {
 224          src[0] = span->dfogdx;
 225          src[1] = 0.0;
 226          src[2] = 0.0;
 227          src[3] = 0.0;
 228       }
 229       else {
 230          src[0] = span->dfogdy;
 231          src[1] = 0.0;
 232          src[2] = 0.0;
 233          src[3] = 0.0;
 234       }
 235       break;
 236    case FRAG_ATTRIB_TEX0:
 237    case FRAG_ATTRIB_TEX1:
 238    case FRAG_ATTRIB_TEX2:
 239    case FRAG_ATTRIB_TEX3:
 240    case FRAG_ATTRIB_TEX4:
 241    case FRAG_ATTRIB_TEX5:
 242    case FRAG_ATTRIB_TEX6:
 243    case FRAG_ATTRIB_TEX7:
 244       if (xOrY == 'X') {
 245          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 246          src[0] = span->texStepX[u][0] * (1.0F / CHAN_MAXF);
 247          src[1] = span->texStepX[u][1] * (1.0F / CHAN_MAXF);
 248          src[2] = span->texStepX[u][2] * (1.0F / CHAN_MAXF);
 249          src[3] = span->texStepX[u][3] * (1.0F / CHAN_MAXF);
 250       }
 251       else {
 252          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 253          src[0] = span->texStepY[u][0] * (1.0F / CHAN_MAXF);
 254          src[1] = span->texStepY[u][1] * (1.0F / CHAN_MAXF);
 255          src[2] = span->texStepY[u][2] * (1.0F / CHAN_MAXF);
 256          src[3] = span->texStepY[u][3] * (1.0F / CHAN_MAXF);
 257       }
 258       break;
 259    default:
 260       return GL_FALSE;
 261    }
 262
 263    result[0] = src[source->Swizzle[0]];
 264    result[1] = src[source->Swizzle[1]];
 265    result[2] = src[source->Swizzle[2]];
 266    result[3] = src[source->Swizzle[3]];
 267
 268    if (source->NegateBase) {
 269       result[0] = -result[0];
 270       result[1] = -result[1];
 271       result[2] = -result[2];
 272       result[3] = -result[3];
 273    }
 274    if (source->Abs) {
 275       result[0] = FABSF(result[0]);
 276       result[1] = FABSF(result[1]);
 277       result[2] = FABSF(result[2]);
 278       result[3] = FABSF(result[3]);
 279    }
 280    if (source->NegateAbs) {
 281       result[0] = -result[0];
 282       result[1] = -result[1];
 283       result[2] = -result[2];
 284       result[3] = -result[3];
 285    }
 286    return GL_TRUE;
 287 }
 288
 289
 290 /**
 291  * As above, but only return result[0] element.
 292  */
 293 static void
 294 fetch_vector1( GLcontext *ctx,
 295                const struct fp_src_register *source,
 296                const struct fp_machine *machine,
 297                const struct fragment_program *program,
 298                GLfloat result[4] )
 299 {
 300    const GLfloat *src;
 301
 302    switch (source->File) {
 303       case PROGRAM_TEMPORARY:
 304          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 305          src = machine->Temporaries[source->Index];
 306          break;
 307       case PROGRAM_INPUT:
 308          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 309          src = machine->Inputs[source->Index];
 310          break;
 311       case PROGRAM_LOCAL_PARAM:
 312          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 313          src = program->Base.LocalParams[source->Index];
 314          break;
 315       case PROGRAM_ENV_PARAM:
 316          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 317          src = ctx->FragmentProgram.Parameters[source->Index];
 318          break;
 319       case PROGRAM_NAMED_PARAM:
 320          ASSERT(source->Index < program->Parameters->NumParameters);
 321          src = program->Parameters->Parameters[source->Index].Values;
 322          break;
 323       case PROGRAM_STATE_VAR:
 324          abort();
 325       default:
 326          _mesa_problem(ctx, "Invalid input register file in fetch_vector1");
 327          return;
 328    }
 329
 330    result[0] = src[source->Swizzle[0]];
 331
 332    if (source->NegateBase) {
 333       result[0] = -result[0];
 334    }
 335    if (source->Abs) {
 336       result[0] = FABSF(result[0]);
 337    }
 338    if (source->NegateAbs) {
 339       result[0] = -result[0];
 340    }
 341 }
 342
 343
 344 /*
 345  * Test value against zero and return GT, LT, EQ or UN if NaN.
 346  */
 347 static INLINE GLuint
 348 generate_cc( float value )
 349 {
 350    if (value != value)
 351       return COND_UN;  /* NaN */
 352    if (value > 0.0F)
 353       return COND_GT;
 354    if (value < 0.0F)
 355       return COND_LT;
 356    return COND_EQ;
 357 }
 358
 359 /*
 360  * Test if the ccMaskRule is satisfied by the given condition code.
 361  * Used to mask destination writes according to the current condition codee.
 362  */
 363 static INLINE GLboolean
 364 test_cc(GLuint condCode, GLuint ccMaskRule)
 365 {
 366    switch (ccMaskRule) {
 367    case COND_EQ: return (condCode == COND_EQ);
 368    case COND_NE: return (condCode != COND_EQ);
 369    case COND_LT: return (condCode == COND_LT);
 370    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 371    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 372    case COND_GT: return (condCode == COND_GT);
 373    case COND_TR: return GL_TRUE;
 374    case COND_FL: return GL_FALSE;
 375    default:      return GL_TRUE;
 376    }
 377 }
 378
 379
 380 /**
 381  * Store 4 floats into a register.  Observe the instructions saturate and
 382  * set-condition-code flags.
 383  */
 384 static void
 385 store_vector4( const struct fp_instruction *inst,
 386                struct fp_machine *machine,
 387                const GLfloat value[4] )
 388 {
 389    const struct fp_dst_register *dest = &(inst->DstReg);
 390    const GLboolean clamp = inst->Saturate;
 391    const GLboolean updateCC = inst->UpdateCondRegister;
 392    GLfloat *dstReg;
 393    GLfloat clampedValue[4];
 394    const GLboolean *writeMask = dest->WriteMask;
 395    GLboolean condWriteMask[4];
 396
 397    switch (dest->File) {
 398       case PROGRAM_OUTPUT:
 399          dstReg = machine->Outputs[dest->Index];
 400          break;
 401       case PROGRAM_TEMPORARY:
 402          dstReg = machine->Temporaries[dest->Index];
 403          break;
 404       default:
 405          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 406          return;
 407    }
 408
 409 #if DEBUG_FRAG
 410    if (value[0] > 1.0e10 ||
 411        IS_INF_OR_NAN(value[0]) ||
 412        IS_INF_OR_NAN(value[1]) ||
 413        IS_INF_OR_NAN(value[2]) ||
 414        IS_INF_OR_NAN(value[3])  )
 415       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 416 #endif
 417
 418    if (clamp) {
 419       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 420       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 421       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 422       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 423       value = clampedValue;
 424    }
 425
 426    if (dest->CondMask != COND_TR) {
 427       condWriteMask[0] = writeMask[0]
 428          && test_cc(machine->CondCodes[dest->CondSwizzle[0]], dest->CondMask);
 429       condWriteMask[1] = writeMask[1]
 430          && test_cc(machine->CondCodes[dest->CondSwizzle[1]], dest->CondMask);
 431       condWriteMask[2] = writeMask[2]
 432          && test_cc(machine->CondCodes[dest->CondSwizzle[2]], dest->CondMask);
 433       condWriteMask[3] = writeMask[3]
 434          && test_cc(machine->CondCodes[dest->CondSwizzle[3]], dest->CondMask);
 435       writeMask = condWriteMask;
 436    }
 437
 438    if (writeMask[0]) {
 439       dstReg[0] = value[0];
 440       if (updateCC)
 441          machine->CondCodes[0] = generate_cc(value[0]);
 442    }
 443    if (writeMask[1]) {
 444       dstReg[1] = value[1];
 445       if (updateCC)
 446          machine->CondCodes[1] = generate_cc(value[1]);
 447    }
 448    if (writeMask[2]) {
 449       dstReg[2] = value[2];
 450       if (updateCC)
 451          machine->CondCodes[2] = generate_cc(value[2]);
 452    }
 453    if (writeMask[3]) {
 454       dstReg[3] = value[3];
 455       if (updateCC)
 456          machine->CondCodes[3] = generate_cc(value[3]);
 457    }
 458 }
 459
 460
 461 /**
 462  * Initialize a new machine state instance from an existing one, adding
 463  * the partial derivatives onto the input registers.
 464  * Used to implement DDX and DDY instructions in non-trivial cases.
 465  */
 466 static void
 467 init_machine_deriv( GLcontext *ctx,
 468                     const struct fp_machine *machine,
 469                     const struct fragment_program *program,
 470                     const struct sw_span *span, char xOrY,
 471                     struct fp_machine *dMachine )
 472 {
 473    GLuint u;
 474
 475    ASSERT(xOrY == 'X' || xOrY == 'Y');
 476
 477    /* copy existing machine */
 478    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 479
 480    /* Clear temporary registers */
 481    _mesa_bzero( (void*) machine->Temporaries,
 482                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 483
 484    /* Add derivatives */
 485    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 486       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 487       if (xOrY == 'X') {
 488          wpos[0] += 1.0F;
 489          wpos[1] += 0.0F;
 490          wpos[2] += span->dzdx;
 491          wpos[3] += span->dwdx;
 492       }
 493       else {
 494          wpos[0] += 0.0F;
 495          wpos[1] += 1.0F;
 496          wpos[2] += span->dzdy;
 497          wpos[3] += span->dwdy;
 498       }
 499    }
 500    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 501       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 502       if (xOrY == 'X') {
 503          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 504          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 505          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 506          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 507       }
 508       else {
 509          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 510          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 511          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 512          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 513       }
 514    }
 515    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 516       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 517       if (xOrY == 'X') {
 518          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 519          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 520          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 521          col1[3] += 0.0; /*XXX fix */
 522       }
 523       else {
 524          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 525          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 526          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 527          col1[3] += 0.0; /*XXX fix */
 528       }
 529    }
 530    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 531       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 532       if (xOrY == 'X') {
 533          fogc[0] += span->dfogdx;
 534       }
 535       else {
 536          fogc[0] += span->dfogdy;
 537       }
 538    }
 539    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 540       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 541          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 542          if (xOrY == 'X') {
 543             tex[0] += span->texStepX[u][0];
 544             tex[1] += span->texStepX[u][1];
 545             tex[2] += span->texStepX[u][2];
 546             tex[3] += span->texStepX[u][3];
 547          }
 548          else {
 549             tex[0] += span->texStepY[u][0];
 550             tex[1] += span->texStepY[u][1];
 551             tex[2] += span->texStepY[u][2];
 552             tex[3] += span->texStepY[u][3];
 553          }
 554       }
 555    }
 556
 557    /* init condition codes */
 558    dMachine->CondCodes[0] = COND_EQ;
 559    dMachine->CondCodes[1] = COND_EQ;
 560    dMachine->CondCodes[2] = COND_EQ;
 561    dMachine->CondCodes[3] = COND_EQ;
 562 }
 563
 564
 565 /**
 566  * Execute the given vertex program.
 567  * NOTE: we do everything in single-precision floating point; we don't
 568  * currently observe the single/half/fixed-precision qualifiers.
 569  * \param ctx - rendering context
 570  * \param program - the fragment program to execute
 571  * \param machine - machine state (register file)
 572  * \param maxInst - max number of instructions to execute
 573  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 574  */
 575 static GLboolean
 576 execute_program( GLcontext *ctx,
 577                  const struct fragment_program *program, GLuint maxInst,
 578                  struct fp_machine *machine, const struct sw_span *span,
 579                  GLuint column )
 580 {
 581    GLuint pc;
 582
 583 #if DEBUG_FRAG
 584    printf("execute fragment program --------------------\n");
 585 #endif
 586
 587    for (pc = 0; pc < maxInst; pc++) {
 588       const struct fp_instruction *inst = program->Instructions + pc;
 589
 590       if (ctx->FragmentProgram.CallbackEnabled &&
 591           ctx->FragmentProgram.Callback) {
 592          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 593          ctx->FragmentProgram.Callback(program->Base.Target,
 594                                        ctx->FragmentProgram.CallbackData);
 595       }
 596
 597       switch (inst->Opcode) {
 598          case FP_OPCODE_ADD:
 599             {
 600                GLfloat a[4], b[4], result[4];
 601                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 602                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 603                result[0] = a[0] + b[0];
 604                result[1] = a[1] + b[1];
 605                result[2] = a[2] + b[2];
 606                result[3] = a[3] + b[3];
 607                store_vector4( inst, machine, result );
 608             }
 609             break;
 610          case FP_OPCODE_COS:
 611             {
 612                GLfloat a[4], result[4];
 613                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 614                result[0] = result[1] = result[2] = result[3] = (GLfloat)_mesa_cos(a[0]);
 615                store_vector4( inst, machine, result );
 616             }
 617             break;
 618          case FP_OPCODE_DDX: /* Partial derivative with respect to X */
 619             {
 620                GLfloat a[4], aNext[4], result[4];
 621                struct fp_machine dMachine;
 622                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'X', result)) {
 623                   /* This is tricky.  Make a copy of the current machine state,
 624                    * increment the input registers by the dx or dy partial
 625                    * derivatives, then re-execute the program up to the
 626                    * preceeding instruction, then fetch the source register.
 627                    * Finally, find the difference in the register values for
 628                    * the original and derivative runs.
 629                    */
 630                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 631                   init_machine_deriv(ctx, machine, program, span,
 632                                      'X', &dMachine);
 633                   execute_program(ctx, program, pc, &dMachine, span, column);
 634                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 635                   result[0] = aNext[0] - a[0];
 636                   result[1] = aNext[1] - a[1];
 637                   result[2] = aNext[2] - a[2];
 638                   result[3] = aNext[3] - a[3];
 639                }
 640                store_vector4( inst, machine, result );
 641             }
 642             break;
 643          case FP_OPCODE_DDY: /* Partial derivative with respect to Y */
 644             {
 645                GLfloat a[4], aNext[4], result[4];
 646                struct fp_machine dMachine;
 647                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'Y', result)) {
 648                   init_machine_deriv(ctx, machine, program, span,
 649                                      'Y', &dMachine);
 650                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 651                   execute_program(ctx, program, pc, &dMachine, span, column);
 652                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 653                   result[0] = aNext[0] - a[0];
 654                   result[1] = aNext[1] - a[1];
 655                   result[2] = aNext[2] - a[2];
 656                   result[3] = aNext[3] - a[3];
 657                }
 658                store_vector4( inst, machine, result );
 659             }
 660             break;
 661          case FP_OPCODE_DP3:
 662             {
 663                GLfloat a[4], b[4], result[4];
 664                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 665                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 666                result[0] = result[1] = result[2] = result[3] =
 667                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 668                store_vector4( inst, machine, result );
 669 #if DEBUG_FRAG
 670                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 671                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 672 #endif
 673             }
 674             break;
 675          case FP_OPCODE_DP4:
 676             {
 677                GLfloat a[4], b[4], result[4];
 678                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 679                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 680                result[0] = result[1] = result[2] = result[3] =
 681                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 682                store_vector4( inst, machine, result );
 683             }
 684             break;
 685          case FP_OPCODE_DST: /* Distance vector */
 686             {
 687                GLfloat a[4], b[4], result[4];
 688                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 689                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 690                result[0] = 1.0F;
 691                result[1] = a[1] * b[1];
 692                result[2] = a[2];
 693                result[3] = b[3];
 694                store_vector4( inst, machine, result );
 695             }
 696             break;
 697          case FP_OPCODE_EX2: /* Exponential base 2 */
 698             {
 699                GLfloat a[4], result[4];
 700                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 701                result[0] = result[1] = result[2] = result[3] =
 702                   (GLfloat) _mesa_pow(2.0, a[0]);
 703                store_vector4( inst, machine, result );
 704             }
 705             break;
 706          case FP_OPCODE_FLR:
 707             {
 708                GLfloat a[4], result[4];
 709                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 710                result[0] = FLOORF(a[0]);
 711                result[1] = FLOORF(a[1]);
 712                result[2] = FLOORF(a[2]);
 713                result[3] = FLOORF(a[3]);
 714                store_vector4( inst, machine, result );
 715             }
 716             break;
 717          case FP_OPCODE_FRC:
 718             {
 719                GLfloat a[4], result[4];
 720                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 721                result[0] = a[0] - FLOORF(a[0]);
 722                result[1] = a[1] - FLOORF(a[1]);
 723                result[2] = a[2] - FLOORF(a[2]);
 724                result[3] = a[3] - FLOORF(a[3]);
 725                store_vector4( inst, machine, result );
 726             }
 727             break;
 728          case FP_OPCODE_KIL:
 729             {
 730                const GLuint *swizzle = inst->DstReg.CondSwizzle;
 731                const GLuint condMask = inst->DstReg.CondMask;
 732                if (test_cc(machine->CondCodes[swizzle[0]], condMask) ||
 733                    test_cc(machine->CondCodes[swizzle[1]], condMask) ||
 734                    test_cc(machine->CondCodes[swizzle[2]], condMask) ||
 735                    test_cc(machine->CondCodes[swizzle[3]], condMask)) {
 736                   return GL_FALSE;
 737                }
 738             }
 739             break;
 740          case FP_OPCODE_LG2:  /* log base 2 */
 741             {
 742                GLfloat a[4], result[4];
 743                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 744                result[0] = result[1] = result[2] = result[3]
 745                   = LOG2(a[0]);
 746                store_vector4( inst, machine, result );
 747             }
 748             break;
 749          case FP_OPCODE_LIT:
 750             {
 751                GLfloat a[4], result[4];
 752                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 753                if (a[0] < 0.0F)
 754                   a[0] = 0.0F;
 755                if (a[1] < 0.0F)
 756                   a[1] = 0.0F;
 757                result[0] = 1.0F;
 758                result[1] = a[0];
 759                result[2] = (a[0] > 0.0F) ? (GLfloat)_mesa_pow(2.0, a[3]) : 0.0F;
 760                result[3] = 1.0F;
 761                store_vector4( inst, machine, result );
 762             }
 763             break;
 764          case FP_OPCODE_LRP:
 765             {
 766                GLfloat a[4], b[4], c[4], result[4];
 767                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 768                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 769                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 770                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 771                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 772                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 773                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 774                store_vector4( inst, machine, result );
 775             }
 776             break;
 777          case FP_OPCODE_MAD:
 778             {
 779                GLfloat a[4], b[4], c[4], result[4];
 780                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 781                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 782                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 783                result[0] = a[0] * b[0] + c[0];
 784                result[1] = a[1] * b[1] + c[1];
 785                result[2] = a[2] * b[2] + c[2];
 786                result[3] = a[3] * b[3] + c[3];
 787                store_vector4( inst, machine, result );
 788             }
 789             break;
 790          case FP_OPCODE_MAX:
 791             {
 792                GLfloat a[4], b[4], result[4];
 793                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 794                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 795                result[0] = MAX2(a[0], b[0]);
 796                result[1] = MAX2(a[1], b[1]);
 797                result[2] = MAX2(a[2], b[2]);
 798                result[3] = MAX2(a[3], b[3]);
 799                store_vector4( inst, machine, result );
 800             }
 801             break;
 802          case FP_OPCODE_MIN:
 803             {
 804                GLfloat a[4], b[4], result[4];
 805                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 806                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 807                result[0] = MIN2(a[0], b[0]);
 808                result[1] = MIN2(a[1], b[1]);
 809                result[2] = MIN2(a[2], b[2]);
 810                result[3] = MIN2(a[3], b[3]);
 811                store_vector4( inst, machine, result );
 812             }
 813             break;
 814          case FP_OPCODE_MOV:
 815             {
 816                GLfloat result[4];
 817                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 818                store_vector4( inst, machine, result );
 819             }
 820             break;
 821          case FP_OPCODE_MUL:
 822             {
 823                GLfloat a[4], b[4], result[4];
 824                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 825                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 826                result[0] = a[0] * b[0];
 827                result[1] = a[1] * b[1];
 828                result[2] = a[2] * b[2];
 829                result[3] = a[3] * b[3];
 830                store_vector4( inst, machine, result );
 831 #if DEBUG_FRAG
 832                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 833                       result[0], result[1], result[2], result[3],
 834                       a[0], a[1], a[2], a[3],
 835                       b[0], b[1], b[2], b[3]);
 836 #endif
 837             }
 838             break;
 839          case FP_OPCODE_PK2H: /* pack two 16-bit floats */
 840             /* XXX this is probably wrong */
 841             {
 842                GLfloat a[4], result[4];
 843                const GLuint *rawBits = (const GLuint *) a;
 844                GLuint *rawResult = (GLuint *) result;
 845                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 846                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 847                   = rawBits[0] | (rawBits[1] << 16);
 848                store_vector4( inst, machine, result );
 849             }
 850             break;
 851          case FP_OPCODE_PK2US: /* pack two GLushorts */
 852             {
 853                GLfloat a[4], result[4];
 854                GLuint usx, usy, *rawResult = (GLuint *) result;
 855                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 856                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 857                a[1] = CLAMP(a[0], 0.0F, 1.0F);
 858                usx = IROUND(a[0] * 65535.0F);
 859                usy = IROUND(a[1] * 65535.0F);
 860                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 861                   = usx | (usy << 16);
 862                store_vector4( inst, machine, result );
 863             }
 864             break;
 865          case FP_OPCODE_PK4B: /* pack four GLbytes */
 866             {
 867                GLfloat a[4], result[4];
 868                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 869                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 870                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 871                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 872                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 873                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 874                ubx = IROUND(127.0F * a[0] + 128.0F);
 875                uby = IROUND(127.0F * a[1] + 128.0F);
 876                ubz = IROUND(127.0F * a[2] + 128.0F);
 877                ubw = IROUND(127.0F * a[3] + 128.0F);
 878                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 879                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 880                store_vector4( inst, machine, result );
 881             }
 882             break;
 883          case FP_OPCODE_PK4UB: /* pack four GLubytes */
 884             {
 885                GLfloat a[4], result[4];
 886                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 887                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 888                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 889                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 890                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 891                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 892                ubx = IROUND(255.0F * a[0]);
 893                uby = IROUND(255.0F * a[1]);
 894                ubz = IROUND(255.0F * a[2]);
 895                ubw = IROUND(255.0F * a[3]);
 896                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 897                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 898                store_vector4( inst, machine, result );
 899             }
 900             break;
 901          case FP_OPCODE_POW:
 902             {
 903                GLfloat a[4], b[4], result[4];
 904                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 905                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
 906                result[0] = result[1] = result[2] = result[3]
 907                   = (GLfloat)_mesa_pow(a[0], b[0]);
 908                store_vector4( inst, machine, result );
 909             }
 910             break;
 911          case FP_OPCODE_RCP:
 912             {
 913                GLfloat a[4], result[4];
 914                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 915 #if DEBUG_FRAG
 916                if (a[0] == 0)
 917                   printf("RCP(0)\n");
 918                else if (IS_INF_OR_NAN(a[0]))
 919                   printf("RCP(inf)\n");
 920 #endif
 921                result[0] = result[1] = result[2] = result[3]
 922                   = 1.0F / a[0];
 923                store_vector4( inst, machine, result );
 924             }
 925             break;
 926          case FP_OPCODE_RFL:
 927             {
 928                GLfloat axis[4], dir[4], result[4], tmp[4];
 929                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
 930                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
 931                tmp[3] = axis[0] * axis[0]
 932                       + axis[1] * axis[1]
 933                       + axis[2] * axis[2];
 934                tmp[0] = (2.0F * (axis[0] * dir[0] +
 935                                  axis[1] * dir[1] +
 936                                  axis[2] * dir[2])) / tmp[3];
 937                result[0] = tmp[0] * axis[0] - dir[0];
 938                result[1] = tmp[0] * axis[1] - dir[1];
 939                result[2] = tmp[0] * axis[2] - dir[2];
 940                /* result[3] is never written! XXX enforce in parser! */
 941                store_vector4( inst, machine, result );
 942             }
 943             break;
 944          case FP_OPCODE_RSQ: /* 1 / sqrt() */
 945             {
 946                GLfloat a[4], result[4];
 947                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 948                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
 949                store_vector4( inst, machine, result );
 950 #if DEBUG_FRAG
 951                printf("RSQ %g = 1/sqrt(%g)\n", result[0], a[0]);
 952 #endif
 953             }
 954             break;
 955          case FP_OPCODE_SEQ: /* set on equal */
 956             {
 957                GLfloat a[4], b[4], result[4];
 958                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 959                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 960                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
 961                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
 962                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
 963                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
 964                store_vector4( inst, machine, result );
 965             }
 966             break;
 967          case FP_OPCODE_SFL: /* set false, operands ignored */
 968             {
 969                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
 970                store_vector4( inst, machine, result );
 971             }
 972             break;
 973          case FP_OPCODE_SGE: /* set on greater or equal */
 974             {
 975                GLfloat a[4], b[4], result[4];
 976                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 977                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 978                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
 979                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
 980                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
 981                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
 982                store_vector4( inst, machine, result );
 983             }
 984             break;
 985          case FP_OPCODE_SGT: /* set on greater */
 986             {
 987                GLfloat a[4], b[4], result[4];
 988                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 989                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 990                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
 991                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
 992                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
 993                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
 994                store_vector4( inst, machine, result );
 995             }
 996             break;
 997          case FP_OPCODE_SIN:
 998             {
 999                GLfloat a[4], result[4];
1000                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1001                result[0] = result[1] = result[2] =
1002                        result[3] = (GLfloat)_mesa_sin(a[0]);
1003                store_vector4( inst, machine, result );
1004             }
1005             break;
1006          case FP_OPCODE_SLE: /* set on less or equal */
1007             {
1008                GLfloat a[4], b[4], result[4];
1009                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1010                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1011                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1012                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1013                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1014                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1015                store_vector4( inst, machine, result );
1016             }
1017             break;
1018          case FP_OPCODE_SLT: /* set on less */
1019             {
1020                GLfloat a[4], b[4], result[4];
1021                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1022                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1023                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1024                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1025                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1026                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1027                store_vector4( inst, machine, result );
1028             }
1029             break;
1030          case FP_OPCODE_SNE: /* set on not equal */
1031             {
1032                GLfloat a[4], b[4], result[4];
1033                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1034                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1035                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1036                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1037                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1038                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1039                store_vector4( inst, machine, result );
1040             }
1041             break;
1042          case FP_OPCODE_STR: /* set true, operands ignored */
1043             {
1044                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1045                store_vector4( inst, machine, result );
1046             }
1047             break;
1048          case FP_OPCODE_SUB:
1049             {
1050                GLfloat a[4], b[4], result[4];
1051                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1052                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1053                result[0] = a[0] - b[0];
1054                result[1] = a[1] - b[1];
1055                result[2] = a[2] - b[2];
1056                result[3] = a[3] - b[3];
1057                store_vector4( inst, machine, result );
1058             }
1059             break;
1060          case FP_OPCODE_TEX:
1061             /* Texel lookup */
1062             {
1063                GLfloat texcoord[4], color[4];
1064                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1065                /* XXX: Undo perspective divide from interpolate_texcoords() */
1066                fetch_texel( ctx, texcoord,
1067                             span->array->lambda[inst->TexSrcUnit][column],
1068                             inst->TexSrcUnit, color );
1069                store_vector4( inst, machine, color );
1070             }
1071             break;
1072          case FP_OPCODE_TXD:
1073             /* Texture lookup w/ partial derivatives for LOD */
1074             {
1075                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1076                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1077                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1078                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1079                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1080                                   color );
1081                store_vector4( inst, machine, color );
1082             }
1083             break;
1084          case FP_OPCODE_TXP:
1085             /* Texture lookup w/ perspective divide */
1086             {
1087                GLfloat texcoord[4], color[4];
1088                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1089                /* Already did perspective divide in interpolate_texcoords() */
1090                fetch_texel( ctx, texcoord,
1091                             span->array->lambda[inst->TexSrcUnit][column],
1092                             inst->TexSrcUnit, color );
1093                store_vector4( inst, machine, color );
1094             }
1095             break;
1096          case FP_OPCODE_UP2H: /* unpack two 16-bit floats */
1097             /* XXX this is probably wrong */
1098             {
1099                GLfloat a[4], result[4];
1100                const GLuint *rawBits = (const GLuint *) a;
1101                GLuint *rawResult = (GLuint *) result;
1102                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1103                rawResult[0] = rawBits[0] & 0xffff;
1104                rawResult[1] = (rawBits[0] >> 16) & 0xffff;
1105                rawResult[2] = rawBits[0] & 0xffff;
1106                rawResult[3] = (rawBits[0] >> 16) & 0xffff;
1107                store_vector4( inst, machine, result );
1108             }
1109             break;
1110          case FP_OPCODE_UP2US: /* unpack two GLushorts */
1111             {
1112                GLfloat a[4], result[4];
1113                const GLuint *rawBits = (const GLuint *) a;
1114                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1115                result[0] = (GLfloat) ((rawBits[0] >>  0) & 0xffff) / 65535.0F;
1116                result[1] = (GLfloat) ((rawBits[0] >> 16) & 0xffff) / 65535.0F;
1117                result[2] = result[0];
1118                result[3] = result[1];
1119                store_vector4( inst, machine, result );
1120             }
1121             break;
1122          case FP_OPCODE_UP4B: /* unpack four GLbytes */
1123             {
1124                GLfloat a[4], result[4];
1125                const GLuint *rawBits = (const GLuint *) a;
1126                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1127                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1128                result[0] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1129                result[0] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1130                result[0] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1131                store_vector4( inst, machine, result );
1132             }
1133             break;
1134          case FP_OPCODE_UP4UB: /* unpack four GLubytes */
1135             {
1136                GLfloat a[4], result[4];
1137                const GLuint *rawBits = (const GLuint *) a;
1138                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1139                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1140                result[0] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1141                result[0] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1142                result[0] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1143                store_vector4( inst, machine, result );
1144             }
1145             break;
1146          case FP_OPCODE_X2D: /* 2-D matrix transform */
1147             {
1148                GLfloat a[4], b[4], c[4], result[4];
1149                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1150                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1151                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1152                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1153                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1154                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1155                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1156                store_vector4( inst, machine, result );
1157             }
1158             break;
1159          case FP_OPCODE_END:
1160             return GL_TRUE;
1161          default:
1162             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1163                           inst->Opcode);
1164             return GL_TRUE; /* return value doesn't matter */
1165       }
1166    }
1167    return GL_TRUE;
1168 }
1169
1170
1171 static void
1172 init_machine( GLcontext *ctx, struct fp_machine *machine,
1173               const struct fragment_program *program,
1174               const struct sw_span *span, GLuint col )
1175 {
1176    GLuint inputsRead = program->InputsRead;
1177    GLuint u;
1178
1179    if (ctx->FragmentProgram.CallbackEnabled)
1180       inputsRead = ~0;
1181
1182    /* Clear temporary registers */
1183    _mesa_bzero(machine->Temporaries,
1184                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1185
1186    /* Load input registers */
1187    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1188       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1189       wpos[0] = (GLfloat) span->x + col;
1190       wpos[1] = (GLfloat) span->y;
1191       wpos[2] = (GLfloat) span->array->z[col] / ctx->DepthMaxF;
1192       wpos[3] = span->w + col * span->dwdx;
1193    }
1194    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1195       GLfloat *col0 = machine->Inputs[FRAG_ATTRIB_COL0];
1196       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1197       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1198       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1199       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1200    }
1201    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1202       GLfloat *col1 = machine->Inputs[FRAG_ATTRIB_COL1];
1203       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1204       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1205       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1206       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1207    }
1208    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1209       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1210       fogc[0] = span->array->fog[col];
1211       fogc[1] = 0.0F;
1212       fogc[2] = 0.0F;
1213       fogc[3] = 0.0F;
1214    }
1215    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1216       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1217          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1218          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1219          COPY_4V(tex, span->array->texcoords[u][col]);
1220          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1221       }
1222    }
1223
1224    /* init condition codes */
1225    machine->CondCodes[0] = COND_EQ;
1226    machine->CondCodes[1] = COND_EQ;
1227    machine->CondCodes[2] = COND_EQ;
1228    machine->CondCodes[3] = COND_EQ;
1229 }
1230
1231
1232 void
1233 _swrast_exec_nv_fragment_program( GLcontext *ctx, struct sw_span *span )
1234 {
1235    const struct fragment_program *program = ctx->FragmentProgram.Current;
1236    GLuint i;
1237
1238    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1239
1240    for (i = 0; i < span->end; i++) {
1241       if (span->array->mask[i]) {
1242          init_machine(ctx, &ctx->FragmentProgram.Machine,
1243                       ctx->FragmentProgram.Current, span, i);
1244
1245          if (!execute_program(ctx, program, ~0,
1246                               &ctx->FragmentProgram.Machine, span, i)) {
1247             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1248          }
1249
1250          /* Store output registers */
1251          {
1252             const GLfloat *colOut
1253                = ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_COLR];
1254             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1255             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1256             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1257             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1258          }
1259          /* depth value */
1260          if (program->OutputsWritten & (1 << FRAG_OUTPUT_DEPR))
1261             span->array->z[i] = IROUND(ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_DEPR][0] * ctx->DepthMaxF);
1262       }
1263    }
1264
1265    ctx->_CurrentProgram = 0;
1266 }
1267