src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25
  26 #include "glheader.h"
  27 #include "colormac.h"
  28 #include "context.h"
  29 #include "nvfragprog.h"
  30 #include "macros.h"
  31 #include "program.h"
  32
  33 #include "s_nvfragprog.h"
  34 #include "s_span.h"
  35 #include "s_texture.h"
  36
  37
  38 /* if 1, print some debugging info */
  39 #define DEBUG_FRAG 0
  40
  41 /**
  42  * Fetch a texel.
  43  */
  44 static void
  45 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  46              GLuint unit, GLfloat color[4] )
  47 {
  48    GLchan rgba[4];
  49    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  50
  51    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  52                                1, (const GLfloat (*)[4]) texcoord,
  53                                &lambda, &rgba);
  54    color[0] = CHAN_TO_FLOAT(rgba[0]);
  55    color[1] = CHAN_TO_FLOAT(rgba[1]);
  56    color[2] = CHAN_TO_FLOAT(rgba[2]);
  57    color[3] = CHAN_TO_FLOAT(rgba[3]);
  58 }
  59
  60
  61 /**
  62  * Fetch a texel with the given partial derivatives to compute a level
  63  * of detail in the mipmap.
  64  */
  65 static void
  66 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  67                    const GLfloat texdx[4], const GLfloat texdy[4],
  68                    GLuint unit, GLfloat color[4] )
  69 {
  70    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  71    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  72    const struct gl_texture_image *texImg = texObj->Image[texObj->BaseLevel];
  73    const GLfloat texW = (GLfloat) texImg->WidthScale;
  74    const GLfloat texH = (GLfloat) texImg->HeightScale;
  75    GLchan rgba[4];
  76
  77    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  78                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  79                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  80                                          texW, texH,
  81                                          texcoord[0], texcoord[1], texcoord[3],
  82                                          1.0F / texcoord[3]);
  83
  84    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  85                                1, (const GLfloat (*)[4]) texcoord,
  86                                &lambda, &rgba);
  87    color[0] = CHAN_TO_FLOAT(rgba[0]);
  88    color[1] = CHAN_TO_FLOAT(rgba[1]);
  89    color[2] = CHAN_TO_FLOAT(rgba[2]);
  90    color[3] = CHAN_TO_FLOAT(rgba[3]);
  91 }
  92
  93
  94 /**
  95  * Return a pointer to the 4-element float vector specified by the given
  96  * source register.
  97  */
  98 static INLINE const GLfloat *
  99 get_register_pointer( GLcontext *ctx,
 100                       const struct fp_src_register *source,
 101                       const struct fp_machine *machine,
 102                       const struct fragment_program *program )
 103 {
 104    const GLfloat *src;
 105    switch (source->File) {
 106       case PROGRAM_TEMPORARY:
 107          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 108          src = machine->Temporaries[source->Index];
 109          break;
 110       case PROGRAM_INPUT:
 111          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 112          src = machine->Inputs[source->Index];
 113          break;
 114       case PROGRAM_LOCAL_PARAM:
 115          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 116          src = program->Base.LocalParams[source->Index];
 117          break;
 118       case PROGRAM_ENV_PARAM:
 119          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 120          src = ctx->FragmentProgram.Parameters[source->Index];
 121          break;
 122
 123       case PROGRAM_STATE_VAR:
 124                         /* Fallthrough */
 125
 126       case PROGRAM_NAMED_PARAM:
 127          ASSERT(source->Index < (GLint) program->Parameters->NumParameters);
 128          src = program->Parameters->Parameters[source->Index].Values;
 129          break;
 130       default:
 131          _mesa_problem(ctx, "Invalid input register file in fetch_vector4");
 132          src = NULL;
 133    }
 134    return src;
 135 }
 136
 137
 138 /**
 139  * Fetch a 4-element float vector from the given source register.
 140  * Apply swizzling and negating as needed.
 141  */
 142 static void
 143 fetch_vector4( GLcontext *ctx,
 144                const struct fp_src_register *source,
 145                const struct fp_machine *machine,
 146                const struct fragment_program *program,
 147                GLfloat result[4] )
 148 {
 149    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 150    ASSERT(src);
 151
 152    result[0] = src[source->Swizzle[0]];
 153    result[1] = src[source->Swizzle[1]];
 154    result[2] = src[source->Swizzle[2]];
 155    result[3] = src[source->Swizzle[3]];
 156
 157    if (source->NegateBase) {
 158       result[0] = -result[0];
 159       result[1] = -result[1];
 160       result[2] = -result[2];
 161       result[3] = -result[3];
 162    }
 163    if (source->Abs) {
 164       result[0] = FABSF(result[0]);
 165       result[1] = FABSF(result[1]);
 166       result[2] = FABSF(result[2]);
 167       result[3] = FABSF(result[3]);
 168    }
 169    if (source->NegateAbs) {
 170       result[0] = -result[0];
 171       result[1] = -result[1];
 172       result[2] = -result[2];
 173       result[3] = -result[3];
 174    }
 175 }
 176
 177
 178 /**
 179  * Fetch the derivative with respect to X for the given register.
 180  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 181  * need to execute another instance of the program (ugh)!
 182  */
 183 static GLboolean
 184 fetch_vector4_deriv( const struct fp_src_register *source,
 185                      const struct sw_span *span,
 186                      char xOrY, GLfloat result[4] )
 187 {
 188    GLfloat src[4];
 189
 190    ASSERT(xOrY == 'X' || xOrY == 'Y');
 191
 192    assert(source->File == PROGRAM_INPUT);
 193
 194    switch (source->Index) {
 195    case FRAG_ATTRIB_WPOS:
 196       if (xOrY == 'X') {
 197          src[0] = 1.0;
 198          src[1] = 0.0;
 199          src[2] = span->dzdx;
 200          src[3] = span->dwdx;
 201       }
 202       else {
 203          src[0] = 0.0;
 204          src[1] = 1.0;
 205          src[2] = span->dzdy;
 206          src[3] = span->dwdy;
 207       }
 208       break;
 209    case FRAG_ATTRIB_COL0:
 210       if (xOrY == 'X') {
 211          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 212          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 213          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 214          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 215       }
 216       else {
 217          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 218          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 219          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 220          src[3] = span->dady * (1.0F / CHAN_MAXF);
 221       }
 222       break;
 223    case FRAG_ATTRIB_COL1:
 224       if (xOrY == 'X') {
 225          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 226          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 227          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 228          src[3] = 0.0; /* XXX need this */
 229       }
 230       else {
 231          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 232          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 233          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 234          src[3] = 0.0; /* XXX need this */
 235       }
 236       break;
 237    case FRAG_ATTRIB_FOGC:
 238       if (xOrY == 'X') {
 239          src[0] = span->dfogdx;
 240          src[1] = 0.0;
 241          src[2] = 0.0;
 242          src[3] = 0.0;
 243       }
 244       else {
 245          src[0] = span->dfogdy;
 246          src[1] = 0.0;
 247          src[2] = 0.0;
 248          src[3] = 0.0;
 249       }
 250       break;
 251    case FRAG_ATTRIB_TEX0:
 252    case FRAG_ATTRIB_TEX1:
 253    case FRAG_ATTRIB_TEX2:
 254    case FRAG_ATTRIB_TEX3:
 255    case FRAG_ATTRIB_TEX4:
 256    case FRAG_ATTRIB_TEX5:
 257    case FRAG_ATTRIB_TEX6:
 258    case FRAG_ATTRIB_TEX7:
 259       if (xOrY == 'X') {
 260          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 261          src[0] = span->texStepX[u][0] * (1.0F / CHAN_MAXF);
 262          src[1] = span->texStepX[u][1] * (1.0F / CHAN_MAXF);
 263          src[2] = span->texStepX[u][2] * (1.0F / CHAN_MAXF);
 264          src[3] = span->texStepX[u][3] * (1.0F / CHAN_MAXF);
 265       }
 266       else {
 267          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 268          src[0] = span->texStepY[u][0] * (1.0F / CHAN_MAXF);
 269          src[1] = span->texStepY[u][1] * (1.0F / CHAN_MAXF);
 270          src[2] = span->texStepY[u][2] * (1.0F / CHAN_MAXF);
 271          src[3] = span->texStepY[u][3] * (1.0F / CHAN_MAXF);
 272       }
 273       break;
 274    default:
 275       return GL_FALSE;
 276    }
 277
 278    result[0] = src[source->Swizzle[0]];
 279    result[1] = src[source->Swizzle[1]];
 280    result[2] = src[source->Swizzle[2]];
 281    result[3] = src[source->Swizzle[3]];
 282
 283    if (source->NegateBase) {
 284       result[0] = -result[0];
 285       result[1] = -result[1];
 286       result[2] = -result[2];
 287       result[3] = -result[3];
 288    }
 289    if (source->Abs) {
 290       result[0] = FABSF(result[0]);
 291       result[1] = FABSF(result[1]);
 292       result[2] = FABSF(result[2]);
 293       result[3] = FABSF(result[3]);
 294    }
 295    if (source->NegateAbs) {
 296       result[0] = -result[0];
 297       result[1] = -result[1];
 298       result[2] = -result[2];
 299       result[3] = -result[3];
 300    }
 301    return GL_TRUE;
 302 }
 303
 304
 305 /**
 306  * As above, but only return result[0] element.
 307  */
 308 static void
 309 fetch_vector1( GLcontext *ctx,
 310                const struct fp_src_register *source,
 311                const struct fp_machine *machine,
 312                const struct fragment_program *program,
 313                GLfloat result[4] )
 314 {
 315    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 316    ASSERT(src);
 317
 318    result[0] = src[source->Swizzle[0]];
 319
 320    if (source->NegateBase) {
 321       result[0] = -result[0];
 322    }
 323    if (source->Abs) {
 324       result[0] = FABSF(result[0]);
 325    }
 326    if (source->NegateAbs) {
 327       result[0] = -result[0];
 328    }
 329 }
 330
 331
 332 /*
 333  * Test value against zero and return GT, LT, EQ or UN if NaN.
 334  */
 335 static INLINE GLuint
 336 generate_cc( float value )
 337 {
 338    if (value != value)
 339       return COND_UN;  /* NaN */
 340    if (value > 0.0F)
 341       return COND_GT;
 342    if (value < 0.0F)
 343       return COND_LT;
 344    return COND_EQ;
 345 }
 346
 347 /*
 348  * Test if the ccMaskRule is satisfied by the given condition code.
 349  * Used to mask destination writes according to the current condition codee.
 350  */
 351 static INLINE GLboolean
 352 test_cc(GLuint condCode, GLuint ccMaskRule)
 353 {
 354    switch (ccMaskRule) {
 355    case COND_EQ: return (condCode == COND_EQ);
 356    case COND_NE: return (condCode != COND_EQ);
 357    case COND_LT: return (condCode == COND_LT);
 358    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 359    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 360    case COND_GT: return (condCode == COND_GT);
 361    case COND_TR: return GL_TRUE;
 362    case COND_FL: return GL_FALSE;
 363    default:      return GL_TRUE;
 364    }
 365 }
 366
 367
 368 /**
 369  * Store 4 floats into a register.  Observe the instructions saturate and
 370  * set-condition-code flags.
 371  */
 372 static void
 373 store_vector4( const struct fp_instruction *inst,
 374                struct fp_machine *machine,
 375                const GLfloat value[4] )
 376 {
 377    const struct fp_dst_register *dest = &(inst->DstReg);
 378    const GLboolean clamp = inst->Saturate;
 379    const GLboolean updateCC = inst->UpdateCondRegister;
 380    GLfloat *dstReg;
 381    GLfloat clampedValue[4];
 382    const GLboolean *writeMask = dest->WriteMask;
 383    GLboolean condWriteMask[4];
 384
 385    switch (dest->File) {
 386       case PROGRAM_OUTPUT:
 387          dstReg = machine->Outputs[dest->Index];
 388          break;
 389       case PROGRAM_TEMPORARY:
 390          dstReg = machine->Temporaries[dest->Index];
 391          break;
 392       default:
 393          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 394          return;
 395    }
 396
 397 #if DEBUG_FRAG
 398    if (value[0] > 1.0e10 ||
 399        IS_INF_OR_NAN(value[0]) ||
 400        IS_INF_OR_NAN(value[1]) ||
 401        IS_INF_OR_NAN(value[2]) ||
 402        IS_INF_OR_NAN(value[3])  )
 403       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 404 #endif
 405
 406    if (clamp) {
 407       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 408       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 409       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 410       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 411       value = clampedValue;
 412    }
 413
 414    if (dest->CondMask != COND_TR) {
 415       condWriteMask[0] = writeMask[0]
 416          && test_cc(machine->CondCodes[dest->CondSwizzle[0]], dest->CondMask);
 417       condWriteMask[1] = writeMask[1]
 418          && test_cc(machine->CondCodes[dest->CondSwizzle[1]], dest->CondMask);
 419       condWriteMask[2] = writeMask[2]
 420          && test_cc(machine->CondCodes[dest->CondSwizzle[2]], dest->CondMask);
 421       condWriteMask[3] = writeMask[3]
 422          && test_cc(machine->CondCodes[dest->CondSwizzle[3]], dest->CondMask);
 423       writeMask = condWriteMask;
 424    }
 425
 426    if (writeMask[0]) {
 427       dstReg[0] = value[0];
 428       if (updateCC)
 429          machine->CondCodes[0] = generate_cc(value[0]);
 430    }
 431    if (writeMask[1]) {
 432       dstReg[1] = value[1];
 433       if (updateCC)
 434          machine->CondCodes[1] = generate_cc(value[1]);
 435    }
 436    if (writeMask[2]) {
 437       dstReg[2] = value[2];
 438       if (updateCC)
 439          machine->CondCodes[2] = generate_cc(value[2]);
 440    }
 441    if (writeMask[3]) {
 442       dstReg[3] = value[3];
 443       if (updateCC)
 444          machine->CondCodes[3] = generate_cc(value[3]);
 445    }
 446 }
 447
 448
 449 /**
 450  * Initialize a new machine state instance from an existing one, adding
 451  * the partial derivatives onto the input registers.
 452  * Used to implement DDX and DDY instructions in non-trivial cases.
 453  */
 454 static void
 455 init_machine_deriv( GLcontext *ctx,
 456                     const struct fp_machine *machine,
 457                     const struct fragment_program *program,
 458                     const struct sw_span *span, char xOrY,
 459                     struct fp_machine *dMachine )
 460 {
 461    GLuint u;
 462
 463    ASSERT(xOrY == 'X' || xOrY == 'Y');
 464
 465    /* copy existing machine */
 466    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 467
 468    /* Clear temporary registers */
 469    _mesa_bzero( (void*) machine->Temporaries,
 470                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 471
 472    /* Add derivatives */
 473    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 474       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 475       if (xOrY == 'X') {
 476          wpos[0] += 1.0F;
 477          wpos[1] += 0.0F;
 478          wpos[2] += span->dzdx;
 479          wpos[3] += span->dwdx;
 480       }
 481       else {
 482          wpos[0] += 0.0F;
 483          wpos[1] += 1.0F;
 484          wpos[2] += span->dzdy;
 485          wpos[3] += span->dwdy;
 486       }
 487    }
 488    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 489       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 490       if (xOrY == 'X') {
 491          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 492          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 493          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 494          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 495       }
 496       else {
 497          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 498          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 499          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 500          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 501       }
 502    }
 503    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 504       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 505       if (xOrY == 'X') {
 506          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 507          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 508          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 509          col1[3] += 0.0; /*XXX fix */
 510       }
 511       else {
 512          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 513          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 514          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 515          col1[3] += 0.0; /*XXX fix */
 516       }
 517    }
 518    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 519       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 520       if (xOrY == 'X') {
 521          fogc[0] += span->dfogdx;
 522       }
 523       else {
 524          fogc[0] += span->dfogdy;
 525       }
 526    }
 527    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 528       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 529          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 530          if (xOrY == 'X') {
 531             tex[0] += span->texStepX[u][0];
 532             tex[1] += span->texStepX[u][1];
 533             tex[2] += span->texStepX[u][2];
 534             tex[3] += span->texStepX[u][3];
 535          }
 536          else {
 537             tex[0] += span->texStepY[u][0];
 538             tex[1] += span->texStepY[u][1];
 539             tex[2] += span->texStepY[u][2];
 540             tex[3] += span->texStepY[u][3];
 541          }
 542       }
 543    }
 544
 545    /* init condition codes */
 546    dMachine->CondCodes[0] = COND_EQ;
 547    dMachine->CondCodes[1] = COND_EQ;
 548    dMachine->CondCodes[2] = COND_EQ;
 549    dMachine->CondCodes[3] = COND_EQ;
 550 }
 551
 552
 553 /**
 554  * Execute the given vertex program.
 555  * NOTE: we do everything in single-precision floating point; we don't
 556  * currently observe the single/half/fixed-precision qualifiers.
 557  * \param ctx - rendering context
 558  * \param program - the fragment program to execute
 559  * \param machine - machine state (register file)
 560  * \param maxInst - max number of instructions to execute
 561  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 562  */
 563 static GLboolean
 564 execute_program( GLcontext *ctx,
 565                  const struct fragment_program *program, GLuint maxInst,
 566                  struct fp_machine *machine, const struct sw_span *span,
 567                  GLuint column )
 568 {
 569    GLuint pc;
 570
 571 #if DEBUG_FRAG
 572    printf("execute fragment program --------------------\n");
 573 #endif
 574
 575         /* XXX: This should go someplace else, but it is safe here (and slow!)
 576          *        - karl
 577          */
 578    _mesa_load_state_parameters(ctx, program->Parameters);
 579
 580
 581    for (pc = 0; pc < maxInst; pc++) {
 582       const struct fp_instruction *inst = program->Instructions + pc;
 583
 584       if (ctx->FragmentProgram.CallbackEnabled &&
 585           ctx->FragmentProgram.Callback) {
 586          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 587          ctx->FragmentProgram.Callback(program->Base.Target,
 588                                        ctx->FragmentProgram.CallbackData);
 589       }
 590
 591       switch (inst->Opcode) {
 592          case FP_OPCODE_ABS:
 593             {
 594                GLfloat a[4], result[4];
 595                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 596                result[0] = FABSF(a[0]);
 597                result[1] = FABSF(a[1]);
 598                result[2] = FABSF(a[2]);
 599                result[3] = FABSF(a[3]);
 600                store_vector4( inst, machine, result );
 601             }
 602             break;
 603          case FP_OPCODE_ADD:
 604             {
 605                GLfloat a[4], b[4], result[4];
 606                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 607                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 608                result[0] = a[0] + b[0];
 609                result[1] = a[1] + b[1];
 610                result[2] = a[2] + b[2];
 611                result[3] = a[3] + b[3];
 612                store_vector4( inst, machine, result );
 613             }
 614             break;
 615          case FP_OPCODE_CMP:
 616             {
 617                GLfloat a[4], b[4], c[4], result[4];
 618                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 619                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 620                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 621                result[0] = a[0] < 0.0F ? b[0] : c[0];
 622                result[1] = a[1] < 0.0F ? b[1] : c[1];
 623                result[2] = a[2] < 0.0F ? b[2] : c[2];
 624                result[3] = a[3] < 0.0F ? b[3] : c[3];
 625                store_vector4( inst, machine, result );
 626             }
 627             break;
 628          case FP_OPCODE_COS:
 629             {
 630                GLfloat a[4], result[4];
 631                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 632                result[0] = result[1] = result[2] = result[3] = (GLfloat)_mesa_cos(a[0]);
 633                store_vector4( inst, machine, result );
 634             }
 635             break;
 636          case FP_OPCODE_DDX: /* Partial derivative with respect to X */
 637             {
 638                GLfloat a[4], aNext[4], result[4];
 639                struct fp_machine dMachine;
 640                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'X', result)) {
 641                   /* This is tricky.  Make a copy of the current machine state,
 642                    * increment the input registers by the dx or dy partial
 643                    * derivatives, then re-execute the program up to the
 644                    * preceeding instruction, then fetch the source register.
 645                    * Finally, find the difference in the register values for
 646                    * the original and derivative runs.
 647                    */
 648                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 649                   init_machine_deriv(ctx, machine, program, span,
 650                                      'X', &dMachine);
 651                   execute_program(ctx, program, pc, &dMachine, span, column);
 652                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 653                   result[0] = aNext[0] - a[0];
 654                   result[1] = aNext[1] - a[1];
 655                   result[2] = aNext[2] - a[2];
 656                   result[3] = aNext[3] - a[3];
 657                }
 658                store_vector4( inst, machine, result );
 659             }
 660             break;
 661          case FP_OPCODE_DDY: /* Partial derivative with respect to Y */
 662             {
 663                GLfloat a[4], aNext[4], result[4];
 664                struct fp_machine dMachine;
 665                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'Y', result)) {
 666                   init_machine_deriv(ctx, machine, program, span,
 667                                      'Y', &dMachine);
 668                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 669                   execute_program(ctx, program, pc, &dMachine, span, column);
 670                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 671                   result[0] = aNext[0] - a[0];
 672                   result[1] = aNext[1] - a[1];
 673                   result[2] = aNext[2] - a[2];
 674                   result[3] = aNext[3] - a[3];
 675                }
 676                store_vector4( inst, machine, result );
 677             }
 678             break;
 679          case FP_OPCODE_DP3:
 680             {
 681                GLfloat a[4], b[4], result[4];
 682                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 683                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 684                result[0] = result[1] = result[2] = result[3] =
 685                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 686                store_vector4( inst, machine, result );
 687 #if DEBUG_FRAG
 688                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 689                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 690 #endif
 691             }
 692             break;
 693          case FP_OPCODE_DP4:
 694             {
 695                GLfloat a[4], b[4], result[4];
 696                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 697                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 698                result[0] = result[1] = result[2] = result[3] =
 699                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 700                store_vector4( inst, machine, result );
 701             }
 702             break;
 703          case FP_OPCODE_DPH:
 704             {
 705                GLfloat a[4], b[4], result[4];
 706                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 707                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 708                result[0] = result[1] = result[2] = result[3] =
 709                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 710                store_vector4( inst, machine, result );
 711             }
 712             break;
 713          case FP_OPCODE_DST: /* Distance vector */
 714             {
 715                GLfloat a[4], b[4], result[4];
 716                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 717                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 718                result[0] = 1.0F;
 719                result[1] = a[1] * b[1];
 720                result[2] = a[2];
 721                result[3] = b[3];
 722                store_vector4( inst, machine, result );
 723             }
 724             break;
 725          case FP_OPCODE_EX2: /* Exponential base 2 */
 726             {
 727                GLfloat a[4], result[4];
 728                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 729                result[0] = result[1] = result[2] = result[3] =
 730                   (GLfloat) _mesa_pow(2.0, a[0]);
 731                store_vector4( inst, machine, result );
 732             }
 733             break;
 734          case FP_OPCODE_FLR:
 735             {
 736                GLfloat a[4], result[4];
 737                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 738                result[0] = FLOORF(a[0]);
 739                result[1] = FLOORF(a[1]);
 740                result[2] = FLOORF(a[2]);
 741                result[3] = FLOORF(a[3]);
 742                store_vector4( inst, machine, result );
 743             }
 744             break;
 745          case FP_OPCODE_FRC:
 746             {
 747                GLfloat a[4], result[4];
 748                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 749                result[0] = a[0] - FLOORF(a[0]);
 750                result[1] = a[1] - FLOORF(a[1]);
 751                result[2] = a[2] - FLOORF(a[2]);
 752                result[3] = a[3] - FLOORF(a[3]);
 753                store_vector4( inst, machine, result );
 754             }
 755             break;
 756          case FP_OPCODE_KIL:
 757             {
 758                const GLuint *swizzle = inst->DstReg.CondSwizzle;
 759                const GLuint condMask = inst->DstReg.CondMask;
 760                if (test_cc(machine->CondCodes[swizzle[0]], condMask) ||
 761                    test_cc(machine->CondCodes[swizzle[1]], condMask) ||
 762                    test_cc(machine->CondCodes[swizzle[2]], condMask) ||
 763                    test_cc(machine->CondCodes[swizzle[3]], condMask)) {
 764                   return GL_FALSE;
 765                }
 766             }
 767             break;
 768          case FP_OPCODE_LG2:  /* log base 2 */
 769             {
 770                GLfloat a[4], result[4];
 771                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 772                result[0] = result[1] = result[2] = result[3]
 773                   = LOG2(a[0]);
 774                store_vector4( inst, machine, result );
 775             }
 776             break;
 777          case FP_OPCODE_LIT:
 778             {
 779                GLfloat a[4], result[4];
 780                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 781                if (a[0] < 0.0F)
 782                   a[0] = 0.0F;
 783                if (a[1] < 0.0F)
 784                   a[1] = 0.0F;
 785                result[0] = 1.0F;
 786                result[1] = a[0];
 787                result[2] = (a[0] > 0.0F) ? (GLfloat)_mesa_pow(2.0, a[3]) : 0.0F;
 788                result[3] = 1.0F;
 789                store_vector4( inst, machine, result );
 790             }
 791             break;
 792          case FP_OPCODE_LRP:
 793             {
 794                GLfloat a[4], b[4], c[4], result[4];
 795                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 796                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 797                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 798                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 799                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 800                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 801                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 802                store_vector4( inst, machine, result );
 803             }
 804             break;
 805          case FP_OPCODE_MAD:
 806             {
 807                GLfloat a[4], b[4], c[4], result[4];
 808                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 809                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 810                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 811                result[0] = a[0] * b[0] + c[0];
 812                result[1] = a[1] * b[1] + c[1];
 813                result[2] = a[2] * b[2] + c[2];
 814                result[3] = a[3] * b[3] + c[3];
 815                store_vector4( inst, machine, result );
 816             }
 817             break;
 818          case FP_OPCODE_MAX:
 819             {
 820                GLfloat a[4], b[4], result[4];
 821                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 822                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 823                result[0] = MAX2(a[0], b[0]);
 824                result[1] = MAX2(a[1], b[1]);
 825                result[2] = MAX2(a[2], b[2]);
 826                result[3] = MAX2(a[3], b[3]);
 827                store_vector4( inst, machine, result );
 828             }
 829             break;
 830          case FP_OPCODE_MIN:
 831             {
 832                GLfloat a[4], b[4], result[4];
 833                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 834                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 835                result[0] = MIN2(a[0], b[0]);
 836                result[1] = MIN2(a[1], b[1]);
 837                result[2] = MIN2(a[2], b[2]);
 838                result[3] = MIN2(a[3], b[3]);
 839                store_vector4( inst, machine, result );
 840             }
 841             break;
 842          case FP_OPCODE_MOV:
 843             {
 844                GLfloat result[4];
 845                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 846                store_vector4( inst, machine, result );
 847             }
 848             break;
 849          case FP_OPCODE_MUL:
 850             {
 851                GLfloat a[4], b[4], result[4];
 852                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 853                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 854                result[0] = a[0] * b[0];
 855                result[1] = a[1] * b[1];
 856                result[2] = a[2] * b[2];
 857                result[3] = a[3] * b[3];
 858                store_vector4( inst, machine, result );
 859 #if DEBUG_FRAG
 860                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 861                       result[0], result[1], result[2], result[3],
 862                       a[0], a[1], a[2], a[3],
 863                       b[0], b[1], b[2], b[3]);
 864 #endif
 865             }
 866             break;
 867          case FP_OPCODE_PK2H: /* pack two 16-bit floats */
 868             /* XXX this is probably wrong */
 869             {
 870                GLfloat a[4], result[4];
 871                const GLuint *rawBits = (const GLuint *) a;
 872                GLuint *rawResult = (GLuint *) result;
 873                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 874                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 875                   = rawBits[0] | (rawBits[1] << 16);
 876                store_vector4( inst, machine, result );
 877             }
 878             break;
 879          case FP_OPCODE_PK2US: /* pack two GLushorts */
 880             {
 881                GLfloat a[4], result[4];
 882                GLuint usx, usy, *rawResult = (GLuint *) result;
 883                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 884                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 885                a[1] = CLAMP(a[0], 0.0F, 1.0F);
 886                usx = IROUND(a[0] * 65535.0F);
 887                usy = IROUND(a[1] * 65535.0F);
 888                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 889                   = usx | (usy << 16);
 890                store_vector4( inst, machine, result );
 891             }
 892             break;
 893          case FP_OPCODE_PK4B: /* pack four GLbytes */
 894             {
 895                GLfloat a[4], result[4];
 896                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 897                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 898                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 899                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 900                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 901                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 902                ubx = IROUND(127.0F * a[0] + 128.0F);
 903                uby = IROUND(127.0F * a[1] + 128.0F);
 904                ubz = IROUND(127.0F * a[2] + 128.0F);
 905                ubw = IROUND(127.0F * a[3] + 128.0F);
 906                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 907                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 908                store_vector4( inst, machine, result );
 909             }
 910             break;
 911          case FP_OPCODE_PK4UB: /* pack four GLubytes */
 912             {
 913                GLfloat a[4], result[4];
 914                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 915                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 916                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 917                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 918                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 919                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 920                ubx = IROUND(255.0F * a[0]);
 921                uby = IROUND(255.0F * a[1]);
 922                ubz = IROUND(255.0F * a[2]);
 923                ubw = IROUND(255.0F * a[3]);
 924                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 925                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 926                store_vector4( inst, machine, result );
 927             }
 928             break;
 929          case FP_OPCODE_POW:
 930             {
 931                GLfloat a[4], b[4], result[4];
 932                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 933                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
 934                result[0] = result[1] = result[2] = result[3]
 935                   = (GLfloat)_mesa_pow(a[0], b[0]);
 936                store_vector4( inst, machine, result );
 937             }
 938             break;
 939          case FP_OPCODE_RCP:
 940             {
 941                GLfloat a[4], result[4];
 942                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 943 #if DEBUG_FRAG
 944                if (a[0] == 0)
 945                   printf("RCP(0)\n");
 946                else if (IS_INF_OR_NAN(a[0]))
 947                   printf("RCP(inf)\n");
 948 #endif
 949                result[0] = result[1] = result[2] = result[3]
 950                   = 1.0F / a[0];
 951                store_vector4( inst, machine, result );
 952             }
 953             break;
 954          case FP_OPCODE_RFL:
 955             {
 956                GLfloat axis[4], dir[4], result[4], tmp[4];
 957                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
 958                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
 959                tmp[3] = axis[0] * axis[0]
 960                       + axis[1] * axis[1]
 961                       + axis[2] * axis[2];
 962                tmp[0] = (2.0F * (axis[0] * dir[0] +
 963                                  axis[1] * dir[1] +
 964                                  axis[2] * dir[2])) / tmp[3];
 965                result[0] = tmp[0] * axis[0] - dir[0];
 966                result[1] = tmp[0] * axis[1] - dir[1];
 967                result[2] = tmp[0] * axis[2] - dir[2];
 968                /* result[3] is never written! XXX enforce in parser! */
 969                store_vector4( inst, machine, result );
 970             }
 971             break;
 972          case FP_OPCODE_RSQ: /* 1 / sqrt() */
 973             {
 974                GLfloat a[4], result[4];
 975                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 976                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
 977                store_vector4( inst, machine, result );
 978 #if DEBUG_FRAG
 979                printf("RSQ %g = 1/sqrt(%g)\n", result[0], a[0]);
 980 #endif
 981             }
 982             break;
 983          case FP_OPCODE_SCS: /* sine and cos */
 984             {
 985                GLfloat a[4], result[4];
 986                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 987                result[0] = (GLfloat)cos(a[0]);
 988                result[1] = (GLfloat)sin(a[0]);
 989                result[2] = 0.0;  /* undefined! */
 990                result[3] = 0.0;  /* undefined! */
 991                store_vector4( inst, machine, result );
 992             }
 993             break;
 994          case FP_OPCODE_SEQ: /* set on equal */
 995             {
 996                GLfloat a[4], b[4], result[4];
 997                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 998                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 999                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1000                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1001                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1002                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1003                store_vector4( inst, machine, result );
1004             }
1005             break;
1006          case FP_OPCODE_SFL: /* set false, operands ignored */
1007             {
1008                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1009                store_vector4( inst, machine, result );
1010             }
1011             break;
1012          case FP_OPCODE_SGE: /* set on greater or equal */
1013             {
1014                GLfloat a[4], b[4], result[4];
1015                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1016                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1017                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1018                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1019                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1020                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1021                store_vector4( inst, machine, result );
1022             }
1023             break;
1024          case FP_OPCODE_SGT: /* set on greater */
1025             {
1026                GLfloat a[4], b[4], result[4];
1027                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1028                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1029                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1030                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1031                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1032                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1033                store_vector4( inst, machine, result );
1034             }
1035             break;
1036          case FP_OPCODE_SIN:
1037             {
1038                GLfloat a[4], result[4];
1039                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1040                result[0] = result[1] = result[2] =
1041                        result[3] = (GLfloat)_mesa_sin(a[0]);
1042                store_vector4( inst, machine, result );
1043             }
1044             break;
1045          case FP_OPCODE_SLE: /* set on less or equal */
1046             {
1047                GLfloat a[4], b[4], result[4];
1048                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1049                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1050                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1051                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1052                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1053                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1054                store_vector4( inst, machine, result );
1055             }
1056             break;
1057          case FP_OPCODE_SLT: /* set on less */
1058             {
1059                GLfloat a[4], b[4], result[4];
1060                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1061                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1062                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1063                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1064                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1065                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1066                store_vector4( inst, machine, result );
1067             }
1068             break;
1069          case FP_OPCODE_SNE: /* set on not equal */
1070             {
1071                GLfloat a[4], b[4], result[4];
1072                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1073                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1074                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1075                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1076                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1077                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1078                store_vector4( inst, machine, result );
1079             }
1080             break;
1081          case FP_OPCODE_STR: /* set true, operands ignored */
1082             {
1083                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1084                store_vector4( inst, machine, result );
1085             }
1086             break;
1087          case FP_OPCODE_SUB:
1088             {
1089                GLfloat a[4], b[4], result[4];
1090                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1091                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1092                result[0] = a[0] - b[0];
1093                result[1] = a[1] - b[1];
1094                result[2] = a[2] - b[2];
1095                result[3] = a[3] - b[3];
1096                store_vector4( inst, machine, result );
1097             }
1098             break;
1099          case FP_OPCODE_SWZ:
1100             {
1101                const struct fp_src_register *source = &inst->SrcReg[0];
1102                const GLfloat *src = get_register_pointer(ctx, source,
1103                                                          machine, program);
1104                GLfloat result[4];
1105                GLuint i;
1106
1107                /* do extended swizzling here */
1108                for (i = 0; i < 3; i++) {
1109                   if (source->Swizzle[i] == SWIZZLE_ZERO)
1110                      result[i] = 0.0;
1111                   else if (source->Swizzle[i] == SWIZZLE_ONE)
1112                      result[i] = -1.0;
1113                   else
1114                      result[i] = -src[source->Swizzle[i]];
1115                   if (source->NegateBase)
1116                      result[i] = -result[i];
1117                }
1118                store_vector4( inst, machine, result );
1119             }
1120             break;
1121          case FP_OPCODE_TEX:
1122             /* Texel lookup */
1123             {
1124                GLfloat texcoord[4], color[4];
1125                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1126                /* XXX: Undo perspective divide from interpolate_texcoords() */
1127                fetch_texel( ctx, texcoord,
1128                             span->array->lambda[inst->TexSrcUnit][column],
1129                             inst->TexSrcUnit, color );
1130                store_vector4( inst, machine, color );
1131             }
1132             break;
1133          case FP_OPCODE_TXB:
1134             /* Texel lookup with LOD bias */
1135             {
1136                GLfloat texcoord[4], color[4], bias, lambda;
1137
1138                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1139                /* texcoord[3] is the bias to add to lambda */
1140                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1141                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1142                     + texcoord[3];
1143                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1144                fetch_texel( ctx, texcoord, lambda,
1145                             inst->TexSrcUnit, color );
1146                store_vector4( inst, machine, color );
1147             }
1148             break;
1149          case FP_OPCODE_TXD:
1150             /* Texture lookup w/ partial derivatives for LOD */
1151             {
1152                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1153                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1154                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1155                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1156                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1157                                   color );
1158                store_vector4( inst, machine, color );
1159             }
1160             break;
1161          case FP_OPCODE_TXP:
1162             /* Texture lookup w/ perspective divide */
1163             {
1164                GLfloat texcoord[4], color[4];
1165                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1166                /* Already did perspective divide in interpolate_texcoords() */
1167                fetch_texel( ctx, texcoord,
1168                             span->array->lambda[inst->TexSrcUnit][column],
1169                             inst->TexSrcUnit, color );
1170                store_vector4( inst, machine, color );
1171             }
1172             break;
1173          case FP_OPCODE_UP2H: /* unpack two 16-bit floats */
1174             /* XXX this is probably wrong */
1175             {
1176                GLfloat a[4], result[4];
1177                const GLuint *rawBits = (const GLuint *) a;
1178                GLuint *rawResult = (GLuint *) result;
1179                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1180                rawResult[0] = rawBits[0] & 0xffff;
1181                rawResult[1] = (rawBits[0] >> 16) & 0xffff;
1182                rawResult[2] = rawBits[0] & 0xffff;
1183                rawResult[3] = (rawBits[0] >> 16) & 0xffff;
1184                store_vector4( inst, machine, result );
1185             }
1186             break;
1187          case FP_OPCODE_UP2US: /* unpack two GLushorts */
1188             {
1189                GLfloat a[4], result[4];
1190                const GLuint *rawBits = (const GLuint *) a;
1191                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1192                result[0] = (GLfloat) ((rawBits[0] >>  0) & 0xffff) / 65535.0F;
1193                result[1] = (GLfloat) ((rawBits[0] >> 16) & 0xffff) / 65535.0F;
1194                result[2] = result[0];
1195                result[3] = result[1];
1196                store_vector4( inst, machine, result );
1197             }
1198             break;
1199          case FP_OPCODE_UP4B: /* unpack four GLbytes */
1200             {
1201                GLfloat a[4], result[4];
1202                const GLuint *rawBits = (const GLuint *) a;
1203                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1204                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1205                result[0] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1206                result[0] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1207                result[0] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1208                store_vector4( inst, machine, result );
1209             }
1210             break;
1211          case FP_OPCODE_UP4UB: /* unpack four GLubytes */
1212             {
1213                GLfloat a[4], result[4];
1214                const GLuint *rawBits = (const GLuint *) a;
1215                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1216                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1217                result[0] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1218                result[0] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1219                result[0] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1220                store_vector4( inst, machine, result );
1221             }
1222             break;
1223          case FP_OPCODE_X2D: /* 2-D matrix transform */
1224             {
1225                GLfloat a[4], b[4], c[4], result[4];
1226                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1227                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1228                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1229                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1230                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1231                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1232                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1233                store_vector4( inst, machine, result );
1234             }
1235             break;
1236          case FP_OPCODE_END:
1237             return GL_TRUE;
1238          default:
1239             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1240                           inst->Opcode);
1241             return GL_TRUE; /* return value doesn't matter */
1242       }
1243    }
1244    return GL_TRUE;
1245 }
1246
1247
1248 static void
1249 init_machine( GLcontext *ctx, struct fp_machine *machine,
1250               const struct fragment_program *program,
1251               const struct sw_span *span, GLuint col )
1252 {
1253    GLuint inputsRead = program->InputsRead;
1254    GLuint u;
1255
1256    if (ctx->FragmentProgram.CallbackEnabled)
1257       inputsRead = ~0;
1258
1259    /* Clear temporary registers */
1260    _mesa_bzero(machine->Temporaries,
1261                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1262
1263    /* Load input registers */
1264    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1265       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1266       wpos[0] = (GLfloat) span->x + col;
1267       wpos[1] = (GLfloat) span->y;
1268       wpos[2] = (GLfloat) span->array->z[col] / ctx->DepthMaxF;
1269       wpos[3] = span->w + col * span->dwdx;
1270    }
1271    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1272       GLfloat *col0 = machine->Inputs[FRAG_ATTRIB_COL0];
1273       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1274       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1275       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1276       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1277    }
1278    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1279       GLfloat *col1 = machine->Inputs[FRAG_ATTRIB_COL1];
1280       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1281       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1282       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1283       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1284    }
1285    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1286       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1287       fogc[0] = span->array->fog[col];
1288       fogc[1] = 0.0F;
1289       fogc[2] = 0.0F;
1290       fogc[3] = 0.0F;
1291    }
1292    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1293       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1294          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1295          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1296          COPY_4V(tex, span->array->texcoords[u][col]);
1297          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1298       }
1299    }
1300
1301    /* init condition codes */
1302    machine->CondCodes[0] = COND_EQ;
1303    machine->CondCodes[1] = COND_EQ;
1304    machine->CondCodes[2] = COND_EQ;
1305    machine->CondCodes[3] = COND_EQ;
1306 }
1307
1308
1309 void
1310 _swrast_exec_nv_fragment_program( GLcontext *ctx, struct sw_span *span )
1311 {
1312    const struct fragment_program *program = ctx->FragmentProgram.Current;
1313    GLuint i;
1314
1315    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1316
1317    for (i = 0; i < span->end; i++) {
1318       if (span->array->mask[i]) {
1319          init_machine(ctx, &ctx->FragmentProgram.Machine,
1320                       ctx->FragmentProgram.Current, span, i);
1321
1322          if (!execute_program(ctx, program, ~0,
1323                               &ctx->FragmentProgram.Machine, span, i)) {
1324             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1325          }
1326
1327          /* Store output registers */
1328          {
1329             const GLfloat *colOut
1330                = ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_COLR];
1331             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1332             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1333             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1334             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1335          }
1336          /* depth value */
1337          if (program->OutputsWritten & (1 << FRAG_OUTPUT_DEPR))
1338             span->array->z[i] = IROUND(ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_DEPR][0] * ctx->DepthMaxF);
1339       }
1340    }
1341
1342    ctx->_CurrentProgram = 0;
1343 }
1344