src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25
  26 #include "glheader.h"
  27 #include "colormac.h"
  28 #include "context.h"
  29 #include "nvfragprog.h"
  30 #include "macros.h"
  31 #include "program.h"
  32
  33 #include "s_nvfragprog.h"
  34 #include "s_span.h"
  35 #include "s_texture.h"
  36
  37
  38 /* if 1, print some debugging info */
  39 #define DEBUG_FRAG 0
  40
  41
  42 /**
  43  * Fetch a texel.
  44  */
  45 static void
  46 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  47              GLuint unit, GLfloat color[4] )
  48 {
  49    GLchan rgba[4];
  50    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  51
  52    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  53                                1, (const GLfloat (*)[4]) texcoord,
  54                                &lambda, &rgba);
  55    color[0] = CHAN_TO_FLOAT(rgba[0]);
  56    color[1] = CHAN_TO_FLOAT(rgba[1]);
  57    color[2] = CHAN_TO_FLOAT(rgba[2]);
  58    color[3] = CHAN_TO_FLOAT(rgba[3]);
  59 }
  60
  61
  62 /**
  63  * Fetch a texel with the given partial derivatives to compute a level
  64  * of detail in the mipmap.
  65  */
  66 static void
  67 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  68                    const GLfloat texdx[4], const GLfloat texdy[4],
  69                    GLuint unit, GLfloat color[4] )
  70 {
  71    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  72    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  73    const struct gl_texture_image *texImg = texObj->Image[texObj->BaseLevel];
  74    const GLfloat texW = (GLfloat) texImg->WidthScale;
  75    const GLfloat texH = (GLfloat) texImg->HeightScale;
  76    GLchan rgba[4];
  77
  78    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  79                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  80                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  81                                          texW, texH,
  82                                          texcoord[0], texcoord[1], texcoord[3],
  83                                          1.0F / texcoord[3]);
  84
  85    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  86                                1, (const GLfloat (*)[4]) texcoord,
  87                                &lambda, &rgba);
  88    color[0] = CHAN_TO_FLOAT(rgba[0]);
  89    color[1] = CHAN_TO_FLOAT(rgba[1]);
  90    color[2] = CHAN_TO_FLOAT(rgba[2]);
  91    color[3] = CHAN_TO_FLOAT(rgba[3]);
  92 }
  93
  94
  95 /**
  96  * Return a pointer to the 4-element float vector specified by the given
  97  * source register.
  98  */
  99 static INLINE const GLfloat *
 100 get_register_pointer( GLcontext *ctx,
 101                       const struct fp_src_register *source,
 102                       const struct fp_machine *machine,
 103                       const struct fragment_program *program )
 104 {
 105    const GLfloat *src;
 106    switch (source->File) {
 107       case PROGRAM_TEMPORARY:
 108          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 109          src = machine->Temporaries[source->Index];
 110          break;
 111       case PROGRAM_INPUT:
 112          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 113          src = machine->Inputs[source->Index];
 114          break;
 115       case PROGRAM_LOCAL_PARAM:
 116          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 117          src = program->Base.LocalParams[source->Index];
 118          break;
 119       case PROGRAM_ENV_PARAM:
 120          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 121          src = ctx->FragmentProgram.Parameters[source->Index];
 122          break;
 123       case PROGRAM_NAMED_PARAM:
 124          ASSERT(source->Index < (GLint) program->Parameters->NumParameters);
 125          src = program->Parameters->Parameters[source->Index].Values;
 126          break;
 127       case PROGRAM_STATE_VAR:
 128          src = NULL;
 129          break;
 130       default:
 131          _mesa_problem(ctx, "Invalid input register file in fetch_vector4");
 132          src = NULL;
 133    }
 134    return src;
 135 }
 136
 137
 138 /**
 139  * Fetch a 4-element float vector from the given source register.
 140  * Apply swizzling and negating as needed.
 141  */
 142 static void
 143 fetch_vector4( GLcontext *ctx,
 144                const struct fp_src_register *source,
 145                const struct fp_machine *machine,
 146                const struct fragment_program *program,
 147                GLfloat result[4] )
 148 {
 149    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 150    ASSERT(src);
 151
 152    result[0] = src[source->Swizzle[0]];
 153    result[1] = src[source->Swizzle[1]];
 154    result[2] = src[source->Swizzle[2]];
 155    result[3] = src[source->Swizzle[3]];
 156
 157    if (source->NegateBase) {
 158       result[0] = -result[0];
 159       result[1] = -result[1];
 160       result[2] = -result[2];
 161       result[3] = -result[3];
 162    }
 163    if (source->Abs) {
 164       result[0] = FABSF(result[0]);
 165       result[1] = FABSF(result[1]);
 166       result[2] = FABSF(result[2]);
 167       result[3] = FABSF(result[3]);
 168    }
 169    if (source->NegateAbs) {
 170       result[0] = -result[0];
 171       result[1] = -result[1];
 172       result[2] = -result[2];
 173       result[3] = -result[3];
 174    }
 175 }
 176
 177
 178 /**
 179  * Fetch the derivative with respect to X for the given register.
 180  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 181  * need to execute another instance of the program (ugh)!
 182  */
 183 static GLboolean
 184 fetch_vector4_deriv( const struct fp_src_register *source,
 185                      const struct sw_span *span,
 186                      char xOrY, GLfloat result[4] )
 187 {
 188    GLfloat src[4];
 189
 190    ASSERT(xOrY == 'X' || xOrY == 'Y');
 191
 192    assert(source->File == PROGRAM_INPUT);
 193
 194    switch (source->Index) {
 195    case FRAG_ATTRIB_WPOS:
 196       if (xOrY == 'X') {
 197          src[0] = 1.0;
 198          src[1] = 0.0;
 199          src[2] = span->dzdx;
 200          src[3] = span->dwdx;
 201       }
 202       else {
 203          src[0] = 0.0;
 204          src[1] = 1.0;
 205          src[2] = span->dzdy;
 206          src[3] = span->dwdy;
 207       }
 208       break;
 209    case FRAG_ATTRIB_COL0:
 210       if (xOrY == 'X') {
 211          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 212          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 213          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 214          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 215       }
 216       else {
 217          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 218          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 219          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 220          src[3] = span->dady * (1.0F / CHAN_MAXF);
 221       }
 222       break;
 223    case FRAG_ATTRIB_COL1:
 224       if (xOrY == 'X') {
 225          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 226          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 227          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 228          src[3] = 0.0; /* XXX need this */
 229       }
 230       else {
 231          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 232          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 233          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 234          src[3] = 0.0; /* XXX need this */
 235       }
 236       break;
 237    case FRAG_ATTRIB_FOGC:
 238       if (xOrY == 'X') {
 239          src[0] = span->dfogdx;
 240          src[1] = 0.0;
 241          src[2] = 0.0;
 242          src[3] = 0.0;
 243       }
 244       else {
 245          src[0] = span->dfogdy;
 246          src[1] = 0.0;
 247          src[2] = 0.0;
 248          src[3] = 0.0;
 249       }
 250       break;
 251    case FRAG_ATTRIB_TEX0:
 252    case FRAG_ATTRIB_TEX1:
 253    case FRAG_ATTRIB_TEX2:
 254    case FRAG_ATTRIB_TEX3:
 255    case FRAG_ATTRIB_TEX4:
 256    case FRAG_ATTRIB_TEX5:
 257    case FRAG_ATTRIB_TEX6:
 258    case FRAG_ATTRIB_TEX7:
 259       if (xOrY == 'X') {
 260          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 261          src[0] = span->texStepX[u][0] * (1.0F / CHAN_MAXF);
 262          src[1] = span->texStepX[u][1] * (1.0F / CHAN_MAXF);
 263          src[2] = span->texStepX[u][2] * (1.0F / CHAN_MAXF);
 264          src[3] = span->texStepX[u][3] * (1.0F / CHAN_MAXF);
 265       }
 266       else {
 267          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 268          src[0] = span->texStepY[u][0] * (1.0F / CHAN_MAXF);
 269          src[1] = span->texStepY[u][1] * (1.0F / CHAN_MAXF);
 270          src[2] = span->texStepY[u][2] * (1.0F / CHAN_MAXF);
 271          src[3] = span->texStepY[u][3] * (1.0F / CHAN_MAXF);
 272       }
 273       break;
 274    default:
 275       return GL_FALSE;
 276    }
 277
 278    result[0] = src[source->Swizzle[0]];
 279    result[1] = src[source->Swizzle[1]];
 280    result[2] = src[source->Swizzle[2]];
 281    result[3] = src[source->Swizzle[3]];
 282
 283    if (source->NegateBase) {
 284       result[0] = -result[0];
 285       result[1] = -result[1];
 286       result[2] = -result[2];
 287       result[3] = -result[3];
 288    }
 289    if (source->Abs) {
 290       result[0] = FABSF(result[0]);
 291       result[1] = FABSF(result[1]);
 292       result[2] = FABSF(result[2]);
 293       result[3] = FABSF(result[3]);
 294    }
 295    if (source->NegateAbs) {
 296       result[0] = -result[0];
 297       result[1] = -result[1];
 298       result[2] = -result[2];
 299       result[3] = -result[3];
 300    }
 301    return GL_TRUE;
 302 }
 303
 304
 305 /**
 306  * As above, but only return result[0] element.
 307  */
 308 static void
 309 fetch_vector1( GLcontext *ctx,
 310                const struct fp_src_register *source,
 311                const struct fp_machine *machine,
 312                const struct fragment_program *program,
 313                GLfloat result[4] )
 314 {
 315    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 316    ASSERT(src);
 317
 318    result[0] = src[source->Swizzle[0]];
 319
 320    if (source->NegateBase) {
 321       result[0] = -result[0];
 322    }
 323    if (source->Abs) {
 324       result[0] = FABSF(result[0]);
 325    }
 326    if (source->NegateAbs) {
 327       result[0] = -result[0];
 328    }
 329 }
 330
 331
 332 /*
 333  * Test value against zero and return GT, LT, EQ or UN if NaN.
 334  */
 335 static INLINE GLuint
 336 generate_cc( float value )
 337 {
 338    if (value != value)
 339       return COND_UN;  /* NaN */
 340    if (value > 0.0F)
 341       return COND_GT;
 342    if (value < 0.0F)
 343       return COND_LT;
 344    return COND_EQ;
 345 }
 346
 347 /*
 348  * Test if the ccMaskRule is satisfied by the given condition code.
 349  * Used to mask destination writes according to the current condition codee.
 350  */
 351 static INLINE GLboolean
 352 test_cc(GLuint condCode, GLuint ccMaskRule)
 353 {
 354    switch (ccMaskRule) {
 355    case COND_EQ: return (condCode == COND_EQ);
 356    case COND_NE: return (condCode != COND_EQ);
 357    case COND_LT: return (condCode == COND_LT);
 358    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 359    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 360    case COND_GT: return (condCode == COND_GT);
 361    case COND_TR: return GL_TRUE;
 362    case COND_FL: return GL_FALSE;
 363    default:      return GL_TRUE;
 364    }
 365 }
 366
 367
 368 /**
 369  * Store 4 floats into a register.  Observe the instructions saturate and
 370  * set-condition-code flags.
 371  */
 372 static void
 373 store_vector4( const struct fp_instruction *inst,
 374                struct fp_machine *machine,
 375                const GLfloat value[4] )
 376 {
 377    const struct fp_dst_register *dest = &(inst->DstReg);
 378    const GLboolean clamp = inst->Saturate;
 379    const GLboolean updateCC = inst->UpdateCondRegister;
 380    GLfloat *dstReg;
 381    GLfloat clampedValue[4];
 382    const GLboolean *writeMask = dest->WriteMask;
 383    GLboolean condWriteMask[4];
 384
 385    switch (dest->File) {
 386       case PROGRAM_OUTPUT:
 387          dstReg = machine->Outputs[dest->Index];
 388          break;
 389       case PROGRAM_TEMPORARY:
 390          dstReg = machine->Temporaries[dest->Index];
 391          break;
 392       default:
 393          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 394          return;
 395    }
 396
 397 #if DEBUG_FRAG
 398    if (value[0] > 1.0e10 ||
 399        IS_INF_OR_NAN(value[0]) ||
 400        IS_INF_OR_NAN(value[1]) ||
 401        IS_INF_OR_NAN(value[2]) ||
 402        IS_INF_OR_NAN(value[3])  )
 403       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 404 #endif
 405
 406    if (clamp) {
 407       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 408       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 409       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 410       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 411       value = clampedValue;
 412    }
 413
 414    if (dest->CondMask != COND_TR) {
 415       condWriteMask[0] = writeMask[0]
 416          && test_cc(machine->CondCodes[dest->CondSwizzle[0]], dest->CondMask);
 417       condWriteMask[1] = writeMask[1]
 418          && test_cc(machine->CondCodes[dest->CondSwizzle[1]], dest->CondMask);
 419       condWriteMask[2] = writeMask[2]
 420          && test_cc(machine->CondCodes[dest->CondSwizzle[2]], dest->CondMask);
 421       condWriteMask[3] = writeMask[3]
 422          && test_cc(machine->CondCodes[dest->CondSwizzle[3]], dest->CondMask);
 423       writeMask = condWriteMask;
 424    }
 425
 426    if (writeMask[0]) {
 427       dstReg[0] = value[0];
 428       if (updateCC)
 429          machine->CondCodes[0] = generate_cc(value[0]);
 430    }
 431    if (writeMask[1]) {
 432       dstReg[1] = value[1];
 433       if (updateCC)
 434          machine->CondCodes[1] = generate_cc(value[1]);
 435    }
 436    if (writeMask[2]) {
 437       dstReg[2] = value[2];
 438       if (updateCC)
 439          machine->CondCodes[2] = generate_cc(value[2]);
 440    }
 441    if (writeMask[3]) {
 442       dstReg[3] = value[3];
 443       if (updateCC)
 444          machine->CondCodes[3] = generate_cc(value[3]);
 445    }
 446 }
 447
 448
 449 /**
 450  * Initialize a new machine state instance from an existing one, adding
 451  * the partial derivatives onto the input registers.
 452  * Used to implement DDX and DDY instructions in non-trivial cases.
 453  */
 454 static void
 455 init_machine_deriv( GLcontext *ctx,
 456                     const struct fp_machine *machine,
 457                     const struct fragment_program *program,
 458                     const struct sw_span *span, char xOrY,
 459                     struct fp_machine *dMachine )
 460 {
 461    GLuint u;
 462
 463    ASSERT(xOrY == 'X' || xOrY == 'Y');
 464
 465    /* copy existing machine */
 466    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 467
 468    /* Clear temporary registers */
 469    _mesa_bzero( (void*) machine->Temporaries,
 470                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 471
 472    /* Add derivatives */
 473    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 474       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 475       if (xOrY == 'X') {
 476          wpos[0] += 1.0F;
 477          wpos[1] += 0.0F;
 478          wpos[2] += span->dzdx;
 479          wpos[3] += span->dwdx;
 480       }
 481       else {
 482          wpos[0] += 0.0F;
 483          wpos[1] += 1.0F;
 484          wpos[2] += span->dzdy;
 485          wpos[3] += span->dwdy;
 486       }
 487    }
 488    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 489       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 490       if (xOrY == 'X') {
 491          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 492          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 493          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 494          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 495       }
 496       else {
 497          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 498          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 499          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 500          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 501       }
 502    }
 503    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 504       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 505       if (xOrY == 'X') {
 506          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 507          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 508          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 509          col1[3] += 0.0; /*XXX fix */
 510       }
 511       else {
 512          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 513          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 514          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 515          col1[3] += 0.0; /*XXX fix */
 516       }
 517    }
 518    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 519       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 520       if (xOrY == 'X') {
 521          fogc[0] += span->dfogdx;
 522       }
 523       else {
 524          fogc[0] += span->dfogdy;
 525       }
 526    }
 527    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 528       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 529          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 530          if (xOrY == 'X') {
 531             tex[0] += span->texStepX[u][0];
 532             tex[1] += span->texStepX[u][1];
 533             tex[2] += span->texStepX[u][2];
 534             tex[3] += span->texStepX[u][3];
 535          }
 536          else {
 537             tex[0] += span->texStepY[u][0];
 538             tex[1] += span->texStepY[u][1];
 539             tex[2] += span->texStepY[u][2];
 540             tex[3] += span->texStepY[u][3];
 541          }
 542       }
 543    }
 544
 545    /* init condition codes */
 546    dMachine->CondCodes[0] = COND_EQ;
 547    dMachine->CondCodes[1] = COND_EQ;
 548    dMachine->CondCodes[2] = COND_EQ;
 549    dMachine->CondCodes[3] = COND_EQ;
 550 }
 551
 552
 553 /**
 554  * Execute the given vertex program.
 555  * NOTE: we do everything in single-precision floating point; we don't
 556  * currently observe the single/half/fixed-precision qualifiers.
 557  * \param ctx - rendering context
 558  * \param program - the fragment program to execute
 559  * \param machine - machine state (register file)
 560  * \param maxInst - max number of instructions to execute
 561  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 562  */
 563 static GLboolean
 564 execute_program( GLcontext *ctx,
 565                  const struct fragment_program *program, GLuint maxInst,
 566                  struct fp_machine *machine, const struct sw_span *span,
 567                  GLuint column )
 568 {
 569    GLuint pc;
 570
 571 #if DEBUG_FRAG
 572    printf("execute fragment program --------------------\n");
 573 #endif
 574
 575    for (pc = 0; pc < maxInst; pc++) {
 576       const struct fp_instruction *inst = program->Instructions + pc;
 577
 578       if (ctx->FragmentProgram.CallbackEnabled &&
 579           ctx->FragmentProgram.Callback) {
 580          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 581          ctx->FragmentProgram.Callback(program->Base.Target,
 582                                        ctx->FragmentProgram.CallbackData);
 583       }
 584
 585       switch (inst->Opcode) {
 586          case FP_OPCODE_ABS:
 587             {
 588                GLfloat a[4], result[4];
 589                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 590                result[0] = FABSF(a[0]);
 591                result[1] = FABSF(a[1]);
 592                result[2] = FABSF(a[2]);
 593                result[3] = FABSF(a[3]);
 594                store_vector4( inst, machine, result );
 595             }
 596             break;
 597          case FP_OPCODE_ADD:
 598             {
 599                GLfloat a[4], b[4], result[4];
 600                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 601                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 602                result[0] = a[0] + b[0];
 603                result[1] = a[1] + b[1];
 604                result[2] = a[2] + b[2];
 605                result[3] = a[3] + b[3];
 606                store_vector4( inst, machine, result );
 607             }
 608             break;
 609          case FP_OPCODE_CMP:
 610             {
 611                GLfloat a[4], b[4], c[4], result[4];
 612                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 613                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 614                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 615                result[0] = a[0] < 0.0F ? b[0] : c[0];
 616                result[1] = a[1] < 0.0F ? b[1] : c[1];
 617                result[2] = a[2] < 0.0F ? b[2] : c[2];
 618                result[3] = a[3] < 0.0F ? b[3] : c[3];
 619                store_vector4( inst, machine, result );
 620             }
 621             break;
 622          case FP_OPCODE_COS:
 623             {
 624                GLfloat a[4], result[4];
 625                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 626                result[0] = result[1] = result[2] = result[3] = (GLfloat)_mesa_cos(a[0]);
 627                store_vector4( inst, machine, result );
 628             }
 629             break;
 630          case FP_OPCODE_DDX: /* Partial derivative with respect to X */
 631             {
 632                GLfloat a[4], aNext[4], result[4];
 633                struct fp_machine dMachine;
 634                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'X', result)) {
 635                   /* This is tricky.  Make a copy of the current machine state,
 636                    * increment the input registers by the dx or dy partial
 637                    * derivatives, then re-execute the program up to the
 638                    * preceeding instruction, then fetch the source register.
 639                    * Finally, find the difference in the register values for
 640                    * the original and derivative runs.
 641                    */
 642                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 643                   init_machine_deriv(ctx, machine, program, span,
 644                                      'X', &dMachine);
 645                   execute_program(ctx, program, pc, &dMachine, span, column);
 646                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 647                   result[0] = aNext[0] - a[0];
 648                   result[1] = aNext[1] - a[1];
 649                   result[2] = aNext[2] - a[2];
 650                   result[3] = aNext[3] - a[3];
 651                }
 652                store_vector4( inst, machine, result );
 653             }
 654             break;
 655          case FP_OPCODE_DDY: /* Partial derivative with respect to Y */
 656             {
 657                GLfloat a[4], aNext[4], result[4];
 658                struct fp_machine dMachine;
 659                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'Y', result)) {
 660                   init_machine_deriv(ctx, machine, program, span,
 661                                      'Y', &dMachine);
 662                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 663                   execute_program(ctx, program, pc, &dMachine, span, column);
 664                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 665                   result[0] = aNext[0] - a[0];
 666                   result[1] = aNext[1] - a[1];
 667                   result[2] = aNext[2] - a[2];
 668                   result[3] = aNext[3] - a[3];
 669                }
 670                store_vector4( inst, machine, result );
 671             }
 672             break;
 673          case FP_OPCODE_DP3:
 674             {
 675                GLfloat a[4], b[4], result[4];
 676                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 677                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 678                result[0] = result[1] = result[2] = result[3] =
 679                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 680                store_vector4( inst, machine, result );
 681 #if DEBUG_FRAG
 682                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 683                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 684 #endif
 685             }
 686             break;
 687          case FP_OPCODE_DP4:
 688             {
 689                GLfloat a[4], b[4], result[4];
 690                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 691                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 692                result[0] = result[1] = result[2] = result[3] =
 693                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 694                store_vector4( inst, machine, result );
 695             }
 696             break;
 697          case FP_OPCODE_DPH:
 698             {
 699                GLfloat a[4], b[4], result[4];
 700                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 701                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 702                result[0] = result[1] = result[2] = result[3] =
 703                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 704                store_vector4( inst, machine, result );
 705             }
 706             break;
 707          case FP_OPCODE_DST: /* Distance vector */
 708             {
 709                GLfloat a[4], b[4], result[4];
 710                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 711                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 712                result[0] = 1.0F;
 713                result[1] = a[1] * b[1];
 714                result[2] = a[2];
 715                result[3] = b[3];
 716                store_vector4( inst, machine, result );
 717             }
 718             break;
 719          case FP_OPCODE_EX2: /* Exponential base 2 */
 720             {
 721                GLfloat a[4], result[4];
 722                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 723                result[0] = result[1] = result[2] = result[3] =
 724                   (GLfloat) _mesa_pow(2.0, a[0]);
 725                store_vector4( inst, machine, result );
 726             }
 727             break;
 728          case FP_OPCODE_FLR:
 729             {
 730                GLfloat a[4], result[4];
 731                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 732                result[0] = FLOORF(a[0]);
 733                result[1] = FLOORF(a[1]);
 734                result[2] = FLOORF(a[2]);
 735                result[3] = FLOORF(a[3]);
 736                store_vector4( inst, machine, result );
 737             }
 738             break;
 739          case FP_OPCODE_FRC:
 740             {
 741                GLfloat a[4], result[4];
 742                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 743                result[0] = a[0] - FLOORF(a[0]);
 744                result[1] = a[1] - FLOORF(a[1]);
 745                result[2] = a[2] - FLOORF(a[2]);
 746                result[3] = a[3] - FLOORF(a[3]);
 747                store_vector4( inst, machine, result );
 748             }
 749             break;
 750          case FP_OPCODE_KIL:
 751             {
 752                const GLuint *swizzle = inst->DstReg.CondSwizzle;
 753                const GLuint condMask = inst->DstReg.CondMask;
 754                if (test_cc(machine->CondCodes[swizzle[0]], condMask) ||
 755                    test_cc(machine->CondCodes[swizzle[1]], condMask) ||
 756                    test_cc(machine->CondCodes[swizzle[2]], condMask) ||
 757                    test_cc(machine->CondCodes[swizzle[3]], condMask)) {
 758                   return GL_FALSE;
 759                }
 760             }
 761             break;
 762          case FP_OPCODE_LG2:  /* log base 2 */
 763             {
 764                GLfloat a[4], result[4];
 765                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 766                result[0] = result[1] = result[2] = result[3]
 767                   = LOG2(a[0]);
 768                store_vector4( inst, machine, result );
 769             }
 770             break;
 771          case FP_OPCODE_LIT:
 772             {
 773                GLfloat a[4], result[4];
 774                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 775                if (a[0] < 0.0F)
 776                   a[0] = 0.0F;
 777                if (a[1] < 0.0F)
 778                   a[1] = 0.0F;
 779                result[0] = 1.0F;
 780                result[1] = a[0];
 781                result[2] = (a[0] > 0.0F) ? (GLfloat)_mesa_pow(2.0, a[3]) : 0.0F;
 782                result[3] = 1.0F;
 783                store_vector4( inst, machine, result );
 784             }
 785             break;
 786          case FP_OPCODE_LRP:
 787             {
 788                GLfloat a[4], b[4], c[4], result[4];
 789                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 790                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 791                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 792                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 793                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 794                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 795                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 796                store_vector4( inst, machine, result );
 797             }
 798             break;
 799          case FP_OPCODE_MAD:
 800             {
 801                GLfloat a[4], b[4], c[4], result[4];
 802                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 803                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 804                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 805                result[0] = a[0] * b[0] + c[0];
 806                result[1] = a[1] * b[1] + c[1];
 807                result[2] = a[2] * b[2] + c[2];
 808                result[3] = a[3] * b[3] + c[3];
 809                store_vector4( inst, machine, result );
 810             }
 811             break;
 812          case FP_OPCODE_MAX:
 813             {
 814                GLfloat a[4], b[4], result[4];
 815                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 816                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 817                result[0] = MAX2(a[0], b[0]);
 818                result[1] = MAX2(a[1], b[1]);
 819                result[2] = MAX2(a[2], b[2]);
 820                result[3] = MAX2(a[3], b[3]);
 821                store_vector4( inst, machine, result );
 822             }
 823             break;
 824          case FP_OPCODE_MIN:
 825             {
 826                GLfloat a[4], b[4], result[4];
 827                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 828                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 829                result[0] = MIN2(a[0], b[0]);
 830                result[1] = MIN2(a[1], b[1]);
 831                result[2] = MIN2(a[2], b[2]);
 832                result[3] = MIN2(a[3], b[3]);
 833                store_vector4( inst, machine, result );
 834             }
 835             break;
 836          case FP_OPCODE_MOV:
 837             {
 838                GLfloat result[4];
 839                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 840                store_vector4( inst, machine, result );
 841             }
 842             break;
 843          case FP_OPCODE_MUL:
 844             {
 845                GLfloat a[4], b[4], result[4];
 846                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 847                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 848                result[0] = a[0] * b[0];
 849                result[1] = a[1] * b[1];
 850                result[2] = a[2] * b[2];
 851                result[3] = a[3] * b[3];
 852                store_vector4( inst, machine, result );
 853 #if DEBUG_FRAG
 854                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 855                       result[0], result[1], result[2], result[3],
 856                       a[0], a[1], a[2], a[3],
 857                       b[0], b[1], b[2], b[3]);
 858 #endif
 859             }
 860             break;
 861          case FP_OPCODE_PK2H: /* pack two 16-bit floats */
 862             /* XXX this is probably wrong */
 863             {
 864                GLfloat a[4], result[4];
 865                const GLuint *rawBits = (const GLuint *) a;
 866                GLuint *rawResult = (GLuint *) result;
 867                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 868                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 869                   = rawBits[0] | (rawBits[1] << 16);
 870                store_vector4( inst, machine, result );
 871             }
 872             break;
 873          case FP_OPCODE_PK2US: /* pack two GLushorts */
 874             {
 875                GLfloat a[4], result[4];
 876                GLuint usx, usy, *rawResult = (GLuint *) result;
 877                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 878                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 879                a[1] = CLAMP(a[0], 0.0F, 1.0F);
 880                usx = IROUND(a[0] * 65535.0F);
 881                usy = IROUND(a[1] * 65535.0F);
 882                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 883                   = usx | (usy << 16);
 884                store_vector4( inst, machine, result );
 885             }
 886             break;
 887          case FP_OPCODE_PK4B: /* pack four GLbytes */
 888             {
 889                GLfloat a[4], result[4];
 890                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 891                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 892                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 893                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 894                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 895                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 896                ubx = IROUND(127.0F * a[0] + 128.0F);
 897                uby = IROUND(127.0F * a[1] + 128.0F);
 898                ubz = IROUND(127.0F * a[2] + 128.0F);
 899                ubw = IROUND(127.0F * a[3] + 128.0F);
 900                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 901                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 902                store_vector4( inst, machine, result );
 903             }
 904             break;
 905          case FP_OPCODE_PK4UB: /* pack four GLubytes */
 906             {
 907                GLfloat a[4], result[4];
 908                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 909                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 910                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 911                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 912                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 913                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 914                ubx = IROUND(255.0F * a[0]);
 915                uby = IROUND(255.0F * a[1]);
 916                ubz = IROUND(255.0F * a[2]);
 917                ubw = IROUND(255.0F * a[3]);
 918                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 919                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 920                store_vector4( inst, machine, result );
 921             }
 922             break;
 923          case FP_OPCODE_POW:
 924             {
 925                GLfloat a[4], b[4], result[4];
 926                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 927                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
 928                result[0] = result[1] = result[2] = result[3]
 929                   = (GLfloat)_mesa_pow(a[0], b[0]);
 930                store_vector4( inst, machine, result );
 931             }
 932             break;
 933          case FP_OPCODE_RCP:
 934             {
 935                GLfloat a[4], result[4];
 936                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 937 #if DEBUG_FRAG
 938                if (a[0] == 0)
 939                   printf("RCP(0)\n");
 940                else if (IS_INF_OR_NAN(a[0]))
 941                   printf("RCP(inf)\n");
 942 #endif
 943                result[0] = result[1] = result[2] = result[3]
 944                   = 1.0F / a[0];
 945                store_vector4( inst, machine, result );
 946             }
 947             break;
 948          case FP_OPCODE_RFL:
 949             {
 950                GLfloat axis[4], dir[4], result[4], tmp[4];
 951                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
 952                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
 953                tmp[3] = axis[0] * axis[0]
 954                       + axis[1] * axis[1]
 955                       + axis[2] * axis[2];
 956                tmp[0] = (2.0F * (axis[0] * dir[0] +
 957                                  axis[1] * dir[1] +
 958                                  axis[2] * dir[2])) / tmp[3];
 959                result[0] = tmp[0] * axis[0] - dir[0];
 960                result[1] = tmp[0] * axis[1] - dir[1];
 961                result[2] = tmp[0] * axis[2] - dir[2];
 962                /* result[3] is never written! XXX enforce in parser! */
 963                store_vector4( inst, machine, result );
 964             }
 965             break;
 966          case FP_OPCODE_RSQ: /* 1 / sqrt() */
 967             {
 968                GLfloat a[4], result[4];
 969                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 970                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
 971                store_vector4( inst, machine, result );
 972 #if DEBUG_FRAG
 973                printf("RSQ %g = 1/sqrt(%g)\n", result[0], a[0]);
 974 #endif
 975             }
 976             break;
 977          case FP_OPCODE_SCS: /* sine and cos */
 978             {
 979                GLfloat a[4], result[4];
 980                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 981                result[0] = (GLfloat)cos(a[0]);
 982                result[1] = (GLfloat)sin(a[0]);
 983                result[2] = 0.0;  /* undefined! */
 984                result[3] = 0.0;  /* undefined! */
 985                store_vector4( inst, machine, result );
 986             }
 987             break;
 988          case FP_OPCODE_SEQ: /* set on equal */
 989             {
 990                GLfloat a[4], b[4], result[4];
 991                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 992                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 993                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
 994                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
 995                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
 996                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
 997                store_vector4( inst, machine, result );
 998             }
 999             break;
1000          case FP_OPCODE_SFL: /* set false, operands ignored */
1001             {
1002                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1003                store_vector4( inst, machine, result );
1004             }
1005             break;
1006          case FP_OPCODE_SGE: /* set on greater or equal */
1007             {
1008                GLfloat a[4], b[4], result[4];
1009                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1010                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1011                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1012                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1013                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1014                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1015                store_vector4( inst, machine, result );
1016             }
1017             break;
1018          case FP_OPCODE_SGT: /* set on greater */
1019             {
1020                GLfloat a[4], b[4], result[4];
1021                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1022                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1023                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1024                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1025                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1026                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1027                store_vector4( inst, machine, result );
1028             }
1029             break;
1030          case FP_OPCODE_SIN:
1031             {
1032                GLfloat a[4], result[4];
1033                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1034                result[0] = result[1] = result[2] =
1035                        result[3] = (GLfloat)_mesa_sin(a[0]);
1036                store_vector4( inst, machine, result );
1037             }
1038             break;
1039          case FP_OPCODE_SLE: /* set on less or equal */
1040             {
1041                GLfloat a[4], b[4], result[4];
1042                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1043                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1044                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1045                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1046                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1047                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1048                store_vector4( inst, machine, result );
1049             }
1050             break;
1051          case FP_OPCODE_SLT: /* set on less */
1052             {
1053                GLfloat a[4], b[4], result[4];
1054                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1055                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1056                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1057                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1058                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1059                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1060                store_vector4( inst, machine, result );
1061             }
1062             break;
1063          case FP_OPCODE_SNE: /* set on not equal */
1064             {
1065                GLfloat a[4], b[4], result[4];
1066                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1067                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1068                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1069                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1070                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1071                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1072                store_vector4( inst, machine, result );
1073             }
1074             break;
1075          case FP_OPCODE_STR: /* set true, operands ignored */
1076             {
1077                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1078                store_vector4( inst, machine, result );
1079             }
1080             break;
1081          case FP_OPCODE_SUB:
1082             {
1083                GLfloat a[4], b[4], result[4];
1084                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1085                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1086                result[0] = a[0] - b[0];
1087                result[1] = a[1] - b[1];
1088                result[2] = a[2] - b[2];
1089                result[3] = a[3] - b[3];
1090                store_vector4( inst, machine, result );
1091             }
1092             break;
1093          case FP_OPCODE_SWZ:
1094             {
1095                const struct fp_src_register *source = &inst->SrcReg[0];
1096                const GLfloat *src = get_register_pointer(ctx, source,
1097                                                          machine, program);
1098                GLfloat result[4];
1099                GLuint i;
1100
1101                /* do extended swizzling here */
1102                for (i = 0; i < 3; i++) {
1103                   if (source->Swizzle[i] == SWIZZLE_ZERO)
1104                      result[i] = 0.0;
1105                   else if (source->Swizzle[i] == SWIZZLE_ONE)
1106                      result[i] = -1.0;
1107                   else
1108                      result[i] = -src[source->Swizzle[i]];
1109                   if (source->NegateBase)
1110                      result[i] = -result[i];
1111                }
1112                store_vector4( inst, machine, result );
1113             }
1114             break;
1115          case FP_OPCODE_TEX:
1116             /* Texel lookup */
1117             {
1118                GLfloat texcoord[4], color[4];
1119                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1120                /* XXX: Undo perspective divide from interpolate_texcoords() */
1121                fetch_texel( ctx, texcoord,
1122                             span->array->lambda[inst->TexSrcUnit][column],
1123                             inst->TexSrcUnit, color );
1124                store_vector4( inst, machine, color );
1125             }
1126             break;
1127          case FP_OPCODE_TXB:
1128             /* Texel lookup with LOD bias */
1129             {
1130                GLfloat texcoord[4], color[4], bias, lambda;
1131
1132                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1133                /* texcoord[3] is the bias to add to lambda */
1134                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1135                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1136                     + texcoord[3];
1137                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1138                fetch_texel( ctx, texcoord, lambda,
1139                             inst->TexSrcUnit, color );
1140                store_vector4( inst, machine, color );
1141             }
1142             break;
1143          case FP_OPCODE_TXD:
1144             /* Texture lookup w/ partial derivatives for LOD */
1145             {
1146                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1147                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1148                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1149                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1150                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1151                                   color );
1152                store_vector4( inst, machine, color );
1153             }
1154             break;
1155          case FP_OPCODE_TXP:
1156             /* Texture lookup w/ perspective divide */
1157             {
1158                GLfloat texcoord[4], color[4];
1159                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1160                /* Already did perspective divide in interpolate_texcoords() */
1161                fetch_texel( ctx, texcoord,
1162                             span->array->lambda[inst->TexSrcUnit][column],
1163                             inst->TexSrcUnit, color );
1164                store_vector4( inst, machine, color );
1165             }
1166             break;
1167          case FP_OPCODE_UP2H: /* unpack two 16-bit floats */
1168             /* XXX this is probably wrong */
1169             {
1170                GLfloat a[4], result[4];
1171                const GLuint *rawBits = (const GLuint *) a;
1172                GLuint *rawResult = (GLuint *) result;
1173                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1174                rawResult[0] = rawBits[0] & 0xffff;
1175                rawResult[1] = (rawBits[0] >> 16) & 0xffff;
1176                rawResult[2] = rawBits[0] & 0xffff;
1177                rawResult[3] = (rawBits[0] >> 16) & 0xffff;
1178                store_vector4( inst, machine, result );
1179             }
1180             break;
1181          case FP_OPCODE_UP2US: /* unpack two GLushorts */
1182             {
1183                GLfloat a[4], result[4];
1184                const GLuint *rawBits = (const GLuint *) a;
1185                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1186                result[0] = (GLfloat) ((rawBits[0] >>  0) & 0xffff) / 65535.0F;
1187                result[1] = (GLfloat) ((rawBits[0] >> 16) & 0xffff) / 65535.0F;
1188                result[2] = result[0];
1189                result[3] = result[1];
1190                store_vector4( inst, machine, result );
1191             }
1192             break;
1193          case FP_OPCODE_UP4B: /* unpack four GLbytes */
1194             {
1195                GLfloat a[4], result[4];
1196                const GLuint *rawBits = (const GLuint *) a;
1197                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1198                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1199                result[0] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1200                result[0] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1201                result[0] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1202                store_vector4( inst, machine, result );
1203             }
1204             break;
1205          case FP_OPCODE_UP4UB: /* unpack four GLubytes */
1206             {
1207                GLfloat a[4], result[4];
1208                const GLuint *rawBits = (const GLuint *) a;
1209                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1210                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1211                result[0] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1212                result[0] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1213                result[0] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1214                store_vector4( inst, machine, result );
1215             }
1216             break;
1217          case FP_OPCODE_X2D: /* 2-D matrix transform */
1218             {
1219                GLfloat a[4], b[4], c[4], result[4];
1220                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1221                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1222                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1223                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1224                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1225                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1226                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1227                store_vector4( inst, machine, result );
1228             }
1229             break;
1230          case FP_OPCODE_END:
1231             return GL_TRUE;
1232          default:
1233             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1234                           inst->Opcode);
1235             return GL_TRUE; /* return value doesn't matter */
1236       }
1237    }
1238    return GL_TRUE;
1239 }
1240
1241
1242 static void
1243 init_machine( GLcontext *ctx, struct fp_machine *machine,
1244               const struct fragment_program *program,
1245               const struct sw_span *span, GLuint col )
1246 {
1247    GLuint inputsRead = program->InputsRead;
1248    GLuint u;
1249
1250    if (ctx->FragmentProgram.CallbackEnabled)
1251       inputsRead = ~0;
1252
1253    /* Clear temporary registers */
1254    _mesa_bzero(machine->Temporaries,
1255                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1256
1257    /* Load input registers */
1258    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1259       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1260       wpos[0] = (GLfloat) span->x + col;
1261       wpos[1] = (GLfloat) span->y;
1262       wpos[2] = (GLfloat) span->array->z[col] / ctx->DepthMaxF;
1263       wpos[3] = span->w + col * span->dwdx;
1264    }
1265    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1266       GLfloat *col0 = machine->Inputs[FRAG_ATTRIB_COL0];
1267       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1268       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1269       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1270       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1271    }
1272    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1273       GLfloat *col1 = machine->Inputs[FRAG_ATTRIB_COL1];
1274       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1275       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1276       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1277       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1278    }
1279    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1280       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1281       fogc[0] = span->array->fog[col];
1282       fogc[1] = 0.0F;
1283       fogc[2] = 0.0F;
1284       fogc[3] = 0.0F;
1285    }
1286    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1287       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1288          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1289          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1290          COPY_4V(tex, span->array->texcoords[u][col]);
1291          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1292       }
1293    }
1294
1295    /* init condition codes */
1296    machine->CondCodes[0] = COND_EQ;
1297    machine->CondCodes[1] = COND_EQ;
1298    machine->CondCodes[2] = COND_EQ;
1299    machine->CondCodes[3] = COND_EQ;
1300 }
1301
1302
1303 void
1304 _swrast_exec_nv_fragment_program( GLcontext *ctx, struct sw_span *span )
1305 {
1306    const struct fragment_program *program = ctx->FragmentProgram.Current;
1307    GLuint i;
1308
1309    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1310
1311    for (i = 0; i < span->end; i++) {
1312       if (span->array->mask[i]) {
1313          init_machine(ctx, &ctx->FragmentProgram.Machine,
1314                       ctx->FragmentProgram.Current, span, i);
1315
1316          if (!execute_program(ctx, program, ~0,
1317                               &ctx->FragmentProgram.Machine, span, i)) {
1318             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1319          }
1320
1321          /* Store output registers */
1322          {
1323             const GLfloat *colOut
1324                = ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_COLR];
1325             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1326             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1327             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1328             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1329          }
1330          /* depth value */
1331          if (program->OutputsWritten & (1 << FRAG_OUTPUT_DEPR))
1332             span->array->z[i] = IROUND(ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_DEPR][0] * ctx->DepthMaxF);
1333       }
1334    }
1335
1336    ctx->_CurrentProgram = 0;
1337 }
1338