src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25
  26 #include "glheader.h"
  27 #include "colormac.h"
  28 #include "context.h"
  29 #include "nvfragprog.h"
  30 #include "macros.h"
  31
  32 #include "s_nvfragprog.h"
  33 #include "s_span.h"
  34 #include "s_texture.h"
  35
  36
  37 /* if 1, print some debugging info */
  38 #define DEBUG_FRAG 0
  39
  40
  41 /**
  42  * Fetch a texel.
  43  */
  44 static void
  45 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  46              GLuint unit, GLfloat color[4] )
  47 {
  48    GLchan rgba[4];
  49    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  50
  51    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  52                                1, (const GLfloat (*)[4]) texcoord,
  53                                &lambda, &rgba);
  54    color[0] = CHAN_TO_FLOAT(rgba[0]);
  55    color[1] = CHAN_TO_FLOAT(rgba[1]);
  56    color[2] = CHAN_TO_FLOAT(rgba[2]);
  57    color[3] = CHAN_TO_FLOAT(rgba[3]);
  58 }
  59
  60
  61 /**
  62  * Fetch a texel with the given partial derivatives to compute a level
  63  * of detail in the mipmap.
  64  */
  65 static void
  66 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  67                    const GLfloat texdx[4], const GLfloat texdy[4],
  68                    GLuint unit, GLfloat color[4] )
  69 {
  70    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  71    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  72    const struct gl_texture_image *texImg = texObj->Image[texObj->BaseLevel];
  73    const GLfloat texW = (GLfloat) texImg->WidthScale;
  74    const GLfloat texH = (GLfloat) texImg->HeightScale;
  75    GLchan rgba[4];
  76
  77    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  78                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  79                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  80                                          texW, texH,
  81                                          texcoord[0], texcoord[1], texcoord[3],
  82                                          1.0F / texcoord[3]);
  83
  84    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  85                                1, (const GLfloat (*)[4]) texcoord,
  86                                &lambda, &rgba);
  87    color[0] = CHAN_TO_FLOAT(rgba[0]);
  88    color[1] = CHAN_TO_FLOAT(rgba[1]);
  89    color[2] = CHAN_TO_FLOAT(rgba[2]);
  90    color[3] = CHAN_TO_FLOAT(rgba[3]);
  91 }
  92
  93
  94
  95 /**
  96  * Fetch a 4-element float vector from the given source register.
  97  * Apply swizzling and negating as needed.
  98  */
  99 static void
 100 fetch_vector4( const struct fp_src_register *source,
 101                const struct fp_machine *machine,
 102                const struct fragment_program *program,
 103                GLfloat result[4] )
 104 {
 105    const GLfloat *src;
 106
 107    if (source->IsParameter) {
 108       src = program->Parameters[source->Register].Values;
 109    }
 110    else {
 111       src = machine->Registers[source->Register];
 112    }
 113
 114    result[0] = src[source->Swizzle[0]];
 115    result[1] = src[source->Swizzle[1]];
 116    result[2] = src[source->Swizzle[2]];
 117    result[3] = src[source->Swizzle[3]];
 118
 119    if (source->NegateBase) {
 120       result[0] = -result[0];
 121       result[1] = -result[1];
 122       result[2] = -result[2];
 123       result[3] = -result[3];
 124    }
 125    if (source->Abs) {
 126       result[0] = FABSF(result[0]);
 127       result[1] = FABSF(result[1]);
 128       result[2] = FABSF(result[2]);
 129       result[3] = FABSF(result[3]);
 130    }
 131    if (source->NegateAbs) {
 132       result[0] = -result[0];
 133       result[1] = -result[1];
 134       result[2] = -result[2];
 135       result[3] = -result[3];
 136    }
 137 }
 138
 139
 140 /**
 141  * Fetch the derivative with respect to X for the given register.
 142  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 143  * need to execute another instance of the program (ugh)!
 144  */
 145 static GLboolean
 146 fetch_vector4_deriv( const struct fp_src_register *source,
 147                      const struct sw_span *span,
 148                      char xOrY, GLfloat result[4] )
 149 {
 150    GLfloat src[4];
 151
 152    ASSERT(xOrY == 'X' || xOrY == 'Y');
 153
 154    switch (source->Register) {
 155    case FRAG_ATTRIB_WPOS:
 156       if (xOrY == 'X') {
 157          src[0] = 1.0;
 158          src[1] = 0.0;
 159          src[2] = span->dzdx;
 160          src[3] = span->dwdx;
 161       }
 162       else {
 163          src[0] = 0.0;
 164          src[1] = 1.0;
 165          src[2] = span->dzdy;
 166          src[3] = span->dwdy;
 167       }
 168       break;
 169    case FRAG_ATTRIB_COL0:
 170       if (xOrY == 'X') {
 171          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 172          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 173          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 174          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 175       }
 176       else {
 177          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 178          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 179          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 180          src[3] = span->dady * (1.0F / CHAN_MAXF);
 181       }
 182       break;
 183    case FRAG_ATTRIB_COL1:
 184       if (xOrY == 'X') {
 185          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 186          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 187          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 188          src[3] = 0.0; /* XXX need this */
 189       }
 190       else {
 191          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 192          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 193          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 194          src[3] = 0.0; /* XXX need this */
 195       }
 196       break;
 197    case FRAG_ATTRIB_FOGC:
 198       if (xOrY == 'X') {
 199          src[0] = span->dfogdx;
 200          src[1] = 0.0;
 201          src[2] = 0.0;
 202          src[3] = 0.0;
 203       }
 204       else {
 205          src[0] = span->dfogdy;
 206          src[1] = 0.0;
 207          src[2] = 0.0;
 208          src[3] = 0.0;
 209       }
 210       break;
 211    case FRAG_ATTRIB_TEX0:
 212    case FRAG_ATTRIB_TEX1:
 213    case FRAG_ATTRIB_TEX2:
 214    case FRAG_ATTRIB_TEX3:
 215    case FRAG_ATTRIB_TEX4:
 216    case FRAG_ATTRIB_TEX5:
 217    case FRAG_ATTRIB_TEX6:
 218    case FRAG_ATTRIB_TEX7:
 219       if (xOrY == 'X') {
 220          const GLuint u = source->Register - FRAG_ATTRIB_TEX0;
 221          src[0] = span->texStepX[u][0] * (1.0F / CHAN_MAXF);
 222          src[1] = span->texStepX[u][1] * (1.0F / CHAN_MAXF);
 223          src[2] = span->texStepX[u][2] * (1.0F / CHAN_MAXF);
 224          src[3] = span->texStepX[u][3] * (1.0F / CHAN_MAXF);
 225       }
 226       else {
 227          const GLuint u = source->Register - FRAG_ATTRIB_TEX0;
 228          src[0] = span->texStepY[u][0] * (1.0F / CHAN_MAXF);
 229          src[1] = span->texStepY[u][1] * (1.0F / CHAN_MAXF);
 230          src[2] = span->texStepY[u][2] * (1.0F / CHAN_MAXF);
 231          src[3] = span->texStepY[u][3] * (1.0F / CHAN_MAXF);
 232       }
 233       break;
 234    default:
 235       return GL_FALSE;
 236    }
 237
 238    result[0] = src[source->Swizzle[0]];
 239    result[1] = src[source->Swizzle[1]];
 240    result[2] = src[source->Swizzle[2]];
 241    result[3] = src[source->Swizzle[3]];
 242
 243    if (source->NegateBase) {
 244       result[0] = -result[0];
 245       result[1] = -result[1];
 246       result[2] = -result[2];
 247       result[3] = -result[3];
 248    }
 249    if (source->Abs) {
 250       result[0] = FABSF(result[0]);
 251       result[1] = FABSF(result[1]);
 252       result[2] = FABSF(result[2]);
 253       result[3] = FABSF(result[3]);
 254    }
 255    if (source->NegateAbs) {
 256       result[0] = -result[0];
 257       result[1] = -result[1];
 258       result[2] = -result[2];
 259       result[3] = -result[3];
 260    }
 261    return GL_TRUE;
 262 }
 263
 264
 265 /**
 266  * As above, but only return result[0] element.
 267  */
 268 static void
 269 fetch_vector1( const struct fp_src_register *source,
 270                const struct fp_machine *machine,
 271                const struct fragment_program *program,
 272                GLfloat result[4] )
 273 {
 274    const GLfloat *src;
 275
 276    if (source->IsParameter) {
 277       src = program->Parameters[source->Register].Values;
 278    }
 279    else {
 280       src = machine->Registers[source->Register];
 281    }
 282
 283    result[0] = src[source->Swizzle[0]];
 284
 285    if (source->NegateBase) {
 286       result[0] = -result[0];
 287    }
 288    if (source->Abs) {
 289       result[0] = FABSF(result[0]);
 290    }
 291    if (source->NegateAbs) {
 292       result[0] = -result[0];
 293    }
 294 }
 295
 296
 297 /*
 298  * Test value against zero and return GT, LT, EQ or UN if NaN.
 299  */
 300 static INLINE GLuint
 301 generate_cc( float value )
 302 {
 303    if (value != value)
 304       return COND_UN;  /* NaN */
 305    if (value > 0.0F)
 306       return COND_GT;
 307    if (value < 0.0F)
 308       return COND_LT;
 309    return COND_EQ;
 310 }
 311
 312 /*
 313  * Test if the ccMaskRule is satisfied by the given condition code.
 314  * Used to mask destination writes according to the current condition codee.
 315  */
 316 static INLINE GLboolean
 317 test_cc(GLuint condCode, GLuint ccMaskRule)
 318 {
 319    switch (ccMaskRule) {
 320    case COND_EQ: return (condCode == COND_EQ);
 321    case COND_NE: return (condCode != COND_EQ);
 322    case COND_LT: return (condCode == COND_LT);
 323    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 324    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 325    case COND_GT: return (condCode == COND_GT);
 326    case COND_TR: return GL_TRUE;
 327    case COND_FL: return GL_FALSE;
 328    default:      return GL_TRUE;
 329    }
 330 }
 331
 332
 333 /**
 334  * Store 4 floats into a register.  Observe the instructions saturate and
 335  * set-condition-code flags.
 336  */
 337 static void
 338 store_vector4( const struct fp_instruction *inst,
 339                struct fp_machine *machine,
 340                const GLfloat value[4] )
 341 {
 342    const struct fp_dst_register *dest = &(inst->DstReg);
 343    const GLboolean clamp = inst->Saturate;
 344    const GLboolean updateCC = inst->UpdateCondRegister;
 345    GLfloat *dstReg = machine->Registers[dest->Register];
 346    GLfloat clampedValue[4];
 347    const GLboolean *writeMask = dest->WriteMask;
 348    GLboolean condWriteMask[4];
 349
 350 #if DEBUG_FRAG
 351    if (value[0] > 1.0e10 ||
 352        IS_INF_OR_NAN(value[0]) ||
 353        IS_INF_OR_NAN(value[1]) ||
 354        IS_INF_OR_NAN(value[2]) ||
 355        IS_INF_OR_NAN(value[3])  )
 356       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 357 #endif
 358
 359    if (clamp) {
 360       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 361       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 362       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 363       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 364       value = clampedValue;
 365    }
 366
 367    if (dest->CondMask != COND_TR) {
 368       condWriteMask[0] = writeMask[0]
 369          && test_cc(machine->CondCodes[dest->CondSwizzle[0]], dest->CondMask);
 370       condWriteMask[1] = writeMask[1]
 371          && test_cc(machine->CondCodes[dest->CondSwizzle[1]], dest->CondMask);
 372       condWriteMask[2] = writeMask[2]
 373          && test_cc(machine->CondCodes[dest->CondSwizzle[2]], dest->CondMask);
 374       condWriteMask[3] = writeMask[3]
 375          && test_cc(machine->CondCodes[dest->CondSwizzle[3]], dest->CondMask);
 376       writeMask = condWriteMask;
 377    }
 378
 379    if (writeMask[0]) {
 380       dstReg[0] = value[0];
 381       if (updateCC)
 382          machine->CondCodes[0] = generate_cc(value[0]);
 383    }
 384    if (writeMask[1]) {
 385       dstReg[1] = value[1];
 386       if (updateCC)
 387          machine->CondCodes[1] = generate_cc(value[1]);
 388    }
 389    if (writeMask[2]) {
 390       dstReg[2] = value[2];
 391       if (updateCC)
 392          machine->CondCodes[2] = generate_cc(value[2]);
 393    }
 394    if (writeMask[3]) {
 395       dstReg[3] = value[3];
 396       if (updateCC)
 397          machine->CondCodes[3] = generate_cc(value[3]);
 398    }
 399 }
 400
 401
 402 /**
 403  * Initialize a new machine state instance from an existing one, adding
 404  * the partial derivatives onto the input registers.
 405  * Used to implement DDX and DDY instructions in non-trivial cases.
 406  */
 407 static void
 408 init_machine_deriv( GLcontext *ctx,
 409                     const struct fp_machine *machine,
 410                     const struct fragment_program *program,
 411                     const struct sw_span *span, char xOrY,
 412                     struct fp_machine *dMachine )
 413 {
 414    GLuint u;
 415
 416    ASSERT(xOrY == 'X' || xOrY == 'Y');
 417
 418    /* copy existing machine */
 419    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 420
 421    /* Clear temporary registers */
 422    _mesa_bzero((GLfloat*) (machine->Registers + FP_TEMP_REG_START) ,
 423                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 424
 425    /* Add derivatives */
 426    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 427       GLfloat *wpos = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_WPOS];
 428       if (xOrY == 'X') {
 429          wpos[0] += 1.0F;
 430          wpos[1] += 0.0F;
 431          wpos[2] += span->dzdx;
 432          wpos[3] += span->dwdx;
 433       }
 434       else {
 435          wpos[0] += 0.0F;
 436          wpos[1] += 1.0F;
 437          wpos[2] += span->dzdy;
 438          wpos[3] += span->dwdy;
 439       }
 440    }
 441    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 442       GLfloat *col0 = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL0];
 443       if (xOrY == 'X') {
 444          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 445          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 446          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 447          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 448       }
 449       else {
 450          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 451          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 452          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 453          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 454       }
 455    }
 456    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 457       GLfloat *col1 = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL1];
 458       if (xOrY == 'X') {
 459          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 460          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 461          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 462          col1[3] += 0.0; /*XXX fix */
 463       }
 464       else {
 465          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 466          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 467          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 468          col1[3] += 0.0; /*XXX fix */
 469       }
 470    }
 471    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 472       GLfloat *fogc = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_FOGC];
 473       if (xOrY == 'X') {
 474          fogc[0] += span->dfogdx;
 475       }
 476       else {
 477          fogc[0] += span->dfogdy;
 478       }
 479    }
 480    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 481       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 482          GLfloat *tex = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_TEX0+u];
 483          if (xOrY == 'X') {
 484             tex[0] += span->texStepX[u][0];
 485             tex[1] += span->texStepX[u][1];
 486             tex[2] += span->texStepX[u][2];
 487             tex[3] += span->texStepX[u][3];
 488          }
 489          else {
 490             tex[0] += span->texStepY[u][0];
 491             tex[1] += span->texStepY[u][1];
 492             tex[2] += span->texStepY[u][2];
 493             tex[3] += span->texStepY[u][3];
 494          }
 495       }
 496    }
 497
 498    /* init condition codes */
 499    dMachine->CondCodes[0] = COND_EQ;
 500    dMachine->CondCodes[1] = COND_EQ;
 501    dMachine->CondCodes[2] = COND_EQ;
 502    dMachine->CondCodes[3] = COND_EQ;
 503 }
 504
 505
 506 /**
 507  * Execute the given vertex program.
 508  * NOTE: we do everything in single-precision floating point; we don't
 509  * currently observe the single/half/fixed-precision qualifiers.
 510  * \param ctx - rendering context
 511  * \param program - the fragment program to execute
 512  * \param machine - machine state (register file)
 513  * \param maxInst - max number of instructions to execute
 514  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 515  */
 516 static GLboolean
 517 execute_program( GLcontext *ctx,
 518                  const struct fragment_program *program, GLuint maxInst,
 519                  struct fp_machine *machine, const struct sw_span *span,
 520                  GLuint column )
 521 {
 522    GLuint pc;
 523
 524 #if DEBUG_FRAG
 525    printf("execute fragment program --------------------\n");
 526 #endif
 527
 528    for (pc = 0; pc < maxInst; pc++) {
 529       const struct fp_instruction *inst = program->Instructions + pc;
 530
 531       if (ctx->FragmentProgram.CallbackEnabled &&
 532           ctx->FragmentProgram.Callback) {
 533          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 534          ctx->FragmentProgram.Callback(program->Base.Target,
 535                                        ctx->FragmentProgram.CallbackData);
 536       }
 537
 538       switch (inst->Opcode) {
 539          case FP_OPCODE_ADD:
 540             {
 541                GLfloat a[4], b[4], result[4];
 542                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 543                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 544                result[0] = a[0] + b[0];
 545                result[1] = a[1] + b[1];
 546                result[2] = a[2] + b[2];
 547                result[3] = a[3] + b[3];
 548                store_vector4( inst, machine, result );
 549             }
 550             break;
 551          case FP_OPCODE_COS:
 552             {
 553                GLfloat a[4], result[4];
 554                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 555                result[0] = result[1] = result[2] = result[3] = _mesa_cos(a[0]);
 556                store_vector4( inst, machine, result );
 557             }
 558             break;
 559          case FP_OPCODE_DDX: /* Partial derivative with respect to X */
 560             {
 561                GLfloat a[4], aNext[4], result[4];
 562                struct fp_machine dMachine;
 563                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'X', result)) {
 564                   /* This is tricky.  Make a copy of the current machine state,
 565                    * increment the input registers by the dx or dy partial
 566                    * derivatives, then re-execute the program up to the
 567                    * preceeding instruction, then fetch the source register.
 568                    * Finally, find the difference in the register values for
 569                    * the original and derivative runs.
 570                    */
 571                   fetch_vector4( &inst->SrcReg[0], machine, program, a);
 572                   init_machine_deriv(ctx, machine, program, span,
 573                                      'X', &dMachine);
 574                   execute_program(ctx, program, pc, &dMachine, span, column);
 575                   fetch_vector4( &inst->SrcReg[0], &dMachine, program, aNext );
 576                   result[0] = aNext[0] - a[0];
 577                   result[1] = aNext[1] - a[1];
 578                   result[2] = aNext[2] - a[2];
 579                   result[3] = aNext[3] - a[3];
 580                }
 581                store_vector4( inst, machine, result );
 582             }
 583             break;
 584          case FP_OPCODE_DDY: /* Partial derivative with respect to Y */
 585             {
 586                GLfloat a[4], aNext[4], result[4];
 587                struct fp_machine dMachine;
 588                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'Y', result)) {
 589                   init_machine_deriv(ctx, machine, program, span,
 590                                      'Y', &dMachine);
 591                   fetch_vector4( &inst->SrcReg[0], machine, program, a);
 592                   execute_program(ctx, program, pc, &dMachine, span, column);
 593                   fetch_vector4( &inst->SrcReg[0], &dMachine, program, aNext );
 594                   result[0] = aNext[0] - a[0];
 595                   result[1] = aNext[1] - a[1];
 596                   result[2] = aNext[2] - a[2];
 597                   result[3] = aNext[3] - a[3];
 598                }
 599                store_vector4( inst, machine, result );
 600             }
 601             break;
 602          case FP_OPCODE_DP3:
 603             {
 604                GLfloat a[4], b[4], result[4];
 605                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 606                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 607                result[0] = result[1] = result[2] = result[3] =
 608                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 609                store_vector4( inst, machine, result );
 610 #if DEBUG_FRAG
 611                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 612                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 613 #endif
 614             }
 615             break;
 616          case FP_OPCODE_DP4:
 617             {
 618                GLfloat a[4], b[4], result[4];
 619                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 620                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 621                result[0] = result[1] = result[2] = result[3] =
 622                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 623                store_vector4( inst, machine, result );
 624             }
 625             break;
 626          case FP_OPCODE_DST: /* Distance vector */
 627             {
 628                GLfloat a[4], b[4], result[4];
 629                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 630                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 631                result[0] = 1.0F;
 632                result[1] = a[1] * b[1];
 633                result[2] = a[2];
 634                result[3] = b[3];
 635                store_vector4( inst, machine, result );
 636             }
 637             break;
 638          case FP_OPCODE_EX2: /* Exponential base 2 */
 639             {
 640                GLfloat a[4], result[4];
 641                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 642                result[0] = result[1] = result[2] = result[3] =
 643                   (GLfloat) _mesa_pow(2.0, a[0]);
 644                store_vector4( inst, machine, result );
 645             }
 646             break;
 647          case FP_OPCODE_FLR:
 648             {
 649                GLfloat a[4], result[4];
 650                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 651                result[0] = FLOORF(a[0]);
 652                result[1] = FLOORF(a[1]);
 653                result[2] = FLOORF(a[2]);
 654                result[3] = FLOORF(a[3]);
 655                store_vector4( inst, machine, result );
 656             }
 657             break;
 658          case FP_OPCODE_FRC:
 659             {
 660                GLfloat a[4], result[4];
 661                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 662                result[0] = a[0] - FLOORF(a[0]);
 663                result[1] = a[1] - FLOORF(a[1]);
 664                result[2] = a[2] - FLOORF(a[2]);
 665                result[3] = a[3] - FLOORF(a[3]);
 666                store_vector4( inst, machine, result );
 667             }
 668             break;
 669          case FP_OPCODE_KIL:
 670             {
 671                const GLuint *swizzle = inst->DstReg.CondSwizzle;
 672                const GLuint condMask = inst->DstReg.CondMask;
 673                if (test_cc(machine->CondCodes[swizzle[0]], condMask) ||
 674                    test_cc(machine->CondCodes[swizzle[1]], condMask) ||
 675                    test_cc(machine->CondCodes[swizzle[2]], condMask) ||
 676                    test_cc(machine->CondCodes[swizzle[3]], condMask)) {
 677                   return GL_FALSE;
 678                }
 679             }
 680             break;
 681          case FP_OPCODE_LG2:  /* log base 2 */
 682             {
 683                GLfloat a[4], result[4];
 684                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 685                result[0] = result[1] = result[2] = result[3]
 686                   = LOG2(a[0]);
 687                store_vector4( inst, machine, result );
 688             }
 689             break;
 690          case FP_OPCODE_LIT:
 691             {
 692                GLfloat a[4], result[4];
 693                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 694                if (a[0] < 0.0F)
 695                   a[0] = 0.0F;
 696                if (a[1] < 0.0F)
 697                   a[1] = 0.0F;
 698                result[0] = 1.0F;
 699                result[1] = a[0];
 700                result[2] = (a[0] > 0.0) ? _mesa_pow(2.0, a[3]) : 0.0F;
 701                result[3] = 1.0F;
 702                store_vector4( inst, machine, result );
 703             }
 704             break;
 705          case FP_OPCODE_LRP:
 706             {
 707                GLfloat a[4], b[4], c[4], result[4];
 708                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 709                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 710                fetch_vector4( &inst->SrcReg[2], machine, program, c );
 711                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 712                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 713                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 714                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 715                store_vector4( inst, machine, result );
 716             }
 717             break;
 718          case FP_OPCODE_MAD:
 719             {
 720                GLfloat a[4], b[4], c[4], result[4];
 721                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 722                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 723                fetch_vector4( &inst->SrcReg[2], machine, program, c );
 724                result[0] = a[0] * b[0] + c[0];
 725                result[1] = a[1] * b[1] + c[1];
 726                result[2] = a[2] * b[2] + c[2];
 727                result[3] = a[3] * b[3] + c[3];
 728                store_vector4( inst, machine, result );
 729             }
 730             break;
 731          case FP_OPCODE_MAX:
 732             {
 733                GLfloat a[4], b[4], result[4];
 734                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 735                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 736                result[0] = MAX2(a[0], b[0]);
 737                result[1] = MAX2(a[1], b[1]);
 738                result[2] = MAX2(a[2], b[2]);
 739                result[3] = MAX2(a[3], b[3]);
 740                store_vector4( inst, machine, result );
 741             }
 742             break;
 743          case FP_OPCODE_MIN:
 744             {
 745                GLfloat a[4], b[4], result[4];
 746                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 747                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 748                result[0] = MIN2(a[0], b[0]);
 749                result[1] = MIN2(a[1], b[1]);
 750                result[2] = MIN2(a[2], b[2]);
 751                result[3] = MIN2(a[3], b[3]);
 752                store_vector4( inst, machine, result );
 753             }
 754             break;
 755          case FP_OPCODE_MOV:
 756             {
 757                GLfloat result[4];
 758                fetch_vector4( &inst->SrcReg[0], machine, program, result );
 759                store_vector4( inst, machine, result );
 760             }
 761             break;
 762          case FP_OPCODE_MUL:
 763             {
 764                GLfloat a[4], b[4], result[4];
 765                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 766                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 767                result[0] = a[0] * b[0];
 768                result[1] = a[1] * b[1];
 769                result[2] = a[2] * b[2];
 770                result[3] = a[3] * b[3];
 771                store_vector4( inst, machine, result );
 772 #if DEBUG_FRAG
 773                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 774                       result[0], result[1], result[2], result[3],
 775                       a[0], a[1], a[2], a[3],
 776                       b[0], b[1], b[2], b[3]);
 777 #endif
 778             }
 779             break;
 780          case FP_OPCODE_PK2H: /* pack two 16-bit floats */
 781             /* XXX this is probably wrong */
 782             {
 783                GLfloat a[4], result[4];
 784                const GLuint *rawBits = (const GLuint *) a;
 785                GLuint *rawResult = (GLuint *) result;
 786                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 787                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 788                   = rawBits[0] | (rawBits[1] << 16);
 789                store_vector4( inst, machine, result );
 790             }
 791             break;
 792          case FP_OPCODE_PK2US: /* pack two GLushorts */
 793             {
 794                GLfloat a[4], result[4];
 795                GLuint usx, usy, *rawResult = (GLuint *) result;
 796                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 797                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 798                a[1] = CLAMP(a[0], 0.0F, 1.0F);
 799                usx = IROUND(a[0] * 65535.0F);
 800                usy = IROUND(a[1] * 65535.0F);
 801                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 802                   = usx | (usy << 16);
 803                store_vector4( inst, machine, result );
 804             }
 805             break;
 806          case FP_OPCODE_PK4B: /* pack four GLbytes */
 807             {
 808                GLfloat a[4], result[4];
 809                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 810                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 811                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 812                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 813                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 814                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 815                ubx = IROUND(127.0F * a[0] + 128.0F);
 816                uby = IROUND(127.0F * a[1] + 128.0F);
 817                ubz = IROUND(127.0F * a[2] + 128.0F);
 818                ubw = IROUND(127.0F * a[3] + 128.0F);
 819                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 820                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 821                store_vector4( inst, machine, result );
 822             }
 823             break;
 824          case FP_OPCODE_PK4UB: /* pack four GLubytes */
 825             {
 826                GLfloat a[4], result[4];
 827                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 828                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 829                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 830                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 831                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 832                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 833                ubx = IROUND(255.0F * a[0]);
 834                uby = IROUND(255.0F * a[1]);
 835                ubz = IROUND(255.0F * a[2]);
 836                ubw = IROUND(255.0F * a[3]);
 837                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 838                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 839                store_vector4( inst, machine, result );
 840             }
 841             break;
 842          case FP_OPCODE_POW:
 843             {
 844                GLfloat a[4], b[4], result[4];
 845                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 846                fetch_vector1( &inst->SrcReg[1], machine, program, b );
 847                result[0] = result[1] = result[2] = result[3]
 848                   = _mesa_pow(a[0], b[0]);
 849                store_vector4( inst, machine, result );
 850             }
 851             break;
 852          case FP_OPCODE_RCP:
 853             {
 854                GLfloat a[4], result[4];
 855                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 856 #if DEBUG_FRAG
 857                if (a[0] == 0)
 858                   printf("RCP(0)\n");
 859                else if (IS_INF_OR_NAN(a[0]))
 860                   printf("RCP(inf)\n");
 861 #endif
 862                result[0] = result[1] = result[2] = result[3]
 863                   = 1.0F / a[0];
 864                store_vector4( inst, machine, result );
 865             }
 866             break;
 867          case FP_OPCODE_RFL:
 868             {
 869                GLfloat axis[4], dir[4], result[4], tmp[4];
 870                fetch_vector4( &inst->SrcReg[0], machine, program, axis );
 871                fetch_vector4( &inst->SrcReg[1], machine, program, dir );
 872                tmp[3] = axis[0] * axis[0]
 873                       + axis[1] * axis[1]
 874                       + axis[2] * axis[2];
 875                tmp[0] = (2.0F * (axis[0] * dir[0] +
 876                                  axis[1] * dir[1] +
 877                                  axis[2] * dir[2])) / tmp[3];
 878                result[0] = tmp[0] * axis[0] - dir[0];
 879                result[1] = tmp[0] * axis[1] - dir[1];
 880                result[2] = tmp[0] * axis[2] - dir[2];
 881                /* result[3] is never written! XXX enforce in parser! */
 882                store_vector4( inst, machine, result );
 883             }
 884             break;
 885          case FP_OPCODE_RSQ: /* 1 / sqrt() */
 886             {
 887                GLfloat a[4], result[4];
 888                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 889                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
 890                store_vector4( inst, machine, result );
 891 #if DEBUG_FRAG
 892                printf("RSQ %g = 1/sqrt(%g)\n", result[0], a[0]);
 893 #endif
 894             }
 895             break;
 896          case FP_OPCODE_SEQ: /* set on equal */
 897             {
 898                GLfloat a[4], b[4], result[4];
 899                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 900                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 901                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
 902                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
 903                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
 904                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
 905                store_vector4( inst, machine, result );
 906             }
 907             break;
 908          case FP_OPCODE_SFL: /* set false, operands ignored */
 909             {
 910                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
 911                store_vector4( inst, machine, result );
 912             }
 913             break;
 914          case FP_OPCODE_SGE: /* set on greater or equal */
 915             {
 916                GLfloat a[4], b[4], result[4];
 917                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 918                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 919                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
 920                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
 921                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
 922                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
 923                store_vector4( inst, machine, result );
 924             }
 925             break;
 926          case FP_OPCODE_SGT: /* set on greater */
 927             {
 928                GLfloat a[4], b[4], result[4];
 929                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 930                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 931                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
 932                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
 933                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
 934                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
 935                store_vector4( inst, machine, result );
 936             }
 937             break;
 938          case FP_OPCODE_SIN:
 939             {
 940                GLfloat a[4], result[4];
 941                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 942                result[0] = result[1] = result[2] = result[3] = _mesa_sin(a[0]);
 943                store_vector4( inst, machine, result );
 944             }
 945             break;
 946          case FP_OPCODE_SLE: /* set on less or equal */
 947             {
 948                GLfloat a[4], b[4], result[4];
 949                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 950                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 951                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
 952                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
 953                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
 954                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
 955                store_vector4( inst, machine, result );
 956             }
 957             break;
 958          case FP_OPCODE_SLT: /* set on less */
 959             {
 960                GLfloat a[4], b[4], result[4];
 961                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 962                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 963                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
 964                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
 965                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
 966                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
 967                store_vector4( inst, machine, result );
 968             }
 969             break;
 970          case FP_OPCODE_SNE: /* set on not equal */
 971             {
 972                GLfloat a[4], b[4], result[4];
 973                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 974                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 975                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
 976                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
 977                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
 978                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
 979                store_vector4( inst, machine, result );
 980             }
 981             break;
 982          case FP_OPCODE_STR: /* set true, operands ignored */
 983             {
 984                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
 985                store_vector4( inst, machine, result );
 986             }
 987             break;
 988          case FP_OPCODE_SUB:
 989             {
 990                GLfloat a[4], b[4], result[4];
 991                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 992                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 993                result[0] = a[0] - b[0];
 994                result[1] = a[1] - b[1];
 995                result[2] = a[2] - b[2];
 996                result[3] = a[3] - b[3];
 997                store_vector4( inst, machine, result );
 998             }
 999             break;
1000          case FP_OPCODE_TEX:
1001             /* Texel lookup */
1002             {
1003                GLfloat texcoord[4], color[4];
1004                fetch_vector4( &inst->SrcReg[0], machine, program, texcoord );
1005                /* XXX: Undo perspective divide from interpolate_texcoords() */
1006                fetch_texel( ctx, texcoord,
1007                             span->array->lambda[inst->TexSrcUnit][column],
1008                             inst->TexSrcUnit, color );
1009                store_vector4( inst, machine, color );
1010             }
1011             break;
1012          case FP_OPCODE_TXD:
1013             /* Texture lookup w/ partial derivatives for LOD */
1014             {
1015                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1016                fetch_vector4( &inst->SrcReg[0], machine, program, texcoord );
1017                fetch_vector4( &inst->SrcReg[1], machine, program, dtdx );
1018                fetch_vector4( &inst->SrcReg[2], machine, program, dtdy );
1019                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1020                                   color );
1021                store_vector4( inst, machine, color );
1022             }
1023             break;
1024          case FP_OPCODE_TXP:
1025             /* Texture lookup w/ perspective divide */
1026             {
1027                GLfloat texcoord[4], color[4];
1028                fetch_vector4( &inst->SrcReg[0], machine, program, texcoord );
1029                /* Already did perspective divide in interpolate_texcoords() */
1030                fetch_texel( ctx, texcoord,
1031                             span->array->lambda[inst->TexSrcUnit][column],
1032                             inst->TexSrcUnit, color );
1033                store_vector4( inst, machine, color );
1034             }
1035             break;
1036          case FP_OPCODE_UP2H: /* unpack two 16-bit floats */
1037             /* XXX this is probably wrong */
1038             {
1039                GLfloat a[4], result[4];
1040                const GLuint *rawBits = (const GLuint *) a;
1041                GLuint *rawResult = (GLuint *) result;
1042                fetch_vector1( &inst->SrcReg[0], machine, program, a );
1043                rawResult[0] = rawBits[0] & 0xffff;
1044                rawResult[1] = (rawBits[0] >> 16) & 0xffff;
1045                rawResult[2] = rawBits[0] & 0xffff;
1046                rawResult[3] = (rawBits[0] >> 16) & 0xffff;
1047                store_vector4( inst, machine, result );
1048             }
1049             break;
1050          case FP_OPCODE_UP2US: /* unpack two GLushorts */
1051             {
1052                GLfloat a[4], result[4];
1053                const GLuint *rawBits = (const GLuint *) a;
1054                fetch_vector1( &inst->SrcReg[0], machine, program, a );
1055                result[0] = (GLfloat) ((rawBits[0] >>  0) & 0xffff) / 65535.0F;
1056                result[1] = (GLfloat) ((rawBits[0] >> 16) & 0xffff) / 65535.0F;
1057                result[2] = result[0];
1058                result[3] = result[1];
1059                store_vector4( inst, machine, result );
1060             }
1061             break;
1062          case FP_OPCODE_UP4B: /* unpack four GLbytes */
1063             {
1064                GLfloat a[4], result[4];
1065                const GLuint *rawBits = (const GLuint *) a;
1066                fetch_vector1( &inst->SrcReg[0], machine, program, a );
1067                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1068                result[0] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1069                result[0] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1070                result[0] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1071                store_vector4( inst, machine, result );
1072             }
1073             break;
1074          case FP_OPCODE_UP4UB: /* unpack four GLubytes */
1075             {
1076                GLfloat a[4], result[4];
1077                const GLuint *rawBits = (const GLuint *) a;
1078                fetch_vector1( &inst->SrcReg[0], machine, program, a );
1079                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1080                result[0] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1081                result[0] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1082                result[0] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1083                store_vector4( inst, machine, result );
1084             }
1085             break;
1086          case FP_OPCODE_X2D: /* 2-D matrix transform */
1087             {
1088                GLfloat a[4], b[4], c[4], result[4];
1089                fetch_vector4( &inst->SrcReg[0], machine, program, a );
1090                fetch_vector4( &inst->SrcReg[1], machine, program, b );
1091                fetch_vector4( &inst->SrcReg[2], machine, program, c );
1092                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1093                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1094                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1095                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1096                store_vector4( inst, machine, result );
1097             }
1098             break;
1099          case FP_OPCODE_END:
1100             return GL_TRUE;
1101          default:
1102             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1103                           inst->Opcode);
1104             return GL_TRUE; /* return value doesn't matter */
1105       }
1106    }
1107    return GL_TRUE;
1108 }
1109
1110
1111 static void
1112 init_machine( GLcontext *ctx, struct fp_machine *machine,
1113               const struct fragment_program *program,
1114               const struct sw_span *span, GLuint col )
1115 {
1116    GLuint inputsRead = program->InputsRead;
1117    GLuint j, u;
1118
1119    if (ctx->FragmentProgram.CallbackEnabled)
1120       inputsRead = ~0;
1121
1122    /* Clear temporary registers */
1123    _mesa_bzero(machine->Registers + FP_TEMP_REG_START,
1124                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1125
1126    /* Load program local parameters */
1127    for (j = 0; j < MAX_NV_FRAGMENT_PROGRAM_PARAMS; j++) {
1128       COPY_4V(machine->Registers[FP_PROG_REG_START + j],
1129               program->Base.LocalParams[j]);
1130    }
1131
1132    /* Load input registers */
1133    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1134       GLfloat *wpos = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_WPOS];
1135       wpos[0] = span->x + col;
1136       wpos[1] = span->y;
1137       wpos[2] = (GLfloat) span->array->z[col] / ctx->DepthMaxF;
1138       wpos[3] = span->w + col * span->dwdx;
1139    }
1140    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1141       GLfloat *col0 = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL0];
1142       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1143       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1144       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1145       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1146    }
1147    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1148       GLfloat *col1 = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL1];
1149       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1150       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1151       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1152       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1153    }
1154    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1155       GLfloat *fogc = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_FOGC];
1156       fogc[0] = span->array->fog[col];
1157       fogc[1] = 0.0F;
1158       fogc[2] = 0.0F;
1159       fogc[3] = 0.0F;
1160    }
1161    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1162       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1163          GLfloat *tex = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_TEX0+u];
1164          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1165          COPY_4V(tex, span->array->texcoords[u][col]);
1166          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1167       }
1168    }
1169
1170    /* init condition codes */
1171    machine->CondCodes[0] = COND_EQ;
1172    machine->CondCodes[1] = COND_EQ;
1173    machine->CondCodes[2] = COND_EQ;
1174    machine->CondCodes[3] = COND_EQ;
1175 }
1176
1177
1178 void
1179 _swrast_exec_nv_fragment_program( GLcontext *ctx, struct sw_span *span )
1180 {
1181    const struct fragment_program *program = ctx->FragmentProgram.Current;
1182    GLuint i;
1183
1184    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1185
1186    for (i = 0; i < span->end; i++) {
1187       if (span->array->mask[i]) {
1188          init_machine(ctx, &ctx->FragmentProgram.Machine,
1189                       ctx->FragmentProgram.Current, span, i);
1190
1191          if (!execute_program(ctx, program, ~0,
1192                               &ctx->FragmentProgram.Machine, span, i)) {
1193             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1194          }
1195
1196          /* Store output registers */
1197          {
1198             const GLfloat *colOut
1199                = ctx->FragmentProgram.Machine.Registers[FP_OUTPUT_REG_START];
1200             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1201             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1202             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1203             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1204          }
1205          /* depth value */
1206          if (program->OutputsWritten & (1 << FRAG_OUTPUT_DEPR))
1207             span->array->z[i] = IROUND(ctx->FragmentProgram.Machine.Registers[FP_OUTPUT_REG_START + 2][0] * ctx->DepthMaxF);
1208       }
1209    }
1210
1211    ctx->_CurrentProgram = 0;
1212 }
1213