src/mesa/swrast/s_nvfragprog.c

   1 /* $Id: s_nvfragprog.c,v 1.13 2003/04/05 00:38:10 brianp Exp $ */
   2
   3 /*
   4  * Mesa 3-D graphics library
   5  * Version:  5.1
   6  *
   7  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   8  *
   9  * Permission is hereby granted, free of charge, to any person obtaining a
  10  * copy of this software and associated documentation files (the "Software"),
  11  * to deal in the Software without restriction, including without limitation
  12  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  13  * and/or sell copies of the Software, and to permit persons to whom the
  14  * Software is furnished to do so, subject to the following conditions:
  15  *
  16  * The above copyright notice and this permission notice shall be included
  17  * in all copies or substantial portions of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  22  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  23  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  24  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  */
  26
  27
  28
  29 #include "glheader.h"
  30 #include "colormac.h"
  31 #include "context.h"
  32 #include "nvfragprog.h"
  33 #include "macros.h"
  34
  35 #include "s_nvfragprog.h"
  36 #include "s_span.h"
  37 #include "s_texture.h"
  38
  39
  40 /**
  41  * Fetch a texel.
  42  */
  43 static void
  44 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLuint unit,
  45              GLfloat color[4] )
  46 {
  47    const GLfloat *lambda = NULL;
  48    GLchan rgba[4];
  49    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  50
  51    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  52                                1, (const GLfloat (*)[4]) texcoord,
  53                                lambda, &rgba);
  54    color[0] = CHAN_TO_FLOAT(rgba[0]);
  55    color[1] = CHAN_TO_FLOAT(rgba[1]);
  56    color[2] = CHAN_TO_FLOAT(rgba[2]);
  57    color[3] = CHAN_TO_FLOAT(rgba[3]);
  58 }
  59
  60
  61 /**
  62  * Fetch a texel with the given partial derivatives to compute a level
  63  * of detail in the mipmap.
  64  */
  65 static void
  66 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  67                    const GLfloat texdx[4], const GLfloat texdy[4],
  68                    GLuint unit, GLfloat color[4] )
  69 {
  70    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  71    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  72    const struct gl_texture_image *texImg = texObj->Image[texObj->BaseLevel];
  73    const GLfloat texW = (GLfloat) texImg->WidthScale;
  74    const GLfloat texH = (GLfloat) texImg->HeightScale;
  75    GLchan rgba[4];
  76
  77    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  78                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  79                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  80                                          texW, texH,
  81                                          texcoord[0], texcoord[1], texcoord[3],
  82                                          1.0F / texcoord[3]);
  83
  84    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  85                                1, (const GLfloat (*)[4]) texcoord,
  86                                &lambda, &rgba);
  87    color[0] = CHAN_TO_FLOAT(rgba[0]);
  88    color[1] = CHAN_TO_FLOAT(rgba[1]);
  89    color[2] = CHAN_TO_FLOAT(rgba[2]);
  90    color[3] = CHAN_TO_FLOAT(rgba[3]);
  91 }
  92
  93
  94
  95 /**
  96  * Fetch a 4-element float vector from the given source register.
  97  * Apply swizzling and negating as needed.
  98  */
  99 static void
 100 fetch_vector4( const struct fp_src_register *source,
 101                const struct fp_machine *machine,
 102                const struct fragment_program *program,
 103                GLfloat result[4] )
 104 {
 105    const GLfloat *src;
 106
 107    if (source->IsParameter) {
 108       src = program->Parameters[source->Register].Values;
 109    }
 110    else {
 111       src = machine->Registers[source->Register];
 112    }
 113
 114    result[0] = src[source->Swizzle[0]];
 115    result[1] = src[source->Swizzle[1]];
 116    result[2] = src[source->Swizzle[2]];
 117    result[3] = src[source->Swizzle[3]];
 118
 119    if (source->NegateBase) {
 120       result[0] = -result[0];
 121       result[1] = -result[1];
 122       result[2] = -result[2];
 123       result[3] = -result[3];
 124    }
 125    if (source->Abs) {
 126       result[0] = FABSF(result[0]);
 127       result[1] = FABSF(result[1]);
 128       result[2] = FABSF(result[2]);
 129       result[3] = FABSF(result[3]);
 130    }
 131    if (source->NegateAbs) {
 132       result[0] = -result[0];
 133       result[1] = -result[1];
 134       result[2] = -result[2];
 135       result[3] = -result[3];
 136    }
 137 }
 138
 139
 140 /**
 141  * Fetch the derivative with respect to X for the given register.
 142  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 143  * need to execute another instance of the program (ugh)!
 144  */
 145 static GLboolean
 146 fetch_vector4_deriv( const struct fp_src_register *source,
 147                      const struct sw_span *span,
 148                      char xOrY, GLfloat result[4] )
 149 {
 150    GLfloat src[4];
 151
 152    ASSERT(xOrY == 'X' || xOrY == 'Y');
 153
 154    switch (source->Register) {
 155    case FRAG_ATTRIB_WPOS:
 156       if (xOrY == 'X') {
 157          src[0] = 1.0;
 158          src[1] = 0.0;
 159          src[2] = span->dzdx;
 160          src[3] = span->dwdx;
 161       }
 162       else {
 163          src[0] = 0.0;
 164          src[1] = 1.0;
 165          src[2] = span->dzdy;
 166          src[3] = span->dwdy;
 167       }
 168       break;
 169    case FRAG_ATTRIB_COL0:
 170       if (xOrY == 'X') {
 171          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 172          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 173          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 174          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 175       }
 176       else {
 177          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 178          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 179          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 180          src[3] = span->dady * (1.0F / CHAN_MAXF);
 181       }
 182       break;
 183    case FRAG_ATTRIB_COL1:
 184       if (xOrY == 'X') {
 185          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 186          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 187          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 188          src[3] = 0.0; /* XXX need this */
 189       }
 190       else {
 191          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 192          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 193          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 194          src[3] = 0.0; /* XXX need this */
 195       }
 196       break;
 197    case FRAG_ATTRIB_FOGC:
 198       if (xOrY == 'X') {
 199          src[0] = span->dfogdx;
 200          src[1] = 0.0;
 201          src[2] = 0.0;
 202          src[3] = 0.0;
 203       }
 204       else {
 205          src[0] = span->dfogdy;
 206          src[1] = 0.0;
 207          src[2] = 0.0;
 208          src[3] = 0.0;
 209       }
 210       break;
 211    case FRAG_ATTRIB_TEX0:
 212    case FRAG_ATTRIB_TEX1:
 213    case FRAG_ATTRIB_TEX2:
 214    case FRAG_ATTRIB_TEX3:
 215    case FRAG_ATTRIB_TEX4:
 216    case FRAG_ATTRIB_TEX5:
 217    case FRAG_ATTRIB_TEX6:
 218    case FRAG_ATTRIB_TEX7:
 219       if (xOrY == 'X') {
 220          const GLuint u = source->Register - FRAG_ATTRIB_TEX0;
 221          src[0] = span->texStepX[u][0] * (1.0F / CHAN_MAXF);
 222          src[1] = span->texStepX[u][1] * (1.0F / CHAN_MAXF);
 223          src[2] = span->texStepX[u][2] * (1.0F / CHAN_MAXF);
 224          src[3] = span->texStepX[u][3] * (1.0F / CHAN_MAXF);
 225       }
 226       else {
 227          const GLuint u = source->Register - FRAG_ATTRIB_TEX0;
 228          src[0] = span->texStepY[u][0] * (1.0F / CHAN_MAXF);
 229          src[1] = span->texStepY[u][1] * (1.0F / CHAN_MAXF);
 230          src[2] = span->texStepY[u][2] * (1.0F / CHAN_MAXF);
 231          src[3] = span->texStepY[u][3] * (1.0F / CHAN_MAXF);
 232       }
 233       break;
 234    default:
 235       return GL_FALSE;
 236    }
 237
 238    result[0] = src[source->Swizzle[0]];
 239    result[1] = src[source->Swizzle[1]];
 240    result[2] = src[source->Swizzle[2]];
 241    result[3] = src[source->Swizzle[3]];
 242
 243    if (source->NegateBase) {
 244       result[0] = -result[0];
 245       result[1] = -result[1];
 246       result[2] = -result[2];
 247       result[3] = -result[3];
 248    }
 249    if (source->Abs) {
 250       result[0] = FABSF(result[0]);
 251       result[1] = FABSF(result[1]);
 252       result[2] = FABSF(result[2]);
 253       result[3] = FABSF(result[3]);
 254    }
 255    if (source->NegateAbs) {
 256       result[0] = -result[0];
 257       result[1] = -result[1];
 258       result[2] = -result[2];
 259       result[3] = -result[3];
 260    }
 261    return GL_TRUE;
 262 }
 263
 264
 265 /**
 266  * As above, but only return result[0] element.
 267  */
 268 static void
 269 fetch_vector1( const struct fp_src_register *source,
 270                const struct fp_machine *machine,
 271                const struct fragment_program *program,
 272                GLfloat result[4] )
 273 {
 274    const GLfloat *src;
 275
 276    if (source->IsParameter) {
 277       src = program->Parameters[source->Register].Values;
 278    }
 279    else {
 280       src = machine->Registers[source->Register];
 281    }
 282
 283    result[0] = src[source->Swizzle[0]];
 284
 285    if (source->NegateBase) {
 286       result[0] = -result[0];
 287    }
 288    if (source->Abs) {
 289       result[0] = FABSF(result[0]);
 290    }
 291    if (source->NegateAbs) {
 292       result[0] = -result[0];
 293    }
 294 }
 295
 296
 297 /*
 298  * Test value against zero and return GT, LT, EQ or UN if NaN.
 299  */
 300 static INLINE GLuint
 301 generate_cc( float value )
 302 {
 303    if (value != value)
 304       return COND_UN;  /* NaN */
 305    if (value > 0.0F)
 306       return COND_GT;
 307    if (value < 0.0F)
 308       return COND_LT;
 309    return COND_EQ;
 310 }
 311
 312 /*
 313  * Test if the ccMaskRule is satisfied by the given condition code.
 314  * Used to mask destination writes according to the current condition codee.
 315  */
 316 static INLINE GLboolean
 317 test_cc(GLuint condCode, GLuint ccMaskRule)
 318 {
 319    switch (ccMaskRule) {
 320    case COND_EQ: return (condCode == COND_EQ);
 321    case COND_NE: return (condCode != COND_EQ);
 322    case COND_LT: return (condCode == COND_LT);
 323    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 324    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 325    case COND_GT: return (condCode == COND_GT);
 326    case COND_TR: return GL_TRUE;
 327    case COND_FL: return GL_FALSE;
 328    default:      return GL_TRUE;
 329    }
 330 }
 331
 332
 333 /**
 334  * Store 4 floats into a register.  Observe the instructions saturate and
 335  * set-condition-code flags.
 336  */
 337 static void
 338 store_vector4( const struct fp_instruction *inst,
 339                struct fp_machine *machine,
 340                const GLfloat value[4] )
 341 {
 342    const struct fp_dst_register *dest = &(inst->DstReg);
 343    const GLboolean clamp = inst->Saturate;
 344    const GLboolean updateCC = inst->UpdateCondRegister;
 345    GLfloat *dstReg = machine->Registers[dest->Register];
 346    GLfloat clampedValue[4];
 347    const GLboolean *writeMask = dest->WriteMask;
 348    GLboolean condWriteMask[4];
 349
 350    if (clamp) {
 351       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 352       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 353       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 354       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 355       value = clampedValue;
 356    }
 357
 358    if (dest->CondMask != COND_TR) {
 359       condWriteMask[0] = writeMask[0]
 360          && test_cc(machine->CondCodes[dest->CondSwizzle[0]], dest->CondMask);
 361       condWriteMask[1] = writeMask[1]
 362          && test_cc(machine->CondCodes[dest->CondSwizzle[1]], dest->CondMask);
 363       condWriteMask[2] = writeMask[2]
 364          && test_cc(machine->CondCodes[dest->CondSwizzle[2]], dest->CondMask);
 365       condWriteMask[3] = writeMask[3]
 366          && test_cc(machine->CondCodes[dest->CondSwizzle[3]], dest->CondMask);
 367       writeMask = condWriteMask;
 368    }
 369
 370    if (writeMask[0]) {
 371       dstReg[0] = value[0];
 372       if (updateCC)
 373          machine->CondCodes[0] = generate_cc(value[0]);
 374    }
 375    if (writeMask[1]) {
 376       dstReg[1] = value[1];
 377       if (updateCC)
 378          machine->CondCodes[1] = generate_cc(value[1]);
 379    }
 380    if (writeMask[2]) {
 381       dstReg[2] = value[2];
 382       if (updateCC)
 383          machine->CondCodes[2] = generate_cc(value[2]);
 384    }
 385    if (writeMask[3]) {
 386       dstReg[3] = value[3];
 387       if (updateCC)
 388          machine->CondCodes[3] = generate_cc(value[3]);
 389    }
 390 }
 391
 392
 393 /**
 394  * Initialize a new machine state instance from an existing one, adding
 395  * the partial derivatives onto the input registers.
 396  * Used to implement DDX and DDY instructions in non-trivial cases.
 397  */
 398 static void
 399 init_machine_deriv( GLcontext *ctx,
 400                     const struct fp_machine *machine,
 401                     const struct fragment_program *program,
 402                     const struct sw_span *span, char xOrY,
 403                     struct fp_machine *dMachine )
 404 {
 405    GLuint u;
 406
 407    ASSERT(xOrY == 'X' || xOrY == 'Y');
 408
 409    /* copy existing machine */
 410    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 411
 412    /* Clear temporary registers */
 413    _mesa_bzero((GLfloat*) (machine->Registers + FP_TEMP_REG_START) ,
 414                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 415
 416    /* Add derivatives */
 417    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 418       GLfloat *wpos = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_WPOS];
 419       if (xOrY == 'X') {
 420          wpos[0] += 1.0F;
 421          wpos[1] += 0.0F;
 422          wpos[2] += span->dzdx;
 423          wpos[3] += span->dwdx;
 424       }
 425       else {
 426          wpos[0] += 0.0F;
 427          wpos[1] += 1.0F;
 428          wpos[2] += span->dzdy;
 429          wpos[3] += span->dwdy;
 430       }
 431    }
 432    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 433       GLfloat *col0 = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL0];
 434       if (xOrY == 'X') {
 435          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 436          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 437          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 438          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 439       }
 440       else {
 441          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 442          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 443          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 444          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 445       }
 446    }
 447    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 448       GLfloat *col1 = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL1];
 449       if (xOrY == 'X') {
 450          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 451          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 452          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 453          col1[3] += 0.0; /*XXX fix */
 454       }
 455       else {
 456          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 457          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 458          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 459          col1[3] += 0.0; /*XXX fix */
 460       }
 461    }
 462    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 463       GLfloat *fogc = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_FOGC];
 464       if (xOrY == 'X') {
 465          fogc[0] += span->dfogdx;
 466       }
 467       else {
 468          fogc[0] += span->dfogdy;
 469       }
 470    }
 471    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 472       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 473          GLfloat *tex = (GLfloat*) machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_TEX0+u];
 474          if (xOrY == 'X') {
 475             tex[0] += span->texStepX[u][0];
 476             tex[1] += span->texStepX[u][1];
 477             tex[2] += span->texStepX[u][2];
 478             tex[3] += span->texStepX[u][3];
 479          }
 480          else {
 481             tex[0] += span->texStepY[u][0];
 482             tex[1] += span->texStepY[u][1];
 483             tex[2] += span->texStepY[u][2];
 484             tex[3] += span->texStepY[u][3];
 485          }
 486       }
 487    }
 488 }
 489
 490
 491 /**
 492  * Execute the given vertex program.
 493  * NOTE: we do everything in single-precision floating point; we don't
 494  * currently observe the single/half/fixed-precision qualifiers.
 495  * \param ctx - rendering context
 496  * \param program - the fragment program to execute
 497  * \param machine - machine state (register file)
 498  * \param maxInst - max number of instructions to execute
 499  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 500  */
 501 static GLboolean
 502 execute_program( GLcontext *ctx,
 503                  const struct fragment_program *program, GLuint maxInst,
 504                  struct fp_machine *machine, const struct sw_span *span )
 505 {
 506    GLuint pc = 0;
 507
 508    for (pc = 0; pc < maxInst; pc++) {
 509       const struct fp_instruction *inst = program->Instructions + pc;
 510       switch (inst->Opcode) {
 511          case FP_OPCODE_ADD:
 512             {
 513                GLfloat a[4], b[4], result[4];
 514                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 515                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 516                result[0] = a[0] + b[0];
 517                result[1] = a[1] + b[1];
 518                result[2] = a[2] + b[2];
 519                result[3] = a[3] + b[3];
 520                store_vector4( inst, machine, result );
 521             }
 522             break;
 523          case FP_OPCODE_COS:
 524             {
 525                GLfloat a[4], result[4];
 526                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 527                result[0] = result[1] = result[2] = result[3] = _mesa_cos(a[0]);
 528                store_vector4( inst, machine, result );
 529             }
 530             break;
 531          case FP_OPCODE_DDX: /* Partial derivative with respect to X */
 532             {
 533                GLfloat a[4], aNext[4], result[4];
 534                struct fp_machine dMachine;
 535                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'X', result)) {
 536                   /* This is tricky.  Make a copy of the current machine state,
 537                    * increment the input registers by the dx or dy partial
 538                    * derivatives, then re-execute the program up to the
 539                    * preceeding instruction, then fetch the source register.
 540                    * Finally, find the difference in the register values for
 541                    * the original and derivative runs.
 542                    */
 543                   init_machine_deriv(ctx, machine, program, span,
 544                                      'X', &dMachine);
 545                   execute_program(ctx, program, pc, &dMachine, span);
 546                   fetch_vector4( &inst->SrcReg[0], &dMachine, program, aNext );
 547                   result[0] = aNext[0] - a[0];
 548                   result[1] = aNext[1] - a[1];
 549                   result[2] = aNext[2] - a[2];
 550                   result[3] = aNext[3] - a[3];
 551                }
 552                store_vector4( inst, machine, result );
 553             }
 554             break;
 555          case FP_OPCODE_DDY: /* Partial derivative with respect to Y */
 556             {
 557                GLfloat a[4], aNext[4], result[4];
 558                struct fp_machine dMachine;
 559                if (!fetch_vector4_deriv(&inst->SrcReg[0], span, 'Y', result)) {
 560                   init_machine_deriv(ctx, machine, program, span,
 561                                      'Y', &dMachine);
 562                   execute_program(ctx, program, pc, &dMachine, span);
 563                   fetch_vector4( &inst->SrcReg[0], &dMachine, program, aNext );
 564                   result[0] = aNext[0] - a[0];
 565                   result[1] = aNext[1] - a[1];
 566                   result[2] = aNext[2] - a[2];
 567                   result[3] = aNext[3] - a[3];
 568                }
 569                store_vector4( inst, machine, result );
 570             }
 571             break;
 572          case FP_OPCODE_DP3:
 573             {
 574                GLfloat a[4], b[4], result[4];
 575                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 576                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 577                result[0] = result[1] = result[2] = result[3] =
 578                   a[0] + b[0] + a[1] * b[1] + a[2] * b[2];
 579                store_vector4( inst, machine, result );
 580             }
 581             break;
 582          case FP_OPCODE_DP4:
 583             {
 584                GLfloat a[4], b[4], result[4];
 585                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 586                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 587                result[0] = result[1] = result[2] = result[3] =
 588                   a[0] + b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 589                store_vector4( inst, machine, result );
 590             }
 591             break;
 592          case FP_OPCODE_DST: /* Distance vector */
 593             {
 594                GLfloat a[4], b[4], result[4];
 595                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 596                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 597                result[0] = 1.0F;
 598                result[1] = a[1] * b[1];
 599                result[2] = a[2];
 600                result[3] = b[3];
 601                store_vector4( inst, machine, result );
 602             }
 603             break;
 604          case FP_OPCODE_EX2: /* Exponential base 2 */
 605             {
 606                GLfloat a[4], result[4];
 607                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 608                result[0] = result[1] = result[2] = result[3] =
 609                   (GLfloat) _mesa_pow(2.0, a[0]);
 610                store_vector4( inst, machine, result );
 611             }
 612             break;
 613          case FP_OPCODE_FLR:
 614             {
 615                GLfloat a[4], result[4];
 616                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 617                result[0] = FLOORF(a[0]);
 618                result[1] = FLOORF(a[1]);
 619                result[2] = FLOORF(a[2]);
 620                result[3] = FLOORF(a[3]);
 621                store_vector4( inst, machine, result );
 622             }
 623             break;
 624          case FP_OPCODE_FRC:
 625             {
 626                GLfloat a[4], result[4];
 627                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 628                result[0] = a[0] - FLOORF(a[0]);
 629                result[1] = a[1] - FLOORF(a[1]);
 630                result[2] = a[2] - FLOORF(a[2]);
 631                result[3] = a[3] - FLOORF(a[3]);
 632                store_vector4( inst, machine, result );
 633             }
 634             break;
 635          case FP_OPCODE_KIL:
 636             {
 637                const GLuint *swizzle = inst->DstReg.CondSwizzle;
 638                const GLuint condMask = inst->DstReg.CondMask;
 639                if (test_cc(machine->CondCodes[swizzle[0]], condMask) ||
 640                    test_cc(machine->CondCodes[swizzle[1]], condMask) ||
 641                    test_cc(machine->CondCodes[swizzle[2]], condMask) ||
 642                    test_cc(machine->CondCodes[swizzle[3]], condMask))
 643                   return GL_FALSE;
 644             }
 645             break;
 646          case FP_OPCODE_LG2:  /* log base 2 */
 647             {
 648                GLfloat a[4], result[4];
 649                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 650                result[0] = result[1] = result[2] = result[3]
 651                   = LOG2(a[0]);
 652                store_vector4( inst, machine, result );
 653             }
 654             break;
 655          case FP_OPCODE_LIT:
 656             {
 657                GLfloat a[4], result[4];
 658                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 659                if (a[0] < 0.0F)
 660                   a[0] = 0.0F;
 661                if (a[1] < 0.0F)
 662                   a[1] = 0.0F;
 663                result[0] = 1.0F;
 664                result[1] = a[0];
 665                result[2] = (a[0] > 0.0) ? _mesa_pow(2.0, a[3]) : 0.0F;
 666                result[3] = 1.0F;
 667                store_vector4( inst, machine, result );
 668             }
 669             break;
 670          case FP_OPCODE_LRP:
 671             {
 672                GLfloat a[4], b[4], c[4], result[4];
 673                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 674                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 675                fetch_vector4( &inst->SrcReg[2], machine, program, c );
 676                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 677                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 678                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 679                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 680                store_vector4( inst, machine, result );
 681             }
 682             break;
 683          case FP_OPCODE_MAD:
 684             {
 685                GLfloat a[4], b[4], c[4], result[4];
 686                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 687                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 688                fetch_vector4( &inst->SrcReg[2], machine, program, c );
 689                result[0] = a[0] * b[0] + c[0];
 690                result[1] = a[1] * b[1] + c[1];
 691                result[2] = a[2] * b[2] + c[2];
 692                result[3] = a[3] * b[3] + c[3];
 693                store_vector4( inst, machine, result );
 694             }
 695             break;
 696          case FP_OPCODE_MAX:
 697             {
 698                GLfloat a[4], b[4], result[4];
 699                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 700                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 701                result[0] = MAX2(a[0], b[0]);
 702                result[1] = MAX2(a[1], b[1]);
 703                result[2] = MAX2(a[2], b[2]);
 704                result[3] = MAX2(a[3], b[3]);
 705                store_vector4( inst, machine, result );
 706             }
 707             break;
 708          case FP_OPCODE_MIN:
 709             {
 710                GLfloat a[4], b[4], result[4];
 711                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 712                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 713                result[0] = MIN2(a[0], b[0]);
 714                result[1] = MIN2(a[1], b[1]);
 715                result[2] = MIN2(a[2], b[2]);
 716                result[3] = MIN2(a[3], b[3]);
 717                store_vector4( inst, machine, result );
 718             }
 719             break;
 720          case FP_OPCODE_MOV:
 721             {
 722                GLfloat result[4];
 723                fetch_vector4( &inst->SrcReg[0], machine, program, result );
 724                store_vector4( inst, machine, result );
 725             }
 726             break;
 727          case FP_OPCODE_MUL:
 728             {
 729                GLfloat a[4], b[4], result[4];
 730                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 731                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 732                result[0] = a[0] * b[0];
 733                result[1] = a[1] * b[1];
 734                result[2] = a[2] * b[2];
 735                result[3] = a[3] * b[3];
 736                store_vector4( inst, machine, result );
 737             }
 738             break;
 739          case FP_OPCODE_PK2H: /* pack two 16-bit floats */
 740             /* XXX this is probably wrong */
 741             {
 742                GLfloat a[4], result[4];
 743                const GLuint *rawBits = (const GLuint *) a;
 744                GLuint *rawResult = (GLuint *) result;
 745                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 746                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 747                   = rawBits[0] | (rawBits[1] << 16);
 748                store_vector4( inst, machine, result );
 749             }
 750             break;
 751          case FP_OPCODE_PK2US: /* pack two GLushorts */
 752             {
 753                GLfloat a[4], result[4];
 754                GLuint usx, usy, *rawResult = (GLuint *) result;
 755                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 756                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 757                a[1] = CLAMP(a[0], 0.0F, 1.0F);
 758                usx = IROUND(a[0] * 65535.0F);
 759                usy = IROUND(a[1] * 65535.0F);
 760                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 761                   = usx | (usy << 16);
 762                store_vector4( inst, machine, result );
 763             }
 764             break;
 765          case FP_OPCODE_PK4B: /* pack four GLbytes */
 766             {
 767                GLfloat a[4], result[4];
 768                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 769                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 770                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 771                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 772                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 773                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 774                ubx = IROUND(127.0F * a[0] + 128.0F);
 775                uby = IROUND(127.0F * a[1] + 128.0F);
 776                ubz = IROUND(127.0F * a[2] + 128.0F);
 777                ubw = IROUND(127.0F * a[3] + 128.0F);
 778                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 779                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 780                store_vector4( inst, machine, result );
 781             }
 782             break;
 783          case FP_OPCODE_PK4UB: /* pack four GLubytes */
 784             {
 785                GLfloat a[4], result[4];
 786                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 787                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 788                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 789                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 790                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 791                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 792                ubx = IROUND(255.0F * a[0]);
 793                uby = IROUND(255.0F * a[1]);
 794                ubz = IROUND(255.0F * a[2]);
 795                ubw = IROUND(255.0F * a[3]);
 796                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 797                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 798                store_vector4( inst, machine, result );
 799             }
 800             break;
 801          case FP_OPCODE_POW:
 802             {
 803                GLfloat a[4], b[4], result[4];
 804                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 805                fetch_vector1( &inst->SrcReg[1], machine, program, b );
 806                result[0] = result[1] = result[2] = result[3]
 807                   = _mesa_pow(a[0], b[0]);
 808                store_vector4( inst, machine, result );
 809             }
 810             break;
 811          case FP_OPCODE_RCP:
 812             {
 813                GLfloat a[4], result[4];
 814                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 815                result[0] = result[1] = result[2] = result[3]
 816                   = 1.0F / a[0];
 817                store_vector4( inst, machine, result );
 818             }
 819             break;
 820          case FP_OPCODE_RFL:
 821             {
 822                GLfloat axis[4], dir[4], result[4], tmp[4];
 823                fetch_vector4( &inst->SrcReg[0], machine, program, axis );
 824                fetch_vector4( &inst->SrcReg[1], machine, program, dir );
 825                tmp[3] = axis[0] * axis[0]
 826                       + axis[1] * axis[1]
 827                       + axis[2] * axis[2];
 828                tmp[0] = (2.0F * (axis[0] * dir[0] +
 829                                  axis[1] * dir[1] +
 830                                  axis[2] * dir[2])) / tmp[3];
 831                result[0] = tmp[0] * axis[0] - dir[0];
 832                result[1] = tmp[0] * axis[1] - dir[1];
 833                result[2] = tmp[0] * axis[2] - dir[2];
 834                /* result[3] is never written! XXX enforce in parser! */
 835                store_vector4( inst, machine, result );
 836             }
 837             break;
 838          case FP_OPCODE_RSQ: /* 1 / sqrt() */
 839             {
 840                GLfloat a[4], result[4];
 841                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 842                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
 843                store_vector4( inst, machine, result );
 844             }
 845             break;
 846          case FP_OPCODE_SEQ: /* set on equal */
 847             {
 848                GLfloat a[4], b[4], result[4];
 849                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 850                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 851                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
 852                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
 853                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
 854                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
 855                store_vector4( inst, machine, result );
 856             }
 857             break;
 858          case FP_OPCODE_SFL: /* set false, operands ignored */
 859             {
 860                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
 861                store_vector4( inst, machine, result );
 862             }
 863             break;
 864          case FP_OPCODE_SGE: /* set on greater or equal */
 865             {
 866                GLfloat a[4], b[4], result[4];
 867                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 868                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 869                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
 870                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
 871                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
 872                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
 873                store_vector4( inst, machine, result );
 874             }
 875             break;
 876          case FP_OPCODE_SGT: /* set on greater */
 877             {
 878                GLfloat a[4], b[4], result[4];
 879                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 880                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 881                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
 882                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
 883                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
 884                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
 885                store_vector4( inst, machine, result );
 886             }
 887             break;
 888          case FP_OPCODE_SIN:
 889             {
 890                GLfloat a[4], result[4];
 891                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 892                result[0] = result[1] = result[2] = result[3] = _mesa_sin(a[0]);
 893                store_vector4( inst, machine, result );
 894             }
 895             break;
 896          case FP_OPCODE_SLE: /* set on less or equal */
 897             {
 898                GLfloat a[4], b[4], result[4];
 899                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 900                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 901                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
 902                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
 903                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
 904                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
 905                store_vector4( inst, machine, result );
 906             }
 907             break;
 908          case FP_OPCODE_SLT: /* set on less */
 909             {
 910                GLfloat a[4], b[4], result[4];
 911                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 912                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 913                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
 914                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
 915                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
 916                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
 917                store_vector4( inst, machine, result );
 918             }
 919             break;
 920          case FP_OPCODE_SNE: /* set on not equal */
 921             {
 922                GLfloat a[4], b[4], result[4];
 923                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 924                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 925                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
 926                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
 927                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
 928                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
 929                store_vector4( inst, machine, result );
 930             }
 931             break;
 932          case FP_OPCODE_STR: /* set true, operands ignored */
 933             {
 934                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
 935                store_vector4( inst, machine, result );
 936             }
 937             break;
 938          case FP_OPCODE_SUB:
 939             {
 940                GLfloat a[4], b[4], result[4];
 941                fetch_vector4( &inst->SrcReg[0], machine, program, a );
 942                fetch_vector4( &inst->SrcReg[1], machine, program, b );
 943                result[0] = a[0] - b[0];
 944                result[1] = a[1] - b[1];
 945                result[2] = a[2] - b[2];
 946                result[3] = a[3] - b[3];
 947                store_vector4( inst, machine, result );
 948             }
 949             break;
 950          case FP_OPCODE_TEX:
 951             /* Texel lookup */
 952             {
 953                GLfloat texcoord[4], color[4];
 954                fetch_vector4( &inst->SrcReg[0], machine, program, texcoord );
 955                /* XXX: Undo perspective divide from interpolate_texcoords() */
 956                fetch_texel( ctx, texcoord, inst->TexSrcUnit, color );
 957                store_vector4( inst, machine, color );
 958             }
 959             break;
 960          case FP_OPCODE_TXD:
 961             /* Texture lookup w/ partial derivatives for LOD */
 962             {
 963                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
 964                fetch_vector4( &inst->SrcReg[0], machine, program, texcoord );
 965                fetch_vector4( &inst->SrcReg[1], machine, program, dtdx );
 966                fetch_vector4( &inst->SrcReg[2], machine, program, dtdy );
 967                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
 968                                   color );
 969                store_vector4( inst, machine, color );
 970             }
 971             break;
 972          case FP_OPCODE_TXP:
 973             /* Texture lookup w/ perspective divide */
 974             {
 975                GLfloat texcoord[4], color[4];
 976                fetch_vector4( &inst->SrcReg[0], machine, program, texcoord );
 977                /* Already did perspective divide in interpolate_texcoords() */
 978                fetch_texel( ctx, texcoord, inst->TexSrcUnit, color );
 979                store_vector4( inst, machine, color );
 980             }
 981             break;
 982          case FP_OPCODE_UP2H: /* unpack two 16-bit floats */
 983             /* XXX this is probably wrong */
 984             {
 985                GLfloat a[4], result[4];
 986                const GLuint *rawBits = (const GLuint *) a;
 987                GLuint *rawResult = (GLuint *) result;
 988                fetch_vector1( &inst->SrcReg[0], machine, program, a );
 989                rawResult[0] = rawBits[0] & 0xffff;
 990                rawResult[1] = (rawBits[0] >> 16) & 0xffff;
 991                rawResult[2] = rawBits[0] & 0xffff;
 992                rawResult[3] = (rawBits[0] >> 16) & 0xffff;
 993                store_vector4( inst, machine, result );
 994             }
 995             break;
 996          case FP_OPCODE_UP2US: /* unpack two GLushorts */
 997             {
 998                GLfloat a[4], result[4];
 999                const GLuint *rawBits = (const GLuint *) a;
1000                fetch_vector1( &inst->SrcReg[0], machine, program, a );
1001                result[0] = (GLfloat) ((rawBits[0] >>  0) & 0xffff) / 65535.0F;
1002                result[1] = (GLfloat) ((rawBits[0] >> 16) & 0xffff) / 65535.0F;
1003                result[2] = result[0];
1004                result[3] = result[1];
1005                store_vector4( inst, machine, result );
1006             }
1007             break;
1008          case FP_OPCODE_UP4B: /* unpack four GLbytes */
1009             {
1010                GLfloat a[4], result[4];
1011                const GLuint *rawBits = (const GLuint *) a;
1012                fetch_vector1( &inst->SrcReg[0], machine, program, a );
1013                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1014                result[0] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1015                result[0] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1016                result[0] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1017                store_vector4( inst, machine, result );
1018             }
1019             break;
1020          case FP_OPCODE_UP4UB: /* unpack four GLubytes */
1021             {
1022                GLfloat a[4], result[4];
1023                const GLuint *rawBits = (const GLuint *) a;
1024                fetch_vector1( &inst->SrcReg[0], machine, program, a );
1025                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1026                result[0] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1027                result[0] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1028                result[0] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1029                store_vector4( inst, machine, result );
1030             }
1031             break;
1032          case FP_OPCODE_X2D: /* 2-D matrix transform */
1033             {
1034                GLfloat a[4], b[4], c[4], result[4];
1035                fetch_vector4( &inst->SrcReg[0], machine, program, a );
1036                fetch_vector4( &inst->SrcReg[1], machine, program, b );
1037                fetch_vector4( &inst->SrcReg[2], machine, program, c );
1038                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1039                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1040                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1041                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1042                store_vector4( inst, machine, result );
1043             }
1044             break;
1045          case FP_OPCODE_END:
1046             return GL_TRUE;
1047          default:
1048             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1049                           inst->Opcode);
1050             return GL_TRUE; /* return value doesn't matter */
1051       }
1052    }
1053    return GL_TRUE;
1054 }
1055
1056
1057 static void
1058 init_machine( GLcontext *ctx, struct fp_machine *machine,
1059               const struct fragment_program *program,
1060               const struct sw_span *span, GLuint col )
1061 {
1062    GLuint j, u;
1063
1064    /* Clear temporary registers */
1065    _mesa_bzero(machine->Registers + FP_TEMP_REG_START,
1066                MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1067
1068    /* Load program local parameters */
1069    for (j = 0; j < MAX_NV_FRAGMENT_PROGRAM_PARAMS; j++) {
1070       COPY_4V(machine->Registers[FP_PROG_REG_START + j],
1071               program->LocalParams[j]);
1072    }
1073
1074    /* Load input registers */
1075    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1076       GLfloat *wpos = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_WPOS];
1077       wpos[0] = span->x + col;
1078       wpos[1] = span->y;
1079       wpos[2] = (GLfloat) span->array->z[col] / ctx->DepthMaxF;
1080       wpos[3] = span->w + col * span->dwdx;
1081    }
1082    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
1083       GLfloat *col0 = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL0];
1084       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1085       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1086       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1087       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1088    }
1089    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
1090       GLfloat *col1 = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_COL1];
1091       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1092       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1093       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1094       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1095    }
1096    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1097       GLfloat *fogc = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_FOGC];
1098       fogc[0] = span->array->fog[col];
1099       fogc[1] = 0.0F;
1100       fogc[2] = 0.0F;
1101       fogc[3] = 0.0F;
1102    }
1103    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1104       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1105          GLfloat *tex = machine->Registers[FP_INPUT_REG_START+FRAG_ATTRIB_TEX0+u];
1106          if (ctx->Texture.Unit[u]._ReallyEnabled) {
1107             COPY_4V(tex, span->array->texcoords[u][col]);
1108          }
1109          else {
1110             COPY_4V(tex, ctx->Current.Attrib[VERT_ATTRIB_TEX0 + u]);
1111          }
1112       }
1113    }
1114 }
1115
1116
1117 void
1118 _swrast_exec_nv_fragment_program( GLcontext *ctx, struct sw_span *span )
1119 {
1120    const struct fragment_program *program = ctx->FragmentProgram.Current;
1121    GLuint i;
1122
1123    for (i = 0; i < span->end; i++) {
1124       if (span->array->mask[i]) {
1125          init_machine(ctx, &ctx->FragmentProgram.Machine,
1126                       ctx->FragmentProgram.Current, span, i);
1127
1128          if (!execute_program(ctx, program, ~0,
1129                               &ctx->FragmentProgram.Machine, span))
1130             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1131
1132          /* Store output registers */
1133          {
1134             const GLfloat *colOut
1135                = ctx->FragmentProgram.Machine.Registers[FP_OUTPUT_REG_START];
1136             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1137             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1138             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1139             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1140          }
1141          /* depth value */
1142          if (ctx->FragmentProgram.Current->OutputsWritten & 2)
1143             span->array->z[i] = IROUND(ctx->FragmentProgram.Machine.Registers[FP_OUTPUT_REG_START + 2][0] * ctx->DepthMaxF);
1144       }
1145    }
1146 }
1147