src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5.2
   4  *
   5  * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /*
  26  * Regarding GL_NV_fragment_program:
  27  *
  28  * Portions of this software may use or implement intellectual
  29  * property owned and licensed by NVIDIA Corporation. NVIDIA disclaims
  30  * any and all warranties with respect to such intellectual property,
  31  * including any use thereof or modifications thereto.
  32  */
  33
  34 #include "glheader.h"
  35 #include "colormac.h"
  36 #include "context.h"
  37 #include "program_instruction.h"
  38 #include "program.h"
  39
  40 #include "s_nvfragprog.h"
  41 #include "s_span.h"
  42
  43
  44 /* if 1, print some debugging info */
  45 #define DEBUG_FRAG 0
  46
  47
  48 /**
  49  * Virtual machine state used during execution of a fragment programs.
  50  */
  51 struct fp_machine
  52 {
  53    GLfloat Temporaries[MAX_NV_FRAGMENT_PROGRAM_TEMPS][4];
  54    GLfloat Inputs[MAX_NV_FRAGMENT_PROGRAM_INPUTS][4];
  55    GLfloat Outputs[MAX_NV_FRAGMENT_PROGRAM_OUTPUTS][4];
  56    GLuint CondCodes[4];  /**< COND_* value for x/y/z/w */
  57 };
  58
  59
  60 #if FEATURE_MESA_program_debug
  61 static struct fp_machine *CurrentMachine = NULL;
  62
  63 /**
  64  * For GL_MESA_program_debug.
  65  * Return current value (4*GLfloat) of a fragment program register.
  66  * Called via ctx->Driver.GetFragmentProgramRegister().
  67  */
  68 void
  69 _swrast_get_program_register(GLcontext *ctx, enum register_file file,
  70                              GLuint index, GLfloat val[4])
  71 {
  72    if (CurrentMachine) {
  73       switch (file) {
  74       case PROGRAM_INPUT:
  75          COPY_4V(val, CurrentMachine->Inputs[index]);
  76          break;
  77       case PROGRAM_OUTPUT:
  78          COPY_4V(val, CurrentMachine->Outputs[index]);
  79          break;
  80       case PROGRAM_TEMPORARY:
  81          COPY_4V(val, CurrentMachine->Temporaries[index]);
  82          break;
  83       default:
  84          _mesa_problem(NULL,
  85                        "bad register file in _swrast_get_program_register");
  86       }
  87    }
  88 }
  89 #endif /* FEATURE_MESA_program_debug */
  90
  91
  92 /**
  93  * Fetch a texel.
  94  */
  95 static void
  96 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  97              GLuint unit, GLfloat color[4] )
  98 {
  99    GLchan rgba[4];
 100    SWcontext *swrast = SWRAST_CONTEXT(ctx);
 101
 102    /* XXX use a float-valued TextureSample routine here!!! */
 103    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
 104                                1, (const GLfloat (*)[4]) texcoord,
 105                                &lambda, &rgba);
 106    color[0] = CHAN_TO_FLOAT(rgba[0]);
 107    color[1] = CHAN_TO_FLOAT(rgba[1]);
 108    color[2] = CHAN_TO_FLOAT(rgba[2]);
 109    color[3] = CHAN_TO_FLOAT(rgba[3]);
 110 }
 111
 112
 113 /**
 114  * Fetch a texel with the given partial derivatives to compute a level
 115  * of detail in the mipmap.
 116  */
 117 static void
 118 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
 119                    const GLfloat texdx[4], const GLfloat texdy[4],
 120                    GLuint unit, GLfloat color[4] )
 121 {
 122    SWcontext *swrast = SWRAST_CONTEXT(ctx);
 123    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
 124    const struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
 125    const GLfloat texW = (GLfloat) texImg->WidthScale;
 126    const GLfloat texH = (GLfloat) texImg->HeightScale;
 127    GLchan rgba[4];
 128
 129    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
 130                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
 131                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
 132                                          texW, texH,
 133                                          texcoord[0], texcoord[1], texcoord[3],
 134                                          1.0F / texcoord[3]);
 135
 136    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
 137                                1, (const GLfloat (*)[4]) texcoord,
 138                                &lambda, &rgba);
 139    color[0] = CHAN_TO_FLOAT(rgba[0]);
 140    color[1] = CHAN_TO_FLOAT(rgba[1]);
 141    color[2] = CHAN_TO_FLOAT(rgba[2]);
 142    color[3] = CHAN_TO_FLOAT(rgba[3]);
 143 }
 144
 145
 146 /**
 147  * Return a pointer to the 4-element float vector specified by the given
 148  * source register.
 149  */
 150 static INLINE const GLfloat *
 151 get_register_pointer( GLcontext *ctx,
 152                       const struct prog_src_register *source,
 153                       const struct fp_machine *machine,
 154                       const struct gl_fragment_program *program )
 155 {
 156    switch (source->File) {
 157    case PROGRAM_TEMPORARY:
 158       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 159       return machine->Temporaries[source->Index];
 160    case PROGRAM_INPUT:
 161       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 162       return machine->Inputs[source->Index];
 163    case PROGRAM_OUTPUT:
 164       /* This is only for PRINT */
 165       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_OUTPUTS);
 166       return machine->Outputs[source->Index];
 167    case PROGRAM_LOCAL_PARAM:
 168       ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 169       return program->Base.LocalParams[source->Index];
 170    case PROGRAM_ENV_PARAM:
 171       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 172       return ctx->FragmentProgram.Parameters[source->Index];
 173    case PROGRAM_STATE_VAR:
 174       /* Fallthrough */
 175    case PROGRAM_CONSTANT:
 176       /* Fallthrough */
 177    case PROGRAM_NAMED_PARAM:
 178       ASSERT(source->Index < (GLint) program->Base.Parameters->NumParameters);
 179       return program->Base.Parameters->ParameterValues[source->Index];
 180    default:
 181       _mesa_problem(ctx, "Invalid input register file %d in fetch_vector4",
 182                     source->File);
 183       return NULL;
 184    }
 185 }
 186
 187
 188 /**
 189  * Fetch a 4-element float vector from the given source register.
 190  * Apply swizzling and negating as needed.
 191  */
 192 static void
 193 fetch_vector4( GLcontext *ctx,
 194                const struct prog_src_register *source,
 195                const struct fp_machine *machine,
 196                const struct gl_fragment_program *program,
 197                GLfloat result[4] )
 198 {
 199    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 200    ASSERT(src);
 201
 202    if (source->Swizzle == MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
 203                                         SWIZZLE_Z, SWIZZLE_W)) {
 204       /* no swizzling */
 205       COPY_4V(result, src);
 206    }
 207    else {
 208       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 209       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 210       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 211       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 212    }
 213
 214    if (source->NegateBase) {
 215       result[0] = -result[0];
 216       result[1] = -result[1];
 217       result[2] = -result[2];
 218       result[3] = -result[3];
 219    }
 220    if (source->Abs) {
 221       result[0] = FABSF(result[0]);
 222       result[1] = FABSF(result[1]);
 223       result[2] = FABSF(result[2]);
 224       result[3] = FABSF(result[3]);
 225    }
 226    if (source->NegateAbs) {
 227       result[0] = -result[0];
 228       result[1] = -result[1];
 229       result[2] = -result[2];
 230       result[3] = -result[3];
 231    }
 232 }
 233
 234
 235 /**
 236  * Fetch the derivative with respect to X for the given register.
 237  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 238  * need to execute another instance of the program (ugh)!
 239  */
 240 static GLboolean
 241 fetch_vector4_deriv( GLcontext *ctx,
 242                      const struct prog_src_register *source,
 243                      const SWspan *span,
 244                      char xOrY, GLint column, GLfloat result[4] )
 245 {
 246    GLfloat src[4];
 247
 248    ASSERT(xOrY == 'X' || xOrY == 'Y');
 249
 250    switch (source->Index) {
 251    case FRAG_ATTRIB_WPOS:
 252       if (xOrY == 'X') {
 253          src[0] = 1.0;
 254          src[1] = 0.0;
 255          src[2] = span->dzdx / ctx->DrawBuffer->_DepthMaxF;
 256          src[3] = span->dwdx;
 257       }
 258       else {
 259          src[0] = 0.0;
 260          src[1] = 1.0;
 261          src[2] = span->dzdy / ctx->DrawBuffer->_DepthMaxF;
 262          src[3] = span->dwdy;
 263       }
 264       break;
 265    case FRAG_ATTRIB_COL0:
 266       if (xOrY == 'X') {
 267          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 268          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 269          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 270          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 271       }
 272       else {
 273          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 274          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 275          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 276          src[3] = span->dady * (1.0F / CHAN_MAXF);
 277       }
 278       break;
 279    case FRAG_ATTRIB_COL1:
 280       if (xOrY == 'X') {
 281          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 282          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 283          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 284          src[3] = 0.0; /* XXX need this */
 285       }
 286       else {
 287          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 288          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 289          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 290          src[3] = 0.0; /* XXX need this */
 291       }
 292       break;
 293    case FRAG_ATTRIB_FOGC:
 294       if (xOrY == 'X') {
 295          src[0] = span->dfogdx;
 296          src[1] = 0.0;
 297          src[2] = 0.0;
 298          src[3] = 0.0;
 299       }
 300       else {
 301          src[0] = span->dfogdy;
 302          src[1] = 0.0;
 303          src[2] = 0.0;
 304          src[3] = 0.0;
 305       }
 306       break;
 307    case FRAG_ATTRIB_TEX0:
 308    case FRAG_ATTRIB_TEX1:
 309    case FRAG_ATTRIB_TEX2:
 310    case FRAG_ATTRIB_TEX3:
 311    case FRAG_ATTRIB_TEX4:
 312    case FRAG_ATTRIB_TEX5:
 313    case FRAG_ATTRIB_TEX6:
 314    case FRAG_ATTRIB_TEX7:
 315       if (xOrY == 'X') {
 316          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 317          /* this is a little tricky - I think I've got it right */
 318          const GLfloat invQ = 1.0f / (span->tex[u][3]
 319                                       + span->texStepX[u][3] * column);
 320          src[0] = span->texStepX[u][0] * invQ;
 321          src[1] = span->texStepX[u][1] * invQ;
 322          src[2] = span->texStepX[u][2] * invQ;
 323          src[3] = span->texStepX[u][3] * invQ;
 324       }
 325       else {
 326          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 327          /* Tricky, as above, but in Y direction */
 328          const GLfloat invQ = 1.0f / (span->tex[u][3] + span->texStepY[u][3]);
 329          src[0] = span->texStepY[u][0] * invQ;
 330          src[1] = span->texStepY[u][1] * invQ;
 331          src[2] = span->texStepY[u][2] * invQ;
 332          src[3] = span->texStepY[u][3] * invQ;
 333       }
 334       break;
 335    default:
 336       return GL_FALSE;
 337    }
 338
 339    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 340    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 341    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 342    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 343
 344    if (source->NegateBase) {
 345       result[0] = -result[0];
 346       result[1] = -result[1];
 347       result[2] = -result[2];
 348       result[3] = -result[3];
 349    }
 350    if (source->Abs) {
 351       result[0] = FABSF(result[0]);
 352       result[1] = FABSF(result[1]);
 353       result[2] = FABSF(result[2]);
 354       result[3] = FABSF(result[3]);
 355    }
 356    if (source->NegateAbs) {
 357       result[0] = -result[0];
 358       result[1] = -result[1];
 359       result[2] = -result[2];
 360       result[3] = -result[3];
 361    }
 362    return GL_TRUE;
 363 }
 364
 365
 366 /**
 367  * As above, but only return result[0] element.
 368  */
 369 static void
 370 fetch_vector1( GLcontext *ctx,
 371                const struct prog_src_register *source,
 372                const struct fp_machine *machine,
 373                const struct gl_fragment_program *program,
 374                GLfloat result[4] )
 375 {
 376    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 377    ASSERT(src);
 378
 379    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 380
 381    if (source->NegateBase) {
 382       result[0] = -result[0];
 383    }
 384    if (source->Abs) {
 385       result[0] = FABSF(result[0]);
 386    }
 387    if (source->NegateAbs) {
 388       result[0] = -result[0];
 389    }
 390 }
 391
 392
 393 /**
 394  * Test value against zero and return GT, LT, EQ or UN if NaN.
 395  */
 396 static INLINE GLuint
 397 generate_cc( float value )
 398 {
 399    if (value != value)
 400       return COND_UN;  /* NaN */
 401    if (value > 0.0F)
 402       return COND_GT;
 403    if (value < 0.0F)
 404       return COND_LT;
 405    return COND_EQ;
 406 }
 407
 408
 409 /**
 410  * Test if the ccMaskRule is satisfied by the given condition code.
 411  * Used to mask destination writes according to the current condition code.
 412  */
 413 static INLINE GLboolean
 414 test_cc(GLuint condCode, GLuint ccMaskRule)
 415 {
 416    switch (ccMaskRule) {
 417    case COND_EQ: return (condCode == COND_EQ);
 418    case COND_NE: return (condCode != COND_EQ);
 419    case COND_LT: return (condCode == COND_LT);
 420    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 421    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 422    case COND_GT: return (condCode == COND_GT);
 423    case COND_TR: return GL_TRUE;
 424    case COND_FL: return GL_FALSE;
 425    default:      return GL_TRUE;
 426    }
 427 }
 428
 429
 430 /**
 431  * Store 4 floats into a register.  Observe the instructions saturate and
 432  * set-condition-code flags.
 433  */
 434 static void
 435 store_vector4( const struct prog_instruction *inst,
 436                struct fp_machine *machine,
 437                const GLfloat value[4] )
 438 {
 439    const struct prog_dst_register *dest = &(inst->DstReg);
 440    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 441    GLfloat *dstReg;
 442    GLfloat dummyReg[4];
 443    GLfloat clampedValue[4];
 444    GLuint writeMask = dest->WriteMask;
 445
 446    switch (dest->File) {
 447       case PROGRAM_OUTPUT:
 448          dstReg = machine->Outputs[dest->Index];
 449          break;
 450       case PROGRAM_TEMPORARY:
 451          dstReg = machine->Temporaries[dest->Index];
 452          break;
 453       case PROGRAM_WRITE_ONLY:
 454          dstReg = dummyReg;
 455          return;
 456       default:
 457          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 458          return;
 459    }
 460
 461 #if DEBUG_FRAG
 462    if (value[0] > 1.0e10 ||
 463        IS_INF_OR_NAN(value[0]) ||
 464        IS_INF_OR_NAN(value[1]) ||
 465        IS_INF_OR_NAN(value[2]) ||
 466        IS_INF_OR_NAN(value[3])  )
 467       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 468 #endif
 469
 470    if (clamp) {
 471       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 472       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 473       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 474       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 475       value = clampedValue;
 476    }
 477
 478    if (dest->CondMask != COND_TR) {
 479       /* condition codes may turn off some writes */
 480       if (writeMask & WRITEMASK_X) {
 481          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)],
 482                       dest->CondMask))
 483             writeMask &= ~WRITEMASK_X;
 484       }
 485       if (writeMask & WRITEMASK_Y) {
 486          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)],
 487                       dest->CondMask))
 488             writeMask &= ~WRITEMASK_Y;
 489       }
 490       if (writeMask & WRITEMASK_Z) {
 491          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)],
 492                       dest->CondMask))
 493             writeMask &= ~WRITEMASK_Z;
 494       }
 495       if (writeMask & WRITEMASK_W) {
 496          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)],
 497                       dest->CondMask))
 498             writeMask &= ~WRITEMASK_W;
 499       }
 500    }
 501
 502    if (writeMask & WRITEMASK_X)
 503       dstReg[0] = value[0];
 504    if (writeMask & WRITEMASK_Y)
 505       dstReg[1] = value[1];
 506    if (writeMask & WRITEMASK_Z)
 507       dstReg[2] = value[2];
 508    if (writeMask & WRITEMASK_W)
 509       dstReg[3] = value[3];
 510
 511    if (inst->CondUpdate) {
 512       if (writeMask & WRITEMASK_X)
 513          machine->CondCodes[0] = generate_cc(value[0]);
 514       if (writeMask & WRITEMASK_Y)
 515          machine->CondCodes[1] = generate_cc(value[1]);
 516       if (writeMask & WRITEMASK_Z)
 517          machine->CondCodes[2] = generate_cc(value[2]);
 518       if (writeMask & WRITEMASK_W)
 519          machine->CondCodes[3] = generate_cc(value[3]);
 520    }
 521 }
 522
 523
 524 /**
 525  * Initialize a new machine state instance from an existing one, adding
 526  * the partial derivatives onto the input registers.
 527  * Used to implement DDX and DDY instructions in non-trivial cases.
 528  */
 529 static void
 530 init_machine_deriv( GLcontext *ctx,
 531                     const struct fp_machine *machine,
 532                     const struct gl_fragment_program *program,
 533                     const SWspan *span, char xOrY,
 534                     struct fp_machine *dMachine )
 535 {
 536    GLuint u;
 537
 538    ASSERT(xOrY == 'X' || xOrY == 'Y');
 539
 540    /* copy existing machine */
 541    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 542
 543    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
 544       /* Clear temporary registers (undefined for ARB_f_p) */
 545       _mesa_bzero( (void*) machine->Temporaries,
 546                    MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 547    }
 548
 549    /* Add derivatives */
 550    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 551       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 552       if (xOrY == 'X') {
 553          wpos[0] += 1.0F;
 554          wpos[1] += 0.0F;
 555          wpos[2] += span->dzdx;
 556          wpos[3] += span->dwdx;
 557       }
 558       else {
 559          wpos[0] += 0.0F;
 560          wpos[1] += 1.0F;
 561          wpos[2] += span->dzdy;
 562          wpos[3] += span->dwdy;
 563       }
 564    }
 565    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 566       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 567       if (xOrY == 'X') {
 568          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 569          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 570          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 571          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 572       }
 573       else {
 574          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 575          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 576          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 577          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 578       }
 579    }
 580    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 581       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 582       if (xOrY == 'X') {
 583          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 584          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 585          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 586          col1[3] += 0.0; /*XXX fix */
 587       }
 588       else {
 589          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 590          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 591          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 592          col1[3] += 0.0; /*XXX fix */
 593       }
 594    }
 595    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 596       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 597       if (xOrY == 'X') {
 598          fogc[0] += span->dfogdx;
 599       }
 600       else {
 601          fogc[0] += span->dfogdy;
 602       }
 603    }
 604    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 605       if (program->Base.InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 606          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 607          /* XXX perspective-correct interpolation */
 608          if (xOrY == 'X') {
 609             tex[0] += span->texStepX[u][0];
 610             tex[1] += span->texStepX[u][1];
 611             tex[2] += span->texStepX[u][2];
 612             tex[3] += span->texStepX[u][3];
 613          }
 614          else {
 615             tex[0] += span->texStepY[u][0];
 616             tex[1] += span->texStepY[u][1];
 617             tex[2] += span->texStepY[u][2];
 618             tex[3] += span->texStepY[u][3];
 619          }
 620       }
 621    }
 622
 623    /* init condition codes */
 624    dMachine->CondCodes[0] = COND_EQ;
 625    dMachine->CondCodes[1] = COND_EQ;
 626    dMachine->CondCodes[2] = COND_EQ;
 627    dMachine->CondCodes[3] = COND_EQ;
 628 }
 629
 630
 631 /**
 632  * Execute the given vertex program.
 633  * NOTE: we do everything in single-precision floating point; we don't
 634  * currently observe the single/half/fixed-precision qualifiers.
 635  * \param ctx - rendering context
 636  * \param program - the fragment program to execute
 637  * \param machine - machine state (register file)
 638  * \param maxInst - max number of instructions to execute
 639  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 640  */
 641 static GLboolean
 642 execute_program( GLcontext *ctx,
 643                  const struct gl_fragment_program *program, GLuint maxInst,
 644                  struct fp_machine *machine, const SWspan *span,
 645                  GLuint column )
 646 {
 647    GLuint pc;
 648
 649 #if DEBUG_FRAG
 650    printf("execute fragment program --------------------\n");
 651 #endif
 652
 653    for (pc = 0; pc < maxInst; pc++) {
 654       const struct prog_instruction *inst = program->Base.Instructions + pc;
 655
 656       if (ctx->FragmentProgram.CallbackEnabled &&
 657           ctx->FragmentProgram.Callback) {
 658          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 659          ctx->FragmentProgram.Callback(program->Base.Target,
 660                                        ctx->FragmentProgram.CallbackData);
 661       }
 662
 663       switch (inst->Opcode) {
 664          case OPCODE_ABS:
 665             {
 666                GLfloat a[4], result[4];
 667                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 668                result[0] = FABSF(a[0]);
 669                result[1] = FABSF(a[1]);
 670                result[2] = FABSF(a[2]);
 671                result[3] = FABSF(a[3]);
 672                store_vector4( inst, machine, result );
 673             }
 674             break;
 675          case OPCODE_ADD:
 676             {
 677                GLfloat a[4], b[4], result[4];
 678                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 679                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 680                result[0] = a[0] + b[0];
 681                result[1] = a[1] + b[1];
 682                result[2] = a[2] + b[2];
 683                result[3] = a[3] + b[3];
 684                store_vector4( inst, machine, result );
 685             }
 686             break;
 687          case OPCODE_CMP:
 688             {
 689                GLfloat a[4], b[4], c[4], result[4];
 690                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 691                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 692                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 693                result[0] = a[0] < 0.0F ? b[0] : c[0];
 694                result[1] = a[1] < 0.0F ? b[1] : c[1];
 695                result[2] = a[2] < 0.0F ? b[2] : c[2];
 696                result[3] = a[3] < 0.0F ? b[3] : c[3];
 697                store_vector4( inst, machine, result );
 698             }
 699             break;
 700          case OPCODE_COS:
 701             {
 702                GLfloat a[4], result[4];
 703                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 704                result[0] = result[1] = result[2] = result[3]
 705                   = (GLfloat) _mesa_cos(a[0]);
 706                store_vector4( inst, machine, result );
 707             }
 708             break;
 709          case OPCODE_DDX: /* Partial derivative with respect to X */
 710             {
 711                GLfloat a[4], aNext[4], result[4];
 712                struct fp_machine dMachine;
 713                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
 714                                         column, result)) {
 715                   /* This is tricky.  Make a copy of the current machine state,
 716                    * increment the input registers by the dx or dy partial
 717                    * derivatives, then re-execute the program up to the
 718                    * preceeding instruction, then fetch the source register.
 719                    * Finally, find the difference in the register values for
 720                    * the original and derivative runs.
 721                    */
 722                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 723                   init_machine_deriv(ctx, machine, program, span,
 724                                      'X', &dMachine);
 725                   execute_program(ctx, program, pc, &dMachine, span, column);
 726                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 727                   result[0] = aNext[0] - a[0];
 728                   result[1] = aNext[1] - a[1];
 729                   result[2] = aNext[2] - a[2];
 730                   result[3] = aNext[3] - a[3];
 731                }
 732                store_vector4( inst, machine, result );
 733             }
 734             break;
 735          case OPCODE_DDY: /* Partial derivative with respect to Y */
 736             {
 737                GLfloat a[4], aNext[4], result[4];
 738                struct fp_machine dMachine;
 739                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
 740                                         column, result)) {
 741                   init_machine_deriv(ctx, machine, program, span,
 742                                      'Y', &dMachine);
 743                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 744                   execute_program(ctx, program, pc, &dMachine, span, column);
 745                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 746                   result[0] = aNext[0] - a[0];
 747                   result[1] = aNext[1] - a[1];
 748                   result[2] = aNext[2] - a[2];
 749                   result[3] = aNext[3] - a[3];
 750                }
 751                store_vector4( inst, machine, result );
 752             }
 753             break;
 754          case OPCODE_DP3:
 755             {
 756                GLfloat a[4], b[4], result[4];
 757                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 758                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 759                result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 760                store_vector4( inst, machine, result );
 761 #if DEBUG_FRAG
 762                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 763                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 764 #endif
 765             }
 766             break;
 767          case OPCODE_DP4:
 768             {
 769                GLfloat a[4], b[4], result[4];
 770                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 771                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 772                result[0] = result[1] = result[2] = result[3] = DOT4(a,b);
 773                store_vector4( inst, machine, result );
 774 #if DEBUG_FRAG
 775                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 776                       result[0], a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 777 #endif
 778             }
 779             break;
 780          case OPCODE_DPH:
 781             {
 782                GLfloat a[4], b[4], result[4];
 783                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 784                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 785                result[0] = result[1] = result[2] = result[3] =
 786                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 787                store_vector4( inst, machine, result );
 788             }
 789             break;
 790          case OPCODE_DST: /* Distance vector */
 791             {
 792                GLfloat a[4], b[4], result[4];
 793                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 794                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 795                result[0] = 1.0F;
 796                result[1] = a[1] * b[1];
 797                result[2] = a[2];
 798                result[3] = b[3];
 799                store_vector4( inst, machine, result );
 800             }
 801             break;
 802          case OPCODE_EX2: /* Exponential base 2 */
 803             {
 804                GLfloat a[4], result[4];
 805                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 806                result[0] = result[1] = result[2] = result[3] =
 807                   (GLfloat) _mesa_pow(2.0, a[0]);
 808                store_vector4( inst, machine, result );
 809             }
 810             break;
 811          case OPCODE_FLR:
 812             {
 813                GLfloat a[4], result[4];
 814                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 815                result[0] = FLOORF(a[0]);
 816                result[1] = FLOORF(a[1]);
 817                result[2] = FLOORF(a[2]);
 818                result[3] = FLOORF(a[3]);
 819                store_vector4( inst, machine, result );
 820             }
 821             break;
 822          case OPCODE_FRC:
 823             {
 824                GLfloat a[4], result[4];
 825                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 826                result[0] = a[0] - FLOORF(a[0]);
 827                result[1] = a[1] - FLOORF(a[1]);
 828                result[2] = a[2] - FLOORF(a[2]);
 829                result[3] = a[3] - FLOORF(a[3]);
 830                store_vector4( inst, machine, result );
 831             }
 832             break;
 833          case OPCODE_KIL_NV: /* NV_f_p only */
 834             {
 835                const GLuint swizzle = inst->DstReg.CondSwizzle;
 836                const GLuint condMask = inst->DstReg.CondMask;
 837                if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 838                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 839                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 840                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 841                   return GL_FALSE;
 842                }
 843             }
 844             break;
 845          case OPCODE_KIL: /* ARB_f_p only */
 846             {
 847                GLfloat a[4];
 848                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 849                if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 850                   return GL_FALSE;
 851                }
 852             }
 853             break;
 854          case OPCODE_LG2:  /* log base 2 */
 855             {
 856                GLfloat a[4], result[4];
 857                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 858                result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
 859                store_vector4( inst, machine, result );
 860             }
 861             break;
 862          case OPCODE_LIT:
 863             {
 864                const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
 865                GLfloat a[4], result[4];
 866                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 867                a[0] = MAX2(a[0], 0.0F);
 868                a[1] = MAX2(a[1], 0.0F);
 869                /* XXX ARB version clamps a[3], NV version doesn't */
 870                a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 871                result[0] = 1.0F;
 872                result[1] = a[0];
 873                /* XXX we could probably just use pow() here */
 874                if (a[0] > 0.0F) {
 875                   if (a[1] == 0.0 && a[3] == 0.0)
 876                      result[2] = 1.0;
 877                   else
 878                      result[2] = EXPF(a[3] * LOGF(a[1]));
 879                }
 880                else {
 881                   result[2] = 0.0;
 882                }
 883                result[3] = 1.0F;
 884                store_vector4( inst, machine, result );
 885             }
 886             break;
 887          case OPCODE_LRP:
 888             {
 889                GLfloat a[4], b[4], c[4], result[4];
 890                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 891                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 892                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 893                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 894                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 895                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 896                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 897                store_vector4( inst, machine, result );
 898 #if DEBUG_FRAG
 899                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
 900                       "(%g %g %g %g), (%g %g %g %g)\n",
 901                       result[0], result[1], result[2], result[3],
 902                       a[0], a[1], a[2], a[3],
 903                       b[0], b[1], b[2], b[3],
 904                       c[0], c[1], c[2], c[3]);
 905 #endif
 906             }
 907             break;
 908          case OPCODE_MAD:
 909             {
 910                GLfloat a[4], b[4], c[4], result[4];
 911                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 912                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 913                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 914                result[0] = a[0] * b[0] + c[0];
 915                result[1] = a[1] * b[1] + c[1];
 916                result[2] = a[2] * b[2] + c[2];
 917                result[3] = a[3] * b[3] + c[3];
 918                store_vector4( inst, machine, result );
 919             }
 920             break;
 921          case OPCODE_MAX:
 922             {
 923                GLfloat a[4], b[4], result[4];
 924                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 925                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 926                result[0] = MAX2(a[0], b[0]);
 927                result[1] = MAX2(a[1], b[1]);
 928                result[2] = MAX2(a[2], b[2]);
 929                result[3] = MAX2(a[3], b[3]);
 930                store_vector4( inst, machine, result );
 931 #if DEBUG_FRAG
 932                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 933                       result[0], result[1], result[2], result[3],
 934                       a[0], a[1], a[2], a[3],
 935                       b[0], b[1], b[2], b[3]);
 936 #endif
 937             }
 938             break;
 939          case OPCODE_MIN:
 940             {
 941                GLfloat a[4], b[4], result[4];
 942                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 943                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 944                result[0] = MIN2(a[0], b[0]);
 945                result[1] = MIN2(a[1], b[1]);
 946                result[2] = MIN2(a[2], b[2]);
 947                result[3] = MIN2(a[3], b[3]);
 948                store_vector4( inst, machine, result );
 949             }
 950             break;
 951          case OPCODE_MOV:
 952             {
 953                GLfloat result[4];
 954                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 955                store_vector4( inst, machine, result );
 956 #if DEBUG_FRAG
 957                printf("MOV (%g %g %g %g)\n",
 958                       result[0], result[1], result[2], result[3]);
 959 #endif
 960             }
 961             break;
 962          case OPCODE_MUL:
 963             {
 964                GLfloat a[4], b[4], result[4];
 965                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 966                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 967                result[0] = a[0] * b[0];
 968                result[1] = a[1] * b[1];
 969                result[2] = a[2] * b[2];
 970                result[3] = a[3] * b[3];
 971                store_vector4( inst, machine, result );
 972 #if DEBUG_FRAG
 973                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 974                       result[0], result[1], result[2], result[3],
 975                       a[0], a[1], a[2], a[3],
 976                       b[0], b[1], b[2], b[3]);
 977 #endif
 978             }
 979             break;
 980          case OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
 981             {
 982                GLfloat a[4], result[4];
 983                GLhalfNV hx, hy;
 984                GLuint *rawResult = (GLuint *) result;
 985                GLuint twoHalves;
 986                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 987                hx = _mesa_float_to_half(a[0]);
 988                hy = _mesa_float_to_half(a[1]);
 989                twoHalves = hx | (hy << 16);
 990                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 991                   = twoHalves;
 992                store_vector4( inst, machine, result );
 993             }
 994             break;
 995          case OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
 996             {
 997                GLfloat a[4], result[4];
 998                GLuint usx, usy, *rawResult = (GLuint *) result;
 999                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1000                a[0] = CLAMP(a[0], 0.0F, 1.0F);
1001                a[1] = CLAMP(a[1], 0.0F, 1.0F);
1002                usx = IROUND(a[0] * 65535.0F);
1003                usy = IROUND(a[1] * 65535.0F);
1004                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1005                   = usx | (usy << 16);
1006                store_vector4( inst, machine, result );
1007             }
1008             break;
1009          case OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
1010             {
1011                GLfloat a[4], result[4];
1012                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1013                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1014                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1015                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1016                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1017                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1018                ubx = IROUND(127.0F * a[0] + 128.0F);
1019                uby = IROUND(127.0F * a[1] + 128.0F);
1020                ubz = IROUND(127.0F * a[2] + 128.0F);
1021                ubw = IROUND(127.0F * a[3] + 128.0F);
1022                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1023                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1024                store_vector4( inst, machine, result );
1025             }
1026             break;
1027          case OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
1028             {
1029                GLfloat a[4], result[4];
1030                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1031                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1032                a[0] = CLAMP(a[0], 0.0F, 1.0F);
1033                a[1] = CLAMP(a[1], 0.0F, 1.0F);
1034                a[2] = CLAMP(a[2], 0.0F, 1.0F);
1035                a[3] = CLAMP(a[3], 0.0F, 1.0F);
1036                ubx = IROUND(255.0F * a[0]);
1037                uby = IROUND(255.0F * a[1]);
1038                ubz = IROUND(255.0F * a[2]);
1039                ubw = IROUND(255.0F * a[3]);
1040                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1041                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1042                store_vector4( inst, machine, result );
1043             }
1044             break;
1045          case OPCODE_POW:
1046             {
1047                GLfloat a[4], b[4], result[4];
1048                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1049                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
1050                result[0] = result[1] = result[2] = result[3]
1051                   = (GLfloat)_mesa_pow(a[0], b[0]);
1052                store_vector4( inst, machine, result );
1053             }
1054             break;
1055          case OPCODE_RCP:
1056             {
1057                GLfloat a[4], result[4];
1058                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1059 #if DEBUG_FRAG
1060                if (a[0] == 0)
1061                   printf("RCP(0)\n");
1062                else if (IS_INF_OR_NAN(a[0]))
1063                   printf("RCP(inf)\n");
1064 #endif
1065                result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1066                store_vector4( inst, machine, result );
1067             }
1068             break;
1069          case OPCODE_RFL: /* reflection vector */
1070             {
1071                GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1072                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
1073                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
1074                tmpW = DOT3(axis, axis);
1075                tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1076                result[0] = tmpX * axis[0] - dir[0];
1077                result[1] = tmpX * axis[1] - dir[1];
1078                result[2] = tmpX * axis[2] - dir[2];
1079                /* result[3] is never written! XXX enforce in parser! */
1080                store_vector4( inst, machine, result );
1081             }
1082             break;
1083          case OPCODE_RSQ: /* 1 / sqrt() */
1084             {
1085                GLfloat a[4], result[4];
1086                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1087                a[0] = FABSF(a[0]);
1088                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1089                store_vector4( inst, machine, result );
1090 #if DEBUG_FRAG
1091                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1092 #endif
1093             }
1094             break;
1095          case OPCODE_SCS: /* sine and cos */
1096             {
1097                GLfloat a[4], result[4];
1098                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1099                result[0] = (GLfloat)_mesa_cos(a[0]);
1100                result[1] = (GLfloat)_mesa_sin(a[0]);
1101                result[2] = 0.0;  /* undefined! */
1102                result[3] = 0.0;  /* undefined! */
1103                store_vector4( inst, machine, result );
1104             }
1105             break;
1106          case OPCODE_SEQ: /* set on equal */
1107             {
1108                GLfloat a[4], b[4], result[4];
1109                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1110                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1111                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1112                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1113                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1114                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1115                store_vector4( inst, machine, result );
1116             }
1117             break;
1118          case OPCODE_SFL: /* set false, operands ignored */
1119             {
1120                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1121                store_vector4( inst, machine, result );
1122             }
1123             break;
1124          case OPCODE_SGE: /* set on greater or equal */
1125             {
1126                GLfloat a[4], b[4], result[4];
1127                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1128                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1129                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1130                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1131                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1132                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1133                store_vector4( inst, machine, result );
1134             }
1135             break;
1136          case OPCODE_SGT: /* set on greater */
1137             {
1138                GLfloat a[4], b[4], result[4];
1139                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1140                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1141                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1142                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1143                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1144                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1145                store_vector4( inst, machine, result );
1146             }
1147             break;
1148          case OPCODE_SIN:
1149             {
1150                GLfloat a[4], result[4];
1151                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1152                result[0] = result[1] = result[2] = result[3]
1153                   = (GLfloat) _mesa_sin(a[0]);
1154                store_vector4( inst, machine, result );
1155             }
1156             break;
1157          case OPCODE_SLE: /* set on less or equal */
1158             {
1159                GLfloat a[4], b[4], result[4];
1160                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1161                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1162                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1163                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1164                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1165                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1166                store_vector4( inst, machine, result );
1167             }
1168             break;
1169          case OPCODE_SLT: /* set on less */
1170             {
1171                GLfloat a[4], b[4], result[4];
1172                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1173                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1174                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1175                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1176                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1177                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1178                store_vector4( inst, machine, result );
1179             }
1180             break;
1181          case OPCODE_SNE: /* set on not equal */
1182             {
1183                GLfloat a[4], b[4], result[4];
1184                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1185                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1186                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1187                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1188                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1189                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1190                store_vector4( inst, machine, result );
1191             }
1192             break;
1193          case OPCODE_STR: /* set true, operands ignored */
1194             {
1195                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1196                store_vector4( inst, machine, result );
1197             }
1198             break;
1199          case OPCODE_SUB:
1200             {
1201                GLfloat a[4], b[4], result[4];
1202                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1203                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1204                result[0] = a[0] - b[0];
1205                result[1] = a[1] - b[1];
1206                result[2] = a[2] - b[2];
1207                result[3] = a[3] - b[3];
1208                store_vector4( inst, machine, result );
1209 #if DEBUG_FRAG
1210                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1211                       result[0], result[1], result[2], result[3],
1212                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1213 #endif
1214             }
1215             break;
1216          case OPCODE_SWZ: /* extended swizzle */
1217             {
1218                const struct prog_src_register *source = &inst->SrcReg[0];
1219                const GLfloat *src = get_register_pointer(ctx, source,
1220                                                          machine, program);
1221                GLfloat result[4];
1222                GLuint i;
1223                for (i = 0; i < 4; i++) {
1224                   const GLuint swz = GET_SWZ(source->Swizzle, i);
1225                   if (swz == SWIZZLE_ZERO)
1226                      result[i] = 0.0;
1227                   else if (swz == SWIZZLE_ONE)
1228                      result[i] = 1.0;
1229                   else {
1230                      ASSERT(swz >= 0);
1231                      ASSERT(swz <= 3);
1232                      result[i] = src[swz];
1233                   }
1234                   if (source->NegateBase & (1 << i))
1235                      result[i] = -result[i];
1236                }
1237                store_vector4( inst, machine, result );
1238             }
1239             break;
1240          case OPCODE_TEX: /* Both ARB and NV frag prog */
1241             /* Texel lookup */
1242             {
1243                GLfloat texcoord[4], color[4];
1244                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1245                /* Note: we pass 0 for LOD.  The ARB extension requires it
1246                 * while the NV extension says it's implementation dependant.
1247                 */
1248                /* KW: Previously lambda was passed as zero, but I
1249                 * believe this is incorrect, the spec seems to
1250                 * indicate rather that lambda should not be
1251                 * changed/biased, unlike TXB where texcoord[3] is
1252                 * added to the lambda calculations.  The lambda should
1253                 * still be calculated normally for TEX & TXP though,
1254                 * not set to zero.  Otherwise it's very difficult to
1255                 * implement normal GL semantics through the fragment
1256                 * shader.
1257                 */
1258                fetch_texel( ctx, texcoord,
1259                             span->array->lambda[inst->TexSrcUnit][column],
1260                             inst->TexSrcUnit, color );
1261 #if DEBUG_FRAG
1262                if (color[3])
1263                   printf("color[3] = %f\n", color[3]);
1264 #endif
1265                store_vector4( inst, machine, color );
1266             }
1267             break;
1268          case OPCODE_TXB: /* GL_ARB_fragment_program only */
1269             /* Texel lookup with LOD bias */
1270             {
1271                GLfloat texcoord[4], color[4], bias, lambda;
1272
1273                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1274                /* texcoord[3] is the bias to add to lambda */
1275                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1276                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1277                     + texcoord[3];
1278                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1279                fetch_texel( ctx, texcoord, lambda,
1280                             inst->TexSrcUnit, color );
1281                store_vector4( inst, machine, color );
1282             }
1283             break;
1284          case OPCODE_TXD: /* GL_NV_fragment_program only */
1285             /* Texture lookup w/ partial derivatives for LOD */
1286             {
1287                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1288                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1289                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1290                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1291                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1292                                   color );
1293                store_vector4( inst, machine, color );
1294             }
1295             break;
1296          case OPCODE_TXP: /* GL_ARB_fragment_program only */
1297             /* Texture lookup w/ projective divide */
1298             {
1299                GLfloat texcoord[4], color[4];
1300                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1301                /* Not so sure about this test - if texcoord[3] is
1302                 * zero, we'd probably be fine except for an ASSERT in
1303                 * IROUND_POS() which gets triggered by the inf values created.
1304                 */
1305                if (texcoord[3] != 0.0) {
1306                   texcoord[0] /= texcoord[3];
1307                   texcoord[1] /= texcoord[3];
1308                   texcoord[2] /= texcoord[3];
1309                }
1310                /* KW: Previously lambda was passed as zero, but I
1311                 * believe this is incorrect, the spec seems to
1312                 * indicate rather that lambda should not be
1313                 * changed/biased, unlike TXB where texcoord[3] is
1314                 * added to the lambda calculations.  The lambda should
1315                 * still be calculated normally for TEX & TXP though,
1316                 * not set to zero.
1317                 */
1318                fetch_texel( ctx, texcoord,
1319                             span->array->lambda[inst->TexSrcUnit][column],
1320                             inst->TexSrcUnit, color );
1321                store_vector4( inst, machine, color );
1322             }
1323             break;
1324          case OPCODE_TXP_NV: /* GL_NV_fragment_program only */
1325             /* Texture lookup w/ projective divide */
1326             {
1327                GLfloat texcoord[4], color[4];
1328                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1329                if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1330                    texcoord[3] != 0.0) {
1331                   texcoord[0] /= texcoord[3];
1332                   texcoord[1] /= texcoord[3];
1333                   texcoord[2] /= texcoord[3];
1334                }
1335                fetch_texel( ctx, texcoord,
1336                             span->array->lambda[inst->TexSrcUnit][column],
1337                             inst->TexSrcUnit, color );
1338                store_vector4( inst, machine, color );
1339             }
1340             break;
1341          case OPCODE_UP2H: /* unpack two 16-bit floats */
1342             {
1343                GLfloat a[4], result[4];
1344                const GLuint *rawBits = (const GLuint *) a;
1345                GLhalfNV hx, hy;
1346                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1347                hx = rawBits[0] & 0xffff;
1348                hy = rawBits[0] >> 16;
1349                result[0] = result[2] = _mesa_half_to_float(hx);
1350                result[1] = result[3] = _mesa_half_to_float(hy);
1351                store_vector4( inst, machine, result );
1352             }
1353             break;
1354          case OPCODE_UP2US: /* unpack two GLushorts */
1355             {
1356                GLfloat a[4], result[4];
1357                const GLuint *rawBits = (const GLuint *) a;
1358                GLushort usx, usy;
1359                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1360                usx = rawBits[0] & 0xffff;
1361                usy = rawBits[0] >> 16;
1362                result[0] = result[2] = usx * (1.0f / 65535.0f);
1363                result[1] = result[3] = usy * (1.0f / 65535.0f);
1364                store_vector4( inst, machine, result );
1365             }
1366             break;
1367          case OPCODE_UP4B: /* unpack four GLbytes */
1368             {
1369                GLfloat a[4], result[4];
1370                const GLuint *rawBits = (const GLuint *) a;
1371                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1372                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1373                result[1] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1374                result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1375                result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1376                store_vector4( inst, machine, result );
1377             }
1378             break;
1379          case OPCODE_UP4UB: /* unpack four GLubytes */
1380             {
1381                GLfloat a[4], result[4];
1382                const GLuint *rawBits = (const GLuint *) a;
1383                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1384                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1385                result[1] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1386                result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1387                result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1388                store_vector4( inst, machine, result );
1389             }
1390             break;
1391          case OPCODE_XPD: /* cross product */
1392             {
1393                GLfloat a[4], b[4], result[4];
1394                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1395                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1396                result[0] = a[1] * b[2] - a[2] * b[1];
1397                result[1] = a[2] * b[0] - a[0] * b[2];
1398                result[2] = a[0] * b[1] - a[1] * b[0];
1399                result[3] = 1.0;
1400                store_vector4( inst, machine, result );
1401             }
1402             break;
1403          case OPCODE_X2D: /* 2-D matrix transform */
1404             {
1405                GLfloat a[4], b[4], c[4], result[4];
1406                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1407                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1408                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1409                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1410                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1411                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1412                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1413                store_vector4( inst, machine, result );
1414             }
1415             break;
1416          case OPCODE_PRINT:
1417             {
1418                if (inst->SrcReg[0].File != -1) {
1419                   GLfloat a[4];
1420                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
1421                   _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1422                                a[0], a[1], a[2], a[3]);
1423                }
1424                else {
1425                   _mesa_printf("%s\n", (const char *) inst->Data);
1426                }
1427             }
1428             break;
1429          case OPCODE_END:
1430             return GL_TRUE;
1431          default:
1432             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1433                           inst->Opcode);
1434             return GL_TRUE; /* return value doesn't matter */
1435       }
1436    }
1437    return GL_TRUE;
1438 }
1439
1440
1441 /**
1442  * Initialize the virtual fragment program machine state prior to running
1443  * fragment program on a fragment.  This involves initializing the input
1444  * registers, condition codes, etc.
1445  * \param machine  the virtual machine state to init
1446  * \param program  the fragment program we're about to run
1447  * \param span  the span of pixels we'll operate on
1448  * \param col  which element (column) of the span we'll operate on
1449  */
1450 static void
1451 init_machine( GLcontext *ctx, struct fp_machine *machine,
1452               const struct gl_fragment_program *program,
1453               const SWspan *span, GLuint col )
1454 {
1455    GLuint inputsRead = program->Base.InputsRead;
1456    GLuint u;
1457
1458    if (ctx->FragmentProgram.CallbackEnabled)
1459       inputsRead = ~0;
1460
1461    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
1462       /* Clear temporary registers (undefined for ARB_f_p) */
1463       _mesa_bzero(machine->Temporaries,
1464                   MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1465    }
1466
1467    /* Load input registers */
1468    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1469       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1470       ASSERT(span->arrayMask & SPAN_Z);
1471       if (span->arrayMask & SPAN_XY) {
1472          wpos[0] = (GLfloat) span->array->x[col];
1473          wpos[1] = (GLfloat) span->array->y[col];
1474       }
1475       else {
1476          wpos[0] = (GLfloat) span->x + col;
1477          wpos[1] = (GLfloat) span->y;
1478       }
1479       wpos[2] = (GLfloat) span->array->z[col] / ctx->DrawBuffer->_DepthMaxF;
1480       wpos[3] = span->w + col * span->dwdx;
1481    }
1482    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1483       ASSERT(span->arrayMask & SPAN_RGBA);
1484       COPY_4V(machine->Inputs[FRAG_ATTRIB_COL0],
1485               span->array->color.sz4.rgba[col]);
1486    }
1487    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1488       ASSERT(span->arrayMask & SPAN_SPEC);
1489       COPY_4V(machine->Inputs[FRAG_ATTRIB_COL1],
1490               span->array->color.sz4.spec[col]);
1491    }
1492    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1493       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1494       ASSERT(span->arrayMask & SPAN_FOG);
1495       fogc[0] = span->array->fog[col];
1496       fogc[1] = 0.0F;
1497       fogc[2] = 0.0F;
1498       fogc[3] = 0.0F;
1499    }
1500    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1501       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1502          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1503          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1504          COPY_4V(tex, span->array->texcoords[u][col]);
1505          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1506       }
1507    }
1508
1509    /* init condition codes */
1510    machine->CondCodes[0] = COND_EQ;
1511    machine->CondCodes[1] = COND_EQ;
1512    machine->CondCodes[2] = COND_EQ;
1513    machine->CondCodes[3] = COND_EQ;
1514 }
1515
1516
1517 /**
1518  * Run fragment program on the pixels in span from 'start' to 'end' - 1.
1519  */
1520 static void
1521 run_program(GLcontext *ctx, SWspan *span, GLuint start, GLuint end)
1522 {
1523    const struct gl_fragment_program *program = ctx->FragmentProgram._Current;
1524    struct fp_machine machine;
1525    GLuint i;
1526
1527    CurrentMachine = &machine;
1528
1529    for (i = start; i < end; i++) {
1530       if (span->array->mask[i]) {
1531          init_machine(ctx, &machine, program, span, i);
1532
1533          if (execute_program(ctx, program, ~0, &machine, span, i)) {
1534             /* Store result color */
1535             COPY_4V(span->array->color.sz4.rgba[i],
1536                     machine.Outputs[FRAG_RESULT_COLR]);
1537
1538             /* Store result depth/z */
1539             if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1540                const GLfloat depth = machine.Outputs[FRAG_RESULT_DEPR][2];
1541                if (depth <= 0.0)
1542                   span->array->z[i] = 0;
1543                else if (depth >= 1.0)
1544                   span->array->z[i] = ctx->DrawBuffer->_DepthMax;
1545                else
1546                   span->array->z[i] = IROUND(depth * ctx->DrawBuffer->_DepthMaxF);
1547             }
1548          }
1549          else {
1550             /* killed fragment */
1551             span->array->mask[i] = GL_FALSE;
1552             span->writeAll = GL_FALSE;
1553          }
1554       }
1555    }
1556
1557    CurrentMachine = NULL;
1558 }
1559
1560
1561 /**
1562  * Execute the current fragment program for all the fragments
1563  * in the given span.
1564  */
1565 void
1566 _swrast_exec_fragment_program( GLcontext *ctx, SWspan *span )
1567 {
1568    const struct gl_fragment_program *program = ctx->FragmentProgram._Current;
1569
1570    /* incoming colors should be floats */
1571    ASSERT(span->array->ChanType == GL_FLOAT);
1572
1573    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1574
1575    if (program->Base.Parameters) {
1576       _mesa_load_state_parameters(ctx, program->Base.Parameters);
1577    }
1578
1579    run_program(ctx, span, 0, span->end);
1580
1581    if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1582       span->interpMask &= ~SPAN_Z;
1583       span->arrayMask |= SPAN_Z;
1584    }
1585
1586    ctx->_CurrentProgram = 0;
1587 }
1588