src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5.2
   4  *
   5  * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /*
  26  * Regarding GL_NV_fragment_program:
  27  *
  28  * Portions of this software may use or implement intellectual
  29  * property owned and licensed by NVIDIA Corporation. NVIDIA disclaims
  30  * any and all warranties with respect to such intellectual property,
  31  * including any use thereof or modifications thereto.
  32  */
  33
  34 #include "glheader.h"
  35 #include "colormac.h"
  36 #include "context.h"
  37 #include "program_instruction.h"
  38 #include "program.h"
  39
  40 #include "s_nvfragprog.h"
  41 #include "s_span.h"
  42
  43
  44 /* if 1, print some debugging info */
  45 #define DEBUG_FRAG 0
  46
  47
  48 /**
  49  * Virtual machine state used during execution of a fragment programs.
  50  */
  51 struct fp_machine
  52 {
  53    GLfloat Temporaries[MAX_NV_FRAGMENT_PROGRAM_TEMPS][4];
  54    GLfloat Inputs[MAX_NV_FRAGMENT_PROGRAM_INPUTS][4];
  55    GLfloat Outputs[MAX_NV_FRAGMENT_PROGRAM_OUTPUTS][4];
  56    GLuint CondCodes[4];  /**< COND_* value for x/y/z/w */
  57 };
  58
  59
  60 #if FEATURE_MESA_program_debug
  61 static struct fp_machine *CurrentMachine = NULL;
  62
  63 /**
  64  * For GL_MESA_program_debug.
  65  * Return current value (4*GLfloat) of a fragment program register.
  66  * Called via ctx->Driver.GetFragmentProgramRegister().
  67  */
  68 void
  69 _swrast_get_program_register(GLcontext *ctx, enum register_file file,
  70                              GLuint index, GLfloat val[4])
  71 {
  72    if (CurrentMachine) {
  73       switch (file) {
  74       case PROGRAM_INPUT:
  75          COPY_4V(val, CurrentMachine->Inputs[index]);
  76          break;
  77       case PROGRAM_OUTPUT:
  78          COPY_4V(val, CurrentMachine->Outputs[index]);
  79          break;
  80       case PROGRAM_TEMPORARY:
  81          COPY_4V(val, CurrentMachine->Temporaries[index]);
  82          break;
  83       default:
  84          _mesa_problem(NULL,
  85                        "bad register file in _swrast_get_program_register");
  86       }
  87    }
  88 }
  89 #endif /* FEATURE_MESA_program_debug */
  90
  91
  92 /**
  93  * Fetch a texel.
  94  */
  95 static void
  96 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  97              GLuint unit, GLfloat color[4] )
  98 {
  99    GLchan rgba[4];
 100    SWcontext *swrast = SWRAST_CONTEXT(ctx);
 101
 102    /* XXX use a float-valued TextureSample routine here!!! */
 103    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
 104                                1, (const GLfloat (*)[4]) texcoord,
 105                                &lambda, &rgba);
 106    color[0] = CHAN_TO_FLOAT(rgba[0]);
 107    color[1] = CHAN_TO_FLOAT(rgba[1]);
 108    color[2] = CHAN_TO_FLOAT(rgba[2]);
 109    color[3] = CHAN_TO_FLOAT(rgba[3]);
 110 }
 111
 112
 113 /**
 114  * Fetch a texel with the given partial derivatives to compute a level
 115  * of detail in the mipmap.
 116  */
 117 static void
 118 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
 119                    const GLfloat texdx[4], const GLfloat texdy[4],
 120                    GLuint unit, GLfloat color[4] )
 121 {
 122    SWcontext *swrast = SWRAST_CONTEXT(ctx);
 123    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
 124    const struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
 125    const GLfloat texW = (GLfloat) texImg->WidthScale;
 126    const GLfloat texH = (GLfloat) texImg->HeightScale;
 127    GLchan rgba[4];
 128
 129    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
 130                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
 131                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
 132                                          texW, texH,
 133                                          texcoord[0], texcoord[1], texcoord[3],
 134                                          1.0F / texcoord[3]);
 135
 136    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
 137                                1, (const GLfloat (*)[4]) texcoord,
 138                                &lambda, &rgba);
 139    color[0] = CHAN_TO_FLOAT(rgba[0]);
 140    color[1] = CHAN_TO_FLOAT(rgba[1]);
 141    color[2] = CHAN_TO_FLOAT(rgba[2]);
 142    color[3] = CHAN_TO_FLOAT(rgba[3]);
 143 }
 144
 145
 146 /**
 147  * Return a pointer to the 4-element float vector specified by the given
 148  * source register.
 149  */
 150 static INLINE const GLfloat *
 151 get_register_pointer( GLcontext *ctx,
 152                       const struct prog_src_register *source,
 153                       const struct fp_machine *machine,
 154                       const struct gl_fragment_program *program )
 155 {
 156    switch (source->File) {
 157    case PROGRAM_TEMPORARY:
 158       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 159       return machine->Temporaries[source->Index];
 160    case PROGRAM_INPUT:
 161       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 162       return machine->Inputs[source->Index];
 163    case PROGRAM_OUTPUT:
 164       /* This is only for PRINT */
 165       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_OUTPUTS);
 166       return machine->Outputs[source->Index];
 167    case PROGRAM_LOCAL_PARAM:
 168       ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 169       return program->Base.LocalParams[source->Index];
 170    case PROGRAM_ENV_PARAM:
 171       ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 172       return ctx->FragmentProgram.Parameters[source->Index];
 173    case PROGRAM_STATE_VAR:
 174       /* Fallthrough */
 175    case PROGRAM_NAMED_PARAM:
 176       ASSERT(source->Index < (GLint) program->Base.Parameters->NumParameters);
 177       return program->Base.Parameters->ParameterValues[source->Index];
 178    default:
 179       _mesa_problem(ctx, "Invalid input register file %d in fetch_vector4",
 180                     source->File);
 181       return NULL;
 182    }
 183 }
 184
 185
 186 /**
 187  * Fetch a 4-element float vector from the given source register.
 188  * Apply swizzling and negating as needed.
 189  */
 190 static void
 191 fetch_vector4( GLcontext *ctx,
 192                const struct prog_src_register *source,
 193                const struct fp_machine *machine,
 194                const struct gl_fragment_program *program,
 195                GLfloat result[4] )
 196 {
 197    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 198    ASSERT(src);
 199
 200    if (source->Swizzle == MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
 201                                         SWIZZLE_Z, SWIZZLE_W)) {
 202       /* no swizzling */
 203       COPY_4V(result, src);
 204    }
 205    else {
 206       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 207       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 208       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 209       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 210    }
 211
 212    if (source->NegateBase) {
 213       result[0] = -result[0];
 214       result[1] = -result[1];
 215       result[2] = -result[2];
 216       result[3] = -result[3];
 217    }
 218    if (source->Abs) {
 219       result[0] = FABSF(result[0]);
 220       result[1] = FABSF(result[1]);
 221       result[2] = FABSF(result[2]);
 222       result[3] = FABSF(result[3]);
 223    }
 224    if (source->NegateAbs) {
 225       result[0] = -result[0];
 226       result[1] = -result[1];
 227       result[2] = -result[2];
 228       result[3] = -result[3];
 229    }
 230 }
 231
 232
 233 /**
 234  * Fetch the derivative with respect to X for the given register.
 235  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 236  * need to execute another instance of the program (ugh)!
 237  */
 238 static GLboolean
 239 fetch_vector4_deriv( GLcontext *ctx,
 240                      const struct prog_src_register *source,
 241                      const SWspan *span,
 242                      char xOrY, GLint column, GLfloat result[4] )
 243 {
 244    GLfloat src[4];
 245
 246    ASSERT(xOrY == 'X' || xOrY == 'Y');
 247
 248    switch (source->Index) {
 249    case FRAG_ATTRIB_WPOS:
 250       if (xOrY == 'X') {
 251          src[0] = 1.0;
 252          src[1] = 0.0;
 253          src[2] = span->dzdx / ctx->DrawBuffer->_DepthMaxF;
 254          src[3] = span->dwdx;
 255       }
 256       else {
 257          src[0] = 0.0;
 258          src[1] = 1.0;
 259          src[2] = span->dzdy / ctx->DrawBuffer->_DepthMaxF;
 260          src[3] = span->dwdy;
 261       }
 262       break;
 263    case FRAG_ATTRIB_COL0:
 264       if (xOrY == 'X') {
 265          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 266          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 267          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 268          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 269       }
 270       else {
 271          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 272          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 273          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 274          src[3] = span->dady * (1.0F / CHAN_MAXF);
 275       }
 276       break;
 277    case FRAG_ATTRIB_COL1:
 278       if (xOrY == 'X') {
 279          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 280          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 281          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 282          src[3] = 0.0; /* XXX need this */
 283       }
 284       else {
 285          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 286          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 287          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 288          src[3] = 0.0; /* XXX need this */
 289       }
 290       break;
 291    case FRAG_ATTRIB_FOGC:
 292       if (xOrY == 'X') {
 293          src[0] = span->dfogdx;
 294          src[1] = 0.0;
 295          src[2] = 0.0;
 296          src[3] = 0.0;
 297       }
 298       else {
 299          src[0] = span->dfogdy;
 300          src[1] = 0.0;
 301          src[2] = 0.0;
 302          src[3] = 0.0;
 303       }
 304       break;
 305    case FRAG_ATTRIB_TEX0:
 306    case FRAG_ATTRIB_TEX1:
 307    case FRAG_ATTRIB_TEX2:
 308    case FRAG_ATTRIB_TEX3:
 309    case FRAG_ATTRIB_TEX4:
 310    case FRAG_ATTRIB_TEX5:
 311    case FRAG_ATTRIB_TEX6:
 312    case FRAG_ATTRIB_TEX7:
 313       if (xOrY == 'X') {
 314          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 315          /* this is a little tricky - I think I've got it right */
 316          const GLfloat invQ = 1.0f / (span->tex[u][3]
 317                                       + span->texStepX[u][3] * column);
 318          src[0] = span->texStepX[u][0] * invQ;
 319          src[1] = span->texStepX[u][1] * invQ;
 320          src[2] = span->texStepX[u][2] * invQ;
 321          src[3] = span->texStepX[u][3] * invQ;
 322       }
 323       else {
 324          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 325          /* Tricky, as above, but in Y direction */
 326          const GLfloat invQ = 1.0f / (span->tex[u][3] + span->texStepY[u][3]);
 327          src[0] = span->texStepY[u][0] * invQ;
 328          src[1] = span->texStepY[u][1] * invQ;
 329          src[2] = span->texStepY[u][2] * invQ;
 330          src[3] = span->texStepY[u][3] * invQ;
 331       }
 332       break;
 333    default:
 334       return GL_FALSE;
 335    }
 336
 337    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 338    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 339    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 340    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 341
 342    if (source->NegateBase) {
 343       result[0] = -result[0];
 344       result[1] = -result[1];
 345       result[2] = -result[2];
 346       result[3] = -result[3];
 347    }
 348    if (source->Abs) {
 349       result[0] = FABSF(result[0]);
 350       result[1] = FABSF(result[1]);
 351       result[2] = FABSF(result[2]);
 352       result[3] = FABSF(result[3]);
 353    }
 354    if (source->NegateAbs) {
 355       result[0] = -result[0];
 356       result[1] = -result[1];
 357       result[2] = -result[2];
 358       result[3] = -result[3];
 359    }
 360    return GL_TRUE;
 361 }
 362
 363
 364 /**
 365  * As above, but only return result[0] element.
 366  */
 367 static void
 368 fetch_vector1( GLcontext *ctx,
 369                const struct prog_src_register *source,
 370                const struct fp_machine *machine,
 371                const struct gl_fragment_program *program,
 372                GLfloat result[4] )
 373 {
 374    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 375    ASSERT(src);
 376
 377    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 378
 379    if (source->NegateBase) {
 380       result[0] = -result[0];
 381    }
 382    if (source->Abs) {
 383       result[0] = FABSF(result[0]);
 384    }
 385    if (source->NegateAbs) {
 386       result[0] = -result[0];
 387    }
 388 }
 389
 390
 391 /**
 392  * Test value against zero and return GT, LT, EQ or UN if NaN.
 393  */
 394 static INLINE GLuint
 395 generate_cc( float value )
 396 {
 397    if (value != value)
 398       return COND_UN;  /* NaN */
 399    if (value > 0.0F)
 400       return COND_GT;
 401    if (value < 0.0F)
 402       return COND_LT;
 403    return COND_EQ;
 404 }
 405
 406
 407 /**
 408  * Test if the ccMaskRule is satisfied by the given condition code.
 409  * Used to mask destination writes according to the current condition code.
 410  */
 411 static INLINE GLboolean
 412 test_cc(GLuint condCode, GLuint ccMaskRule)
 413 {
 414    switch (ccMaskRule) {
 415    case COND_EQ: return (condCode == COND_EQ);
 416    case COND_NE: return (condCode != COND_EQ);
 417    case COND_LT: return (condCode == COND_LT);
 418    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 419    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 420    case COND_GT: return (condCode == COND_GT);
 421    case COND_TR: return GL_TRUE;
 422    case COND_FL: return GL_FALSE;
 423    default:      return GL_TRUE;
 424    }
 425 }
 426
 427
 428 /**
 429  * Store 4 floats into a register.  Observe the instructions saturate and
 430  * set-condition-code flags.
 431  */
 432 static void
 433 store_vector4( const struct prog_instruction *inst,
 434                struct fp_machine *machine,
 435                const GLfloat value[4] )
 436 {
 437    const struct prog_dst_register *dest = &(inst->DstReg);
 438    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 439    GLfloat *dstReg;
 440    GLfloat dummyReg[4];
 441    GLfloat clampedValue[4];
 442    GLuint writeMask = dest->WriteMask;
 443
 444    switch (dest->File) {
 445       case PROGRAM_OUTPUT:
 446          dstReg = machine->Outputs[dest->Index];
 447          break;
 448       case PROGRAM_TEMPORARY:
 449          dstReg = machine->Temporaries[dest->Index];
 450          break;
 451       case PROGRAM_WRITE_ONLY:
 452          dstReg = dummyReg;
 453          return;
 454       default:
 455          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 456          return;
 457    }
 458
 459 #if DEBUG_FRAG
 460    if (value[0] > 1.0e10 ||
 461        IS_INF_OR_NAN(value[0]) ||
 462        IS_INF_OR_NAN(value[1]) ||
 463        IS_INF_OR_NAN(value[2]) ||
 464        IS_INF_OR_NAN(value[3])  )
 465       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 466 #endif
 467
 468    if (clamp) {
 469       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 470       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 471       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 472       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 473       value = clampedValue;
 474    }
 475
 476    if (dest->CondMask != COND_TR) {
 477       /* condition codes may turn off some writes */
 478       if (writeMask & WRITEMASK_X) {
 479          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)],
 480                       dest->CondMask))
 481             writeMask &= ~WRITEMASK_X;
 482       }
 483       if (writeMask & WRITEMASK_Y) {
 484          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)],
 485                       dest->CondMask))
 486             writeMask &= ~WRITEMASK_Y;
 487       }
 488       if (writeMask & WRITEMASK_Z) {
 489          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)],
 490                       dest->CondMask))
 491             writeMask &= ~WRITEMASK_Z;
 492       }
 493       if (writeMask & WRITEMASK_W) {
 494          if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)],
 495                       dest->CondMask))
 496             writeMask &= ~WRITEMASK_W;
 497       }
 498    }
 499
 500    if (writeMask & WRITEMASK_X)
 501       dstReg[0] = value[0];
 502    if (writeMask & WRITEMASK_Y)
 503       dstReg[1] = value[1];
 504    if (writeMask & WRITEMASK_Z)
 505       dstReg[2] = value[2];
 506    if (writeMask & WRITEMASK_W)
 507       dstReg[3] = value[3];
 508
 509    if (inst->CondUpdate) {
 510       if (writeMask & WRITEMASK_X)
 511          machine->CondCodes[0] = generate_cc(value[0]);
 512       if (writeMask & WRITEMASK_Y)
 513          machine->CondCodes[1] = generate_cc(value[1]);
 514       if (writeMask & WRITEMASK_Z)
 515          machine->CondCodes[2] = generate_cc(value[2]);
 516       if (writeMask & WRITEMASK_W)
 517          machine->CondCodes[3] = generate_cc(value[3]);
 518    }
 519 }
 520
 521
 522 /**
 523  * Initialize a new machine state instance from an existing one, adding
 524  * the partial derivatives onto the input registers.
 525  * Used to implement DDX and DDY instructions in non-trivial cases.
 526  */
 527 static void
 528 init_machine_deriv( GLcontext *ctx,
 529                     const struct fp_machine *machine,
 530                     const struct gl_fragment_program *program,
 531                     const SWspan *span, char xOrY,
 532                     struct fp_machine *dMachine )
 533 {
 534    GLuint u;
 535
 536    ASSERT(xOrY == 'X' || xOrY == 'Y');
 537
 538    /* copy existing machine */
 539    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 540
 541    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
 542       /* Clear temporary registers (undefined for ARB_f_p) */
 543       _mesa_bzero( (void*) machine->Temporaries,
 544                    MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 545    }
 546
 547    /* Add derivatives */
 548    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 549       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 550       if (xOrY == 'X') {
 551          wpos[0] += 1.0F;
 552          wpos[1] += 0.0F;
 553          wpos[2] += span->dzdx;
 554          wpos[3] += span->dwdx;
 555       }
 556       else {
 557          wpos[0] += 0.0F;
 558          wpos[1] += 1.0F;
 559          wpos[2] += span->dzdy;
 560          wpos[3] += span->dwdy;
 561       }
 562    }
 563    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 564       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 565       if (xOrY == 'X') {
 566          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 567          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 568          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 569          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 570       }
 571       else {
 572          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 573          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 574          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 575          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 576       }
 577    }
 578    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 579       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 580       if (xOrY == 'X') {
 581          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 582          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 583          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 584          col1[3] += 0.0; /*XXX fix */
 585       }
 586       else {
 587          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 588          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 589          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 590          col1[3] += 0.0; /*XXX fix */
 591       }
 592    }
 593    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 594       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 595       if (xOrY == 'X') {
 596          fogc[0] += span->dfogdx;
 597       }
 598       else {
 599          fogc[0] += span->dfogdy;
 600       }
 601    }
 602    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 603       if (program->Base.InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 604          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 605          /* XXX perspective-correct interpolation */
 606          if (xOrY == 'X') {
 607             tex[0] += span->texStepX[u][0];
 608             tex[1] += span->texStepX[u][1];
 609             tex[2] += span->texStepX[u][2];
 610             tex[3] += span->texStepX[u][3];
 611          }
 612          else {
 613             tex[0] += span->texStepY[u][0];
 614             tex[1] += span->texStepY[u][1];
 615             tex[2] += span->texStepY[u][2];
 616             tex[3] += span->texStepY[u][3];
 617          }
 618       }
 619    }
 620
 621    /* init condition codes */
 622    dMachine->CondCodes[0] = COND_EQ;
 623    dMachine->CondCodes[1] = COND_EQ;
 624    dMachine->CondCodes[2] = COND_EQ;
 625    dMachine->CondCodes[3] = COND_EQ;
 626 }
 627
 628
 629 /**
 630  * Execute the given vertex program.
 631  * NOTE: we do everything in single-precision floating point; we don't
 632  * currently observe the single/half/fixed-precision qualifiers.
 633  * \param ctx - rendering context
 634  * \param program - the fragment program to execute
 635  * \param machine - machine state (register file)
 636  * \param maxInst - max number of instructions to execute
 637  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 638  */
 639 static GLboolean
 640 execute_program( GLcontext *ctx,
 641                  const struct gl_fragment_program *program, GLuint maxInst,
 642                  struct fp_machine *machine, const SWspan *span,
 643                  GLuint column )
 644 {
 645    GLuint pc;
 646
 647 #if DEBUG_FRAG
 648    printf("execute fragment program --------------------\n");
 649 #endif
 650
 651    for (pc = 0; pc < maxInst; pc++) {
 652       const struct prog_instruction *inst = program->Base.Instructions + pc;
 653
 654       if (ctx->FragmentProgram.CallbackEnabled &&
 655           ctx->FragmentProgram.Callback) {
 656          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 657          ctx->FragmentProgram.Callback(program->Base.Target,
 658                                        ctx->FragmentProgram.CallbackData);
 659       }
 660
 661       switch (inst->Opcode) {
 662          case OPCODE_ABS:
 663             {
 664                GLfloat a[4], result[4];
 665                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 666                result[0] = FABSF(a[0]);
 667                result[1] = FABSF(a[1]);
 668                result[2] = FABSF(a[2]);
 669                result[3] = FABSF(a[3]);
 670                store_vector4( inst, machine, result );
 671             }
 672             break;
 673          case OPCODE_ADD:
 674             {
 675                GLfloat a[4], b[4], result[4];
 676                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 677                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 678                result[0] = a[0] + b[0];
 679                result[1] = a[1] + b[1];
 680                result[2] = a[2] + b[2];
 681                result[3] = a[3] + b[3];
 682                store_vector4( inst, machine, result );
 683             }
 684             break;
 685          case OPCODE_CMP:
 686             {
 687                GLfloat a[4], b[4], c[4], result[4];
 688                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 689                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 690                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 691                result[0] = a[0] < 0.0F ? b[0] : c[0];
 692                result[1] = a[1] < 0.0F ? b[1] : c[1];
 693                result[2] = a[2] < 0.0F ? b[2] : c[2];
 694                result[3] = a[3] < 0.0F ? b[3] : c[3];
 695                store_vector4( inst, machine, result );
 696             }
 697             break;
 698          case OPCODE_COS:
 699             {
 700                GLfloat a[4], result[4];
 701                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 702                result[0] = result[1] = result[2] = result[3]
 703                   = (GLfloat) _mesa_cos(a[0]);
 704                store_vector4( inst, machine, result );
 705             }
 706             break;
 707          case OPCODE_DDX: /* Partial derivative with respect to X */
 708             {
 709                GLfloat a[4], aNext[4], result[4];
 710                struct fp_machine dMachine;
 711                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
 712                                         column, result)) {
 713                   /* This is tricky.  Make a copy of the current machine state,
 714                    * increment the input registers by the dx or dy partial
 715                    * derivatives, then re-execute the program up to the
 716                    * preceeding instruction, then fetch the source register.
 717                    * Finally, find the difference in the register values for
 718                    * the original and derivative runs.
 719                    */
 720                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 721                   init_machine_deriv(ctx, machine, program, span,
 722                                      'X', &dMachine);
 723                   execute_program(ctx, program, pc, &dMachine, span, column);
 724                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 725                   result[0] = aNext[0] - a[0];
 726                   result[1] = aNext[1] - a[1];
 727                   result[2] = aNext[2] - a[2];
 728                   result[3] = aNext[3] - a[3];
 729                }
 730                store_vector4( inst, machine, result );
 731             }
 732             break;
 733          case OPCODE_DDY: /* Partial derivative with respect to Y */
 734             {
 735                GLfloat a[4], aNext[4], result[4];
 736                struct fp_machine dMachine;
 737                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
 738                                         column, result)) {
 739                   init_machine_deriv(ctx, machine, program, span,
 740                                      'Y', &dMachine);
 741                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 742                   execute_program(ctx, program, pc, &dMachine, span, column);
 743                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 744                   result[0] = aNext[0] - a[0];
 745                   result[1] = aNext[1] - a[1];
 746                   result[2] = aNext[2] - a[2];
 747                   result[3] = aNext[3] - a[3];
 748                }
 749                store_vector4( inst, machine, result );
 750             }
 751             break;
 752          case OPCODE_DP3:
 753             {
 754                GLfloat a[4], b[4], result[4];
 755                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 756                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 757                result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 758                store_vector4( inst, machine, result );
 759 #if DEBUG_FRAG
 760                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 761                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 762 #endif
 763             }
 764             break;
 765          case OPCODE_DP4:
 766             {
 767                GLfloat a[4], b[4], result[4];
 768                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 769                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 770                result[0] = result[1] = result[2] = result[3] = DOT4(a,b);
 771                store_vector4( inst, machine, result );
 772 #if DEBUG_FRAG
 773                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 774                       result[0], a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 775 #endif
 776             }
 777             break;
 778          case OPCODE_DPH:
 779             {
 780                GLfloat a[4], b[4], result[4];
 781                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 782                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 783                result[0] = result[1] = result[2] = result[3] =
 784                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 785                store_vector4( inst, machine, result );
 786             }
 787             break;
 788          case OPCODE_DST: /* Distance vector */
 789             {
 790                GLfloat a[4], b[4], result[4];
 791                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 792                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 793                result[0] = 1.0F;
 794                result[1] = a[1] * b[1];
 795                result[2] = a[2];
 796                result[3] = b[3];
 797                store_vector4( inst, machine, result );
 798             }
 799             break;
 800          case OPCODE_EX2: /* Exponential base 2 */
 801             {
 802                GLfloat a[4], result[4];
 803                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 804                result[0] = result[1] = result[2] = result[3] =
 805                   (GLfloat) _mesa_pow(2.0, a[0]);
 806                store_vector4( inst, machine, result );
 807             }
 808             break;
 809          case OPCODE_FLR:
 810             {
 811                GLfloat a[4], result[4];
 812                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 813                result[0] = FLOORF(a[0]);
 814                result[1] = FLOORF(a[1]);
 815                result[2] = FLOORF(a[2]);
 816                result[3] = FLOORF(a[3]);
 817                store_vector4( inst, machine, result );
 818             }
 819             break;
 820          case OPCODE_FRC:
 821             {
 822                GLfloat a[4], result[4];
 823                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 824                result[0] = a[0] - FLOORF(a[0]);
 825                result[1] = a[1] - FLOORF(a[1]);
 826                result[2] = a[2] - FLOORF(a[2]);
 827                result[3] = a[3] - FLOORF(a[3]);
 828                store_vector4( inst, machine, result );
 829             }
 830             break;
 831          case OPCODE_KIL_NV: /* NV_f_p only */
 832             {
 833                const GLuint swizzle = inst->DstReg.CondSwizzle;
 834                const GLuint condMask = inst->DstReg.CondMask;
 835                if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 836                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 837                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 838                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 839                   return GL_FALSE;
 840                }
 841             }
 842             break;
 843          case OPCODE_KIL: /* ARB_f_p only */
 844             {
 845                GLfloat a[4];
 846                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 847                if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 848                   return GL_FALSE;
 849                }
 850             }
 851             break;
 852          case OPCODE_LG2:  /* log base 2 */
 853             {
 854                GLfloat a[4], result[4];
 855                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 856                result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
 857                store_vector4( inst, machine, result );
 858             }
 859             break;
 860          case OPCODE_LIT:
 861             {
 862                const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
 863                GLfloat a[4], result[4];
 864                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 865                a[0] = MAX2(a[0], 0.0F);
 866                a[1] = MAX2(a[1], 0.0F);
 867                /* XXX ARB version clamps a[3], NV version doesn't */
 868                a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 869                result[0] = 1.0F;
 870                result[1] = a[0];
 871                /* XXX we could probably just use pow() here */
 872                if (a[0] > 0.0F) {
 873                   if (a[1] == 0.0 && a[3] == 0.0)
 874                      result[2] = 1.0;
 875                   else
 876                      result[2] = EXPF(a[3] * LOGF(a[1]));
 877                }
 878                else {
 879                   result[2] = 0.0;
 880                }
 881                result[3] = 1.0F;
 882                store_vector4( inst, machine, result );
 883             }
 884             break;
 885          case OPCODE_LRP:
 886             {
 887                GLfloat a[4], b[4], c[4], result[4];
 888                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 889                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 890                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 891                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 892                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 893                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 894                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 895                store_vector4( inst, machine, result );
 896             }
 897             break;
 898          case OPCODE_MAD:
 899             {
 900                GLfloat a[4], b[4], c[4], result[4];
 901                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 902                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 903                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 904                result[0] = a[0] * b[0] + c[0];
 905                result[1] = a[1] * b[1] + c[1];
 906                result[2] = a[2] * b[2] + c[2];
 907                result[3] = a[3] * b[3] + c[3];
 908                store_vector4( inst, machine, result );
 909             }
 910             break;
 911          case OPCODE_MAX:
 912             {
 913                GLfloat a[4], b[4], result[4];
 914                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 915                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 916                result[0] = MAX2(a[0], b[0]);
 917                result[1] = MAX2(a[1], b[1]);
 918                result[2] = MAX2(a[2], b[2]);
 919                result[3] = MAX2(a[3], b[3]);
 920                store_vector4( inst, machine, result );
 921 #if DEBUG_FRAG
 922                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 923                       result[0], result[1], result[2], result[3],
 924                       a[0], a[1], a[2], a[3],
 925                       b[0], b[1], b[2], b[3]);
 926 #endif
 927             }
 928             break;
 929          case OPCODE_MIN:
 930             {
 931                GLfloat a[4], b[4], result[4];
 932                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 933                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 934                result[0] = MIN2(a[0], b[0]);
 935                result[1] = MIN2(a[1], b[1]);
 936                result[2] = MIN2(a[2], b[2]);
 937                result[3] = MIN2(a[3], b[3]);
 938                store_vector4( inst, machine, result );
 939             }
 940             break;
 941          case OPCODE_MOV:
 942             {
 943                GLfloat result[4];
 944                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 945                store_vector4( inst, machine, result );
 946 #if DEBUG_FRAG
 947                printf("MOV (%g %g %g %g)\n",
 948                       result[0], result[1], result[2], result[3]);
 949 #endif
 950             }
 951             break;
 952          case OPCODE_MUL:
 953             {
 954                GLfloat a[4], b[4], result[4];
 955                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 956                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 957                result[0] = a[0] * b[0];
 958                result[1] = a[1] * b[1];
 959                result[2] = a[2] * b[2];
 960                result[3] = a[3] * b[3];
 961                store_vector4( inst, machine, result );
 962 #if DEBUG_FRAG
 963                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 964                       result[0], result[1], result[2], result[3],
 965                       a[0], a[1], a[2], a[3],
 966                       b[0], b[1], b[2], b[3]);
 967 #endif
 968             }
 969             break;
 970          case OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
 971             {
 972                GLfloat a[4], result[4];
 973                GLhalfNV hx, hy;
 974                GLuint *rawResult = (GLuint *) result;
 975                GLuint twoHalves;
 976                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 977                hx = _mesa_float_to_half(a[0]);
 978                hy = _mesa_float_to_half(a[1]);
 979                twoHalves = hx | (hy << 16);
 980                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 981                   = twoHalves;
 982                store_vector4( inst, machine, result );
 983             }
 984             break;
 985          case OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
 986             {
 987                GLfloat a[4], result[4];
 988                GLuint usx, usy, *rawResult = (GLuint *) result;
 989                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 990                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 991                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 992                usx = IROUND(a[0] * 65535.0F);
 993                usy = IROUND(a[1] * 65535.0F);
 994                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 995                   = usx | (usy << 16);
 996                store_vector4( inst, machine, result );
 997             }
 998             break;
 999          case OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
1000             {
1001                GLfloat a[4], result[4];
1002                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1003                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1004                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1005                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1006                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1007                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1008                ubx = IROUND(127.0F * a[0] + 128.0F);
1009                uby = IROUND(127.0F * a[1] + 128.0F);
1010                ubz = IROUND(127.0F * a[2] + 128.0F);
1011                ubw = IROUND(127.0F * a[3] + 128.0F);
1012                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1013                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1014                store_vector4( inst, machine, result );
1015             }
1016             break;
1017          case OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
1018             {
1019                GLfloat a[4], result[4];
1020                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1021                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1022                a[0] = CLAMP(a[0], 0.0F, 1.0F);
1023                a[1] = CLAMP(a[1], 0.0F, 1.0F);
1024                a[2] = CLAMP(a[2], 0.0F, 1.0F);
1025                a[3] = CLAMP(a[3], 0.0F, 1.0F);
1026                ubx = IROUND(255.0F * a[0]);
1027                uby = IROUND(255.0F * a[1]);
1028                ubz = IROUND(255.0F * a[2]);
1029                ubw = IROUND(255.0F * a[3]);
1030                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1031                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1032                store_vector4( inst, machine, result );
1033             }
1034             break;
1035          case OPCODE_POW:
1036             {
1037                GLfloat a[4], b[4], result[4];
1038                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1039                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
1040                result[0] = result[1] = result[2] = result[3]
1041                   = (GLfloat)_mesa_pow(a[0], b[0]);
1042                store_vector4( inst, machine, result );
1043             }
1044             break;
1045          case OPCODE_RCP:
1046             {
1047                GLfloat a[4], result[4];
1048                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1049 #if DEBUG_FRAG
1050                if (a[0] == 0)
1051                   printf("RCP(0)\n");
1052                else if (IS_INF_OR_NAN(a[0]))
1053                   printf("RCP(inf)\n");
1054 #endif
1055                result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1056                store_vector4( inst, machine, result );
1057             }
1058             break;
1059          case OPCODE_RFL: /* reflection vector */
1060             {
1061                GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1062                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
1063                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
1064                tmpW = DOT3(axis, axis);
1065                tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1066                result[0] = tmpX * axis[0] - dir[0];
1067                result[1] = tmpX * axis[1] - dir[1];
1068                result[2] = tmpX * axis[2] - dir[2];
1069                /* result[3] is never written! XXX enforce in parser! */
1070                store_vector4( inst, machine, result );
1071             }
1072             break;
1073          case OPCODE_RSQ: /* 1 / sqrt() */
1074             {
1075                GLfloat a[4], result[4];
1076                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1077                a[0] = FABSF(a[0]);
1078                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1079                store_vector4( inst, machine, result );
1080 #if DEBUG_FRAG
1081                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1082 #endif
1083             }
1084             break;
1085          case OPCODE_SCS: /* sine and cos */
1086             {
1087                GLfloat a[4], result[4];
1088                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1089                result[0] = (GLfloat)_mesa_cos(a[0]);
1090                result[1] = (GLfloat)_mesa_sin(a[0]);
1091                result[2] = 0.0;  /* undefined! */
1092                result[3] = 0.0;  /* undefined! */
1093                store_vector4( inst, machine, result );
1094             }
1095             break;
1096          case OPCODE_SEQ: /* set on equal */
1097             {
1098                GLfloat a[4], b[4], result[4];
1099                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1100                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1101                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1102                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1103                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1104                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1105                store_vector4( inst, machine, result );
1106             }
1107             break;
1108          case OPCODE_SFL: /* set false, operands ignored */
1109             {
1110                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1111                store_vector4( inst, machine, result );
1112             }
1113             break;
1114          case OPCODE_SGE: /* set on greater or equal */
1115             {
1116                GLfloat a[4], b[4], result[4];
1117                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1118                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1119                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1120                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1121                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1122                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1123                store_vector4( inst, machine, result );
1124             }
1125             break;
1126          case OPCODE_SGT: /* set on greater */
1127             {
1128                GLfloat a[4], b[4], result[4];
1129                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1130                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1131                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1132                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1133                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1134                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1135                store_vector4( inst, machine, result );
1136             }
1137             break;
1138          case OPCODE_SIN:
1139             {
1140                GLfloat a[4], result[4];
1141                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1142                result[0] = result[1] = result[2] = result[3]
1143                   = (GLfloat) _mesa_sin(a[0]);
1144                store_vector4( inst, machine, result );
1145             }
1146             break;
1147          case OPCODE_SLE: /* set on less or equal */
1148             {
1149                GLfloat a[4], b[4], result[4];
1150                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1151                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1152                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1153                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1154                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1155                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1156                store_vector4( inst, machine, result );
1157             }
1158             break;
1159          case OPCODE_SLT: /* set on less */
1160             {
1161                GLfloat a[4], b[4], result[4];
1162                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1163                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1164                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1165                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1166                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1167                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1168                store_vector4( inst, machine, result );
1169             }
1170             break;
1171          case OPCODE_SNE: /* set on not equal */
1172             {
1173                GLfloat a[4], b[4], result[4];
1174                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1175                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1176                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1177                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1178                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1179                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1180                store_vector4( inst, machine, result );
1181             }
1182             break;
1183          case OPCODE_STR: /* set true, operands ignored */
1184             {
1185                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1186                store_vector4( inst, machine, result );
1187             }
1188             break;
1189          case OPCODE_SUB:
1190             {
1191                GLfloat a[4], b[4], result[4];
1192                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1193                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1194                result[0] = a[0] - b[0];
1195                result[1] = a[1] - b[1];
1196                result[2] = a[2] - b[2];
1197                result[3] = a[3] - b[3];
1198                store_vector4( inst, machine, result );
1199             }
1200             break;
1201          case OPCODE_SWZ: /* extended swizzle */
1202             {
1203                const struct prog_src_register *source = &inst->SrcReg[0];
1204                const GLfloat *src = get_register_pointer(ctx, source,
1205                                                          machine, program);
1206                GLfloat result[4];
1207                GLuint i;
1208                for (i = 0; i < 4; i++) {
1209                   const GLuint swz = GET_SWZ(source->Swizzle, i);
1210                   if (swz == SWIZZLE_ZERO)
1211                      result[i] = 0.0;
1212                   else if (swz == SWIZZLE_ONE)
1213                      result[i] = 1.0;
1214                   else {
1215                      ASSERT(swz >= 0);
1216                      ASSERT(swz <= 3);
1217                      result[i] = src[swz];
1218                   }
1219                   if (source->NegateBase & (1 << i))
1220                      result[i] = -result[i];
1221                }
1222                store_vector4( inst, machine, result );
1223             }
1224             break;
1225          case OPCODE_TEX: /* Both ARB and NV frag prog */
1226             /* Texel lookup */
1227             {
1228                GLfloat texcoord[4], color[4];
1229                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1230                /* Note: we pass 0 for LOD.  The ARB extension requires it
1231                 * while the NV extension says it's implementation dependant.
1232                 */
1233                /* KW: Previously lambda was passed as zero, but I
1234                 * believe this is incorrect, the spec seems to
1235                 * indicate rather that lambda should not be
1236                 * changed/biased, unlike TXB where texcoord[3] is
1237                 * added to the lambda calculations.  The lambda should
1238                 * still be calculated normally for TEX & TXP though,
1239                 * not set to zero.  Otherwise it's very difficult to
1240                 * implement normal GL semantics through the fragment
1241                 * shader.
1242                 */
1243                fetch_texel( ctx, texcoord,
1244                             span->array->lambda[inst->TexSrcUnit][column],
1245                             inst->TexSrcUnit, color );
1246 #if DEBUG_FRAG
1247                if (color[3])
1248                   printf("color[3] = %f\n", color[3]);
1249 #endif
1250                store_vector4( inst, machine, color );
1251             }
1252             break;
1253          case OPCODE_TXB: /* GL_ARB_fragment_program only */
1254             /* Texel lookup with LOD bias */
1255             {
1256                GLfloat texcoord[4], color[4], bias, lambda;
1257
1258                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1259                /* texcoord[3] is the bias to add to lambda */
1260                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1261                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1262                     + texcoord[3];
1263                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1264                fetch_texel( ctx, texcoord, lambda,
1265                             inst->TexSrcUnit, color );
1266                store_vector4( inst, machine, color );
1267             }
1268             break;
1269          case OPCODE_TXD: /* GL_NV_fragment_program only */
1270             /* Texture lookup w/ partial derivatives for LOD */
1271             {
1272                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1273                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1274                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1275                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1276                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1277                                   color );
1278                store_vector4( inst, machine, color );
1279             }
1280             break;
1281          case OPCODE_TXP: /* GL_ARB_fragment_program only */
1282             /* Texture lookup w/ projective divide */
1283             {
1284                GLfloat texcoord[4], color[4];
1285                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1286                /* Not so sure about this test - if texcoord[3] is
1287                 * zero, we'd probably be fine except for an ASSERT in
1288                 * IROUND_POS() which gets triggered by the inf values created.
1289                 */
1290                if (texcoord[3] != 0.0) {
1291                   texcoord[0] /= texcoord[3];
1292                   texcoord[1] /= texcoord[3];
1293                   texcoord[2] /= texcoord[3];
1294                }
1295                /* KW: Previously lambda was passed as zero, but I
1296                 * believe this is incorrect, the spec seems to
1297                 * indicate rather that lambda should not be
1298                 * changed/biased, unlike TXB where texcoord[3] is
1299                 * added to the lambda calculations.  The lambda should
1300                 * still be calculated normally for TEX & TXP though,
1301                 * not set to zero.
1302                 */
1303                fetch_texel( ctx, texcoord,
1304                             span->array->lambda[inst->TexSrcUnit][column],
1305                             inst->TexSrcUnit, color );
1306                store_vector4( inst, machine, color );
1307             }
1308             break;
1309          case OPCODE_TXP_NV: /* GL_NV_fragment_program only */
1310             /* Texture lookup w/ projective divide */
1311             {
1312                GLfloat texcoord[4], color[4];
1313                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1314                if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1315                    texcoord[3] != 0.0) {
1316                   texcoord[0] /= texcoord[3];
1317                   texcoord[1] /= texcoord[3];
1318                   texcoord[2] /= texcoord[3];
1319                }
1320                fetch_texel( ctx, texcoord,
1321                             span->array->lambda[inst->TexSrcUnit][column],
1322                             inst->TexSrcUnit, color );
1323                store_vector4( inst, machine, color );
1324             }
1325             break;
1326          case OPCODE_UP2H: /* unpack two 16-bit floats */
1327             {
1328                GLfloat a[4], result[4];
1329                const GLuint *rawBits = (const GLuint *) a;
1330                GLhalfNV hx, hy;
1331                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1332                hx = rawBits[0] & 0xffff;
1333                hy = rawBits[0] >> 16;
1334                result[0] = result[2] = _mesa_half_to_float(hx);
1335                result[1] = result[3] = _mesa_half_to_float(hy);
1336                store_vector4( inst, machine, result );
1337             }
1338             break;
1339          case OPCODE_UP2US: /* unpack two GLushorts */
1340             {
1341                GLfloat a[4], result[4];
1342                const GLuint *rawBits = (const GLuint *) a;
1343                GLushort usx, usy;
1344                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1345                usx = rawBits[0] & 0xffff;
1346                usy = rawBits[0] >> 16;
1347                result[0] = result[2] = usx * (1.0f / 65535.0f);
1348                result[1] = result[3] = usy * (1.0f / 65535.0f);
1349                store_vector4( inst, machine, result );
1350             }
1351             break;
1352          case OPCODE_UP4B: /* unpack four GLbytes */
1353             {
1354                GLfloat a[4], result[4];
1355                const GLuint *rawBits = (const GLuint *) a;
1356                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1357                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1358                result[1] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1359                result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1360                result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1361                store_vector4( inst, machine, result );
1362             }
1363             break;
1364          case OPCODE_UP4UB: /* unpack four GLubytes */
1365             {
1366                GLfloat a[4], result[4];
1367                const GLuint *rawBits = (const GLuint *) a;
1368                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1369                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1370                result[1] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1371                result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1372                result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1373                store_vector4( inst, machine, result );
1374             }
1375             break;
1376          case OPCODE_XPD: /* cross product */
1377             {
1378                GLfloat a[4], b[4], result[4];
1379                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1380                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1381                result[0] = a[1] * b[2] - a[2] * b[1];
1382                result[1] = a[2] * b[0] - a[0] * b[2];
1383                result[2] = a[0] * b[1] - a[1] * b[0];
1384                result[3] = 1.0;
1385                store_vector4( inst, machine, result );
1386             }
1387             break;
1388          case OPCODE_X2D: /* 2-D matrix transform */
1389             {
1390                GLfloat a[4], b[4], c[4], result[4];
1391                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1392                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1393                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1394                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1395                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1396                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1397                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1398                store_vector4( inst, machine, result );
1399             }
1400             break;
1401          case OPCODE_PRINT:
1402             {
1403                if (inst->SrcReg[0].File != -1) {
1404                   GLfloat a[4];
1405                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
1406                   _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1407                                a[0], a[1], a[2], a[3]);
1408                }
1409                else {
1410                   _mesa_printf("%s\n", (const char *) inst->Data);
1411                }
1412             }
1413             break;
1414          case OPCODE_END:
1415             return GL_TRUE;
1416          default:
1417             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1418                           inst->Opcode);
1419             return GL_TRUE; /* return value doesn't matter */
1420       }
1421    }
1422    return GL_TRUE;
1423 }
1424
1425
1426 /**
1427  * Initialize the virtual fragment program machine state prior to running
1428  * fragment program on a fragment.  This involves initializing the input
1429  * registers, condition codes, etc.
1430  * \param machine  the virtual machine state to init
1431  * \param program  the fragment program we're about to run
1432  * \param span  the span of pixels we'll operate on
1433  * \param col  which element (column) of the span we'll operate on
1434  */
1435 static void
1436 init_machine( GLcontext *ctx, struct fp_machine *machine,
1437               const struct gl_fragment_program *program,
1438               const SWspan *span, GLuint col )
1439 {
1440    GLuint inputsRead = program->Base.InputsRead;
1441    GLuint u;
1442
1443    if (ctx->FragmentProgram.CallbackEnabled)
1444       inputsRead = ~0;
1445
1446    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
1447       /* Clear temporary registers (undefined for ARB_f_p) */
1448       _mesa_bzero(machine->Temporaries,
1449                   MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1450    }
1451
1452    /* Load input registers */
1453    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1454       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1455       ASSERT(span->arrayMask & SPAN_Z);
1456       if (span->arrayMask & SPAN_XY) {
1457          wpos[0] = (GLfloat) span->array->x[col];
1458          wpos[1] = (GLfloat) span->array->y[col];
1459       }
1460       else {
1461          wpos[0] = (GLfloat) span->x + col;
1462          wpos[1] = (GLfloat) span->y;
1463       }
1464       wpos[2] = (GLfloat) span->array->z[col] / ctx->DrawBuffer->_DepthMaxF;
1465       wpos[3] = span->w + col * span->dwdx;
1466    }
1467    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1468       ASSERT(span->arrayMask & SPAN_RGBA);
1469       COPY_4V(machine->Inputs[FRAG_ATTRIB_COL0],
1470               span->array->color.sz4.rgba[col]);
1471    }
1472    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1473       ASSERT(span->arrayMask & SPAN_SPEC);
1474       COPY_4V(machine->Inputs[FRAG_ATTRIB_COL1],
1475               span->array->color.sz4.spec[col]);
1476    }
1477    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1478       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1479       ASSERT(span->arrayMask & SPAN_FOG);
1480       fogc[0] = span->array->fog[col];
1481       fogc[1] = 0.0F;
1482       fogc[2] = 0.0F;
1483       fogc[3] = 0.0F;
1484    }
1485    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1486       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1487          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1488          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1489          COPY_4V(tex, span->array->texcoords[u][col]);
1490          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1491       }
1492    }
1493
1494    /* init condition codes */
1495    machine->CondCodes[0] = COND_EQ;
1496    machine->CondCodes[1] = COND_EQ;
1497    machine->CondCodes[2] = COND_EQ;
1498    machine->CondCodes[3] = COND_EQ;
1499 }
1500
1501
1502 /**
1503  * Run fragment program on the pixels in span from 'start' to 'end' - 1.
1504  */
1505 static void
1506 run_program(GLcontext *ctx, SWspan *span, GLuint start, GLuint end)
1507 {
1508    const struct gl_fragment_program *program = ctx->FragmentProgram._Current;
1509    struct fp_machine machine;
1510    GLuint i;
1511
1512    CurrentMachine = &machine;
1513
1514    for (i = start; i < end; i++) {
1515       if (span->array->mask[i]) {
1516          init_machine(ctx, &machine, program, span, i);
1517
1518          if (execute_program(ctx, program, ~0, &machine, span, i)) {
1519             /* Store result color */
1520             COPY_4V(span->array->color.sz4.rgba[i],
1521                     machine.Outputs[FRAG_RESULT_COLR]);
1522
1523             /* Store result depth/z */
1524             if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1525                const GLfloat depth = machine.Outputs[FRAG_RESULT_DEPR][2];
1526                if (depth <= 0.0)
1527                   span->array->z[i] = 0;
1528                else if (depth >= 1.0)
1529                   span->array->z[i] = ctx->DrawBuffer->_DepthMax;
1530                else
1531                   span->array->z[i] = IROUND(depth * ctx->DrawBuffer->_DepthMaxF);
1532             }
1533          }
1534          else {
1535             /* killed fragment */
1536             span->array->mask[i] = GL_FALSE;
1537             span->writeAll = GL_FALSE;
1538          }
1539       }
1540    }
1541
1542    CurrentMachine = NULL;
1543 }
1544
1545
1546 /**
1547  * Execute the current fragment program for all the fragments
1548  * in the given span.
1549  */
1550 void
1551 _swrast_exec_fragment_program( GLcontext *ctx, SWspan *span )
1552 {
1553    const struct gl_fragment_program *program = ctx->FragmentProgram._Current;
1554
1555    /* incoming colors should be floats */
1556    ASSERT(span->array->ChanType == GL_FLOAT);
1557
1558    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1559
1560    if (program->Base.Parameters) {
1561       _mesa_load_state_parameters(ctx, program->Base.Parameters);
1562    }
1563
1564    run_program(ctx, span, 0, span->end);
1565
1566    if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1567       span->interpMask &= ~SPAN_Z;
1568       span->arrayMask |= SPAN_Z;
1569    }
1570
1571    ctx->_CurrentProgram = 0;
1572 }
1573