src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5.2
   4  *
   5  * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /*
  26  * Regarding GL_NV_fragment_program:
  27  *
  28  * Portions of this software may use or implement intellectual
  29  * property owned and licensed by NVIDIA Corporation. NVIDIA disclaims
  30  * any and all warranties with respect to such intellectual property,
  31  * including any use thereof or modifications thereto.
  32  */
  33
  34 #include "glheader.h"
  35 #include "colormac.h"
  36 #include "context.h"
  37 #include "program_instruction.h"
  38 #include "program.h"
  39
  40 #include "s_nvfragprog.h"
  41 #include "s_span.h"
  42
  43
  44 /* if 1, print some debugging info */
  45 #define DEBUG_FRAG 0
  46
  47
  48 /**
  49  * Virtual machine state used during execution of a fragment programs.
  50  */
  51 struct fp_machine
  52 {
  53    GLfloat Temporaries[MAX_NV_FRAGMENT_PROGRAM_TEMPS][4];
  54    GLfloat Inputs[MAX_NV_FRAGMENT_PROGRAM_INPUTS][4];
  55    GLfloat Outputs[MAX_NV_FRAGMENT_PROGRAM_OUTPUTS][4];
  56    GLuint CondCodes[4];
  57 };
  58
  59
  60 #if FEATURE_MESA_program_debug
  61 static struct fp_machine *CurrentMachine = NULL;
  62
  63 /**
  64  * For GL_MESA_program_debug.
  65  * Return current value (4*GLfloat) of a fragment program register.
  66  * Called via ctx->Driver.GetFragmentProgramRegister().
  67  */
  68 void
  69 _swrast_get_program_register(GLcontext *ctx, enum register_file file,
  70                              GLuint index, GLfloat val[4])
  71 {
  72    if (CurrentMachine) {
  73       switch (file) {
  74       case PROGRAM_INPUT:
  75          COPY_4V(val, CurrentMachine->Inputs[index]);
  76          break;
  77       case PROGRAM_OUTPUT:
  78          COPY_4V(val, CurrentMachine->Outputs[index]);
  79          break;
  80       case PROGRAM_TEMPORARY:
  81          COPY_4V(val, CurrentMachine->Temporaries[index]);
  82          break;
  83       default:
  84          _mesa_problem(NULL,
  85                        "bad register file in _swrast_get_program_register");
  86       }
  87    }
  88 }
  89 #endif /* FEATURE_MESA_program_debug */
  90
  91
  92 /**
  93  * Fetch a texel.
  94  */
  95 static void
  96 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  97              GLuint unit, GLfloat color[4] )
  98 {
  99    GLchan rgba[4];
 100    SWcontext *swrast = SWRAST_CONTEXT(ctx);
 101
 102    /* XXX use a float-valued TextureSample routine here!!! */
 103    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
 104                                1, (const GLfloat (*)[4]) texcoord,
 105                                &lambda, &rgba);
 106    color[0] = CHAN_TO_FLOAT(rgba[0]);
 107    color[1] = CHAN_TO_FLOAT(rgba[1]);
 108    color[2] = CHAN_TO_FLOAT(rgba[2]);
 109    color[3] = CHAN_TO_FLOAT(rgba[3]);
 110 }
 111
 112
 113 /**
 114  * Fetch a texel with the given partial derivatives to compute a level
 115  * of detail in the mipmap.
 116  */
 117 static void
 118 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
 119                    const GLfloat texdx[4], const GLfloat texdy[4],
 120                    GLuint unit, GLfloat color[4] )
 121 {
 122    SWcontext *swrast = SWRAST_CONTEXT(ctx);
 123    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
 124    const struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
 125    const GLfloat texW = (GLfloat) texImg->WidthScale;
 126    const GLfloat texH = (GLfloat) texImg->HeightScale;
 127    GLchan rgba[4];
 128
 129    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
 130                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
 131                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
 132                                          texW, texH,
 133                                          texcoord[0], texcoord[1], texcoord[3],
 134                                          1.0F / texcoord[3]);
 135
 136    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
 137                                1, (const GLfloat (*)[4]) texcoord,
 138                                &lambda, &rgba);
 139    color[0] = CHAN_TO_FLOAT(rgba[0]);
 140    color[1] = CHAN_TO_FLOAT(rgba[1]);
 141    color[2] = CHAN_TO_FLOAT(rgba[2]);
 142    color[3] = CHAN_TO_FLOAT(rgba[3]);
 143 }
 144
 145
 146 /**
 147  * Return a pointer to the 4-element float vector specified by the given
 148  * source register.
 149  */
 150 static INLINE const GLfloat *
 151 get_register_pointer( GLcontext *ctx,
 152                       const struct prog_src_register *source,
 153                       const struct fp_machine *machine,
 154                       const struct gl_fragment_program *program )
 155 {
 156    const GLfloat *src;
 157    switch (source->File) {
 158       case PROGRAM_TEMPORARY:
 159          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 160          src = machine->Temporaries[source->Index];
 161          break;
 162       case PROGRAM_INPUT:
 163          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 164          src = machine->Inputs[source->Index];
 165          break;
 166       case PROGRAM_OUTPUT:
 167          /* This is only for PRINT */
 168          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_OUTPUTS);
 169          src = machine->Outputs[source->Index];
 170          break;
 171       case PROGRAM_LOCAL_PARAM:
 172          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 173          src = program->Base.LocalParams[source->Index];
 174          break;
 175       case PROGRAM_ENV_PARAM:
 176          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 177          src = ctx->FragmentProgram.Parameters[source->Index];
 178          break;
 179       case PROGRAM_STATE_VAR:
 180          /* Fallthrough */
 181       case PROGRAM_NAMED_PARAM:
 182          ASSERT(source->Index < (GLint) program->Base.Parameters->NumParameters);
 183          src = program->Base.Parameters->ParameterValues[source->Index];
 184          break;
 185       default:
 186          _mesa_problem(ctx, "Invalid input register file %d in fetch_vector4", source->File);
 187          src = NULL;
 188    }
 189    return src;
 190 }
 191
 192
 193 /**
 194  * Fetch a 4-element float vector from the given source register.
 195  * Apply swizzling and negating as needed.
 196  */
 197 static void
 198 fetch_vector4( GLcontext *ctx,
 199                const struct prog_src_register *source,
 200                const struct fp_machine *machine,
 201                const struct gl_fragment_program *program,
 202                GLfloat result[4] )
 203 {
 204    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 205    ASSERT(src);
 206
 207    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 208    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 209    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 210    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 211
 212    if (source->NegateBase) {
 213       result[0] = -result[0];
 214       result[1] = -result[1];
 215       result[2] = -result[2];
 216       result[3] = -result[3];
 217    }
 218    if (source->Abs) {
 219       result[0] = FABSF(result[0]);
 220       result[1] = FABSF(result[1]);
 221       result[2] = FABSF(result[2]);
 222       result[3] = FABSF(result[3]);
 223    }
 224    if (source->NegateAbs) {
 225       result[0] = -result[0];
 226       result[1] = -result[1];
 227       result[2] = -result[2];
 228       result[3] = -result[3];
 229    }
 230 }
 231
 232
 233 /**
 234  * Fetch the derivative with respect to X for the given register.
 235  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 236  * need to execute another instance of the program (ugh)!
 237  */
 238 static GLboolean
 239 fetch_vector4_deriv( GLcontext *ctx,
 240                      const struct prog_src_register *source,
 241                      const SWspan *span,
 242                      char xOrY, GLint column, GLfloat result[4] )
 243 {
 244    GLfloat src[4];
 245
 246    ASSERT(xOrY == 'X' || xOrY == 'Y');
 247
 248    switch (source->Index) {
 249    case FRAG_ATTRIB_WPOS:
 250       if (xOrY == 'X') {
 251          src[0] = 1.0;
 252          src[1] = 0.0;
 253          src[2] = span->dzdx / ctx->DrawBuffer->_DepthMaxF;
 254          src[3] = span->dwdx;
 255       }
 256       else {
 257          src[0] = 0.0;
 258          src[1] = 1.0;
 259          src[2] = span->dzdy / ctx->DrawBuffer->_DepthMaxF;
 260          src[3] = span->dwdy;
 261       }
 262       break;
 263    case FRAG_ATTRIB_COL0:
 264       if (xOrY == 'X') {
 265          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 266          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 267          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 268          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 269       }
 270       else {
 271          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 272          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 273          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 274          src[3] = span->dady * (1.0F / CHAN_MAXF);
 275       }
 276       break;
 277    case FRAG_ATTRIB_COL1:
 278       if (xOrY == 'X') {
 279          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 280          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 281          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 282          src[3] = 0.0; /* XXX need this */
 283       }
 284       else {
 285          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 286          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 287          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 288          src[3] = 0.0; /* XXX need this */
 289       }
 290       break;
 291    case FRAG_ATTRIB_FOGC:
 292       if (xOrY == 'X') {
 293          src[0] = span->dfogdx;
 294          src[1] = 0.0;
 295          src[2] = 0.0;
 296          src[3] = 0.0;
 297       }
 298       else {
 299          src[0] = span->dfogdy;
 300          src[1] = 0.0;
 301          src[2] = 0.0;
 302          src[3] = 0.0;
 303       }
 304       break;
 305    case FRAG_ATTRIB_TEX0:
 306    case FRAG_ATTRIB_TEX1:
 307    case FRAG_ATTRIB_TEX2:
 308    case FRAG_ATTRIB_TEX3:
 309    case FRAG_ATTRIB_TEX4:
 310    case FRAG_ATTRIB_TEX5:
 311    case FRAG_ATTRIB_TEX6:
 312    case FRAG_ATTRIB_TEX7:
 313       if (xOrY == 'X') {
 314          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 315          /* this is a little tricky - I think I've got it right */
 316          const GLfloat invQ = 1.0f / (span->tex[u][3]
 317                                       + span->texStepX[u][3] * column);
 318          src[0] = span->texStepX[u][0] * invQ;
 319          src[1] = span->texStepX[u][1] * invQ;
 320          src[2] = span->texStepX[u][2] * invQ;
 321          src[3] = span->texStepX[u][3] * invQ;
 322       }
 323       else {
 324          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 325          /* Tricky, as above, but in Y direction */
 326          const GLfloat invQ = 1.0f / (span->tex[u][3] + span->texStepY[u][3]);
 327          src[0] = span->texStepY[u][0] * invQ;
 328          src[1] = span->texStepY[u][1] * invQ;
 329          src[2] = span->texStepY[u][2] * invQ;
 330          src[3] = span->texStepY[u][3] * invQ;
 331       }
 332       break;
 333    default:
 334       return GL_FALSE;
 335    }
 336
 337    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 338    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 339    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 340    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 341
 342    if (source->NegateBase) {
 343       result[0] = -result[0];
 344       result[1] = -result[1];
 345       result[2] = -result[2];
 346       result[3] = -result[3];
 347    }
 348    if (source->Abs) {
 349       result[0] = FABSF(result[0]);
 350       result[1] = FABSF(result[1]);
 351       result[2] = FABSF(result[2]);
 352       result[3] = FABSF(result[3]);
 353    }
 354    if (source->NegateAbs) {
 355       result[0] = -result[0];
 356       result[1] = -result[1];
 357       result[2] = -result[2];
 358       result[3] = -result[3];
 359    }
 360    return GL_TRUE;
 361 }
 362
 363
 364 /**
 365  * As above, but only return result[0] element.
 366  */
 367 static void
 368 fetch_vector1( GLcontext *ctx,
 369                const struct prog_src_register *source,
 370                const struct fp_machine *machine,
 371                const struct gl_fragment_program *program,
 372                GLfloat result[4] )
 373 {
 374    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 375    ASSERT(src);
 376
 377    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 378
 379    if (source->NegateBase) {
 380       result[0] = -result[0];
 381    }
 382    if (source->Abs) {
 383       result[0] = FABSF(result[0]);
 384    }
 385    if (source->NegateAbs) {
 386       result[0] = -result[0];
 387    }
 388 }
 389
 390
 391 /**
 392  * Test value against zero and return GT, LT, EQ or UN if NaN.
 393  */
 394 static INLINE GLuint
 395 generate_cc( float value )
 396 {
 397    if (value != value)
 398       return COND_UN;  /* NaN */
 399    if (value > 0.0F)
 400       return COND_GT;
 401    if (value < 0.0F)
 402       return COND_LT;
 403    return COND_EQ;
 404 }
 405
 406
 407 /**
 408  * Test if the ccMaskRule is satisfied by the given condition code.
 409  * Used to mask destination writes according to the current condition codee.
 410  */
 411 static INLINE GLboolean
 412 test_cc(GLuint condCode, GLuint ccMaskRule)
 413 {
 414    switch (ccMaskRule) {
 415    case COND_EQ: return (condCode == COND_EQ);
 416    case COND_NE: return (condCode != COND_EQ);
 417    case COND_LT: return (condCode == COND_LT);
 418    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 419    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 420    case COND_GT: return (condCode == COND_GT);
 421    case COND_TR: return GL_TRUE;
 422    case COND_FL: return GL_FALSE;
 423    default:      return GL_TRUE;
 424    }
 425 }
 426
 427
 428 /**
 429  * Store 4 floats into a register.  Observe the instructions saturate and
 430  * set-condition-code flags.
 431  */
 432 static void
 433 store_vector4( const struct prog_instruction *inst,
 434                struct fp_machine *machine,
 435                const GLfloat value[4] )
 436 {
 437    const struct prog_dst_register *dest = &(inst->DstReg);
 438    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 439    const GLboolean updateCC = inst->CondUpdate;
 440    GLfloat *dstReg;
 441    GLfloat dummyReg[4];
 442    GLfloat clampedValue[4];
 443    GLboolean condWriteMask[4];
 444    GLuint writeMask = dest->WriteMask;
 445
 446    switch (dest->File) {
 447       case PROGRAM_OUTPUT:
 448          dstReg = machine->Outputs[dest->Index];
 449          break;
 450       case PROGRAM_TEMPORARY:
 451          dstReg = machine->Temporaries[dest->Index];
 452          break;
 453       case PROGRAM_WRITE_ONLY:
 454          dstReg = dummyReg;
 455          return;
 456       default:
 457          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 458          return;
 459    }
 460
 461 #if DEBUG_FRAG
 462    if (value[0] > 1.0e10 ||
 463        IS_INF_OR_NAN(value[0]) ||
 464        IS_INF_OR_NAN(value[1]) ||
 465        IS_INF_OR_NAN(value[2]) ||
 466        IS_INF_OR_NAN(value[3])  )
 467       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 468 #endif
 469
 470    if (clamp) {
 471       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 472       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 473       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 474       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 475       value = clampedValue;
 476    }
 477
 478    if (dest->CondMask != COND_TR) {
 479       condWriteMask[0] = GET_BIT(writeMask, 0)
 480          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)], dest->CondMask);
 481       condWriteMask[1] = GET_BIT(writeMask, 1)
 482          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)], dest->CondMask);
 483       condWriteMask[2] = GET_BIT(writeMask, 2)
 484          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)], dest->CondMask);
 485       condWriteMask[3] = GET_BIT(writeMask, 3)
 486          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)], dest->CondMask);
 487
 488       writeMask = ((condWriteMask[0] << 0) |
 489                    (condWriteMask[1] << 1) |
 490                    (condWriteMask[2] << 2) |
 491                    (condWriteMask[3] << 3));
 492    }
 493
 494    if (GET_BIT(writeMask, 0)) {
 495       dstReg[0] = value[0];
 496       if (updateCC)
 497          machine->CondCodes[0] = generate_cc(value[0]);
 498    }
 499    if (GET_BIT(writeMask, 1)) {
 500       dstReg[1] = value[1];
 501       if (updateCC)
 502          machine->CondCodes[1] = generate_cc(value[1]);
 503    }
 504    if (GET_BIT(writeMask, 2)) {
 505       dstReg[2] = value[2];
 506       if (updateCC)
 507          machine->CondCodes[2] = generate_cc(value[2]);
 508    }
 509    if (GET_BIT(writeMask, 3)) {
 510       dstReg[3] = value[3];
 511       if (updateCC)
 512          machine->CondCodes[3] = generate_cc(value[3]);
 513    }
 514 }
 515
 516
 517 /**
 518  * Initialize a new machine state instance from an existing one, adding
 519  * the partial derivatives onto the input registers.
 520  * Used to implement DDX and DDY instructions in non-trivial cases.
 521  */
 522 static void
 523 init_machine_deriv( GLcontext *ctx,
 524                     const struct fp_machine *machine,
 525                     const struct gl_fragment_program *program,
 526                     const SWspan *span, char xOrY,
 527                     struct fp_machine *dMachine )
 528 {
 529    GLuint u;
 530
 531    ASSERT(xOrY == 'X' || xOrY == 'Y');
 532
 533    /* copy existing machine */
 534    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 535
 536    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
 537       /* Clear temporary registers (undefined for ARB_f_p) */
 538       _mesa_bzero( (void*) machine->Temporaries,
 539                    MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 540    }
 541
 542    /* Add derivatives */
 543    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 544       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 545       if (xOrY == 'X') {
 546          wpos[0] += 1.0F;
 547          wpos[1] += 0.0F;
 548          wpos[2] += span->dzdx;
 549          wpos[3] += span->dwdx;
 550       }
 551       else {
 552          wpos[0] += 0.0F;
 553          wpos[1] += 1.0F;
 554          wpos[2] += span->dzdy;
 555          wpos[3] += span->dwdy;
 556       }
 557    }
 558    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 559       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 560       if (xOrY == 'X') {
 561          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 562          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 563          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 564          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 565       }
 566       else {
 567          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 568          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 569          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 570          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 571       }
 572    }
 573    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 574       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 575       if (xOrY == 'X') {
 576          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 577          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 578          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 579          col1[3] += 0.0; /*XXX fix */
 580       }
 581       else {
 582          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 583          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 584          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 585          col1[3] += 0.0; /*XXX fix */
 586       }
 587    }
 588    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 589       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 590       if (xOrY == 'X') {
 591          fogc[0] += span->dfogdx;
 592       }
 593       else {
 594          fogc[0] += span->dfogdy;
 595       }
 596    }
 597    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 598       if (program->Base.InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 599          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 600          /* XXX perspective-correct interpolation */
 601          if (xOrY == 'X') {
 602             tex[0] += span->texStepX[u][0];
 603             tex[1] += span->texStepX[u][1];
 604             tex[2] += span->texStepX[u][2];
 605             tex[3] += span->texStepX[u][3];
 606          }
 607          else {
 608             tex[0] += span->texStepY[u][0];
 609             tex[1] += span->texStepY[u][1];
 610             tex[2] += span->texStepY[u][2];
 611             tex[3] += span->texStepY[u][3];
 612          }
 613       }
 614    }
 615
 616    /* init condition codes */
 617    dMachine->CondCodes[0] = COND_EQ;
 618    dMachine->CondCodes[1] = COND_EQ;
 619    dMachine->CondCodes[2] = COND_EQ;
 620    dMachine->CondCodes[3] = COND_EQ;
 621 }
 622
 623
 624 /**
 625  * Execute the given vertex program.
 626  * NOTE: we do everything in single-precision floating point; we don't
 627  * currently observe the single/half/fixed-precision qualifiers.
 628  * \param ctx - rendering context
 629  * \param program - the fragment program to execute
 630  * \param machine - machine state (register file)
 631  * \param maxInst - max number of instructions to execute
 632  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 633  */
 634 static GLboolean
 635 execute_program( GLcontext *ctx,
 636                  const struct gl_fragment_program *program, GLuint maxInst,
 637                  struct fp_machine *machine, const SWspan *span,
 638                  GLuint column )
 639 {
 640    GLuint pc;
 641
 642 #if DEBUG_FRAG
 643    printf("execute fragment program --------------------\n");
 644 #endif
 645
 646    for (pc = 0; pc < maxInst; pc++) {
 647       const struct prog_instruction *inst = program->Base.Instructions + pc;
 648
 649       if (ctx->FragmentProgram.CallbackEnabled &&
 650           ctx->FragmentProgram.Callback) {
 651          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 652          ctx->FragmentProgram.Callback(program->Base.Target,
 653                                        ctx->FragmentProgram.CallbackData);
 654       }
 655
 656       switch (inst->Opcode) {
 657          case OPCODE_ABS:
 658             {
 659                GLfloat a[4], result[4];
 660                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 661                result[0] = FABSF(a[0]);
 662                result[1] = FABSF(a[1]);
 663                result[2] = FABSF(a[2]);
 664                result[3] = FABSF(a[3]);
 665                store_vector4( inst, machine, result );
 666             }
 667             break;
 668          case OPCODE_ADD:
 669             {
 670                GLfloat a[4], b[4], result[4];
 671                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 672                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 673                result[0] = a[0] + b[0];
 674                result[1] = a[1] + b[1];
 675                result[2] = a[2] + b[2];
 676                result[3] = a[3] + b[3];
 677                store_vector4( inst, machine, result );
 678             }
 679             break;
 680          case OPCODE_CMP:
 681             {
 682                GLfloat a[4], b[4], c[4], result[4];
 683                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 684                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 685                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 686                result[0] = a[0] < 0.0F ? b[0] : c[0];
 687                result[1] = a[1] < 0.0F ? b[1] : c[1];
 688                result[2] = a[2] < 0.0F ? b[2] : c[2];
 689                result[3] = a[3] < 0.0F ? b[3] : c[3];
 690                store_vector4( inst, machine, result );
 691             }
 692             break;
 693          case OPCODE_COS:
 694             {
 695                GLfloat a[4], result[4];
 696                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 697                result[0] = result[1] = result[2] = result[3] = (GLfloat)_mesa_cos(a[0]);
 698                store_vector4( inst, machine, result );
 699             }
 700             break;
 701          case OPCODE_DDX: /* Partial derivative with respect to X */
 702             {
 703                GLfloat a[4], aNext[4], result[4];
 704                struct fp_machine dMachine;
 705                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
 706                                         column, result)) {
 707                   /* This is tricky.  Make a copy of the current machine state,
 708                    * increment the input registers by the dx or dy partial
 709                    * derivatives, then re-execute the program up to the
 710                    * preceeding instruction, then fetch the source register.
 711                    * Finally, find the difference in the register values for
 712                    * the original and derivative runs.
 713                    */
 714                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 715                   init_machine_deriv(ctx, machine, program, span,
 716                                      'X', &dMachine);
 717                   execute_program(ctx, program, pc, &dMachine, span, column);
 718                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 719                   result[0] = aNext[0] - a[0];
 720                   result[1] = aNext[1] - a[1];
 721                   result[2] = aNext[2] - a[2];
 722                   result[3] = aNext[3] - a[3];
 723                }
 724                store_vector4( inst, machine, result );
 725             }
 726             break;
 727          case OPCODE_DDY: /* Partial derivative with respect to Y */
 728             {
 729                GLfloat a[4], aNext[4], result[4];
 730                struct fp_machine dMachine;
 731                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
 732                                         column, result)) {
 733                   init_machine_deriv(ctx, machine, program, span,
 734                                      'Y', &dMachine);
 735                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 736                   execute_program(ctx, program, pc, &dMachine, span, column);
 737                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 738                   result[0] = aNext[0] - a[0];
 739                   result[1] = aNext[1] - a[1];
 740                   result[2] = aNext[2] - a[2];
 741                   result[3] = aNext[3] - a[3];
 742                }
 743                store_vector4( inst, machine, result );
 744             }
 745             break;
 746          case OPCODE_DP3:
 747             {
 748                GLfloat a[4], b[4], result[4];
 749                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 750                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 751                result[0] = result[1] = result[2] = result[3] =
 752                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 753                store_vector4( inst, machine, result );
 754 #if DEBUG_FRAG
 755                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 756                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 757 #endif
 758             }
 759             break;
 760          case OPCODE_DP4:
 761             {
 762                GLfloat a[4], b[4], result[4];
 763                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 764                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 765                result[0] = result[1] = result[2] = result[3] =
 766                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 767                store_vector4( inst, machine, result );
 768 #if DEBUG_FRAG
 769                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 770                       result[0], a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 771 #endif
 772             }
 773             break;
 774          case OPCODE_DPH:
 775             {
 776                GLfloat a[4], b[4], result[4];
 777                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 778                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 779                result[0] = result[1] = result[2] = result[3] =
 780                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 781                store_vector4( inst, machine, result );
 782             }
 783             break;
 784          case OPCODE_DST: /* Distance vector */
 785             {
 786                GLfloat a[4], b[4], result[4];
 787                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 788                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 789                result[0] = 1.0F;
 790                result[1] = a[1] * b[1];
 791                result[2] = a[2];
 792                result[3] = b[3];
 793                store_vector4( inst, machine, result );
 794             }
 795             break;
 796          case OPCODE_EX2: /* Exponential base 2 */
 797             {
 798                GLfloat a[4], result[4];
 799                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 800                result[0] = result[1] = result[2] = result[3] =
 801                   (GLfloat) _mesa_pow(2.0, a[0]);
 802                store_vector4( inst, machine, result );
 803             }
 804             break;
 805          case OPCODE_FLR:
 806             {
 807                GLfloat a[4], result[4];
 808                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 809                result[0] = FLOORF(a[0]);
 810                result[1] = FLOORF(a[1]);
 811                result[2] = FLOORF(a[2]);
 812                result[3] = FLOORF(a[3]);
 813                store_vector4( inst, machine, result );
 814             }
 815             break;
 816          case OPCODE_FRC:
 817             {
 818                GLfloat a[4], result[4];
 819                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 820                result[0] = a[0] - FLOORF(a[0]);
 821                result[1] = a[1] - FLOORF(a[1]);
 822                result[2] = a[2] - FLOORF(a[2]);
 823                result[3] = a[3] - FLOORF(a[3]);
 824                store_vector4( inst, machine, result );
 825             }
 826             break;
 827          case OPCODE_KIL_NV: /* NV_f_p only */
 828             {
 829                const GLuint swizzle = inst->DstReg.CondSwizzle;
 830                const GLuint condMask = inst->DstReg.CondMask;
 831                if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 832                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 833                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 834                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 835                   return GL_FALSE;
 836                }
 837             }
 838             break;
 839          case OPCODE_KIL: /* ARB_f_p only */
 840             {
 841                GLfloat a[4];
 842                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 843                if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 844                   return GL_FALSE;
 845                }
 846             }
 847             break;
 848          case OPCODE_LG2:  /* log base 2 */
 849             {
 850                GLfloat a[4], result[4];
 851                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 852                result[0] = result[1] = result[2] = result[3]
 853                   = LOG2(a[0]);
 854                store_vector4( inst, machine, result );
 855             }
 856             break;
 857          case OPCODE_LIT:
 858             {
 859                const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
 860                GLfloat a[4], result[4];
 861                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 862                a[0] = MAX2(a[0], 0.0F);
 863                a[1] = MAX2(a[1], 0.0F);
 864                /* XXX ARB version clamps a[3], NV version doesn't */
 865                a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 866                result[0] = 1.0F;
 867                result[1] = a[0];
 868                /* XXX we could probably just use pow() here */
 869                if (a[0] > 0.0F) {
 870                   if (a[1] == 0.0 && a[3] == 0.0)
 871                      result[2] = 1.0;
 872                   else
 873                      result[2] = EXPF(a[3] * LOGF(a[1]));
 874                }
 875                else {
 876                   result[2] = 0.0;
 877                }
 878                result[3] = 1.0F;
 879                store_vector4( inst, machine, result );
 880             }
 881             break;
 882          case OPCODE_LRP:
 883             {
 884                GLfloat a[4], b[4], c[4], result[4];
 885                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 886                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 887                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 888                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 889                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 890                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 891                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 892                store_vector4( inst, machine, result );
 893             }
 894             break;
 895          case OPCODE_MAD:
 896             {
 897                GLfloat a[4], b[4], c[4], result[4];
 898                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 899                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 900                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 901                result[0] = a[0] * b[0] + c[0];
 902                result[1] = a[1] * b[1] + c[1];
 903                result[2] = a[2] * b[2] + c[2];
 904                result[3] = a[3] * b[3] + c[3];
 905                store_vector4( inst, machine, result );
 906             }
 907             break;
 908          case OPCODE_MAX:
 909             {
 910                GLfloat a[4], b[4], result[4];
 911                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 912                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 913                result[0] = MAX2(a[0], b[0]);
 914                result[1] = MAX2(a[1], b[1]);
 915                result[2] = MAX2(a[2], b[2]);
 916                result[3] = MAX2(a[3], b[3]);
 917                store_vector4( inst, machine, result );
 918 #if DEBUG_FRAG
 919                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 920                       result[0], result[1], result[2], result[3],
 921                       a[0], a[1], a[2], a[3],
 922                       b[0], b[1], b[2], b[3]);
 923 #endif
 924             }
 925             break;
 926          case OPCODE_MIN:
 927             {
 928                GLfloat a[4], b[4], result[4];
 929                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 930                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 931                result[0] = MIN2(a[0], b[0]);
 932                result[1] = MIN2(a[1], b[1]);
 933                result[2] = MIN2(a[2], b[2]);
 934                result[3] = MIN2(a[3], b[3]);
 935                store_vector4( inst, machine, result );
 936             }
 937             break;
 938          case OPCODE_MOV:
 939             {
 940                GLfloat result[4];
 941                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 942                store_vector4( inst, machine, result );
 943 #if DEBUG_FRAG
 944                printf("MOV (%g %g %g %g)\n",
 945                       result[0], result[1], result[2], result[3]);
 946 #endif
 947             }
 948             break;
 949          case OPCODE_MUL:
 950             {
 951                GLfloat a[4], b[4], result[4];
 952                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 953                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 954                result[0] = a[0] * b[0];
 955                result[1] = a[1] * b[1];
 956                result[2] = a[2] * b[2];
 957                result[3] = a[3] * b[3];
 958                store_vector4( inst, machine, result );
 959 #if DEBUG_FRAG
 960                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 961                       result[0], result[1], result[2], result[3],
 962                       a[0], a[1], a[2], a[3],
 963                       b[0], b[1], b[2], b[3]);
 964 #endif
 965             }
 966             break;
 967          case OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
 968             {
 969                GLfloat a[4], result[4];
 970                GLhalfNV hx, hy;
 971                GLuint *rawResult = (GLuint *) result;
 972                GLuint twoHalves;
 973                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 974                hx = _mesa_float_to_half(a[0]);
 975                hy = _mesa_float_to_half(a[1]);
 976                twoHalves = hx | (hy << 16);
 977                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 978                   = twoHalves;
 979                store_vector4( inst, machine, result );
 980             }
 981             break;
 982          case OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
 983             {
 984                GLfloat a[4], result[4];
 985                GLuint usx, usy, *rawResult = (GLuint *) result;
 986                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 987                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 988                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 989                usx = IROUND(a[0] * 65535.0F);
 990                usy = IROUND(a[1] * 65535.0F);
 991                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 992                   = usx | (usy << 16);
 993                store_vector4( inst, machine, result );
 994             }
 995             break;
 996          case OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
 997             {
 998                GLfloat a[4], result[4];
 999                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1000                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1001                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1002                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1003                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1004                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1005                ubx = IROUND(127.0F * a[0] + 128.0F);
1006                uby = IROUND(127.0F * a[1] + 128.0F);
1007                ubz = IROUND(127.0F * a[2] + 128.0F);
1008                ubw = IROUND(127.0F * a[3] + 128.0F);
1009                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1010                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1011                store_vector4( inst, machine, result );
1012             }
1013             break;
1014          case OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
1015             {
1016                GLfloat a[4], result[4];
1017                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
1018                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1019                a[0] = CLAMP(a[0], 0.0F, 1.0F);
1020                a[1] = CLAMP(a[1], 0.0F, 1.0F);
1021                a[2] = CLAMP(a[2], 0.0F, 1.0F);
1022                a[3] = CLAMP(a[3], 0.0F, 1.0F);
1023                ubx = IROUND(255.0F * a[0]);
1024                uby = IROUND(255.0F * a[1]);
1025                ubz = IROUND(255.0F * a[2]);
1026                ubw = IROUND(255.0F * a[3]);
1027                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
1028                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1029                store_vector4( inst, machine, result );
1030             }
1031             break;
1032          case OPCODE_POW:
1033             {
1034                GLfloat a[4], b[4], result[4];
1035                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1036                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
1037                result[0] = result[1] = result[2] = result[3]
1038                   = (GLfloat)_mesa_pow(a[0], b[0]);
1039                store_vector4( inst, machine, result );
1040             }
1041             break;
1042          case OPCODE_RCP:
1043             {
1044                GLfloat a[4], result[4];
1045                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1046 #if DEBUG_FRAG
1047                if (a[0] == 0)
1048                   printf("RCP(0)\n");
1049                else if (IS_INF_OR_NAN(a[0]))
1050                   printf("RCP(inf)\n");
1051 #endif
1052                result[0] = result[1] = result[2] = result[3]
1053                   = 1.0F / a[0];
1054                store_vector4( inst, machine, result );
1055             }
1056             break;
1057          case OPCODE_RFL:
1058             {
1059                GLfloat axis[4], dir[4], result[4], tmp[4];
1060                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
1061                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
1062                tmp[3] = axis[0] * axis[0]
1063                       + axis[1] * axis[1]
1064                       + axis[2] * axis[2];
1065                tmp[0] = (2.0F * (axis[0] * dir[0] +
1066                                  axis[1] * dir[1] +
1067                                  axis[2] * dir[2])) / tmp[3];
1068                result[0] = tmp[0] * axis[0] - dir[0];
1069                result[1] = tmp[0] * axis[1] - dir[1];
1070                result[2] = tmp[0] * axis[2] - dir[2];
1071                /* result[3] is never written! XXX enforce in parser! */
1072                store_vector4( inst, machine, result );
1073             }
1074             break;
1075          case OPCODE_RSQ: /* 1 / sqrt() */
1076             {
1077                GLfloat a[4], result[4];
1078                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1079                a[0] = FABSF(a[0]);
1080                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1081                store_vector4( inst, machine, result );
1082 #if DEBUG_FRAG
1083                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1084 #endif
1085             }
1086             break;
1087          case OPCODE_SCS: /* sine and cos */
1088             {
1089                GLfloat a[4], result[4];
1090                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1091                result[0] = (GLfloat)_mesa_cos(a[0]);
1092                result[1] = (GLfloat)_mesa_sin(a[0]);
1093                result[2] = 0.0;  /* undefined! */
1094                result[3] = 0.0;  /* undefined! */
1095                store_vector4( inst, machine, result );
1096             }
1097             break;
1098          case OPCODE_SEQ: /* set on equal */
1099             {
1100                GLfloat a[4], b[4], result[4];
1101                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1102                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1103                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1104                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1105                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1106                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1107                store_vector4( inst, machine, result );
1108             }
1109             break;
1110          case OPCODE_SFL: /* set false, operands ignored */
1111             {
1112                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1113                store_vector4( inst, machine, result );
1114             }
1115             break;
1116          case OPCODE_SGE: /* set on greater or equal */
1117             {
1118                GLfloat a[4], b[4], result[4];
1119                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1120                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1121                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1122                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1123                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1124                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1125                store_vector4( inst, machine, result );
1126             }
1127             break;
1128          case OPCODE_SGT: /* set on greater */
1129             {
1130                GLfloat a[4], b[4], result[4];
1131                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1132                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1133                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1134                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1135                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1136                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1137                store_vector4( inst, machine, result );
1138             }
1139             break;
1140          case OPCODE_SIN:
1141             {
1142                GLfloat a[4], result[4];
1143                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1144                result[0] = result[1] = result[2] =
1145                        result[3] = (GLfloat)_mesa_sin(a[0]);
1146                store_vector4( inst, machine, result );
1147             }
1148             break;
1149          case OPCODE_SLE: /* set on less or equal */
1150             {
1151                GLfloat a[4], b[4], result[4];
1152                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1153                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1154                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1155                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1156                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1157                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1158                store_vector4( inst, machine, result );
1159             }
1160             break;
1161          case OPCODE_SLT: /* set on less */
1162             {
1163                GLfloat a[4], b[4], result[4];
1164                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1165                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1166                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1167                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1168                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1169                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1170                store_vector4( inst, machine, result );
1171             }
1172             break;
1173          case OPCODE_SNE: /* set on not equal */
1174             {
1175                GLfloat a[4], b[4], result[4];
1176                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1177                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1178                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1179                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1180                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1181                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1182                store_vector4( inst, machine, result );
1183             }
1184             break;
1185          case OPCODE_STR: /* set true, operands ignored */
1186             {
1187                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1188                store_vector4( inst, machine, result );
1189             }
1190             break;
1191          case OPCODE_SUB:
1192             {
1193                GLfloat a[4], b[4], result[4];
1194                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1195                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1196                result[0] = a[0] - b[0];
1197                result[1] = a[1] - b[1];
1198                result[2] = a[2] - b[2];
1199                result[3] = a[3] - b[3];
1200                store_vector4( inst, machine, result );
1201             }
1202             break;
1203          case OPCODE_SWZ:
1204             {
1205                const struct prog_src_register *source = &inst->SrcReg[0];
1206                const GLfloat *src = get_register_pointer(ctx, source,
1207                                                          machine, program);
1208                GLfloat result[4];
1209                GLuint i;
1210
1211                /* do extended swizzling here */
1212                for (i = 0; i < 4; i++) {
1213                   if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ZERO)
1214                      result[i] = 0.0;
1215                   else if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ONE)
1216                      result[i] = 1.0;
1217                   else
1218                      result[i] = src[GET_SWZ(source->Swizzle, i)];
1219
1220                   if (source->NegateBase & (1 << i))
1221                      result[i] = -result[i];
1222                }
1223                store_vector4( inst, machine, result );
1224             }
1225             break;
1226          case OPCODE_TEX: /* Both ARB and NV frag prog */
1227             /* Texel lookup */
1228             {
1229                GLfloat texcoord[4], color[4];
1230                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1231                /* Note: we pass 0 for LOD.  The ARB extension requires it
1232                 * while the NV extension says it's implementation dependant.
1233                 */
1234                /* KW: Previously lambda was passed as zero, but I
1235                 * believe this is incorrect, the spec seems to
1236                 * indicate rather that lambda should not be
1237                 * changed/biased, unlike TXB where texcoord[3] is
1238                 * added to the lambda calculations.  The lambda should
1239                 * still be calculated normally for TEX & TXP though,
1240                 * not set to zero.  Otherwise it's very difficult to
1241                 * implement normal GL semantics through the fragment
1242                 * shader.
1243                 */
1244                fetch_texel( ctx, texcoord,
1245                             span->array->lambda[inst->TexSrcUnit][column],
1246                             inst->TexSrcUnit, color );
1247 #if DEBUG_FRAG
1248                if (color[3])
1249                   printf("color[3] = %f\n", color[3]);
1250 #endif
1251                store_vector4( inst, machine, color );
1252             }
1253             break;
1254          case OPCODE_TXB: /* GL_ARB_fragment_program only */
1255             /* Texel lookup with LOD bias */
1256             {
1257                GLfloat texcoord[4], color[4], bias, lambda;
1258
1259                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1260                /* texcoord[3] is the bias to add to lambda */
1261                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1262                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1263                     + texcoord[3];
1264                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1265                fetch_texel( ctx, texcoord, lambda,
1266                             inst->TexSrcUnit, color );
1267                store_vector4( inst, machine, color );
1268             }
1269             break;
1270          case OPCODE_TXD: /* GL_NV_fragment_program only */
1271             /* Texture lookup w/ partial derivatives for LOD */
1272             {
1273                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1274                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1275                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1276                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1277                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1278                                   color );
1279                store_vector4( inst, machine, color );
1280             }
1281             break;
1282          case OPCODE_TXP: /* GL_ARB_fragment_program only */
1283             /* Texture lookup w/ projective divide */
1284             {
1285                GLfloat texcoord[4], color[4];
1286                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1287                /* Not so sure about this test - if texcoord[3] is
1288                 * zero, we'd probably be fine except for an ASSERT in
1289                 * IROUND_POS() which gets triggered by the inf values created.
1290                 */
1291                if (texcoord[3] != 0.0) {
1292                   texcoord[0] /= texcoord[3];
1293                   texcoord[1] /= texcoord[3];
1294                   texcoord[2] /= texcoord[3];
1295                }
1296                /* KW: Previously lambda was passed as zero, but I
1297                 * believe this is incorrect, the spec seems to
1298                 * indicate rather that lambda should not be
1299                 * changed/biased, unlike TXB where texcoord[3] is
1300                 * added to the lambda calculations.  The lambda should
1301                 * still be calculated normally for TEX & TXP though,
1302                 * not set to zero.
1303                 */
1304                fetch_texel( ctx, texcoord,
1305                             span->array->lambda[inst->TexSrcUnit][column],
1306                             inst->TexSrcUnit, color );
1307                store_vector4( inst, machine, color );
1308             }
1309             break;
1310          case OPCODE_TXP_NV: /* GL_NV_fragment_program only */
1311             /* Texture lookup w/ projective divide */
1312             {
1313                GLfloat texcoord[4], color[4];
1314                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1315                if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1316                    texcoord[3] != 0.0) {
1317                   texcoord[0] /= texcoord[3];
1318                   texcoord[1] /= texcoord[3];
1319                   texcoord[2] /= texcoord[3];
1320                }
1321                fetch_texel( ctx, texcoord,
1322                             span->array->lambda[inst->TexSrcUnit][column],
1323                             inst->TexSrcUnit, color );
1324                store_vector4( inst, machine, color );
1325             }
1326             break;
1327          case OPCODE_UP2H: /* unpack two 16-bit floats */
1328             {
1329                GLfloat a[4], result[4];
1330                const GLuint *rawBits = (const GLuint *) a;
1331                GLhalfNV hx, hy;
1332                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1333                hx = rawBits[0] & 0xffff;
1334                hy = rawBits[0] >> 16;
1335                result[0] = result[2] = _mesa_half_to_float(hx);
1336                result[1] = result[3] = _mesa_half_to_float(hy);
1337                store_vector4( inst, machine, result );
1338             }
1339             break;
1340          case OPCODE_UP2US: /* unpack two GLushorts */
1341             {
1342                GLfloat a[4], result[4];
1343                const GLuint *rawBits = (const GLuint *) a;
1344                GLushort usx, usy;
1345                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1346                usx = rawBits[0] & 0xffff;
1347                usy = rawBits[0] >> 16;
1348                result[0] = result[2] = usx * (1.0f / 65535.0f);
1349                result[1] = result[3] = usy * (1.0f / 65535.0f);
1350                store_vector4( inst, machine, result );
1351             }
1352             break;
1353          case OPCODE_UP4B: /* unpack four GLbytes */
1354             {
1355                GLfloat a[4], result[4];
1356                const GLuint *rawBits = (const GLuint *) a;
1357                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1358                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1359                result[1] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1360                result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1361                result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1362                store_vector4( inst, machine, result );
1363             }
1364             break;
1365          case OPCODE_UP4UB: /* unpack four GLubytes */
1366             {
1367                GLfloat a[4], result[4];
1368                const GLuint *rawBits = (const GLuint *) a;
1369                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1370                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1371                result[1] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1372                result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1373                result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1374                store_vector4( inst, machine, result );
1375             }
1376             break;
1377          case OPCODE_XPD: /* cross product */
1378             {
1379                GLfloat a[4], b[4], result[4];
1380                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1381                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1382                result[0] = a[1] * b[2] - a[2] * b[1];
1383                result[1] = a[2] * b[0] - a[0] * b[2];
1384                result[2] = a[0] * b[1] - a[1] * b[0];
1385                result[3] = 1.0;
1386                store_vector4( inst, machine, result );
1387             }
1388             break;
1389          case OPCODE_X2D: /* 2-D matrix transform */
1390             {
1391                GLfloat a[4], b[4], c[4], result[4];
1392                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1393                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1394                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1395                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1396                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1397                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1398                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1399                store_vector4( inst, machine, result );
1400             }
1401             break;
1402          case OPCODE_PRINT:
1403             {
1404                if (inst->SrcReg[0].File != -1) {
1405                   GLfloat a[4];
1406                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
1407                   _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1408                                a[0], a[1], a[2], a[3]);
1409                }
1410                else {
1411                   _mesa_printf("%s\n", (const char *) inst->Data);
1412                }
1413             }
1414             break;
1415          case OPCODE_END:
1416             return GL_TRUE;
1417          default:
1418             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1419                           inst->Opcode);
1420             return GL_TRUE; /* return value doesn't matter */
1421       }
1422    }
1423    return GL_TRUE;
1424 }
1425
1426
1427 /**
1428  * Initialize the virtual fragment program machine state prior to running
1429  * fragment program on a fragment.  This involves initializing the input
1430  * registers, condition codes, etc.
1431  * \param machine  the virtual machine state to init
1432  * \param program  the fragment program we're about to run
1433  * \param span  the span of pixels we'll operate on
1434  * \param col  which element (column) of the span we'll operate on
1435  */
1436 static void
1437 init_machine( GLcontext *ctx, struct fp_machine *machine,
1438               const struct gl_fragment_program *program,
1439               const SWspan *span, GLuint col )
1440 {
1441    GLuint inputsRead = program->Base.InputsRead;
1442    GLuint u;
1443
1444    if (ctx->FragmentProgram.CallbackEnabled)
1445       inputsRead = ~0;
1446
1447    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
1448       /* Clear temporary registers (undefined for ARB_f_p) */
1449       _mesa_bzero(machine->Temporaries,
1450                   MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1451    }
1452
1453    /* Load input registers */
1454    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1455       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1456       ASSERT(span->arrayMask & SPAN_Z);
1457       if (span->arrayMask & SPAN_XY) {
1458          wpos[0] = (GLfloat) span->array->x[col];
1459          wpos[1] = (GLfloat) span->array->y[col];
1460       }
1461       else {
1462          wpos[0] = (GLfloat) span->x + col;
1463          wpos[1] = (GLfloat) span->y;
1464       }
1465       wpos[2] = (GLfloat) span->array->z[col] / ctx->DrawBuffer->_DepthMaxF;
1466       wpos[3] = span->w + col * span->dwdx;
1467    }
1468    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1469       GLfloat *col0 = machine->Inputs[FRAG_ATTRIB_COL0];
1470       ASSERT(span->arrayMask & SPAN_RGBA);
1471       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1472       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1473       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1474       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1475    }
1476    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1477       GLfloat *col1 = machine->Inputs[FRAG_ATTRIB_COL1];
1478       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1479       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1480       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1481       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1482    }
1483    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1484       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1485       ASSERT(span->arrayMask & SPAN_FOG);
1486       fogc[0] = span->array->fog[col];
1487       fogc[1] = 0.0F;
1488       fogc[2] = 0.0F;
1489       fogc[3] = 0.0F;
1490    }
1491    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1492       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1493          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1494          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1495          COPY_4V(tex, span->array->texcoords[u][col]);
1496          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1497       }
1498    }
1499
1500    /* init condition codes */
1501    machine->CondCodes[0] = COND_EQ;
1502    machine->CondCodes[1] = COND_EQ;
1503    machine->CondCodes[2] = COND_EQ;
1504    machine->CondCodes[3] = COND_EQ;
1505 }
1506
1507
1508 /**
1509  * Run fragment program on the pixels in span from 'start' to 'end' - 1.
1510  */
1511 static void
1512 run_program(GLcontext *ctx, SWspan *span, GLuint start, GLuint end)
1513 {
1514    const struct gl_fragment_program *program = ctx->FragmentProgram._Current;
1515    struct fp_machine machine;
1516    GLuint i;
1517
1518    CurrentMachine = &machine;
1519
1520    for (i = start; i < end; i++) {
1521       if (span->array->mask[i]) {
1522          init_machine(ctx, &machine, program, span, i);
1523
1524          if (!execute_program(ctx, program, ~0, &machine, span, i)) {
1525             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1526             span->writeAll = GL_FALSE;
1527          }
1528
1529          /* Store output registers */
1530          {
1531             const GLfloat *colOut = machine.Outputs[FRAG_RESULT_COLR];
1532             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1533             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1534             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1535             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1536          }
1537          /* depth value */
1538          if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1539             const GLfloat depth = machine.Outputs[FRAG_RESULT_DEPR][2];
1540             if (depth <= 0.0)
1541                span->array->z[i] = 0;
1542             else if (depth >= 1.0)
1543                span->array->z[i] = ctx->DrawBuffer->_DepthMax;
1544             else
1545                span->array->z[i] = IROUND(depth * ctx->DrawBuffer->_DepthMaxF);
1546          }
1547       }
1548    }
1549    CurrentMachine = NULL;
1550 }
1551
1552
1553 /**
1554  * Execute the current fragment program for all the fragments
1555  * in the given span.
1556  */
1557 void
1558 _swrast_exec_fragment_program( GLcontext *ctx, SWspan *span )
1559 {
1560    const struct gl_fragment_program *program = ctx->FragmentProgram._Current;
1561
1562    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1563
1564    if (program->Base.Parameters) {
1565       _mesa_load_state_parameters(ctx, program->Base.Parameters);
1566    }
1567
1568    run_program(ctx, span, 0, span->end);
1569
1570    if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1571       span->interpMask &= ~SPAN_Z;
1572       span->arrayMask |= SPAN_Z;
1573    }
1574
1575    ctx->_CurrentProgram = 0;
1576 }
1577