src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5.1
   4  *
   5  * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /*
  26  * Regarding GL_NV_fragment_program:
  27  *
  28  * Portions of this software may use or implement intellectual
  29  * property owned and licensed by NVIDIA Corporation. NVIDIA disclaims
  30  * any and all warranties with respect to such intellectual property,
  31  * including any use thereof or modifications thereto.
  32  */
  33
  34 #include "glheader.h"
  35 #include "colormac.h"
  36 #include "context.h"
  37 #include "program_instruction.h"
  38 #include "program.h"
  39
  40 #include "s_nvfragprog.h"
  41 #include "s_span.h"
  42
  43
  44 /* if 1, print some debugging info */
  45 #define DEBUG_FRAG 0
  46
  47 /**
  48  * Fetch a texel.
  49  */
  50 static void
  51 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  52              GLuint unit, GLfloat color[4] )
  53 {
  54    GLchan rgba[4];
  55    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  56
  57    /* XXX use a float-valued TextureSample routine here!!! */
  58    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
  59                                1, (const GLfloat (*)[4]) texcoord,
  60                                &lambda, &rgba);
  61    color[0] = CHAN_TO_FLOAT(rgba[0]);
  62    color[1] = CHAN_TO_FLOAT(rgba[1]);
  63    color[2] = CHAN_TO_FLOAT(rgba[2]);
  64    color[3] = CHAN_TO_FLOAT(rgba[3]);
  65 }
  66
  67
  68 /**
  69  * Fetch a texel with the given partial derivatives to compute a level
  70  * of detail in the mipmap.
  71  */
  72 static void
  73 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  74                    const GLfloat texdx[4], const GLfloat texdy[4],
  75                    GLuint unit, GLfloat color[4] )
  76 {
  77    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  78    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  79    const struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
  80    const GLfloat texW = (GLfloat) texImg->WidthScale;
  81    const GLfloat texH = (GLfloat) texImg->HeightScale;
  82    GLchan rgba[4];
  83
  84    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  85                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  86                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  87                                          texW, texH,
  88                                          texcoord[0], texcoord[1], texcoord[3],
  89                                          1.0F / texcoord[3]);
  90
  91    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
  92                                1, (const GLfloat (*)[4]) texcoord,
  93                                &lambda, &rgba);
  94    color[0] = CHAN_TO_FLOAT(rgba[0]);
  95    color[1] = CHAN_TO_FLOAT(rgba[1]);
  96    color[2] = CHAN_TO_FLOAT(rgba[2]);
  97    color[3] = CHAN_TO_FLOAT(rgba[3]);
  98 }
  99
 100
 101 /**
 102  * Return a pointer to the 4-element float vector specified by the given
 103  * source register.
 104  */
 105 static INLINE const GLfloat *
 106 get_register_pointer( GLcontext *ctx,
 107                       const struct prog_src_register *source,
 108                       const struct fp_machine *machine,
 109                       const struct gl_fragment_program *program )
 110 {
 111    const GLfloat *src;
 112    switch (source->File) {
 113       case PROGRAM_TEMPORARY:
 114          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 115          src = machine->Temporaries[source->Index];
 116          break;
 117       case PROGRAM_INPUT:
 118          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 119          src = machine->Inputs[source->Index];
 120          break;
 121       case PROGRAM_OUTPUT:
 122          /* This is only for PRINT */
 123          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_OUTPUTS);
 124          src = machine->Outputs[source->Index];
 125          break;
 126       case PROGRAM_LOCAL_PARAM:
 127          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 128          src = program->Base.LocalParams[source->Index];
 129          break;
 130       case PROGRAM_ENV_PARAM:
 131          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 132          src = ctx->FragmentProgram.Parameters[source->Index];
 133          break;
 134       case PROGRAM_STATE_VAR:
 135          /* Fallthrough */
 136       case PROGRAM_NAMED_PARAM:
 137          ASSERT(source->Index < (GLint) program->Base.Parameters->NumParameters);
 138          src = program->Base.Parameters->ParameterValues[source->Index];
 139          break;
 140       default:
 141          _mesa_problem(ctx, "Invalid input register file %d in fetch_vector4", source->File);
 142          src = NULL;
 143    }
 144    return src;
 145 }
 146
 147
 148 /**
 149  * Fetch a 4-element float vector from the given source register.
 150  * Apply swizzling and negating as needed.
 151  */
 152 static void
 153 fetch_vector4( GLcontext *ctx,
 154                const struct prog_src_register *source,
 155                const struct fp_machine *machine,
 156                const struct gl_fragment_program *program,
 157                GLfloat result[4] )
 158 {
 159    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 160    ASSERT(src);
 161
 162    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 163    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 164    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 165    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 166
 167    if (source->NegateBase) {
 168       result[0] = -result[0];
 169       result[1] = -result[1];
 170       result[2] = -result[2];
 171       result[3] = -result[3];
 172    }
 173    if (source->Abs) {
 174       result[0] = FABSF(result[0]);
 175       result[1] = FABSF(result[1]);
 176       result[2] = FABSF(result[2]);
 177       result[3] = FABSF(result[3]);
 178    }
 179    if (source->NegateAbs) {
 180       result[0] = -result[0];
 181       result[1] = -result[1];
 182       result[2] = -result[2];
 183       result[3] = -result[3];
 184    }
 185 }
 186
 187
 188 /**
 189  * Fetch the derivative with respect to X for the given register.
 190  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 191  * need to execute another instance of the program (ugh)!
 192  */
 193 static GLboolean
 194 fetch_vector4_deriv( GLcontext *ctx,
 195                      const struct prog_src_register *source,
 196                      const struct sw_span *span,
 197                      char xOrY, GLint column, GLfloat result[4] )
 198 {
 199    GLfloat src[4];
 200
 201    ASSERT(xOrY == 'X' || xOrY == 'Y');
 202
 203    switch (source->Index) {
 204    case FRAG_ATTRIB_WPOS:
 205       if (xOrY == 'X') {
 206          src[0] = 1.0;
 207          src[1] = 0.0;
 208          src[2] = span->dzdx / ctx->DrawBuffer->_DepthMaxF;
 209          src[3] = span->dwdx;
 210       }
 211       else {
 212          src[0] = 0.0;
 213          src[1] = 1.0;
 214          src[2] = span->dzdy / ctx->DrawBuffer->_DepthMaxF;
 215          src[3] = span->dwdy;
 216       }
 217       break;
 218    case FRAG_ATTRIB_COL0:
 219       if (xOrY == 'X') {
 220          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 221          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 222          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 223          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 224       }
 225       else {
 226          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 227          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 228          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 229          src[3] = span->dady * (1.0F / CHAN_MAXF);
 230       }
 231       break;
 232    case FRAG_ATTRIB_COL1:
 233       if (xOrY == 'X') {
 234          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 235          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 236          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 237          src[3] = 0.0; /* XXX need this */
 238       }
 239       else {
 240          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 241          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 242          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 243          src[3] = 0.0; /* XXX need this */
 244       }
 245       break;
 246    case FRAG_ATTRIB_FOGC:
 247       if (xOrY == 'X') {
 248          src[0] = span->dfogdx;
 249          src[1] = 0.0;
 250          src[2] = 0.0;
 251          src[3] = 0.0;
 252       }
 253       else {
 254          src[0] = span->dfogdy;
 255          src[1] = 0.0;
 256          src[2] = 0.0;
 257          src[3] = 0.0;
 258       }
 259       break;
 260    case FRAG_ATTRIB_TEX0:
 261    case FRAG_ATTRIB_TEX1:
 262    case FRAG_ATTRIB_TEX2:
 263    case FRAG_ATTRIB_TEX3:
 264    case FRAG_ATTRIB_TEX4:
 265    case FRAG_ATTRIB_TEX5:
 266    case FRAG_ATTRIB_TEX6:
 267    case FRAG_ATTRIB_TEX7:
 268       if (xOrY == 'X') {
 269          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 270          /* this is a little tricky - I think I've got it right */
 271          const GLfloat invQ = 1.0f / (span->tex[u][3]
 272                                       + span->texStepX[u][3] * column);
 273          src[0] = span->texStepX[u][0] * invQ;
 274          src[1] = span->texStepX[u][1] * invQ;
 275          src[2] = span->texStepX[u][2] * invQ;
 276          src[3] = span->texStepX[u][3] * invQ;
 277       }
 278       else {
 279          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 280          /* Tricky, as above, but in Y direction */
 281          const GLfloat invQ = 1.0f / (span->tex[u][3] + span->texStepY[u][3]);
 282          src[0] = span->texStepY[u][0] * invQ;
 283          src[1] = span->texStepY[u][1] * invQ;
 284          src[2] = span->texStepY[u][2] * invQ;
 285          src[3] = span->texStepY[u][3] * invQ;
 286       }
 287       break;
 288    default:
 289       return GL_FALSE;
 290    }
 291
 292    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 293    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 294    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 295    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 296
 297    if (source->NegateBase) {
 298       result[0] = -result[0];
 299       result[1] = -result[1];
 300       result[2] = -result[2];
 301       result[3] = -result[3];
 302    }
 303    if (source->Abs) {
 304       result[0] = FABSF(result[0]);
 305       result[1] = FABSF(result[1]);
 306       result[2] = FABSF(result[2]);
 307       result[3] = FABSF(result[3]);
 308    }
 309    if (source->NegateAbs) {
 310       result[0] = -result[0];
 311       result[1] = -result[1];
 312       result[2] = -result[2];
 313       result[3] = -result[3];
 314    }
 315    return GL_TRUE;
 316 }
 317
 318
 319 /**
 320  * As above, but only return result[0] element.
 321  */
 322 static void
 323 fetch_vector1( GLcontext *ctx,
 324                const struct prog_src_register *source,
 325                const struct fp_machine *machine,
 326                const struct gl_fragment_program *program,
 327                GLfloat result[4] )
 328 {
 329    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 330    ASSERT(src);
 331
 332    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 333
 334    if (source->NegateBase) {
 335       result[0] = -result[0];
 336    }
 337    if (source->Abs) {
 338       result[0] = FABSF(result[0]);
 339    }
 340    if (source->NegateAbs) {
 341       result[0] = -result[0];
 342    }
 343 }
 344
 345
 346 /**
 347  * Test value against zero and return GT, LT, EQ or UN if NaN.
 348  */
 349 static INLINE GLuint
 350 generate_cc( float value )
 351 {
 352    if (value != value)
 353       return COND_UN;  /* NaN */
 354    if (value > 0.0F)
 355       return COND_GT;
 356    if (value < 0.0F)
 357       return COND_LT;
 358    return COND_EQ;
 359 }
 360
 361
 362 /**
 363  * Test if the ccMaskRule is satisfied by the given condition code.
 364  * Used to mask destination writes according to the current condition codee.
 365  */
 366 static INLINE GLboolean
 367 test_cc(GLuint condCode, GLuint ccMaskRule)
 368 {
 369    switch (ccMaskRule) {
 370    case COND_EQ: return (condCode == COND_EQ);
 371    case COND_NE: return (condCode != COND_EQ);
 372    case COND_LT: return (condCode == COND_LT);
 373    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 374    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 375    case COND_GT: return (condCode == COND_GT);
 376    case COND_TR: return GL_TRUE;
 377    case COND_FL: return GL_FALSE;
 378    default:      return GL_TRUE;
 379    }
 380 }
 381
 382
 383 /**
 384  * Store 4 floats into a register.  Observe the instructions saturate and
 385  * set-condition-code flags.
 386  */
 387 static void
 388 store_vector4( const struct prog_instruction *inst,
 389                struct fp_machine *machine,
 390                const GLfloat value[4] )
 391 {
 392    const struct prog_dst_register *dest = &(inst->DstReg);
 393    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 394    const GLboolean updateCC = inst->CondUpdate;
 395    GLfloat *dstReg;
 396    GLfloat dummyReg[4];
 397    GLfloat clampedValue[4];
 398    GLboolean condWriteMask[4];
 399    GLuint writeMask = dest->WriteMask;
 400
 401    switch (dest->File) {
 402       case PROGRAM_OUTPUT:
 403          dstReg = machine->Outputs[dest->Index];
 404          break;
 405       case PROGRAM_TEMPORARY:
 406          dstReg = machine->Temporaries[dest->Index];
 407          break;
 408       case PROGRAM_WRITE_ONLY:
 409          dstReg = dummyReg;
 410          return;
 411       default:
 412          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 413          return;
 414    }
 415
 416 #if DEBUG_FRAG
 417    if (value[0] > 1.0e10 ||
 418        IS_INF_OR_NAN(value[0]) ||
 419        IS_INF_OR_NAN(value[1]) ||
 420        IS_INF_OR_NAN(value[2]) ||
 421        IS_INF_OR_NAN(value[3])  )
 422       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 423 #endif
 424
 425    if (clamp) {
 426       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 427       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 428       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 429       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 430       value = clampedValue;
 431    }
 432
 433    if (dest->CondMask != COND_TR) {
 434       condWriteMask[0] = GET_BIT(writeMask, 0)
 435          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)], dest->CondMask);
 436       condWriteMask[1] = GET_BIT(writeMask, 1)
 437          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)], dest->CondMask);
 438       condWriteMask[2] = GET_BIT(writeMask, 2)
 439          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)], dest->CondMask);
 440       condWriteMask[3] = GET_BIT(writeMask, 3)
 441          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)], dest->CondMask);
 442
 443       writeMask = ((condWriteMask[0] << 0) |
 444                    (condWriteMask[1] << 1) |
 445                    (condWriteMask[2] << 2) |
 446                    (condWriteMask[3] << 3));
 447    }
 448
 449    if (GET_BIT(writeMask, 0)) {
 450       dstReg[0] = value[0];
 451       if (updateCC)
 452          machine->CondCodes[0] = generate_cc(value[0]);
 453    }
 454    if (GET_BIT(writeMask, 1)) {
 455       dstReg[1] = value[1];
 456       if (updateCC)
 457          machine->CondCodes[1] = generate_cc(value[1]);
 458    }
 459    if (GET_BIT(writeMask, 2)) {
 460       dstReg[2] = value[2];
 461       if (updateCC)
 462          machine->CondCodes[2] = generate_cc(value[2]);
 463    }
 464    if (GET_BIT(writeMask, 3)) {
 465       dstReg[3] = value[3];
 466       if (updateCC)
 467          machine->CondCodes[3] = generate_cc(value[3]);
 468    }
 469 }
 470
 471
 472 /**
 473  * Initialize a new machine state instance from an existing one, adding
 474  * the partial derivatives onto the input registers.
 475  * Used to implement DDX and DDY instructions in non-trivial cases.
 476  */
 477 static void
 478 init_machine_deriv( GLcontext *ctx,
 479                     const struct fp_machine *machine,
 480                     const struct gl_fragment_program *program,
 481                     const struct sw_span *span, char xOrY,
 482                     struct fp_machine *dMachine )
 483 {
 484    GLuint u;
 485
 486    ASSERT(xOrY == 'X' || xOrY == 'Y');
 487
 488    /* copy existing machine */
 489    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 490
 491    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
 492       /* Clear temporary registers (undefined for ARB_f_p) */
 493       _mesa_bzero( (void*) machine->Temporaries,
 494                    MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 495    }
 496
 497    /* Add derivatives */
 498    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 499       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 500       if (xOrY == 'X') {
 501          wpos[0] += 1.0F;
 502          wpos[1] += 0.0F;
 503          wpos[2] += span->dzdx;
 504          wpos[3] += span->dwdx;
 505       }
 506       else {
 507          wpos[0] += 0.0F;
 508          wpos[1] += 1.0F;
 509          wpos[2] += span->dzdy;
 510          wpos[3] += span->dwdy;
 511       }
 512    }
 513    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 514       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 515       if (xOrY == 'X') {
 516          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 517          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 518          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 519          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 520       }
 521       else {
 522          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 523          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 524          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 525          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 526       }
 527    }
 528    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 529       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 530       if (xOrY == 'X') {
 531          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 532          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 533          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 534          col1[3] += 0.0; /*XXX fix */
 535       }
 536       else {
 537          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 538          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 539          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 540          col1[3] += 0.0; /*XXX fix */
 541       }
 542    }
 543    if (program->Base.InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 544       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 545       if (xOrY == 'X') {
 546          fogc[0] += span->dfogdx;
 547       }
 548       else {
 549          fogc[0] += span->dfogdy;
 550       }
 551    }
 552    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 553       if (program->Base.InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 554          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 555          /* XXX perspective-correct interpolation */
 556          if (xOrY == 'X') {
 557             tex[0] += span->texStepX[u][0];
 558             tex[1] += span->texStepX[u][1];
 559             tex[2] += span->texStepX[u][2];
 560             tex[3] += span->texStepX[u][3];
 561          }
 562          else {
 563             tex[0] += span->texStepY[u][0];
 564             tex[1] += span->texStepY[u][1];
 565             tex[2] += span->texStepY[u][2];
 566             tex[3] += span->texStepY[u][3];
 567          }
 568       }
 569    }
 570
 571    /* init condition codes */
 572    dMachine->CondCodes[0] = COND_EQ;
 573    dMachine->CondCodes[1] = COND_EQ;
 574    dMachine->CondCodes[2] = COND_EQ;
 575    dMachine->CondCodes[3] = COND_EQ;
 576 }
 577
 578
 579 /**
 580  * Execute the given vertex program.
 581  * NOTE: we do everything in single-precision floating point; we don't
 582  * currently observe the single/half/fixed-precision qualifiers.
 583  * \param ctx - rendering context
 584  * \param program - the fragment program to execute
 585  * \param machine - machine state (register file)
 586  * \param maxInst - max number of instructions to execute
 587  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 588  */
 589 static GLboolean
 590 execute_program( GLcontext *ctx,
 591                  const struct gl_fragment_program *program, GLuint maxInst,
 592                  struct fp_machine *machine, const struct sw_span *span,
 593                  GLuint column )
 594 {
 595    GLuint pc;
 596
 597 #if DEBUG_FRAG
 598    printf("execute fragment program --------------------\n");
 599 #endif
 600
 601    for (pc = 0; pc < maxInst; pc++) {
 602       const struct prog_instruction *inst = program->Base.Instructions + pc;
 603
 604       if (ctx->FragmentProgram.CallbackEnabled &&
 605           ctx->FragmentProgram.Callback) {
 606          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 607          ctx->FragmentProgram.Callback(program->Base.Target,
 608                                        ctx->FragmentProgram.CallbackData);
 609       }
 610
 611       switch (inst->Opcode) {
 612          case OPCODE_ABS:
 613             {
 614                GLfloat a[4], result[4];
 615                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 616                result[0] = FABSF(a[0]);
 617                result[1] = FABSF(a[1]);
 618                result[2] = FABSF(a[2]);
 619                result[3] = FABSF(a[3]);
 620                store_vector4( inst, machine, result );
 621             }
 622             break;
 623          case OPCODE_ADD:
 624             {
 625                GLfloat a[4], b[4], result[4];
 626                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 627                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 628                result[0] = a[0] + b[0];
 629                result[1] = a[1] + b[1];
 630                result[2] = a[2] + b[2];
 631                result[3] = a[3] + b[3];
 632                store_vector4( inst, machine, result );
 633             }
 634             break;
 635          case OPCODE_CMP:
 636             {
 637                GLfloat a[4], b[4], c[4], result[4];
 638                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 639                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 640                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 641                result[0] = a[0] < 0.0F ? b[0] : c[0];
 642                result[1] = a[1] < 0.0F ? b[1] : c[1];
 643                result[2] = a[2] < 0.0F ? b[2] : c[2];
 644                result[3] = a[3] < 0.0F ? b[3] : c[3];
 645                store_vector4( inst, machine, result );
 646             }
 647             break;
 648          case OPCODE_COS:
 649             {
 650                GLfloat a[4], result[4];
 651                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 652                result[0] = result[1] = result[2] = result[3] = (GLfloat)_mesa_cos(a[0]);
 653                store_vector4( inst, machine, result );
 654             }
 655             break;
 656          case OPCODE_DDX: /* Partial derivative with respect to X */
 657             {
 658                GLfloat a[4], aNext[4], result[4];
 659                struct fp_machine dMachine;
 660                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
 661                                         column, result)) {
 662                   /* This is tricky.  Make a copy of the current machine state,
 663                    * increment the input registers by the dx or dy partial
 664                    * derivatives, then re-execute the program up to the
 665                    * preceeding instruction, then fetch the source register.
 666                    * Finally, find the difference in the register values for
 667                    * the original and derivative runs.
 668                    */
 669                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 670                   init_machine_deriv(ctx, machine, program, span,
 671                                      'X', &dMachine);
 672                   execute_program(ctx, program, pc, &dMachine, span, column);
 673                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 674                   result[0] = aNext[0] - a[0];
 675                   result[1] = aNext[1] - a[1];
 676                   result[2] = aNext[2] - a[2];
 677                   result[3] = aNext[3] - a[3];
 678                }
 679                store_vector4( inst, machine, result );
 680             }
 681             break;
 682          case OPCODE_DDY: /* Partial derivative with respect to Y */
 683             {
 684                GLfloat a[4], aNext[4], result[4];
 685                struct fp_machine dMachine;
 686                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
 687                                         column, result)) {
 688                   init_machine_deriv(ctx, machine, program, span,
 689                                      'Y', &dMachine);
 690                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 691                   execute_program(ctx, program, pc, &dMachine, span, column);
 692                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 693                   result[0] = aNext[0] - a[0];
 694                   result[1] = aNext[1] - a[1];
 695                   result[2] = aNext[2] - a[2];
 696                   result[3] = aNext[3] - a[3];
 697                }
 698                store_vector4( inst, machine, result );
 699             }
 700             break;
 701          case OPCODE_DP3:
 702             {
 703                GLfloat a[4], b[4], result[4];
 704                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 705                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 706                result[0] = result[1] = result[2] = result[3] =
 707                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 708                store_vector4( inst, machine, result );
 709 #if DEBUG_FRAG
 710                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 711                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 712 #endif
 713             }
 714             break;
 715          case OPCODE_DP4:
 716             {
 717                GLfloat a[4], b[4], result[4];
 718                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 719                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 720                result[0] = result[1] = result[2] = result[3] =
 721                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 722                store_vector4( inst, machine, result );
 723 #if DEBUG_FRAG
 724                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 725                       result[0], a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 726 #endif
 727             }
 728             break;
 729          case OPCODE_DPH:
 730             {
 731                GLfloat a[4], b[4], result[4];
 732                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 733                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 734                result[0] = result[1] = result[2] = result[3] =
 735                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 736                store_vector4( inst, machine, result );
 737             }
 738             break;
 739          case OPCODE_DST: /* Distance vector */
 740             {
 741                GLfloat a[4], b[4], result[4];
 742                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 743                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 744                result[0] = 1.0F;
 745                result[1] = a[1] * b[1];
 746                result[2] = a[2];
 747                result[3] = b[3];
 748                store_vector4( inst, machine, result );
 749             }
 750             break;
 751          case OPCODE_EX2: /* Exponential base 2 */
 752             {
 753                GLfloat a[4], result[4];
 754                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 755                result[0] = result[1] = result[2] = result[3] =
 756                   (GLfloat) _mesa_pow(2.0, a[0]);
 757                store_vector4( inst, machine, result );
 758             }
 759             break;
 760          case OPCODE_FLR:
 761             {
 762                GLfloat a[4], result[4];
 763                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 764                result[0] = FLOORF(a[0]);
 765                result[1] = FLOORF(a[1]);
 766                result[2] = FLOORF(a[2]);
 767                result[3] = FLOORF(a[3]);
 768                store_vector4( inst, machine, result );
 769             }
 770             break;
 771          case OPCODE_FRC:
 772             {
 773                GLfloat a[4], result[4];
 774                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 775                result[0] = a[0] - FLOORF(a[0]);
 776                result[1] = a[1] - FLOORF(a[1]);
 777                result[2] = a[2] - FLOORF(a[2]);
 778                result[3] = a[3] - FLOORF(a[3]);
 779                store_vector4( inst, machine, result );
 780             }
 781             break;
 782          case OPCODE_KIL_NV: /* NV_f_p only */
 783             {
 784                const GLuint swizzle = inst->DstReg.CondSwizzle;
 785                const GLuint condMask = inst->DstReg.CondMask;
 786                if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 787                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 788                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 789                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 790                   return GL_FALSE;
 791                }
 792             }
 793             break;
 794          case OPCODE_KIL: /* ARB_f_p only */
 795             {
 796                GLfloat a[4];
 797                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 798                if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 799                   return GL_FALSE;
 800                }
 801             }
 802             break;
 803          case OPCODE_LG2:  /* log base 2 */
 804             {
 805                GLfloat a[4], result[4];
 806                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 807                result[0] = result[1] = result[2] = result[3]
 808                   = LOG2(a[0]);
 809                store_vector4( inst, machine, result );
 810             }
 811             break;
 812          case OPCODE_LIT:
 813             {
 814                const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
 815                GLfloat a[4], result[4];
 816                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 817                a[0] = MAX2(a[0], 0.0F);
 818                a[1] = MAX2(a[1], 0.0F);
 819                /* XXX ARB version clamps a[3], NV version doesn't */
 820                a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 821                result[0] = 1.0F;
 822                result[1] = a[0];
 823                /* XXX we could probably just use pow() here */
 824                if (a[0] > 0.0F) {
 825                   if (a[1] == 0.0 && a[3] == 0.0)
 826                      result[2] = 1.0;
 827                   else
 828                      result[2] = EXPF(a[3] * LOGF(a[1]));
 829                }
 830                else {
 831                   result[2] = 0.0;
 832                }
 833                result[3] = 1.0F;
 834                store_vector4( inst, machine, result );
 835             }
 836             break;
 837          case OPCODE_LRP:
 838             {
 839                GLfloat a[4], b[4], c[4], result[4];
 840                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 841                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 842                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 843                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 844                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 845                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 846                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 847                store_vector4( inst, machine, result );
 848             }
 849             break;
 850          case OPCODE_MAD:
 851             {
 852                GLfloat a[4], b[4], c[4], result[4];
 853                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 854                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 855                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 856                result[0] = a[0] * b[0] + c[0];
 857                result[1] = a[1] * b[1] + c[1];
 858                result[2] = a[2] * b[2] + c[2];
 859                result[3] = a[3] * b[3] + c[3];
 860                store_vector4( inst, machine, result );
 861             }
 862             break;
 863          case OPCODE_MAX:
 864             {
 865                GLfloat a[4], b[4], result[4];
 866                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 867                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 868                result[0] = MAX2(a[0], b[0]);
 869                result[1] = MAX2(a[1], b[1]);
 870                result[2] = MAX2(a[2], b[2]);
 871                result[3] = MAX2(a[3], b[3]);
 872                store_vector4( inst, machine, result );
 873 #if DEBUG_FRAG
 874                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 875                       result[0], result[1], result[2], result[3],
 876                       a[0], a[1], a[2], a[3],
 877                       b[0], b[1], b[2], b[3]);
 878 #endif
 879             }
 880             break;
 881          case OPCODE_MIN:
 882             {
 883                GLfloat a[4], b[4], result[4];
 884                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 885                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 886                result[0] = MIN2(a[0], b[0]);
 887                result[1] = MIN2(a[1], b[1]);
 888                result[2] = MIN2(a[2], b[2]);
 889                result[3] = MIN2(a[3], b[3]);
 890                store_vector4( inst, machine, result );
 891             }
 892             break;
 893          case OPCODE_MOV:
 894             {
 895                GLfloat result[4];
 896                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 897                store_vector4( inst, machine, result );
 898 #if DEBUG_FRAG
 899                printf("MOV (%g %g %g %g)\n",
 900                       result[0], result[1], result[2], result[3]);
 901 #endif
 902             }
 903             break;
 904          case OPCODE_MUL:
 905             {
 906                GLfloat a[4], b[4], result[4];
 907                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 908                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 909                result[0] = a[0] * b[0];
 910                result[1] = a[1] * b[1];
 911                result[2] = a[2] * b[2];
 912                result[3] = a[3] * b[3];
 913                store_vector4( inst, machine, result );
 914 #if DEBUG_FRAG
 915                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 916                       result[0], result[1], result[2], result[3],
 917                       a[0], a[1], a[2], a[3],
 918                       b[0], b[1], b[2], b[3]);
 919 #endif
 920             }
 921             break;
 922          case OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
 923             {
 924                GLfloat a[4], result[4];
 925                GLhalfNV hx, hy;
 926                GLuint *rawResult = (GLuint *) result;
 927                GLuint twoHalves;
 928                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 929                hx = _mesa_float_to_half(a[0]);
 930                hy = _mesa_float_to_half(a[1]);
 931                twoHalves = hx | (hy << 16);
 932                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 933                   = twoHalves;
 934                store_vector4( inst, machine, result );
 935             }
 936             break;
 937          case OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
 938             {
 939                GLfloat a[4], result[4];
 940                GLuint usx, usy, *rawResult = (GLuint *) result;
 941                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 942                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 943                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 944                usx = IROUND(a[0] * 65535.0F);
 945                usy = IROUND(a[1] * 65535.0F);
 946                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 947                   = usx | (usy << 16);
 948                store_vector4( inst, machine, result );
 949             }
 950             break;
 951          case OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
 952             {
 953                GLfloat a[4], result[4];
 954                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 955                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 956                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 957                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 958                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 959                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 960                ubx = IROUND(127.0F * a[0] + 128.0F);
 961                uby = IROUND(127.0F * a[1] + 128.0F);
 962                ubz = IROUND(127.0F * a[2] + 128.0F);
 963                ubw = IROUND(127.0F * a[3] + 128.0F);
 964                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 965                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 966                store_vector4( inst, machine, result );
 967             }
 968             break;
 969          case OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
 970             {
 971                GLfloat a[4], result[4];
 972                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 973                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 974                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 975                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 976                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 977                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 978                ubx = IROUND(255.0F * a[0]);
 979                uby = IROUND(255.0F * a[1]);
 980                ubz = IROUND(255.0F * a[2]);
 981                ubw = IROUND(255.0F * a[3]);
 982                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 983                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 984                store_vector4( inst, machine, result );
 985             }
 986             break;
 987          case OPCODE_POW:
 988             {
 989                GLfloat a[4], b[4], result[4];
 990                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 991                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
 992                result[0] = result[1] = result[2] = result[3]
 993                   = (GLfloat)_mesa_pow(a[0], b[0]);
 994                store_vector4( inst, machine, result );
 995             }
 996             break;
 997          case OPCODE_RCP:
 998             {
 999                GLfloat a[4], result[4];
1000                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1001 #if DEBUG_FRAG
1002                if (a[0] == 0)
1003                   printf("RCP(0)\n");
1004                else if (IS_INF_OR_NAN(a[0]))
1005                   printf("RCP(inf)\n");
1006 #endif
1007                result[0] = result[1] = result[2] = result[3]
1008                   = 1.0F / a[0];
1009                store_vector4( inst, machine, result );
1010             }
1011             break;
1012          case OPCODE_RFL:
1013             {
1014                GLfloat axis[4], dir[4], result[4], tmp[4];
1015                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
1016                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
1017                tmp[3] = axis[0] * axis[0]
1018                       + axis[1] * axis[1]
1019                       + axis[2] * axis[2];
1020                tmp[0] = (2.0F * (axis[0] * dir[0] +
1021                                  axis[1] * dir[1] +
1022                                  axis[2] * dir[2])) / tmp[3];
1023                result[0] = tmp[0] * axis[0] - dir[0];
1024                result[1] = tmp[0] * axis[1] - dir[1];
1025                result[2] = tmp[0] * axis[2] - dir[2];
1026                /* result[3] is never written! XXX enforce in parser! */
1027                store_vector4( inst, machine, result );
1028             }
1029             break;
1030          case OPCODE_RSQ: /* 1 / sqrt() */
1031             {
1032                GLfloat a[4], result[4];
1033                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1034                a[0] = FABSF(a[0]);
1035                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1036                store_vector4( inst, machine, result );
1037 #if DEBUG_FRAG
1038                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1039 #endif
1040             }
1041             break;
1042          case OPCODE_SCS: /* sine and cos */
1043             {
1044                GLfloat a[4], result[4];
1045                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1046                result[0] = (GLfloat)_mesa_cos(a[0]);
1047                result[1] = (GLfloat)_mesa_sin(a[0]);
1048                result[2] = 0.0;  /* undefined! */
1049                result[3] = 0.0;  /* undefined! */
1050                store_vector4( inst, machine, result );
1051             }
1052             break;
1053          case OPCODE_SEQ: /* set on equal */
1054             {
1055                GLfloat a[4], b[4], result[4];
1056                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1057                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1058                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1059                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1060                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1061                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1062                store_vector4( inst, machine, result );
1063             }
1064             break;
1065          case OPCODE_SFL: /* set false, operands ignored */
1066             {
1067                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1068                store_vector4( inst, machine, result );
1069             }
1070             break;
1071          case OPCODE_SGE: /* set on greater or equal */
1072             {
1073                GLfloat a[4], b[4], result[4];
1074                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1075                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1076                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1077                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1078                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1079                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1080                store_vector4( inst, machine, result );
1081             }
1082             break;
1083          case OPCODE_SGT: /* set on greater */
1084             {
1085                GLfloat a[4], b[4], result[4];
1086                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1087                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1088                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1089                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1090                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1091                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1092                store_vector4( inst, machine, result );
1093             }
1094             break;
1095          case OPCODE_SIN:
1096             {
1097                GLfloat a[4], result[4];
1098                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1099                result[0] = result[1] = result[2] =
1100                        result[3] = (GLfloat)_mesa_sin(a[0]);
1101                store_vector4( inst, machine, result );
1102             }
1103             break;
1104          case OPCODE_SLE: /* set on less or equal */
1105             {
1106                GLfloat a[4], b[4], result[4];
1107                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1108                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1109                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1110                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1111                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1112                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1113                store_vector4( inst, machine, result );
1114             }
1115             break;
1116          case OPCODE_SLT: /* set on less */
1117             {
1118                GLfloat a[4], b[4], result[4];
1119                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1120                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1121                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1122                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1123                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1124                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1125                store_vector4( inst, machine, result );
1126             }
1127             break;
1128          case OPCODE_SNE: /* set on not equal */
1129             {
1130                GLfloat a[4], b[4], result[4];
1131                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1132                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1133                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1134                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1135                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1136                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1137                store_vector4( inst, machine, result );
1138             }
1139             break;
1140          case OPCODE_STR: /* set true, operands ignored */
1141             {
1142                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1143                store_vector4( inst, machine, result );
1144             }
1145             break;
1146          case OPCODE_SUB:
1147             {
1148                GLfloat a[4], b[4], result[4];
1149                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1150                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1151                result[0] = a[0] - b[0];
1152                result[1] = a[1] - b[1];
1153                result[2] = a[2] - b[2];
1154                result[3] = a[3] - b[3];
1155                store_vector4( inst, machine, result );
1156             }
1157             break;
1158          case OPCODE_SWZ:
1159             {
1160                const struct prog_src_register *source = &inst->SrcReg[0];
1161                const GLfloat *src = get_register_pointer(ctx, source,
1162                                                          machine, program);
1163                GLfloat result[4];
1164                GLuint i;
1165
1166                /* do extended swizzling here */
1167                for (i = 0; i < 4; i++) {
1168                   if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ZERO)
1169                      result[i] = 0.0;
1170                   else if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ONE)
1171                      result[i] = 1.0;
1172                   else
1173                      result[i] = src[GET_SWZ(source->Swizzle, i)];
1174
1175                   if (source->NegateBase & (1 << i))
1176                      result[i] = -result[i];
1177                }
1178                store_vector4( inst, machine, result );
1179             }
1180             break;
1181          case OPCODE_TEX: /* Both ARB and NV frag prog */
1182             /* Texel lookup */
1183             {
1184                GLfloat texcoord[4], color[4];
1185                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1186                /* Note: we pass 0 for LOD.  The ARB extension requires it
1187                 * while the NV extension says it's implementation dependant.
1188                 */
1189                /* KW: Previously lambda was passed as zero, but I
1190                 * believe this is incorrect, the spec seems to
1191                 * indicate rather that lambda should not be
1192                 * changed/biased, unlike TXB where texcoord[3] is
1193                 * added to the lambda calculations.  The lambda should
1194                 * still be calculated normally for TEX & TXP though,
1195                 * not set to zero.  Otherwise it's very difficult to
1196                 * implement normal GL semantics through the fragment
1197                 * shader.
1198                 */
1199                fetch_texel( ctx, texcoord,
1200                             span->array->lambda[inst->TexSrcUnit][column],
1201                             inst->TexSrcUnit, color );
1202 #if DEBUG_FRAG
1203                if (color[3])
1204                   printf("color[3] = %f\n", color[3]);
1205 #endif
1206                store_vector4( inst, machine, color );
1207             }
1208             break;
1209          case OPCODE_TXB: /* GL_ARB_fragment_program only */
1210             /* Texel lookup with LOD bias */
1211             {
1212                GLfloat texcoord[4], color[4], bias, lambda;
1213
1214                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1215                /* texcoord[3] is the bias to add to lambda */
1216                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1217                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1218                     + texcoord[3];
1219                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1220                fetch_texel( ctx, texcoord, lambda,
1221                             inst->TexSrcUnit, color );
1222                store_vector4( inst, machine, color );
1223             }
1224             break;
1225          case OPCODE_TXD: /* GL_NV_fragment_program only */
1226             /* Texture lookup w/ partial derivatives for LOD */
1227             {
1228                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1229                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1230                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1231                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1232                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1233                                   color );
1234                store_vector4( inst, machine, color );
1235             }
1236             break;
1237          case OPCODE_TXP: /* GL_ARB_fragment_program only */
1238             /* Texture lookup w/ projective divide */
1239             {
1240                GLfloat texcoord[4], color[4];
1241                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1242                /* Not so sure about this test - if texcoord[3] is
1243                 * zero, we'd probably be fine except for an ASSERT in
1244                 * IROUND_POS() which gets triggered by the inf values created.
1245                 */
1246                if (texcoord[3] != 0.0) {
1247                   texcoord[0] /= texcoord[3];
1248                   texcoord[1] /= texcoord[3];
1249                   texcoord[2] /= texcoord[3];
1250                }
1251                /* KW: Previously lambda was passed as zero, but I
1252                 * believe this is incorrect, the spec seems to
1253                 * indicate rather that lambda should not be
1254                 * changed/biased, unlike TXB where texcoord[3] is
1255                 * added to the lambda calculations.  The lambda should
1256                 * still be calculated normally for TEX & TXP though,
1257                 * not set to zero.
1258                 */
1259                fetch_texel( ctx, texcoord,
1260                             span->array->lambda[inst->TexSrcUnit][column],
1261                             inst->TexSrcUnit, color );
1262                store_vector4( inst, machine, color );
1263             }
1264             break;
1265          case OPCODE_TXP_NV: /* GL_NV_fragment_program only */
1266             /* Texture lookup w/ projective divide */
1267             {
1268                GLfloat texcoord[4], color[4];
1269                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1270                if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1271                    texcoord[3] != 0.0) {
1272                   texcoord[0] /= texcoord[3];
1273                   texcoord[1] /= texcoord[3];
1274                   texcoord[2] /= texcoord[3];
1275                }
1276                fetch_texel( ctx, texcoord,
1277                             span->array->lambda[inst->TexSrcUnit][column],
1278                             inst->TexSrcUnit, color );
1279                store_vector4( inst, machine, color );
1280             }
1281             break;
1282          case OPCODE_UP2H: /* unpack two 16-bit floats */
1283             {
1284                GLfloat a[4], result[4];
1285                const GLuint *rawBits = (const GLuint *) a;
1286                GLhalfNV hx, hy;
1287                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1288                hx = rawBits[0] & 0xffff;
1289                hy = rawBits[0] >> 16;
1290                result[0] = result[2] = _mesa_half_to_float(hx);
1291                result[1] = result[3] = _mesa_half_to_float(hy);
1292                store_vector4( inst, machine, result );
1293             }
1294             break;
1295          case OPCODE_UP2US: /* unpack two GLushorts */
1296             {
1297                GLfloat a[4], result[4];
1298                const GLuint *rawBits = (const GLuint *) a;
1299                GLushort usx, usy;
1300                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1301                usx = rawBits[0] & 0xffff;
1302                usy = rawBits[0] >> 16;
1303                result[0] = result[2] = usx * (1.0f / 65535.0f);
1304                result[1] = result[3] = usy * (1.0f / 65535.0f);
1305                store_vector4( inst, machine, result );
1306             }
1307             break;
1308          case OPCODE_UP4B: /* unpack four GLbytes */
1309             {
1310                GLfloat a[4], result[4];
1311                const GLuint *rawBits = (const GLuint *) a;
1312                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1313                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1314                result[1] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1315                result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1316                result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1317                store_vector4( inst, machine, result );
1318             }
1319             break;
1320          case OPCODE_UP4UB: /* unpack four GLubytes */
1321             {
1322                GLfloat a[4], result[4];
1323                const GLuint *rawBits = (const GLuint *) a;
1324                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1325                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1326                result[1] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1327                result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1328                result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1329                store_vector4( inst, machine, result );
1330             }
1331             break;
1332          case OPCODE_XPD: /* cross product */
1333             {
1334                GLfloat a[4], b[4], result[4];
1335                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1336                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1337                result[0] = a[1] * b[2] - a[2] * b[1];
1338                result[1] = a[2] * b[0] - a[0] * b[2];
1339                result[2] = a[0] * b[1] - a[1] * b[0];
1340                result[3] = 1.0;
1341                store_vector4( inst, machine, result );
1342             }
1343             break;
1344          case OPCODE_X2D: /* 2-D matrix transform */
1345             {
1346                GLfloat a[4], b[4], c[4], result[4];
1347                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1348                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1349                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1350                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1351                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1352                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1353                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1354                store_vector4( inst, machine, result );
1355             }
1356             break;
1357          case OPCODE_PRINT:
1358             {
1359                if (inst->SrcReg[0].File != -1) {
1360                   GLfloat a[4];
1361                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
1362                   _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1363                                a[0], a[1], a[2], a[3]);
1364                }
1365                else {
1366                   _mesa_printf("%s\n", (const char *) inst->Data);
1367                }
1368             }
1369             break;
1370          case OPCODE_END:
1371             return GL_TRUE;
1372          default:
1373             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1374                           inst->Opcode);
1375             return GL_TRUE; /* return value doesn't matter */
1376       }
1377    }
1378    return GL_TRUE;
1379 }
1380
1381
1382 static void
1383 init_machine( GLcontext *ctx, struct fp_machine *machine,
1384               const struct gl_fragment_program *program,
1385               const struct sw_span *span, GLuint col )
1386 {
1387    GLuint inputsRead = program->Base.InputsRead;
1388    GLuint u;
1389
1390    if (ctx->FragmentProgram.CallbackEnabled)
1391       inputsRead = ~0;
1392
1393    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
1394       /* Clear temporary registers (undefined for ARB_f_p) */
1395       _mesa_bzero(machine->Temporaries,
1396                   MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1397    }
1398
1399    /* Load input registers */
1400    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1401       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1402       ASSERT(span->arrayMask & SPAN_Z);
1403       if (span->arrayMask & SPAN_XY) {
1404          wpos[0] = (GLfloat) span->array->x[col];
1405          wpos[1] = (GLfloat) span->array->y[col];
1406       }
1407       else {
1408          wpos[0] = (GLfloat) span->x + col;
1409          wpos[1] = (GLfloat) span->y;
1410       }
1411       wpos[2] = (GLfloat) span->array->z[col] / ctx->DrawBuffer->_DepthMaxF;
1412       wpos[3] = span->w + col * span->dwdx;
1413    }
1414    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1415       GLfloat *col0 = machine->Inputs[FRAG_ATTRIB_COL0];
1416       ASSERT(span->arrayMask & SPAN_RGBA);
1417       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1418       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1419       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1420       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1421    }
1422    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1423       GLfloat *col1 = machine->Inputs[FRAG_ATTRIB_COL1];
1424       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1425       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1426       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1427       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1428    }
1429    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1430       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1431       ASSERT(span->arrayMask & SPAN_FOG);
1432       fogc[0] = span->array->fog[col];
1433       fogc[1] = 0.0F;
1434       fogc[2] = 0.0F;
1435       fogc[3] = 0.0F;
1436    }
1437    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1438       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1439          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1440          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1441          COPY_4V(tex, span->array->texcoords[u][col]);
1442          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1443       }
1444    }
1445
1446    /* init condition codes */
1447    machine->CondCodes[0] = COND_EQ;
1448    machine->CondCodes[1] = COND_EQ;
1449    machine->CondCodes[2] = COND_EQ;
1450    machine->CondCodes[3] = COND_EQ;
1451 }
1452
1453
1454
1455 /**
1456  * Execute the current fragment program, operating on the given span.
1457  */
1458 void
1459 _swrast_exec_fragment_program( GLcontext *ctx, struct sw_span *span )
1460 {
1461    const struct gl_fragment_program *program = ctx->FragmentProgram._Current;
1462    GLuint i;
1463
1464    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1465
1466    if (program->Base.Parameters) {
1467       _mesa_load_state_parameters(ctx, program->Base.Parameters);
1468    }
1469
1470    for (i = 0; i < span->end; i++) {
1471       if (span->array->mask[i]) {
1472          init_machine(ctx, &ctx->FragmentProgram.Machine,
1473                       ctx->FragmentProgram._Current, span, i);
1474
1475          if (!execute_program(ctx, program, ~0,
1476                               &ctx->FragmentProgram.Machine, span, i)) {
1477             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1478             span->writeAll = GL_FALSE;
1479          }
1480
1481          /* Store output registers */
1482          {
1483             const GLfloat *colOut
1484                = ctx->FragmentProgram.Machine.Outputs[FRAG_RESULT_COLR];
1485             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1486             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1487             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1488             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1489          }
1490          /* depth value */
1491          if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1492             const GLfloat depth
1493                = ctx->FragmentProgram.Machine.Outputs[FRAG_RESULT_DEPR][2];
1494             if (depth <= 0.0)
1495                span->array->z[i] = 0;
1496             else if (depth >= 1.0)
1497                span->array->z[i] = ctx->DrawBuffer->_DepthMax;
1498             else
1499                span->array->z[i] = IROUND(depth * ctx->DrawBuffer->_DepthMaxF);
1500          }
1501       }
1502    }
1503
1504    if (program->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) {
1505       span->interpMask &= ~SPAN_Z;
1506       span->arrayMask |= SPAN_Z;
1507    }
1508
1509    ctx->_CurrentProgram = 0;
1510 }
1511