src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /*
  26  * Regarding GL_NV_fragment_program:
  27  *
  28  * Portions of this software may use or implement intellectual
  29  * property owned and licensed by NVIDIA Corporation. NVIDIA disclaims
  30  * any and all warranties with respect to such intellectual property,
  31  * including any use thereof or modifications thereto.
  32  */
  33
  34 #include "glheader.h"
  35 #include "colormac.h"
  36 #include "context.h"
  37 #include "nvfragprog.h"
  38 #include "program.h"
  39
  40 #include "s_nvfragprog.h"
  41 #include "s_span.h"
  42
  43
  44 /* if 1, print some debugging info */
  45 #define DEBUG_FRAG 0
  46
  47 /**
  48  * Fetch a texel.
  49  */
  50 static void
  51 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  52              GLuint unit, GLfloat color[4] )
  53 {
  54    GLchan rgba[4];
  55    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  56
  57    /* XXX use a float-valued TextureSample routine here!!! */
  58    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
  59                                1, (const GLfloat (*)[4]) texcoord,
  60                                &lambda, &rgba);
  61    color[0] = CHAN_TO_FLOAT(rgba[0]);
  62    color[1] = CHAN_TO_FLOAT(rgba[1]);
  63    color[2] = CHAN_TO_FLOAT(rgba[2]);
  64    color[3] = CHAN_TO_FLOAT(rgba[3]);
  65 }
  66
  67
  68 /**
  69  * Fetch a texel with the given partial derivatives to compute a level
  70  * of detail in the mipmap.
  71  */
  72 static void
  73 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  74                    const GLfloat texdx[4], const GLfloat texdy[4],
  75                    GLuint unit, GLfloat color[4] )
  76 {
  77    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  78    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  79    const struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
  80    const GLfloat texW = (GLfloat) texImg->WidthScale;
  81    const GLfloat texH = (GLfloat) texImg->HeightScale;
  82    GLchan rgba[4];
  83
  84    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  85                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  86                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  87                                          texW, texH,
  88                                          texcoord[0], texcoord[1], texcoord[3],
  89                                          1.0F / texcoord[3]);
  90
  91    swrast->TextureSample[unit](ctx, ctx->Texture.Unit[unit]._Current,
  92                                1, (const GLfloat (*)[4]) texcoord,
  93                                &lambda, &rgba);
  94    color[0] = CHAN_TO_FLOAT(rgba[0]);
  95    color[1] = CHAN_TO_FLOAT(rgba[1]);
  96    color[2] = CHAN_TO_FLOAT(rgba[2]);
  97    color[3] = CHAN_TO_FLOAT(rgba[3]);
  98 }
  99
 100
 101 /**
 102  * Return a pointer to the 4-element float vector specified by the given
 103  * source register.
 104  */
 105 static INLINE const GLfloat *
 106 get_register_pointer( GLcontext *ctx,
 107                       const struct fp_src_register *source,
 108                       const struct fp_machine *machine,
 109                       const struct fragment_program *program )
 110 {
 111    const GLfloat *src;
 112    switch (source->File) {
 113       case PROGRAM_TEMPORARY:
 114          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 115          src = machine->Temporaries[source->Index];
 116          break;
 117       case PROGRAM_INPUT:
 118          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 119          src = machine->Inputs[source->Index];
 120          break;
 121       case PROGRAM_OUTPUT:
 122          /* This is only for PRINT */
 123          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_OUTPUTS);
 124          src = machine->Outputs[source->Index];
 125          break;
 126       case PROGRAM_LOCAL_PARAM:
 127          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 128          src = program->Base.LocalParams[source->Index];
 129          break;
 130       case PROGRAM_ENV_PARAM:
 131          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 132          src = ctx->FragmentProgram.Parameters[source->Index];
 133          break;
 134       case PROGRAM_STATE_VAR:
 135          /* Fallthrough */
 136       case PROGRAM_NAMED_PARAM:
 137          ASSERT(source->Index < (GLint) program->Parameters->NumParameters);
 138          src = program->Parameters->ParameterValues[source->Index];
 139          break;
 140       default:
 141          _mesa_problem(ctx, "Invalid input register file %d in fetch_vector4", source->File);
 142          src = NULL;
 143    }
 144    return src;
 145 }
 146
 147
 148 /**
 149  * Fetch a 4-element float vector from the given source register.
 150  * Apply swizzling and negating as needed.
 151  */
 152 static void
 153 fetch_vector4( GLcontext *ctx,
 154                const struct fp_src_register *source,
 155                const struct fp_machine *machine,
 156                const struct fragment_program *program,
 157                GLfloat result[4] )
 158 {
 159    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 160    ASSERT(src);
 161
 162    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 163    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 164    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 165    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 166
 167    if (source->NegateBase) {
 168       result[0] = -result[0];
 169       result[1] = -result[1];
 170       result[2] = -result[2];
 171       result[3] = -result[3];
 172    }
 173    if (source->Abs) {
 174       result[0] = FABSF(result[0]);
 175       result[1] = FABSF(result[1]);
 176       result[2] = FABSF(result[2]);
 177       result[3] = FABSF(result[3]);
 178    }
 179    if (source->NegateAbs) {
 180       result[0] = -result[0];
 181       result[1] = -result[1];
 182       result[2] = -result[2];
 183       result[3] = -result[3];
 184    }
 185 }
 186
 187
 188 /**
 189  * Fetch the derivative with respect to X for the given register.
 190  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 191  * need to execute another instance of the program (ugh)!
 192  */
 193 static GLboolean
 194 fetch_vector4_deriv( GLcontext *ctx,
 195                      const struct fp_src_register *source,
 196                      const struct sw_span *span,
 197                      char xOrY, GLint column, GLfloat result[4] )
 198 {
 199    GLfloat src[4];
 200
 201    ASSERT(xOrY == 'X' || xOrY == 'Y');
 202
 203    switch (source->Index) {
 204    case FRAG_ATTRIB_WPOS:
 205       if (xOrY == 'X') {
 206          src[0] = 1.0;
 207          src[1] = 0.0;
 208          src[2] = span->dzdx / ctx->DrawBuffer->_DepthMaxF;
 209          src[3] = span->dwdx;
 210       }
 211       else {
 212          src[0] = 0.0;
 213          src[1] = 1.0;
 214          src[2] = span->dzdy / ctx->DrawBuffer->_DepthMaxF;
 215          src[3] = span->dwdy;
 216       }
 217       break;
 218    case FRAG_ATTRIB_COL0:
 219       if (xOrY == 'X') {
 220          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 221          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 222          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 223          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 224       }
 225       else {
 226          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 227          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 228          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 229          src[3] = span->dady * (1.0F / CHAN_MAXF);
 230       }
 231       break;
 232    case FRAG_ATTRIB_COL1:
 233       if (xOrY == 'X') {
 234          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 235          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 236          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 237          src[3] = 0.0; /* XXX need this */
 238       }
 239       else {
 240          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 241          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 242          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 243          src[3] = 0.0; /* XXX need this */
 244       }
 245       break;
 246    case FRAG_ATTRIB_FOGC:
 247       if (xOrY == 'X') {
 248          src[0] = span->dfogdx;
 249          src[1] = 0.0;
 250          src[2] = 0.0;
 251          src[3] = 0.0;
 252       }
 253       else {
 254          src[0] = span->dfogdy;
 255          src[1] = 0.0;
 256          src[2] = 0.0;
 257          src[3] = 0.0;
 258       }
 259       break;
 260    case FRAG_ATTRIB_TEX0:
 261    case FRAG_ATTRIB_TEX1:
 262    case FRAG_ATTRIB_TEX2:
 263    case FRAG_ATTRIB_TEX3:
 264    case FRAG_ATTRIB_TEX4:
 265    case FRAG_ATTRIB_TEX5:
 266    case FRAG_ATTRIB_TEX6:
 267    case FRAG_ATTRIB_TEX7:
 268       if (xOrY == 'X') {
 269          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 270          /* this is a little tricky - I think I've got it right */
 271          const GLfloat invQ = 1.0f / (span->tex[u][3]
 272                                       + span->texStepX[u][3] * column);
 273          src[0] = span->texStepX[u][0] * invQ;
 274          src[1] = span->texStepX[u][1] * invQ;
 275          src[2] = span->texStepX[u][2] * invQ;
 276          src[3] = span->texStepX[u][3] * invQ;
 277       }
 278       else {
 279          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 280          /* Tricky, as above, but in Y direction */
 281          const GLfloat invQ = 1.0f / (span->tex[u][3] + span->texStepY[u][3]);
 282          src[0] = span->texStepY[u][0] * invQ;
 283          src[1] = span->texStepY[u][1] * invQ;
 284          src[2] = span->texStepY[u][2] * invQ;
 285          src[3] = span->texStepY[u][3] * invQ;
 286       }
 287       break;
 288    default:
 289       return GL_FALSE;
 290    }
 291
 292    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 293    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 294    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 295    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 296
 297    if (source->NegateBase) {
 298       result[0] = -result[0];
 299       result[1] = -result[1];
 300       result[2] = -result[2];
 301       result[3] = -result[3];
 302    }
 303    if (source->Abs) {
 304       result[0] = FABSF(result[0]);
 305       result[1] = FABSF(result[1]);
 306       result[2] = FABSF(result[2]);
 307       result[3] = FABSF(result[3]);
 308    }
 309    if (source->NegateAbs) {
 310       result[0] = -result[0];
 311       result[1] = -result[1];
 312       result[2] = -result[2];
 313       result[3] = -result[3];
 314    }
 315    return GL_TRUE;
 316 }
 317
 318
 319 /**
 320  * As above, but only return result[0] element.
 321  */
 322 static void
 323 fetch_vector1( GLcontext *ctx,
 324                const struct fp_src_register *source,
 325                const struct fp_machine *machine,
 326                const struct fragment_program *program,
 327                GLfloat result[4] )
 328 {
 329    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 330    ASSERT(src);
 331
 332    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 333
 334    if (source->NegateBase) {
 335       result[0] = -result[0];
 336    }
 337    if (source->Abs) {
 338       result[0] = FABSF(result[0]);
 339    }
 340    if (source->NegateAbs) {
 341       result[0] = -result[0];
 342    }
 343 }
 344
 345
 346 /**
 347  * Test value against zero and return GT, LT, EQ or UN if NaN.
 348  */
 349 static INLINE GLuint
 350 generate_cc( float value )
 351 {
 352    if (value != value)
 353       return COND_UN;  /* NaN */
 354    if (value > 0.0F)
 355       return COND_GT;
 356    if (value < 0.0F)
 357       return COND_LT;
 358    return COND_EQ;
 359 }
 360
 361
 362 /**
 363  * Test if the ccMaskRule is satisfied by the given condition code.
 364  * Used to mask destination writes according to the current condition codee.
 365  */
 366 static INLINE GLboolean
 367 test_cc(GLuint condCode, GLuint ccMaskRule)
 368 {
 369    switch (ccMaskRule) {
 370    case COND_EQ: return (condCode == COND_EQ);
 371    case COND_NE: return (condCode != COND_EQ);
 372    case COND_LT: return (condCode == COND_LT);
 373    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 374    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 375    case COND_GT: return (condCode == COND_GT);
 376    case COND_TR: return GL_TRUE;
 377    case COND_FL: return GL_FALSE;
 378    default:      return GL_TRUE;
 379    }
 380 }
 381
 382
 383 /**
 384  * Store 4 floats into a register.  Observe the instructions saturate and
 385  * set-condition-code flags.
 386  */
 387 static void
 388 store_vector4( const struct fp_instruction *inst,
 389                struct fp_machine *machine,
 390                const GLfloat value[4] )
 391 {
 392    const struct fp_dst_register *dest = &(inst->DstReg);
 393    const GLboolean clamp = inst->Saturate;
 394    const GLboolean updateCC = inst->UpdateCondRegister;
 395    GLfloat *dstReg;
 396    GLfloat dummyReg[4];
 397    GLfloat clampedValue[4];
 398    GLboolean condWriteMask[4];
 399    GLuint writeMask = dest->WriteMask;
 400
 401    switch (dest->File) {
 402       case PROGRAM_OUTPUT:
 403          dstReg = machine->Outputs[dest->Index];
 404          break;
 405       case PROGRAM_TEMPORARY:
 406          dstReg = machine->Temporaries[dest->Index];
 407          break;
 408       case PROGRAM_WRITE_ONLY:
 409          dstReg = dummyReg;
 410          return;
 411       default:
 412          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 413          return;
 414    }
 415
 416 #if DEBUG_FRAG
 417    if (value[0] > 1.0e10 ||
 418        IS_INF_OR_NAN(value[0]) ||
 419        IS_INF_OR_NAN(value[1]) ||
 420        IS_INF_OR_NAN(value[2]) ||
 421        IS_INF_OR_NAN(value[3])  )
 422       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 423 #endif
 424
 425    if (clamp) {
 426       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 427       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 428       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 429       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 430       value = clampedValue;
 431    }
 432
 433    if (dest->CondMask != COND_TR) {
 434       condWriteMask[0] = GET_BIT(writeMask, 0)
 435          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)], dest->CondMask);
 436       condWriteMask[1] = GET_BIT(writeMask, 1)
 437          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)], dest->CondMask);
 438       condWriteMask[2] = GET_BIT(writeMask, 2)
 439          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)], dest->CondMask);
 440       condWriteMask[3] = GET_BIT(writeMask, 3)
 441          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)], dest->CondMask);
 442
 443       writeMask = ((condWriteMask[0] << 0) |
 444                    (condWriteMask[1] << 1) |
 445                    (condWriteMask[2] << 2) |
 446                    (condWriteMask[3] << 3));
 447    }
 448
 449    if (GET_BIT(writeMask, 0)) {
 450       dstReg[0] = value[0];
 451       if (updateCC)
 452          machine->CondCodes[0] = generate_cc(value[0]);
 453    }
 454    if (GET_BIT(writeMask, 1)) {
 455       dstReg[1] = value[1];
 456       if (updateCC)
 457          machine->CondCodes[1] = generate_cc(value[1]);
 458    }
 459    if (GET_BIT(writeMask, 2)) {
 460       dstReg[2] = value[2];
 461       if (updateCC)
 462          machine->CondCodes[2] = generate_cc(value[2]);
 463    }
 464    if (GET_BIT(writeMask, 3)) {
 465       dstReg[3] = value[3];
 466       if (updateCC)
 467          machine->CondCodes[3] = generate_cc(value[3]);
 468    }
 469 }
 470
 471
 472 /**
 473  * Initialize a new machine state instance from an existing one, adding
 474  * the partial derivatives onto the input registers.
 475  * Used to implement DDX and DDY instructions in non-trivial cases.
 476  */
 477 static void
 478 init_machine_deriv( GLcontext *ctx,
 479                     const struct fp_machine *machine,
 480                     const struct fragment_program *program,
 481                     const struct sw_span *span, char xOrY,
 482                     struct fp_machine *dMachine )
 483 {
 484    GLuint u;
 485
 486    ASSERT(xOrY == 'X' || xOrY == 'Y');
 487
 488    /* copy existing machine */
 489    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 490
 491    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
 492       /* Clear temporary registers (undefined for ARB_f_p) */
 493       _mesa_bzero( (void*) machine->Temporaries,
 494                    MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 495    }
 496
 497    /* Add derivatives */
 498    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 499       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 500       if (xOrY == 'X') {
 501          wpos[0] += 1.0F;
 502          wpos[1] += 0.0F;
 503          wpos[2] += span->dzdx;
 504          wpos[3] += span->dwdx;
 505       }
 506       else {
 507          wpos[0] += 0.0F;
 508          wpos[1] += 1.0F;
 509          wpos[2] += span->dzdy;
 510          wpos[3] += span->dwdy;
 511       }
 512    }
 513    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 514       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 515       if (xOrY == 'X') {
 516          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 517          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 518          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 519          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 520       }
 521       else {
 522          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 523          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 524          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 525          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 526       }
 527    }
 528    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 529       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 530       if (xOrY == 'X') {
 531          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 532          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 533          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 534          col1[3] += 0.0; /*XXX fix */
 535       }
 536       else {
 537          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 538          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 539          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 540          col1[3] += 0.0; /*XXX fix */
 541       }
 542    }
 543    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 544       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 545       if (xOrY == 'X') {
 546          fogc[0] += span->dfogdx;
 547       }
 548       else {
 549          fogc[0] += span->dfogdy;
 550       }
 551    }
 552    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 553       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 554          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 555          /* XXX perspective-correct interpolation */
 556          if (xOrY == 'X') {
 557             tex[0] += span->texStepX[u][0];
 558             tex[1] += span->texStepX[u][1];
 559             tex[2] += span->texStepX[u][2];
 560             tex[3] += span->texStepX[u][3];
 561          }
 562          else {
 563             tex[0] += span->texStepY[u][0];
 564             tex[1] += span->texStepY[u][1];
 565             tex[2] += span->texStepY[u][2];
 566             tex[3] += span->texStepY[u][3];
 567          }
 568       }
 569    }
 570
 571    /* init condition codes */
 572    dMachine->CondCodes[0] = COND_EQ;
 573    dMachine->CondCodes[1] = COND_EQ;
 574    dMachine->CondCodes[2] = COND_EQ;
 575    dMachine->CondCodes[3] = COND_EQ;
 576 }
 577
 578
 579 /**
 580  * Execute the given vertex program.
 581  * NOTE: we do everything in single-precision floating point; we don't
 582  * currently observe the single/half/fixed-precision qualifiers.
 583  * \param ctx - rendering context
 584  * \param program - the fragment program to execute
 585  * \param machine - machine state (register file)
 586  * \param maxInst - max number of instructions to execute
 587  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 588  */
 589 static GLboolean
 590 execute_program( GLcontext *ctx,
 591                  const struct fragment_program *program, GLuint maxInst,
 592                  struct fp_machine *machine, const struct sw_span *span,
 593                  GLuint column )
 594 {
 595    GLuint pc;
 596
 597 #if DEBUG_FRAG
 598    printf("execute fragment program --------------------\n");
 599 #endif
 600
 601    for (pc = 0; pc < maxInst; pc++) {
 602       const struct fp_instruction *inst = program->Instructions + pc;
 603
 604       if (ctx->FragmentProgram.CallbackEnabled &&
 605           ctx->FragmentProgram.Callback) {
 606          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 607          ctx->FragmentProgram.Callback(program->Base.Target,
 608                                        ctx->FragmentProgram.CallbackData);
 609       }
 610
 611       switch (inst->Opcode) {
 612          case FP_OPCODE_ABS:
 613             {
 614                GLfloat a[4], result[4];
 615                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 616                result[0] = FABSF(a[0]);
 617                result[1] = FABSF(a[1]);
 618                result[2] = FABSF(a[2]);
 619                result[3] = FABSF(a[3]);
 620                store_vector4( inst, machine, result );
 621             }
 622             break;
 623          case FP_OPCODE_ADD:
 624             {
 625                GLfloat a[4], b[4], result[4];
 626                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 627                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 628                result[0] = a[0] + b[0];
 629                result[1] = a[1] + b[1];
 630                result[2] = a[2] + b[2];
 631                result[3] = a[3] + b[3];
 632                store_vector4( inst, machine, result );
 633             }
 634             break;
 635          case FP_OPCODE_CMP:
 636             {
 637                GLfloat a[4], b[4], c[4], result[4];
 638                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 639                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 640                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 641                result[0] = a[0] < 0.0F ? b[0] : c[0];
 642                result[1] = a[1] < 0.0F ? b[1] : c[1];
 643                result[2] = a[2] < 0.0F ? b[2] : c[2];
 644                result[3] = a[3] < 0.0F ? b[3] : c[3];
 645                store_vector4( inst, machine, result );
 646             }
 647             break;
 648          case FP_OPCODE_COS:
 649             {
 650                GLfloat a[4], result[4];
 651                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 652                result[0] = result[1] = result[2] = result[3] = (GLfloat)_mesa_cos(a[0]);
 653                store_vector4( inst, machine, result );
 654             }
 655             break;
 656          case FP_OPCODE_DDX: /* Partial derivative with respect to X */
 657             {
 658                GLfloat a[4], aNext[4], result[4];
 659                struct fp_machine dMachine;
 660                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
 661                                         column, result)) {
 662                   /* This is tricky.  Make a copy of the current machine state,
 663                    * increment the input registers by the dx or dy partial
 664                    * derivatives, then re-execute the program up to the
 665                    * preceeding instruction, then fetch the source register.
 666                    * Finally, find the difference in the register values for
 667                    * the original and derivative runs.
 668                    */
 669                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 670                   init_machine_deriv(ctx, machine, program, span,
 671                                      'X', &dMachine);
 672                   execute_program(ctx, program, pc, &dMachine, span, column);
 673                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 674                   result[0] = aNext[0] - a[0];
 675                   result[1] = aNext[1] - a[1];
 676                   result[2] = aNext[2] - a[2];
 677                   result[3] = aNext[3] - a[3];
 678                }
 679                store_vector4( inst, machine, result );
 680             }
 681             break;
 682          case FP_OPCODE_DDY: /* Partial derivative with respect to Y */
 683             {
 684                GLfloat a[4], aNext[4], result[4];
 685                struct fp_machine dMachine;
 686                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
 687                                         column, result)) {
 688                   init_machine_deriv(ctx, machine, program, span,
 689                                      'Y', &dMachine);
 690                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 691                   execute_program(ctx, program, pc, &dMachine, span, column);
 692                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 693                   result[0] = aNext[0] - a[0];
 694                   result[1] = aNext[1] - a[1];
 695                   result[2] = aNext[2] - a[2];
 696                   result[3] = aNext[3] - a[3];
 697                }
 698                store_vector4( inst, machine, result );
 699             }
 700             break;
 701          case FP_OPCODE_DP3:
 702             {
 703                GLfloat a[4], b[4], result[4];
 704                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 705                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 706                result[0] = result[1] = result[2] = result[3] =
 707                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 708                store_vector4( inst, machine, result );
 709 #if DEBUG_FRAG
 710                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 711                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 712 #endif
 713             }
 714             break;
 715          case FP_OPCODE_DP4:
 716             {
 717                GLfloat a[4], b[4], result[4];
 718                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 719                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 720                result[0] = result[1] = result[2] = result[3] =
 721                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 722                store_vector4( inst, machine, result );
 723 #if DEBUG_FRAG
 724                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 725                       result[0], a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 726 #endif
 727             }
 728             break;
 729          case FP_OPCODE_DPH:
 730             {
 731                GLfloat a[4], b[4], result[4];
 732                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 733                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 734                result[0] = result[1] = result[2] = result[3] =
 735                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 736                store_vector4( inst, machine, result );
 737             }
 738             break;
 739          case FP_OPCODE_DST: /* Distance vector */
 740             {
 741                GLfloat a[4], b[4], result[4];
 742                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 743                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 744                result[0] = 1.0F;
 745                result[1] = a[1] * b[1];
 746                result[2] = a[2];
 747                result[3] = b[3];
 748                store_vector4( inst, machine, result );
 749             }
 750             break;
 751          case FP_OPCODE_EX2: /* Exponential base 2 */
 752             {
 753                GLfloat a[4], result[4];
 754                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 755                result[0] = result[1] = result[2] = result[3] =
 756                   (GLfloat) _mesa_pow(2.0, a[0]);
 757                store_vector4( inst, machine, result );
 758             }
 759             break;
 760          case FP_OPCODE_FLR:
 761             {
 762                GLfloat a[4], result[4];
 763                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 764                result[0] = FLOORF(a[0]);
 765                result[1] = FLOORF(a[1]);
 766                result[2] = FLOORF(a[2]);
 767                result[3] = FLOORF(a[3]);
 768                store_vector4( inst, machine, result );
 769             }
 770             break;
 771          case FP_OPCODE_FRC:
 772             {
 773                GLfloat a[4], result[4];
 774                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 775                result[0] = a[0] - FLOORF(a[0]);
 776                result[1] = a[1] - FLOORF(a[1]);
 777                result[2] = a[2] - FLOORF(a[2]);
 778                result[3] = a[3] - FLOORF(a[3]);
 779                store_vector4( inst, machine, result );
 780             }
 781             break;
 782          case FP_OPCODE_KIL_NV: /* NV_f_p only */
 783             {
 784                const GLuint swizzle = inst->DstReg.CondSwizzle;
 785                const GLuint condMask = inst->DstReg.CondMask;
 786                if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 787                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 788                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 789                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 790                   return GL_FALSE;
 791                }
 792             }
 793             break;
 794          case FP_OPCODE_KIL: /* ARB_f_p only */
 795             {
 796                GLfloat a[4];
 797                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 798                if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 799                   return GL_FALSE;
 800                }
 801             }
 802             break;
 803          case FP_OPCODE_LG2:  /* log base 2 */
 804             {
 805                GLfloat a[4], result[4];
 806                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 807                result[0] = result[1] = result[2] = result[3]
 808                   = LOG2(a[0]);
 809                store_vector4( inst, machine, result );
 810             }
 811             break;
 812          case FP_OPCODE_LIT:
 813             {
 814                const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
 815                GLfloat a[4], result[4];
 816                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 817                a[0] = MAX2(a[0], 0.0F);
 818                a[1] = MAX2(a[1], 0.0F);
 819                /* XXX ARB version clamps a[3], NV version doesn't */
 820                a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 821                result[0] = 1.0F;
 822                result[1] = a[0];
 823                /* XXX we could probably just use pow() here */
 824                result[2] = (a[0] > 0.0F) ? (GLfloat) exp(a[3] * log(a[1])) : 0.0F;
 825                result[3] = 1.0F;
 826                store_vector4( inst, machine, result );
 827             }
 828             break;
 829          case FP_OPCODE_LRP:
 830             {
 831                GLfloat a[4], b[4], c[4], result[4];
 832                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 833                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 834                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 835                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 836                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 837                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 838                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 839                store_vector4( inst, machine, result );
 840             }
 841             break;
 842          case FP_OPCODE_MAD:
 843             {
 844                GLfloat a[4], b[4], c[4], result[4];
 845                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 846                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 847                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 848                result[0] = a[0] * b[0] + c[0];
 849                result[1] = a[1] * b[1] + c[1];
 850                result[2] = a[2] * b[2] + c[2];
 851                result[3] = a[3] * b[3] + c[3];
 852                store_vector4( inst, machine, result );
 853             }
 854             break;
 855          case FP_OPCODE_MAX:
 856             {
 857                GLfloat a[4], b[4], result[4];
 858                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 859                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 860                result[0] = MAX2(a[0], b[0]);
 861                result[1] = MAX2(a[1], b[1]);
 862                result[2] = MAX2(a[2], b[2]);
 863                result[3] = MAX2(a[3], b[3]);
 864                store_vector4( inst, machine, result );
 865 #if DEBUG_FRAG
 866                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 867                       result[0], result[1], result[2], result[3],
 868                       a[0], a[1], a[2], a[3],
 869                       b[0], b[1], b[2], b[3]);
 870 #endif
 871             }
 872             break;
 873          case FP_OPCODE_MIN:
 874             {
 875                GLfloat a[4], b[4], result[4];
 876                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 877                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 878                result[0] = MIN2(a[0], b[0]);
 879                result[1] = MIN2(a[1], b[1]);
 880                result[2] = MIN2(a[2], b[2]);
 881                result[3] = MIN2(a[3], b[3]);
 882                store_vector4( inst, machine, result );
 883             }
 884             break;
 885          case FP_OPCODE_MOV:
 886             {
 887                GLfloat result[4];
 888                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 889                store_vector4( inst, machine, result );
 890 #if DEBUG_FRAG
 891                printf("MOV (%g %g %g %g)\n",
 892                       result[0], result[1], result[2], result[3]);
 893 #endif
 894             }
 895             break;
 896          case FP_OPCODE_MUL:
 897             {
 898                GLfloat a[4], b[4], result[4];
 899                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 900                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 901                result[0] = a[0] * b[0];
 902                result[1] = a[1] * b[1];
 903                result[2] = a[2] * b[2];
 904                result[3] = a[3] * b[3];
 905                store_vector4( inst, machine, result );
 906 #if DEBUG_FRAG
 907                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 908                       result[0], result[1], result[2], result[3],
 909                       a[0], a[1], a[2], a[3],
 910                       b[0], b[1], b[2], b[3]);
 911 #endif
 912             }
 913             break;
 914          case FP_OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
 915             {
 916                GLfloat a[4], result[4];
 917                GLhalfNV hx, hy;
 918                GLuint *rawResult = (GLuint *) result;
 919                GLuint twoHalves;
 920                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 921                hx = _mesa_float_to_half(a[0]);
 922                hy = _mesa_float_to_half(a[1]);
 923                twoHalves = hx | (hy << 16);
 924                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 925                   = twoHalves;
 926                store_vector4( inst, machine, result );
 927             }
 928             break;
 929          case FP_OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
 930             {
 931                GLfloat a[4], result[4];
 932                GLuint usx, usy, *rawResult = (GLuint *) result;
 933                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 934                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 935                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 936                usx = IROUND(a[0] * 65535.0F);
 937                usy = IROUND(a[1] * 65535.0F);
 938                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 939                   = usx | (usy << 16);
 940                store_vector4( inst, machine, result );
 941             }
 942             break;
 943          case FP_OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
 944             {
 945                GLfloat a[4], result[4];
 946                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 947                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 948                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 949                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 950                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 951                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 952                ubx = IROUND(127.0F * a[0] + 128.0F);
 953                uby = IROUND(127.0F * a[1] + 128.0F);
 954                ubz = IROUND(127.0F * a[2] + 128.0F);
 955                ubw = IROUND(127.0F * a[3] + 128.0F);
 956                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 957                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 958                store_vector4( inst, machine, result );
 959             }
 960             break;
 961          case FP_OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
 962             {
 963                GLfloat a[4], result[4];
 964                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 965                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 966                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 967                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 968                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 969                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 970                ubx = IROUND(255.0F * a[0]);
 971                uby = IROUND(255.0F * a[1]);
 972                ubz = IROUND(255.0F * a[2]);
 973                ubw = IROUND(255.0F * a[3]);
 974                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 975                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 976                store_vector4( inst, machine, result );
 977             }
 978             break;
 979          case FP_OPCODE_POW:
 980             {
 981                GLfloat a[4], b[4], result[4];
 982                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 983                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
 984                result[0] = result[1] = result[2] = result[3]
 985                   = (GLfloat)_mesa_pow(a[0], b[0]);
 986                store_vector4( inst, machine, result );
 987             }
 988             break;
 989          case FP_OPCODE_RCP:
 990             {
 991                GLfloat a[4], result[4];
 992                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 993 #if DEBUG_FRAG
 994                if (a[0] == 0)
 995                   printf("RCP(0)\n");
 996                else if (IS_INF_OR_NAN(a[0]))
 997                   printf("RCP(inf)\n");
 998 #endif
 999                result[0] = result[1] = result[2] = result[3]
1000                   = 1.0F / a[0];
1001                store_vector4( inst, machine, result );
1002             }
1003             break;
1004          case FP_OPCODE_RFL:
1005             {
1006                GLfloat axis[4], dir[4], result[4], tmp[4];
1007                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
1008                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
1009                tmp[3] = axis[0] * axis[0]
1010                       + axis[1] * axis[1]
1011                       + axis[2] * axis[2];
1012                tmp[0] = (2.0F * (axis[0] * dir[0] +
1013                                  axis[1] * dir[1] +
1014                                  axis[2] * dir[2])) / tmp[3];
1015                result[0] = tmp[0] * axis[0] - dir[0];
1016                result[1] = tmp[0] * axis[1] - dir[1];
1017                result[2] = tmp[0] * axis[2] - dir[2];
1018                /* result[3] is never written! XXX enforce in parser! */
1019                store_vector4( inst, machine, result );
1020             }
1021             break;
1022          case FP_OPCODE_RSQ: /* 1 / sqrt() */
1023             {
1024                GLfloat a[4], result[4];
1025                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1026                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1027                store_vector4( inst, machine, result );
1028 #if DEBUG_FRAG
1029                printf("RSQ %g = 1/sqrt(%g)\n", result[0], a[0]);
1030 #endif
1031             }
1032             break;
1033          case FP_OPCODE_SCS: /* sine and cos */
1034             {
1035                GLfloat a[4], result[4];
1036                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1037                result[0] = (GLfloat)cos(a[0]);
1038                result[1] = (GLfloat)sin(a[0]);
1039                result[2] = 0.0;  /* undefined! */
1040                result[3] = 0.0;  /* undefined! */
1041                store_vector4( inst, machine, result );
1042             }
1043             break;
1044          case FP_OPCODE_SEQ: /* set on equal */
1045             {
1046                GLfloat a[4], b[4], result[4];
1047                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1048                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1049                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1050                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1051                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1052                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1053                store_vector4( inst, machine, result );
1054             }
1055             break;
1056          case FP_OPCODE_SFL: /* set false, operands ignored */
1057             {
1058                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1059                store_vector4( inst, machine, result );
1060             }
1061             break;
1062          case FP_OPCODE_SGE: /* set on greater or equal */
1063             {
1064                GLfloat a[4], b[4], result[4];
1065                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1066                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1067                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1068                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1069                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1070                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1071                store_vector4( inst, machine, result );
1072             }
1073             break;
1074          case FP_OPCODE_SGT: /* set on greater */
1075             {
1076                GLfloat a[4], b[4], result[4];
1077                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1078                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1079                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1080                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1081                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1082                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1083                store_vector4( inst, machine, result );
1084             }
1085             break;
1086          case FP_OPCODE_SIN:
1087             {
1088                GLfloat a[4], result[4];
1089                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1090                result[0] = result[1] = result[2] =
1091                        result[3] = (GLfloat)_mesa_sin(a[0]);
1092                store_vector4( inst, machine, result );
1093             }
1094             break;
1095          case FP_OPCODE_SLE: /* set on less or equal */
1096             {
1097                GLfloat a[4], b[4], result[4];
1098                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1099                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1100                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1101                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1102                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1103                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1104                store_vector4( inst, machine, result );
1105             }
1106             break;
1107          case FP_OPCODE_SLT: /* set on less */
1108             {
1109                GLfloat a[4], b[4], result[4];
1110                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1111                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1112                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1113                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1114                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1115                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1116                store_vector4( inst, machine, result );
1117             }
1118             break;
1119          case FP_OPCODE_SNE: /* set on not equal */
1120             {
1121                GLfloat a[4], b[4], result[4];
1122                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1123                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1124                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1125                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1126                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1127                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1128                store_vector4( inst, machine, result );
1129             }
1130             break;
1131          case FP_OPCODE_STR: /* set true, operands ignored */
1132             {
1133                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1134                store_vector4( inst, machine, result );
1135             }
1136             break;
1137          case FP_OPCODE_SUB:
1138             {
1139                GLfloat a[4], b[4], result[4];
1140                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1141                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1142                result[0] = a[0] - b[0];
1143                result[1] = a[1] - b[1];
1144                result[2] = a[2] - b[2];
1145                result[3] = a[3] - b[3];
1146                store_vector4( inst, machine, result );
1147             }
1148             break;
1149          case FP_OPCODE_SWZ:
1150             {
1151                const struct fp_src_register *source = &inst->SrcReg[0];
1152                const GLfloat *src = get_register_pointer(ctx, source,
1153                                                          machine, program);
1154                GLfloat result[4];
1155                GLuint i;
1156
1157                /* do extended swizzling here */
1158                for (i = 0; i < 3; i++) {
1159                   if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ZERO)
1160                      result[i] = 0.0;
1161                   else if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ONE)
1162                      result[i] = -1.0;
1163                   else
1164                      result[i] = -src[GET_SWZ(source->Swizzle, i)];
1165
1166                   if (source->NegateBase)
1167                      result[i] = -result[i];
1168                }
1169                store_vector4( inst, machine, result );
1170             }
1171             break;
1172          case FP_OPCODE_TEX: /* Both ARB and NV frag prog */
1173             /* Texel lookup */
1174             {
1175                GLfloat texcoord[4], color[4];
1176                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1177                /* Note: we pass 0 for LOD.  The ARB extension requires it
1178                 * while the NV extension says it's implementation dependant.
1179                 */
1180                /* KW: Previously lambda was passed as zero, but I
1181                 * believe this is incorrect, the spec seems to
1182                 * indicate rather that lambda should not be
1183                 * changed/biased, unlike TXB where texcoord[3] is
1184                 * added to the lambda calculations.  The lambda should
1185                 * still be calculated normally for TEX & TXP though,
1186                 * not set to zero.  Otherwise it's very difficult to
1187                 * implement normal GL semantics through the fragment
1188                 * shader.
1189                 */
1190                fetch_texel( ctx, texcoord,
1191                             span->array->lambda[inst->TexSrcUnit][column],
1192                             inst->TexSrcUnit, color );
1193 #if DEBUG_FRAG
1194                if (color[3])
1195                   printf("color[3] = %f\n", color[3]);
1196 #endif
1197                store_vector4( inst, machine, color );
1198             }
1199             break;
1200          case FP_OPCODE_TXB: /* GL_ARB_fragment_program only */
1201             /* Texel lookup with LOD bias */
1202             {
1203                GLfloat texcoord[4], color[4], bias, lambda;
1204
1205                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1206                /* texcoord[3] is the bias to add to lambda */
1207                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1208                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1209                     + texcoord[3];
1210                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1211                fetch_texel( ctx, texcoord, lambda,
1212                             inst->TexSrcUnit, color );
1213                store_vector4( inst, machine, color );
1214             }
1215             break;
1216          case FP_OPCODE_TXD: /* GL_NV_fragment_program only */
1217             /* Texture lookup w/ partial derivatives for LOD */
1218             {
1219                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1220                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1221                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1222                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1223                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1224                                   color );
1225                store_vector4( inst, machine, color );
1226             }
1227             break;
1228          case FP_OPCODE_TXP: /* GL_ARB_fragment_program only */
1229             /* Texture lookup w/ projective divide */
1230             {
1231                GLfloat texcoord[4], color[4];
1232                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1233                /* Not so sure about this test - if texcoord[3] is
1234                 * zero, we'd probably be fine except for an ASSERT in
1235                 * IROUND_POS() which gets triggered by the inf values created.
1236                 */
1237                if (texcoord[3] != 0.0) {
1238                   texcoord[0] /= texcoord[3];
1239                   texcoord[1] /= texcoord[3];
1240                   texcoord[2] /= texcoord[3];
1241                }
1242                /* KW: Previously lambda was passed as zero, but I
1243                 * believe this is incorrect, the spec seems to
1244                 * indicate rather that lambda should not be
1245                 * changed/biased, unlike TXB where texcoord[3] is
1246                 * added to the lambda calculations.  The lambda should
1247                 * still be calculated normally for TEX & TXP though,
1248                 * not set to zero.
1249                 */
1250                fetch_texel( ctx, texcoord,
1251                             span->array->lambda[inst->TexSrcUnit][column],
1252                             inst->TexSrcUnit, color );
1253                store_vector4( inst, machine, color );
1254             }
1255             break;
1256          case FP_OPCODE_TXP_NV: /* GL_NV_fragment_program only */
1257             /* Texture lookup w/ projective divide */
1258             {
1259                GLfloat texcoord[4], color[4];
1260                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1261                if (inst->TexSrcIdx != TEXTURE_CUBE_INDEX &&
1262                    texcoord[3] != 0.0) {
1263                   texcoord[0] /= texcoord[3];
1264                   texcoord[1] /= texcoord[3];
1265                   texcoord[2] /= texcoord[3];
1266                }
1267                fetch_texel( ctx, texcoord,
1268                             span->array->lambda[inst->TexSrcUnit][column],
1269                             inst->TexSrcUnit, color );
1270                store_vector4( inst, machine, color );
1271             }
1272             break;
1273          case FP_OPCODE_UP2H: /* unpack two 16-bit floats */
1274             {
1275                GLfloat a[4], result[4];
1276                const GLuint *rawBits = (const GLuint *) a;
1277                GLhalfNV hx, hy;
1278                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1279                hx = rawBits[0] & 0xffff;
1280                hy = rawBits[0] >> 16;
1281                result[0] = result[2] = _mesa_half_to_float(hx);
1282                result[1] = result[3] = _mesa_half_to_float(hy);
1283                store_vector4( inst, machine, result );
1284             }
1285             break;
1286          case FP_OPCODE_UP2US: /* unpack two GLushorts */
1287             {
1288                GLfloat a[4], result[4];
1289                const GLuint *rawBits = (const GLuint *) a;
1290                GLushort usx, usy;
1291                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1292                usx = rawBits[0] & 0xffff;
1293                usy = rawBits[0] >> 16;
1294                result[0] = result[2] = usx * (1.0f / 65535.0f);
1295                result[1] = result[3] = usy * (1.0f / 65535.0f);
1296                store_vector4( inst, machine, result );
1297             }
1298             break;
1299          case FP_OPCODE_UP4B: /* unpack four GLbytes */
1300             {
1301                GLfloat a[4], result[4];
1302                const GLuint *rawBits = (const GLuint *) a;
1303                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1304                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1305                result[1] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1306                result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1307                result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1308                store_vector4( inst, machine, result );
1309             }
1310             break;
1311          case FP_OPCODE_UP4UB: /* unpack four GLubytes */
1312             {
1313                GLfloat a[4], result[4];
1314                const GLuint *rawBits = (const GLuint *) a;
1315                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1316                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1317                result[1] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1318                result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1319                result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1320                store_vector4( inst, machine, result );
1321             }
1322             break;
1323          case FP_OPCODE_XPD: /* cross product */
1324             {
1325                GLfloat a[4], b[4], result[4];
1326                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1327                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1328                result[0] = a[1] * b[2] - a[2] * b[1];
1329                result[1] = a[2] * b[0] - a[0] * b[2];
1330                result[2] = a[0] * b[1] - a[1] * b[0];
1331                result[3] = 1.0;
1332                store_vector4( inst, machine, result );
1333             }
1334             break;
1335          case FP_OPCODE_X2D: /* 2-D matrix transform */
1336             {
1337                GLfloat a[4], b[4], c[4], result[4];
1338                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1339                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1340                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1341                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1342                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1343                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1344                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1345                store_vector4( inst, machine, result );
1346             }
1347             break;
1348          case FP_OPCODE_PRINT:
1349             {
1350                if (inst->SrcReg[0].File != -1) {
1351                   GLfloat a[4];
1352                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
1353                   _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1354                                a[0], a[1], a[2], a[3]);
1355                }
1356                else {
1357                   _mesa_printf("%s\n", (const char *) inst->Data);
1358                }
1359             }
1360             break;
1361          case FP_OPCODE_END:
1362             return GL_TRUE;
1363          default:
1364             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1365                           inst->Opcode);
1366             return GL_TRUE; /* return value doesn't matter */
1367       }
1368    }
1369    return GL_TRUE;
1370 }
1371
1372
1373 static void
1374 init_machine( GLcontext *ctx, struct fp_machine *machine,
1375               const struct fragment_program *program,
1376               const struct sw_span *span, GLuint col )
1377 {
1378    GLuint inputsRead = program->InputsRead;
1379    GLuint u;
1380
1381    if (ctx->FragmentProgram.CallbackEnabled)
1382       inputsRead = ~0;
1383
1384    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
1385       /* Clear temporary registers (undefined for ARB_f_p) */
1386       _mesa_bzero(machine->Temporaries,
1387                   MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1388    }
1389
1390    /* Load input registers */
1391    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1392       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1393       wpos[0] = (GLfloat) span->x + col;
1394       wpos[1] = (GLfloat) span->y;
1395       wpos[2] = (GLfloat) span->array->z[col] / ctx->DrawBuffer->_DepthMaxF;
1396       wpos[3] = span->w + col * span->dwdx;
1397    }
1398    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1399       GLfloat *col0 = machine->Inputs[FRAG_ATTRIB_COL0];
1400       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1401       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1402       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1403       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1404    }
1405    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1406       GLfloat *col1 = machine->Inputs[FRAG_ATTRIB_COL1];
1407       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1408       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1409       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1410       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1411    }
1412    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1413       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1414       fogc[0] = span->array->fog[col];
1415       fogc[1] = 0.0F;
1416       fogc[2] = 0.0F;
1417       fogc[3] = 0.0F;
1418    }
1419    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1420       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1421          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1422          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1423          COPY_4V(tex, span->array->texcoords[u][col]);
1424          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1425       }
1426    }
1427
1428    /* init condition codes */
1429    machine->CondCodes[0] = COND_EQ;
1430    machine->CondCodes[1] = COND_EQ;
1431    machine->CondCodes[2] = COND_EQ;
1432    machine->CondCodes[3] = COND_EQ;
1433 }
1434
1435
1436
1437 /**
1438  * Execute the current fragment program, operating on the given span.
1439  */
1440 void
1441 _swrast_exec_fragment_program( GLcontext *ctx, struct sw_span *span )
1442 {
1443    const struct fragment_program *program = ctx->FragmentProgram._Current;
1444    GLuint i;
1445
1446    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1447
1448    if (program->Parameters) {
1449       _mesa_load_state_parameters(ctx, program->Parameters);
1450    }
1451
1452    for (i = 0; i < span->end; i++) {
1453       if (span->array->mask[i]) {
1454          init_machine(ctx, &ctx->FragmentProgram.Machine,
1455                       ctx->FragmentProgram._Current, span, i);
1456
1457 #ifdef USE_TCC
1458          if (!_swrast_execute_codegen_program(ctx, program, ~0,
1459                                               &ctx->FragmentProgram.Machine,
1460                                               span, i)) {
1461             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1462          }
1463 #else
1464          if (!execute_program(ctx, program, ~0,
1465                               &ctx->FragmentProgram.Machine, span, i)) {
1466             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1467          }
1468 #endif
1469
1470          /* Store output registers */
1471          {
1472             const GLfloat *colOut
1473                = ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_COLR];
1474             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1475             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1476             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1477             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1478          }
1479          /* depth value */
1480          if (program->OutputsWritten & (1 << FRAG_OUTPUT_DEPR))
1481             span->array->z[i] = IROUND(ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_DEPR][0] * ctx->DrawBuffer->_DepthMaxF);
1482       }
1483    }
1484
1485    ctx->_CurrentProgram = 0;
1486 }
1487