src/mesa/swrast/s_nvfragprog.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /*
  26  * Regarding GL_NV_fragment_program:
  27  *
  28  * Portions of this software may use or implement intellectual
  29  * property owned and licensed by NVIDIA Corporation. NVIDIA disclaims
  30  * any and all warranties with respect to such intellectual property,
  31  * including any use thereof or modifications thereto.
  32  */
  33
  34 #include "glheader.h"
  35 #include "colormac.h"
  36 #include "context.h"
  37 #include "nvfragprog.h"
  38 #include "macros.h"
  39 #include "program.h"
  40
  41 #include "s_nvfragprog.h"
  42 #include "s_span.h"
  43 #include "s_texture.h"
  44
  45
  46 /* if 1, print some debugging info */
  47 #define DEBUG_FRAG 0
  48
  49 /**
  50  * Fetch a texel.
  51  */
  52 static void
  53 fetch_texel( GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
  54              GLuint unit, GLfloat color[4] )
  55 {
  56    GLchan rgba[4];
  57    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  58
  59    /* XXX use a float-valued TextureSample routine here!!! */
  60    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  61                                1, (const GLfloat (*)[4]) texcoord,
  62                                &lambda, &rgba);
  63    color[0] = CHAN_TO_FLOAT(rgba[0]);
  64    color[1] = CHAN_TO_FLOAT(rgba[1]);
  65    color[2] = CHAN_TO_FLOAT(rgba[2]);
  66    color[3] = CHAN_TO_FLOAT(rgba[3]);
  67 }
  68
  69
  70 /**
  71  * Fetch a texel with the given partial derivatives to compute a level
  72  * of detail in the mipmap.
  73  */
  74 static void
  75 fetch_texel_deriv( GLcontext *ctx, const GLfloat texcoord[4],
  76                    const GLfloat texdx[4], const GLfloat texdy[4],
  77                    GLuint unit, GLfloat color[4] )
  78 {
  79    SWcontext *swrast = SWRAST_CONTEXT(ctx);
  80    const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
  81    const struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
  82    const GLfloat texW = (GLfloat) texImg->WidthScale;
  83    const GLfloat texH = (GLfloat) texImg->HeightScale;
  84    GLchan rgba[4];
  85
  86    GLfloat lambda = _swrast_compute_lambda(texdx[0], texdy[0], /* ds/dx, ds/dy */
  87                                          texdx[1], texdy[1], /* dt/dx, dt/dy */
  88                                          texdx[3], texdy[2], /* dq/dx, dq/dy */
  89                                          texW, texH,
  90                                          texcoord[0], texcoord[1], texcoord[3],
  91                                          1.0F / texcoord[3]);
  92
  93    swrast->TextureSample[unit](ctx, unit, ctx->Texture.Unit[unit]._Current,
  94                                1, (const GLfloat (*)[4]) texcoord,
  95                                &lambda, &rgba);
  96    color[0] = CHAN_TO_FLOAT(rgba[0]);
  97    color[1] = CHAN_TO_FLOAT(rgba[1]);
  98    color[2] = CHAN_TO_FLOAT(rgba[2]);
  99    color[3] = CHAN_TO_FLOAT(rgba[3]);
 100 }
 101
 102
 103 /**
 104  * Return a pointer to the 4-element float vector specified by the given
 105  * source register.
 106  */
 107 static INLINE const GLfloat *
 108 get_register_pointer( GLcontext *ctx,
 109                       const struct fp_src_register *source,
 110                       const struct fp_machine *machine,
 111                       const struct fragment_program *program )
 112 {
 113    const GLfloat *src;
 114    switch (source->File) {
 115       case PROGRAM_TEMPORARY:
 116          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_TEMPS);
 117          src = machine->Temporaries[source->Index];
 118          break;
 119       case PROGRAM_INPUT:
 120          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_INPUTS);
 121          src = machine->Inputs[source->Index];
 122          break;
 123       case PROGRAM_OUTPUT:
 124          /* This is only for PRINT */
 125          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_OUTPUTS);
 126          src = machine->Outputs[source->Index];
 127          break;
 128       case PROGRAM_LOCAL_PARAM:
 129          ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 130          src = program->Base.LocalParams[source->Index];
 131          break;
 132       case PROGRAM_ENV_PARAM:
 133          ASSERT(source->Index < MAX_NV_FRAGMENT_PROGRAM_PARAMS);
 134          src = ctx->FragmentProgram.Parameters[source->Index];
 135          break;
 136       case PROGRAM_STATE_VAR:
 137          /* Fallthrough */
 138       case PROGRAM_NAMED_PARAM:
 139          ASSERT(source->Index < (GLint) program->Parameters->NumParameters);
 140          src = program->Parameters->ParameterValues[source->Index];
 141          break;
 142       default:
 143          _mesa_problem(ctx, "Invalid input register file %d in fetch_vector4", source->File);
 144          src = NULL;
 145    }
 146    return src;
 147 }
 148
 149
 150 /**
 151  * Fetch a 4-element float vector from the given source register.
 152  * Apply swizzling and negating as needed.
 153  */
 154 static void
 155 fetch_vector4( GLcontext *ctx,
 156                const struct fp_src_register *source,
 157                const struct fp_machine *machine,
 158                const struct fragment_program *program,
 159                GLfloat result[4] )
 160 {
 161    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 162    ASSERT(src);
 163
 164    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 165    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 166    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 167    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 168
 169    if (source->NegateBase) {
 170       result[0] = -result[0];
 171       result[1] = -result[1];
 172       result[2] = -result[2];
 173       result[3] = -result[3];
 174    }
 175    if (source->Abs) {
 176       result[0] = FABSF(result[0]);
 177       result[1] = FABSF(result[1]);
 178       result[2] = FABSF(result[2]);
 179       result[3] = FABSF(result[3]);
 180    }
 181    if (source->NegateAbs) {
 182       result[0] = -result[0];
 183       result[1] = -result[1];
 184       result[2] = -result[2];
 185       result[3] = -result[3];
 186    }
 187 }
 188
 189
 190 /**
 191  * Fetch the derivative with respect to X for the given register.
 192  * \return GL_TRUE if it was easily computed or GL_FALSE if we
 193  * need to execute another instance of the program (ugh)!
 194  */
 195 static GLboolean
 196 fetch_vector4_deriv( GLcontext *ctx,
 197                      const struct fp_src_register *source,
 198                      const struct sw_span *span,
 199                      char xOrY, GLint column, GLfloat result[4] )
 200 {
 201    GLfloat src[4];
 202
 203    ASSERT(xOrY == 'X' || xOrY == 'Y');
 204
 205    switch (source->Index) {
 206    case FRAG_ATTRIB_WPOS:
 207       if (xOrY == 'X') {
 208          src[0] = 1.0;
 209          src[1] = 0.0;
 210          src[2] = span->dzdx / ctx->DrawBuffer->_DepthMaxF;
 211          src[3] = span->dwdx;
 212       }
 213       else {
 214          src[0] = 0.0;
 215          src[1] = 1.0;
 216          src[2] = span->dzdy / ctx->DrawBuffer->_DepthMaxF;
 217          src[3] = span->dwdy;
 218       }
 219       break;
 220    case FRAG_ATTRIB_COL0:
 221       if (xOrY == 'X') {
 222          src[0] = span->drdx * (1.0F / CHAN_MAXF);
 223          src[1] = span->dgdx * (1.0F / CHAN_MAXF);
 224          src[2] = span->dbdx * (1.0F / CHAN_MAXF);
 225          src[3] = span->dadx * (1.0F / CHAN_MAXF);
 226       }
 227       else {
 228          src[0] = span->drdy * (1.0F / CHAN_MAXF);
 229          src[1] = span->dgdy * (1.0F / CHAN_MAXF);
 230          src[2] = span->dbdy * (1.0F / CHAN_MAXF);
 231          src[3] = span->dady * (1.0F / CHAN_MAXF);
 232       }
 233       break;
 234    case FRAG_ATTRIB_COL1:
 235       if (xOrY == 'X') {
 236          src[0] = span->dsrdx * (1.0F / CHAN_MAXF);
 237          src[1] = span->dsgdx * (1.0F / CHAN_MAXF);
 238          src[2] = span->dsbdx * (1.0F / CHAN_MAXF);
 239          src[3] = 0.0; /* XXX need this */
 240       }
 241       else {
 242          src[0] = span->dsrdy * (1.0F / CHAN_MAXF);
 243          src[1] = span->dsgdy * (1.0F / CHAN_MAXF);
 244          src[2] = span->dsbdy * (1.0F / CHAN_MAXF);
 245          src[3] = 0.0; /* XXX need this */
 246       }
 247       break;
 248    case FRAG_ATTRIB_FOGC:
 249       if (xOrY == 'X') {
 250          src[0] = span->dfogdx;
 251          src[1] = 0.0;
 252          src[2] = 0.0;
 253          src[3] = 0.0;
 254       }
 255       else {
 256          src[0] = span->dfogdy;
 257          src[1] = 0.0;
 258          src[2] = 0.0;
 259          src[3] = 0.0;
 260       }
 261       break;
 262    case FRAG_ATTRIB_TEX0:
 263    case FRAG_ATTRIB_TEX1:
 264    case FRAG_ATTRIB_TEX2:
 265    case FRAG_ATTRIB_TEX3:
 266    case FRAG_ATTRIB_TEX4:
 267    case FRAG_ATTRIB_TEX5:
 268    case FRAG_ATTRIB_TEX6:
 269    case FRAG_ATTRIB_TEX7:
 270       if (xOrY == 'X') {
 271          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 272          /* this is a little tricky - I think I've got it right */
 273          const GLfloat invQ = 1.0f / (span->tex[u][3]
 274                                       + span->texStepX[u][3] * column);
 275          src[0] = span->texStepX[u][0] * invQ;
 276          src[1] = span->texStepX[u][1] * invQ;
 277          src[2] = span->texStepX[u][2] * invQ;
 278          src[3] = span->texStepX[u][3] * invQ;
 279       }
 280       else {
 281          const GLuint u = source->Index - FRAG_ATTRIB_TEX0;
 282          /* Tricky, as above, but in Y direction */
 283          const GLfloat invQ = 1.0f / (span->tex[u][3] + span->texStepY[u][3]);
 284          src[0] = span->texStepY[u][0] * invQ;
 285          src[1] = span->texStepY[u][1] * invQ;
 286          src[2] = span->texStepY[u][2] * invQ;
 287          src[3] = span->texStepY[u][3] * invQ;
 288       }
 289       break;
 290    default:
 291       return GL_FALSE;
 292    }
 293
 294    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 295    result[1] = src[GET_SWZ(source->Swizzle, 1)];
 296    result[2] = src[GET_SWZ(source->Swizzle, 2)];
 297    result[3] = src[GET_SWZ(source->Swizzle, 3)];
 298
 299    if (source->NegateBase) {
 300       result[0] = -result[0];
 301       result[1] = -result[1];
 302       result[2] = -result[2];
 303       result[3] = -result[3];
 304    }
 305    if (source->Abs) {
 306       result[0] = FABSF(result[0]);
 307       result[1] = FABSF(result[1]);
 308       result[2] = FABSF(result[2]);
 309       result[3] = FABSF(result[3]);
 310    }
 311    if (source->NegateAbs) {
 312       result[0] = -result[0];
 313       result[1] = -result[1];
 314       result[2] = -result[2];
 315       result[3] = -result[3];
 316    }
 317    return GL_TRUE;
 318 }
 319
 320
 321 /**
 322  * As above, but only return result[0] element.
 323  */
 324 static void
 325 fetch_vector1( GLcontext *ctx,
 326                const struct fp_src_register *source,
 327                const struct fp_machine *machine,
 328                const struct fragment_program *program,
 329                GLfloat result[4] )
 330 {
 331    const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 332    ASSERT(src);
 333
 334    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 335
 336    if (source->NegateBase) {
 337       result[0] = -result[0];
 338    }
 339    if (source->Abs) {
 340       result[0] = FABSF(result[0]);
 341    }
 342    if (source->NegateAbs) {
 343       result[0] = -result[0];
 344    }
 345 }
 346
 347
 348 /**
 349  * Test value against zero and return GT, LT, EQ or UN if NaN.
 350  */
 351 static INLINE GLuint
 352 generate_cc( float value )
 353 {
 354    if (value != value)
 355       return COND_UN;  /* NaN */
 356    if (value > 0.0F)
 357       return COND_GT;
 358    if (value < 0.0F)
 359       return COND_LT;
 360    return COND_EQ;
 361 }
 362
 363
 364 /**
 365  * Test if the ccMaskRule is satisfied by the given condition code.
 366  * Used to mask destination writes according to the current condition codee.
 367  */
 368 static INLINE GLboolean
 369 test_cc(GLuint condCode, GLuint ccMaskRule)
 370 {
 371    switch (ccMaskRule) {
 372    case COND_EQ: return (condCode == COND_EQ);
 373    case COND_NE: return (condCode != COND_EQ);
 374    case COND_LT: return (condCode == COND_LT);
 375    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 376    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 377    case COND_GT: return (condCode == COND_GT);
 378    case COND_TR: return GL_TRUE;
 379    case COND_FL: return GL_FALSE;
 380    default:      return GL_TRUE;
 381    }
 382 }
 383
 384
 385 /**
 386  * Store 4 floats into a register.  Observe the instructions saturate and
 387  * set-condition-code flags.
 388  */
 389 static void
 390 store_vector4( const struct fp_instruction *inst,
 391                struct fp_machine *machine,
 392                const GLfloat value[4] )
 393 {
 394    const struct fp_dst_register *dest = &(inst->DstReg);
 395    const GLboolean clamp = inst->Saturate;
 396    const GLboolean updateCC = inst->UpdateCondRegister;
 397    GLfloat *dstReg;
 398    GLfloat dummyReg[4];
 399    GLfloat clampedValue[4];
 400    GLboolean condWriteMask[4];
 401    GLuint writeMask = dest->WriteMask;
 402
 403    switch (dest->File) {
 404       case PROGRAM_OUTPUT:
 405          dstReg = machine->Outputs[dest->Index];
 406          break;
 407       case PROGRAM_TEMPORARY:
 408          dstReg = machine->Temporaries[dest->Index];
 409          break;
 410       case PROGRAM_WRITE_ONLY:
 411          dstReg = dummyReg;
 412          return;
 413       default:
 414          _mesa_problem(NULL, "bad register file in store_vector4(fp)");
 415          return;
 416    }
 417
 418 #if DEBUG_FRAG
 419    if (value[0] > 1.0e10 ||
 420        IS_INF_OR_NAN(value[0]) ||
 421        IS_INF_OR_NAN(value[1]) ||
 422        IS_INF_OR_NAN(value[2]) ||
 423        IS_INF_OR_NAN(value[3])  )
 424       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 425 #endif
 426
 427    if (clamp) {
 428       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 429       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 430       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 431       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 432       value = clampedValue;
 433    }
 434
 435    if (dest->CondMask != COND_TR) {
 436       condWriteMask[0] = GET_BIT(writeMask, 0)
 437          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)], dest->CondMask);
 438       condWriteMask[1] = GET_BIT(writeMask, 1)
 439          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)], dest->CondMask);
 440       condWriteMask[2] = GET_BIT(writeMask, 2)
 441          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)], dest->CondMask);
 442       condWriteMask[3] = GET_BIT(writeMask, 3)
 443          && test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)], dest->CondMask);
 444
 445       writeMask = ((condWriteMask[0] << 0) |
 446                    (condWriteMask[1] << 1) |
 447                    (condWriteMask[2] << 2) |
 448                    (condWriteMask[3] << 3));
 449    }
 450
 451    if (GET_BIT(writeMask, 0)) {
 452       dstReg[0] = value[0];
 453       if (updateCC)
 454          machine->CondCodes[0] = generate_cc(value[0]);
 455    }
 456    if (GET_BIT(writeMask, 1)) {
 457       dstReg[1] = value[1];
 458       if (updateCC)
 459          machine->CondCodes[1] = generate_cc(value[1]);
 460    }
 461    if (GET_BIT(writeMask, 2)) {
 462       dstReg[2] = value[2];
 463       if (updateCC)
 464          machine->CondCodes[2] = generate_cc(value[2]);
 465    }
 466    if (GET_BIT(writeMask, 3)) {
 467       dstReg[3] = value[3];
 468       if (updateCC)
 469          machine->CondCodes[3] = generate_cc(value[3]);
 470    }
 471 }
 472
 473
 474 /**
 475  * Initialize a new machine state instance from an existing one, adding
 476  * the partial derivatives onto the input registers.
 477  * Used to implement DDX and DDY instructions in non-trivial cases.
 478  */
 479 static void
 480 init_machine_deriv( GLcontext *ctx,
 481                     const struct fp_machine *machine,
 482                     const struct fragment_program *program,
 483                     const struct sw_span *span, char xOrY,
 484                     struct fp_machine *dMachine )
 485 {
 486    GLuint u;
 487
 488    ASSERT(xOrY == 'X' || xOrY == 'Y');
 489
 490    /* copy existing machine */
 491    _mesa_memcpy(dMachine, machine, sizeof(struct fp_machine));
 492
 493    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
 494       /* Clear temporary registers (undefined for ARB_f_p) */
 495       _mesa_bzero( (void*) machine->Temporaries,
 496                    MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
 497    }
 498
 499    /* Add derivatives */
 500    if (program->InputsRead & (1 << FRAG_ATTRIB_WPOS)) {
 501       GLfloat *wpos = (GLfloat*) machine->Inputs[FRAG_ATTRIB_WPOS];
 502       if (xOrY == 'X') {
 503          wpos[0] += 1.0F;
 504          wpos[1] += 0.0F;
 505          wpos[2] += span->dzdx;
 506          wpos[3] += span->dwdx;
 507       }
 508       else {
 509          wpos[0] += 0.0F;
 510          wpos[1] += 1.0F;
 511          wpos[2] += span->dzdy;
 512          wpos[3] += span->dwdy;
 513       }
 514    }
 515    if (program->InputsRead & (1 << FRAG_ATTRIB_COL0)) {
 516       GLfloat *col0 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL0];
 517       if (xOrY == 'X') {
 518          col0[0] += span->drdx * (1.0F / CHAN_MAXF);
 519          col0[1] += span->dgdx * (1.0F / CHAN_MAXF);
 520          col0[2] += span->dbdx * (1.0F / CHAN_MAXF);
 521          col0[3] += span->dadx * (1.0F / CHAN_MAXF);
 522       }
 523       else {
 524          col0[0] += span->drdy * (1.0F / CHAN_MAXF);
 525          col0[1] += span->dgdy * (1.0F / CHAN_MAXF);
 526          col0[2] += span->dbdy * (1.0F / CHAN_MAXF);
 527          col0[3] += span->dady * (1.0F / CHAN_MAXF);
 528       }
 529    }
 530    if (program->InputsRead & (1 << FRAG_ATTRIB_COL1)) {
 531       GLfloat *col1 = (GLfloat*) machine->Inputs[FRAG_ATTRIB_COL1];
 532       if (xOrY == 'X') {
 533          col1[0] += span->dsrdx * (1.0F / CHAN_MAXF);
 534          col1[1] += span->dsgdx * (1.0F / CHAN_MAXF);
 535          col1[2] += span->dsbdx * (1.0F / CHAN_MAXF);
 536          col1[3] += 0.0; /*XXX fix */
 537       }
 538       else {
 539          col1[0] += span->dsrdy * (1.0F / CHAN_MAXF);
 540          col1[1] += span->dsgdy * (1.0F / CHAN_MAXF);
 541          col1[2] += span->dsbdy * (1.0F / CHAN_MAXF);
 542          col1[3] += 0.0; /*XXX fix */
 543       }
 544    }
 545    if (program->InputsRead & (1 << FRAG_ATTRIB_FOGC)) {
 546       GLfloat *fogc = (GLfloat*) machine->Inputs[FRAG_ATTRIB_FOGC];
 547       if (xOrY == 'X') {
 548          fogc[0] += span->dfogdx;
 549       }
 550       else {
 551          fogc[0] += span->dfogdy;
 552       }
 553    }
 554    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
 555       if (program->InputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
 556          GLfloat *tex = (GLfloat*) machine->Inputs[FRAG_ATTRIB_TEX0 + u];
 557          /* XXX perspective-correct interpolation */
 558          if (xOrY == 'X') {
 559             tex[0] += span->texStepX[u][0];
 560             tex[1] += span->texStepX[u][1];
 561             tex[2] += span->texStepX[u][2];
 562             tex[3] += span->texStepX[u][3];
 563          }
 564          else {
 565             tex[0] += span->texStepY[u][0];
 566             tex[1] += span->texStepY[u][1];
 567             tex[2] += span->texStepY[u][2];
 568             tex[3] += span->texStepY[u][3];
 569          }
 570       }
 571    }
 572
 573    /* init condition codes */
 574    dMachine->CondCodes[0] = COND_EQ;
 575    dMachine->CondCodes[1] = COND_EQ;
 576    dMachine->CondCodes[2] = COND_EQ;
 577    dMachine->CondCodes[3] = COND_EQ;
 578 }
 579
 580
 581 /**
 582  * Execute the given vertex program.
 583  * NOTE: we do everything in single-precision floating point; we don't
 584  * currently observe the single/half/fixed-precision qualifiers.
 585  * \param ctx - rendering context
 586  * \param program - the fragment program to execute
 587  * \param machine - machine state (register file)
 588  * \param maxInst - max number of instructions to execute
 589  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 590  */
 591 static GLboolean
 592 execute_program( GLcontext *ctx,
 593                  const struct fragment_program *program, GLuint maxInst,
 594                  struct fp_machine *machine, const struct sw_span *span,
 595                  GLuint column )
 596 {
 597    GLuint pc;
 598
 599 #if DEBUG_FRAG
 600    printf("execute fragment program --------------------\n");
 601 #endif
 602
 603    for (pc = 0; pc < maxInst; pc++) {
 604       const struct fp_instruction *inst = program->Instructions + pc;
 605
 606       if (ctx->FragmentProgram.CallbackEnabled &&
 607           ctx->FragmentProgram.Callback) {
 608          ctx->FragmentProgram.CurrentPosition = inst->StringPos;
 609          ctx->FragmentProgram.Callback(program->Base.Target,
 610                                        ctx->FragmentProgram.CallbackData);
 611       }
 612
 613       switch (inst->Opcode) {
 614          case FP_OPCODE_ABS:
 615             {
 616                GLfloat a[4], result[4];
 617                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 618                result[0] = FABSF(a[0]);
 619                result[1] = FABSF(a[1]);
 620                result[2] = FABSF(a[2]);
 621                result[3] = FABSF(a[3]);
 622                store_vector4( inst, machine, result );
 623             }
 624             break;
 625          case FP_OPCODE_ADD:
 626             {
 627                GLfloat a[4], b[4], result[4];
 628                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 629                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 630                result[0] = a[0] + b[0];
 631                result[1] = a[1] + b[1];
 632                result[2] = a[2] + b[2];
 633                result[3] = a[3] + b[3];
 634                store_vector4( inst, machine, result );
 635             }
 636             break;
 637          case FP_OPCODE_CMP:
 638             {
 639                GLfloat a[4], b[4], c[4], result[4];
 640                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 641                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 642                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 643                result[0] = a[0] < 0.0F ? b[0] : c[0];
 644                result[1] = a[1] < 0.0F ? b[1] : c[1];
 645                result[2] = a[2] < 0.0F ? b[2] : c[2];
 646                result[3] = a[3] < 0.0F ? b[3] : c[3];
 647                store_vector4( inst, machine, result );
 648             }
 649             break;
 650          case FP_OPCODE_COS:
 651             {
 652                GLfloat a[4], result[4];
 653                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 654                result[0] = result[1] = result[2] = result[3] = (GLfloat)_mesa_cos(a[0]);
 655                store_vector4( inst, machine, result );
 656             }
 657             break;
 658          case FP_OPCODE_DDX: /* Partial derivative with respect to X */
 659             {
 660                GLfloat a[4], aNext[4], result[4];
 661                struct fp_machine dMachine;
 662                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
 663                                         column, result)) {
 664                   /* This is tricky.  Make a copy of the current machine state,
 665                    * increment the input registers by the dx or dy partial
 666                    * derivatives, then re-execute the program up to the
 667                    * preceeding instruction, then fetch the source register.
 668                    * Finally, find the difference in the register values for
 669                    * the original and derivative runs.
 670                    */
 671                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 672                   init_machine_deriv(ctx, machine, program, span,
 673                                      'X', &dMachine);
 674                   execute_program(ctx, program, pc, &dMachine, span, column);
 675                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 676                   result[0] = aNext[0] - a[0];
 677                   result[1] = aNext[1] - a[1];
 678                   result[2] = aNext[2] - a[2];
 679                   result[3] = aNext[3] - a[3];
 680                }
 681                store_vector4( inst, machine, result );
 682             }
 683             break;
 684          case FP_OPCODE_DDY: /* Partial derivative with respect to Y */
 685             {
 686                GLfloat a[4], aNext[4], result[4];
 687                struct fp_machine dMachine;
 688                if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
 689                                         column, result)) {
 690                   init_machine_deriv(ctx, machine, program, span,
 691                                      'Y', &dMachine);
 692                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
 693                   execute_program(ctx, program, pc, &dMachine, span, column);
 694                   fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
 695                   result[0] = aNext[0] - a[0];
 696                   result[1] = aNext[1] - a[1];
 697                   result[2] = aNext[2] - a[2];
 698                   result[3] = aNext[3] - a[3];
 699                }
 700                store_vector4( inst, machine, result );
 701             }
 702             break;
 703          case FP_OPCODE_DP3:
 704             {
 705                GLfloat a[4], b[4], result[4];
 706                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 707                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 708                result[0] = result[1] = result[2] = result[3] =
 709                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 710                store_vector4( inst, machine, result );
 711 #if DEBUG_FRAG
 712                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 713                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 714 #endif
 715             }
 716             break;
 717          case FP_OPCODE_DP4:
 718             {
 719                GLfloat a[4], b[4], result[4];
 720                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 721                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 722                result[0] = result[1] = result[2] = result[3] =
 723                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 724                store_vector4( inst, machine, result );
 725 #if DEBUG_FRAG
 726                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 727                       result[0], a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 728 #endif
 729             }
 730             break;
 731          case FP_OPCODE_DPH:
 732             {
 733                GLfloat a[4], b[4], result[4];
 734                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 735                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 736                result[0] = result[1] = result[2] = result[3] =
 737                   a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
 738                store_vector4( inst, machine, result );
 739             }
 740             break;
 741          case FP_OPCODE_DST: /* Distance vector */
 742             {
 743                GLfloat a[4], b[4], result[4];
 744                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 745                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 746                result[0] = 1.0F;
 747                result[1] = a[1] * b[1];
 748                result[2] = a[2];
 749                result[3] = b[3];
 750                store_vector4( inst, machine, result );
 751             }
 752             break;
 753          case FP_OPCODE_EX2: /* Exponential base 2 */
 754             {
 755                GLfloat a[4], result[4];
 756                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 757                result[0] = result[1] = result[2] = result[3] =
 758                   (GLfloat) _mesa_pow(2.0, a[0]);
 759                store_vector4( inst, machine, result );
 760             }
 761             break;
 762          case FP_OPCODE_FLR:
 763             {
 764                GLfloat a[4], result[4];
 765                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 766                result[0] = FLOORF(a[0]);
 767                result[1] = FLOORF(a[1]);
 768                result[2] = FLOORF(a[2]);
 769                result[3] = FLOORF(a[3]);
 770                store_vector4( inst, machine, result );
 771             }
 772             break;
 773          case FP_OPCODE_FRC:
 774             {
 775                GLfloat a[4], result[4];
 776                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 777                result[0] = a[0] - FLOORF(a[0]);
 778                result[1] = a[1] - FLOORF(a[1]);
 779                result[2] = a[2] - FLOORF(a[2]);
 780                result[3] = a[3] - FLOORF(a[3]);
 781                store_vector4( inst, machine, result );
 782             }
 783             break;
 784          case FP_OPCODE_KIL_NV: /* NV_f_p only */
 785             {
 786                const GLuint swizzle = inst->DstReg.CondSwizzle;
 787                const GLuint condMask = inst->DstReg.CondMask;
 788                if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 789                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 790                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 791                    test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 792                   return GL_FALSE;
 793                }
 794             }
 795             break;
 796          case FP_OPCODE_KIL: /* ARB_f_p only */
 797             {
 798                GLfloat a[4];
 799                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 800                if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 801                   return GL_FALSE;
 802                }
 803             }
 804             break;
 805          case FP_OPCODE_LG2:  /* log base 2 */
 806             {
 807                GLfloat a[4], result[4];
 808                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 809                result[0] = result[1] = result[2] = result[3]
 810                   = LOG2(a[0]);
 811                store_vector4( inst, machine, result );
 812             }
 813             break;
 814          case FP_OPCODE_LIT:
 815             {
 816                const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
 817                GLfloat a[4], result[4];
 818                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 819                a[0] = MAX2(a[0], 0.0F);
 820                a[1] = MAX2(a[1], 0.0F);
 821                /* XXX ARB version clamps a[3], NV version doesn't */
 822                a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 823                result[0] = 1.0F;
 824                result[1] = a[0];
 825                /* XXX we could probably just use pow() here */
 826                result[2] = (a[0] > 0.0F) ? (GLfloat) exp(a[3] * log(a[1])) : 0.0F;
 827                result[3] = 1.0F;
 828                store_vector4( inst, machine, result );
 829             }
 830             break;
 831          case FP_OPCODE_LRP:
 832             {
 833                GLfloat a[4], b[4], c[4], result[4];
 834                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 835                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 836                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 837                result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
 838                result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
 839                result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
 840                result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
 841                store_vector4( inst, machine, result );
 842             }
 843             break;
 844          case FP_OPCODE_MAD:
 845             {
 846                GLfloat a[4], b[4], c[4], result[4];
 847                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 848                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 849                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
 850                result[0] = a[0] * b[0] + c[0];
 851                result[1] = a[1] * b[1] + c[1];
 852                result[2] = a[2] * b[2] + c[2];
 853                result[3] = a[3] * b[3] + c[3];
 854                store_vector4( inst, machine, result );
 855             }
 856             break;
 857          case FP_OPCODE_MAX:
 858             {
 859                GLfloat a[4], b[4], result[4];
 860                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 861                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 862                result[0] = MAX2(a[0], b[0]);
 863                result[1] = MAX2(a[1], b[1]);
 864                result[2] = MAX2(a[2], b[2]);
 865                result[3] = MAX2(a[3], b[3]);
 866                store_vector4( inst, machine, result );
 867 #if DEBUG_FRAG
 868                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
 869                       result[0], result[1], result[2], result[3],
 870                       a[0], a[1], a[2], a[3],
 871                       b[0], b[1], b[2], b[3]);
 872 #endif
 873             }
 874             break;
 875          case FP_OPCODE_MIN:
 876             {
 877                GLfloat a[4], b[4], result[4];
 878                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 879                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 880                result[0] = MIN2(a[0], b[0]);
 881                result[1] = MIN2(a[1], b[1]);
 882                result[2] = MIN2(a[2], b[2]);
 883                result[3] = MIN2(a[3], b[3]);
 884                store_vector4( inst, machine, result );
 885             }
 886             break;
 887          case FP_OPCODE_MOV:
 888             {
 889                GLfloat result[4];
 890                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
 891                store_vector4( inst, machine, result );
 892 #if DEBUG_FRAG
 893                printf("MOV (%g %g %g %g)\n",
 894                       result[0], result[1], result[2], result[3]);
 895 #endif
 896             }
 897             break;
 898          case FP_OPCODE_MUL:
 899             {
 900                GLfloat a[4], b[4], result[4];
 901                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 902                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
 903                result[0] = a[0] * b[0];
 904                result[1] = a[1] * b[1];
 905                result[2] = a[2] * b[2];
 906                result[3] = a[3] * b[3];
 907                store_vector4( inst, machine, result );
 908 #if DEBUG_FRAG
 909                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
 910                       result[0], result[1], result[2], result[3],
 911                       a[0], a[1], a[2], a[3],
 912                       b[0], b[1], b[2], b[3]);
 913 #endif
 914             }
 915             break;
 916          case FP_OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
 917             {
 918                GLfloat a[4], result[4];
 919                GLhalfNV hx, hy;
 920                GLuint *rawResult = (GLuint *) result;
 921                GLuint twoHalves;
 922                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 923                hx = _mesa_float_to_half(a[0]);
 924                hy = _mesa_float_to_half(a[1]);
 925                twoHalves = hx | (hy << 16);
 926                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 927                   = twoHalves;
 928                store_vector4( inst, machine, result );
 929             }
 930             break;
 931          case FP_OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
 932             {
 933                GLfloat a[4], result[4];
 934                GLuint usx, usy, *rawResult = (GLuint *) result;
 935                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 936                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 937                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 938                usx = IROUND(a[0] * 65535.0F);
 939                usy = IROUND(a[1] * 65535.0F);
 940                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 941                   = usx | (usy << 16);
 942                store_vector4( inst, machine, result );
 943             }
 944             break;
 945          case FP_OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
 946             {
 947                GLfloat a[4], result[4];
 948                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 949                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 950                a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
 951                a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
 952                a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
 953                a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
 954                ubx = IROUND(127.0F * a[0] + 128.0F);
 955                uby = IROUND(127.0F * a[1] + 128.0F);
 956                ubz = IROUND(127.0F * a[2] + 128.0F);
 957                ubw = IROUND(127.0F * a[3] + 128.0F);
 958                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 959                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 960                store_vector4( inst, machine, result );
 961             }
 962             break;
 963          case FP_OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
 964             {
 965                GLfloat a[4], result[4];
 966                GLuint ubx, uby, ubz, ubw, *rawResult = (GLuint *) result;
 967                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
 968                a[0] = CLAMP(a[0], 0.0F, 1.0F);
 969                a[1] = CLAMP(a[1], 0.0F, 1.0F);
 970                a[2] = CLAMP(a[2], 0.0F, 1.0F);
 971                a[3] = CLAMP(a[3], 0.0F, 1.0F);
 972                ubx = IROUND(255.0F * a[0]);
 973                uby = IROUND(255.0F * a[1]);
 974                ubz = IROUND(255.0F * a[2]);
 975                ubw = IROUND(255.0F * a[3]);
 976                rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
 977                   = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
 978                store_vector4( inst, machine, result );
 979             }
 980             break;
 981          case FP_OPCODE_POW:
 982             {
 983                GLfloat a[4], b[4], result[4];
 984                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 985                fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
 986                result[0] = result[1] = result[2] = result[3]
 987                   = (GLfloat)_mesa_pow(a[0], b[0]);
 988                store_vector4( inst, machine, result );
 989             }
 990             break;
 991          case FP_OPCODE_RCP:
 992             {
 993                GLfloat a[4], result[4];
 994                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
 995 #if DEBUG_FRAG
 996                if (a[0] == 0)
 997                   printf("RCP(0)\n");
 998                else if (IS_INF_OR_NAN(a[0]))
 999                   printf("RCP(inf)\n");
1000 #endif
1001                result[0] = result[1] = result[2] = result[3]
1002                   = 1.0F / a[0];
1003                store_vector4( inst, machine, result );
1004             }
1005             break;
1006          case FP_OPCODE_RFL:
1007             {
1008                GLfloat axis[4], dir[4], result[4], tmp[4];
1009                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
1010                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
1011                tmp[3] = axis[0] * axis[0]
1012                       + axis[1] * axis[1]
1013                       + axis[2] * axis[2];
1014                tmp[0] = (2.0F * (axis[0] * dir[0] +
1015                                  axis[1] * dir[1] +
1016                                  axis[2] * dir[2])) / tmp[3];
1017                result[0] = tmp[0] * axis[0] - dir[0];
1018                result[1] = tmp[0] * axis[1] - dir[1];
1019                result[2] = tmp[0] * axis[2] - dir[2];
1020                /* result[3] is never written! XXX enforce in parser! */
1021                store_vector4( inst, machine, result );
1022             }
1023             break;
1024          case FP_OPCODE_RSQ: /* 1 / sqrt() */
1025             {
1026                GLfloat a[4], result[4];
1027                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1028                result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1029                store_vector4( inst, machine, result );
1030 #if DEBUG_FRAG
1031                printf("RSQ %g = 1/sqrt(%g)\n", result[0], a[0]);
1032 #endif
1033             }
1034             break;
1035          case FP_OPCODE_SCS: /* sine and cos */
1036             {
1037                GLfloat a[4], result[4];
1038                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1039                result[0] = (GLfloat)cos(a[0]);
1040                result[1] = (GLfloat)sin(a[0]);
1041                result[2] = 0.0;  /* undefined! */
1042                result[3] = 0.0;  /* undefined! */
1043                store_vector4( inst, machine, result );
1044             }
1045             break;
1046          case FP_OPCODE_SEQ: /* set on equal */
1047             {
1048                GLfloat a[4], b[4], result[4];
1049                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1050                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1051                result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1052                result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1053                result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1054                result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1055                store_vector4( inst, machine, result );
1056             }
1057             break;
1058          case FP_OPCODE_SFL: /* set false, operands ignored */
1059             {
1060                static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1061                store_vector4( inst, machine, result );
1062             }
1063             break;
1064          case FP_OPCODE_SGE: /* set on greater or equal */
1065             {
1066                GLfloat a[4], b[4], result[4];
1067                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1068                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1069                result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1070                result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1071                result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1072                result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1073                store_vector4( inst, machine, result );
1074             }
1075             break;
1076          case FP_OPCODE_SGT: /* set on greater */
1077             {
1078                GLfloat a[4], b[4], result[4];
1079                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1080                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1081                result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1082                result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1083                result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1084                result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1085                store_vector4( inst, machine, result );
1086             }
1087             break;
1088          case FP_OPCODE_SIN:
1089             {
1090                GLfloat a[4], result[4];
1091                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1092                result[0] = result[1] = result[2] =
1093                        result[3] = (GLfloat)_mesa_sin(a[0]);
1094                store_vector4( inst, machine, result );
1095             }
1096             break;
1097          case FP_OPCODE_SLE: /* set on less or equal */
1098             {
1099                GLfloat a[4], b[4], result[4];
1100                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1101                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1102                result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1103                result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1104                result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1105                result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1106                store_vector4( inst, machine, result );
1107             }
1108             break;
1109          case FP_OPCODE_SLT: /* set on less */
1110             {
1111                GLfloat a[4], b[4], result[4];
1112                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1113                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1114                result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1115                result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1116                result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1117                result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1118                store_vector4( inst, machine, result );
1119             }
1120             break;
1121          case FP_OPCODE_SNE: /* set on not equal */
1122             {
1123                GLfloat a[4], b[4], result[4];
1124                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1125                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1126                result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1127                result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1128                result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1129                result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1130                store_vector4( inst, machine, result );
1131             }
1132             break;
1133          case FP_OPCODE_STR: /* set true, operands ignored */
1134             {
1135                static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1136                store_vector4( inst, machine, result );
1137             }
1138             break;
1139          case FP_OPCODE_SUB:
1140             {
1141                GLfloat a[4], b[4], result[4];
1142                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1143                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1144                result[0] = a[0] - b[0];
1145                result[1] = a[1] - b[1];
1146                result[2] = a[2] - b[2];
1147                result[3] = a[3] - b[3];
1148                store_vector4( inst, machine, result );
1149             }
1150             break;
1151          case FP_OPCODE_SWZ:
1152             {
1153                const struct fp_src_register *source = &inst->SrcReg[0];
1154                const GLfloat *src = get_register_pointer(ctx, source,
1155                                                          machine, program);
1156                GLfloat result[4];
1157                GLuint i;
1158
1159                /* do extended swizzling here */
1160                for (i = 0; i < 3; i++) {
1161                   if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ZERO)
1162                      result[i] = 0.0;
1163                   else if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ONE)
1164                      result[i] = -1.0;
1165                   else
1166                      result[i] = -src[GET_SWZ(source->Swizzle, i)];
1167
1168                   if (source->NegateBase)
1169                      result[i] = -result[i];
1170                }
1171                store_vector4( inst, machine, result );
1172             }
1173             break;
1174          case FP_OPCODE_TEX: /* Both ARB and NV frag prog */
1175             /* Texel lookup */
1176             {
1177                GLfloat texcoord[4], color[4];
1178                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1179                /* Note: we pass 0 for LOD.  The ARB extension requires it
1180                 * while the NV extension says it's implementation dependant.
1181                 */
1182                /* KW: Previously lambda was passed as zero, but I
1183                 * believe this is incorrect, the spec seems to
1184                 * indicate rather that lambda should not be
1185                 * changed/biased, unlike TXB where texcoord[3] is
1186                 * added to the lambda calculations.  The lambda should
1187                 * still be calculated normally for TEX & TXP though,
1188                 * not set to zero.  Otherwise it's very difficult to
1189                 * implement normal GL semantics through the fragment
1190                 * shader.
1191                 */
1192                fetch_texel( ctx, texcoord,
1193                             span->array->lambda[inst->TexSrcUnit][column],
1194                             inst->TexSrcUnit, color );
1195 #if DEBUG_FRAG
1196                if (color[3])
1197                   printf("color[3] = %f\n", color[3]);
1198 #endif
1199                store_vector4( inst, machine, color );
1200             }
1201             break;
1202          case FP_OPCODE_TXB: /* GL_ARB_fragment_program only */
1203             /* Texel lookup with LOD bias */
1204             {
1205                GLfloat texcoord[4], color[4], bias, lambda;
1206
1207                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1208                /* texcoord[3] is the bias to add to lambda */
1209                bias = ctx->Texture.Unit[inst->TexSrcUnit].LodBias
1210                     + ctx->Texture.Unit[inst->TexSrcUnit]._Current->LodBias
1211                     + texcoord[3];
1212                lambda = span->array->lambda[inst->TexSrcUnit][column] + bias;
1213                fetch_texel( ctx, texcoord, lambda,
1214                             inst->TexSrcUnit, color );
1215                store_vector4( inst, machine, color );
1216             }
1217             break;
1218          case FP_OPCODE_TXD: /* GL_NV_fragment_program only */
1219             /* Texture lookup w/ partial derivatives for LOD */
1220             {
1221                GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1222                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1223                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
1224                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
1225                fetch_texel_deriv( ctx, texcoord, dtdx, dtdy, inst->TexSrcUnit,
1226                                   color );
1227                store_vector4( inst, machine, color );
1228             }
1229             break;
1230          case FP_OPCODE_TXP: /* GL_ARB_fragment_program only */
1231             /* Texture lookup w/ projective divide */
1232             {
1233                GLfloat texcoord[4], color[4];
1234                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1235                /* Not so sure about this test - if texcoord[3] is
1236                 * zero, we'd probably be fine except for an ASSERT in
1237                 * IROUND_POS() which gets triggered by the inf values created.
1238                 */
1239                if (texcoord[3] != 0.0) {
1240                   texcoord[0] /= texcoord[3];
1241                   texcoord[1] /= texcoord[3];
1242                   texcoord[2] /= texcoord[3];
1243                }
1244                /* KW: Previously lambda was passed as zero, but I
1245                 * believe this is incorrect, the spec seems to
1246                 * indicate rather that lambda should not be
1247                 * changed/biased, unlike TXB where texcoord[3] is
1248                 * added to the lambda calculations.  The lambda should
1249                 * still be calculated normally for TEX & TXP though,
1250                 * not set to zero.
1251                 */
1252                fetch_texel( ctx, texcoord,
1253                             span->array->lambda[inst->TexSrcUnit][column],
1254                             inst->TexSrcUnit, color );
1255                store_vector4( inst, machine, color );
1256             }
1257             break;
1258          case FP_OPCODE_TXP_NV: /* GL_NV_fragment_program only */
1259             /* Texture lookup w/ projective divide */
1260             {
1261                GLfloat texcoord[4], color[4];
1262                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
1263                if (inst->TexSrcIdx != TEXTURE_CUBE_INDEX &&
1264                    texcoord[3] != 0.0) {
1265                   texcoord[0] /= texcoord[3];
1266                   texcoord[1] /= texcoord[3];
1267                   texcoord[2] /= texcoord[3];
1268                }
1269                fetch_texel( ctx, texcoord,
1270                             span->array->lambda[inst->TexSrcUnit][column],
1271                             inst->TexSrcUnit, color );
1272                store_vector4( inst, machine, color );
1273             }
1274             break;
1275          case FP_OPCODE_UP2H: /* unpack two 16-bit floats */
1276             {
1277                GLfloat a[4], result[4];
1278                const GLuint *rawBits = (const GLuint *) a;
1279                GLhalfNV hx, hy;
1280                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1281                hx = rawBits[0] & 0xffff;
1282                hy = rawBits[0] >> 16;
1283                result[0] = result[2] = _mesa_half_to_float(hx);
1284                result[1] = result[3] = _mesa_half_to_float(hy);
1285                store_vector4( inst, machine, result );
1286             }
1287             break;
1288          case FP_OPCODE_UP2US: /* unpack two GLushorts */
1289             {
1290                GLfloat a[4], result[4];
1291                const GLuint *rawBits = (const GLuint *) a;
1292                GLushort usx, usy;
1293                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1294                usx = rawBits[0] & 0xffff;
1295                usy = rawBits[0] >> 16;
1296                result[0] = result[2] = usx * (1.0f / 65535.0f);
1297                result[1] = result[3] = usy * (1.0f / 65535.0f);
1298                store_vector4( inst, machine, result );
1299             }
1300             break;
1301          case FP_OPCODE_UP4B: /* unpack four GLbytes */
1302             {
1303                GLfloat a[4], result[4];
1304                const GLuint *rawBits = (const GLuint *) a;
1305                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1306                result[0] = (((rawBits[0] >>  0) & 0xff) - 128) / 127.0F;
1307                result[1] = (((rawBits[0] >>  8) & 0xff) - 128) / 127.0F;
1308                result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
1309                result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
1310                store_vector4( inst, machine, result );
1311             }
1312             break;
1313          case FP_OPCODE_UP4UB: /* unpack four GLubytes */
1314             {
1315                GLfloat a[4], result[4];
1316                const GLuint *rawBits = (const GLuint *) a;
1317                fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
1318                result[0] = ((rawBits[0] >>  0) & 0xff) / 255.0F;
1319                result[1] = ((rawBits[0] >>  8) & 0xff) / 255.0F;
1320                result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
1321                result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
1322                store_vector4( inst, machine, result );
1323             }
1324             break;
1325          case FP_OPCODE_XPD: /* cross product */
1326             {
1327                GLfloat a[4], b[4], result[4];
1328                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1329                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1330                result[0] = a[1] * b[2] - a[2] * b[1];
1331                result[1] = a[2] * b[0] - a[0] * b[2];
1332                result[2] = a[0] * b[1] - a[1] * b[0];
1333                result[3] = 1.0;
1334                store_vector4( inst, machine, result );
1335             }
1336             break;
1337          case FP_OPCODE_X2D: /* 2-D matrix transform */
1338             {
1339                GLfloat a[4], b[4], c[4], result[4];
1340                fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
1341                fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
1342                fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
1343                result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1344                result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1345                result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1346                result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1347                store_vector4( inst, machine, result );
1348             }
1349             break;
1350          case FP_OPCODE_PRINT:
1351             {
1352                if (inst->SrcReg[0].File != -1) {
1353                   GLfloat a[4];
1354                   fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
1355                   _mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
1356                                a[0], a[1], a[2], a[3]);
1357                }
1358                else {
1359                   _mesa_printf("%s\n", (const char *) inst->Data);
1360                }
1361             }
1362             break;
1363          case FP_OPCODE_END:
1364             return GL_TRUE;
1365          default:
1366             _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
1367                           inst->Opcode);
1368             return GL_TRUE; /* return value doesn't matter */
1369       }
1370    }
1371    return GL_TRUE;
1372 }
1373
1374
1375 static void
1376 init_machine( GLcontext *ctx, struct fp_machine *machine,
1377               const struct fragment_program *program,
1378               const struct sw_span *span, GLuint col )
1379 {
1380    GLuint inputsRead = program->InputsRead;
1381    GLuint u;
1382
1383    if (ctx->FragmentProgram.CallbackEnabled)
1384       inputsRead = ~0;
1385
1386    if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
1387       /* Clear temporary registers (undefined for ARB_f_p) */
1388       _mesa_bzero(machine->Temporaries,
1389                   MAX_NV_FRAGMENT_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
1390    }
1391
1392    /* Load input registers */
1393    if (inputsRead & (1 << FRAG_ATTRIB_WPOS)) {
1394       GLfloat *wpos = machine->Inputs[FRAG_ATTRIB_WPOS];
1395       wpos[0] = (GLfloat) span->x + col;
1396       wpos[1] = (GLfloat) span->y;
1397       wpos[2] = (GLfloat) span->array->z[col] / ctx->DrawBuffer->_DepthMaxF;
1398       wpos[3] = span->w + col * span->dwdx;
1399    }
1400    if (inputsRead & (1 << FRAG_ATTRIB_COL0)) {
1401       GLfloat *col0 = machine->Inputs[FRAG_ATTRIB_COL0];
1402       col0[0] = CHAN_TO_FLOAT(span->array->rgba[col][RCOMP]);
1403       col0[1] = CHAN_TO_FLOAT(span->array->rgba[col][GCOMP]);
1404       col0[2] = CHAN_TO_FLOAT(span->array->rgba[col][BCOMP]);
1405       col0[3] = CHAN_TO_FLOAT(span->array->rgba[col][ACOMP]);
1406    }
1407    if (inputsRead & (1 << FRAG_ATTRIB_COL1)) {
1408       GLfloat *col1 = machine->Inputs[FRAG_ATTRIB_COL1];
1409       col1[0] = CHAN_TO_FLOAT(span->array->spec[col][RCOMP]);
1410       col1[1] = CHAN_TO_FLOAT(span->array->spec[col][GCOMP]);
1411       col1[2] = CHAN_TO_FLOAT(span->array->spec[col][BCOMP]);
1412       col1[3] = CHAN_TO_FLOAT(span->array->spec[col][ACOMP]);
1413    }
1414    if (inputsRead & (1 << FRAG_ATTRIB_FOGC)) {
1415       GLfloat *fogc = machine->Inputs[FRAG_ATTRIB_FOGC];
1416       fogc[0] = span->array->fog[col];
1417       fogc[1] = 0.0F;
1418       fogc[2] = 0.0F;
1419       fogc[3] = 0.0F;
1420    }
1421    for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
1422       if (inputsRead & (1 << (FRAG_ATTRIB_TEX0 + u))) {
1423          GLfloat *tex = machine->Inputs[FRAG_ATTRIB_TEX0 + u];
1424          /*ASSERT(ctx->Texture._EnabledCoordUnits & (1 << u));*/
1425          COPY_4V(tex, span->array->texcoords[u][col]);
1426          /*ASSERT(tex[0] != 0 || tex[1] != 0 || tex[2] != 0);*/
1427       }
1428    }
1429
1430    /* init condition codes */
1431    machine->CondCodes[0] = COND_EQ;
1432    machine->CondCodes[1] = COND_EQ;
1433    machine->CondCodes[2] = COND_EQ;
1434    machine->CondCodes[3] = COND_EQ;
1435 }
1436
1437
1438
1439 /**
1440  * Execute the current fragment program, operating on the given span.
1441  */
1442 void
1443 _swrast_exec_fragment_program( GLcontext *ctx, struct sw_span *span )
1444 {
1445    const struct fragment_program *program = ctx->FragmentProgram._Current;
1446    GLuint i;
1447
1448    ctx->_CurrentProgram = GL_FRAGMENT_PROGRAM_ARB; /* or NV, doesn't matter */
1449
1450    if (program->Parameters) {
1451       _mesa_load_state_parameters(ctx, program->Parameters);
1452    }
1453
1454    for (i = 0; i < span->end; i++) {
1455       if (span->array->mask[i]) {
1456          init_machine(ctx, &ctx->FragmentProgram.Machine,
1457                       ctx->FragmentProgram._Current, span, i);
1458
1459 #ifdef USE_TCC
1460          if (!_swrast_execute_codegen_program(ctx, program, ~0,
1461                                               &ctx->FragmentProgram.Machine,
1462                                               span, i)) {
1463             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1464          }
1465 #else
1466          if (!execute_program(ctx, program, ~0,
1467                               &ctx->FragmentProgram.Machine, span, i)) {
1468             span->array->mask[i] = GL_FALSE;  /* killed fragment */
1469          }
1470 #endif
1471
1472          /* Store output registers */
1473          {
1474             const GLfloat *colOut
1475                = ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_COLR];
1476             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][RCOMP], colOut[0]);
1477             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][GCOMP], colOut[1]);
1478             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][BCOMP], colOut[2]);
1479             UNCLAMPED_FLOAT_TO_CHAN(span->array->rgba[i][ACOMP], colOut[3]);
1480          }
1481          /* depth value */
1482          if (program->OutputsWritten & (1 << FRAG_OUTPUT_DEPR))
1483             span->array->z[i] = IROUND(ctx->FragmentProgram.Machine.Outputs[FRAG_OUTPUT_DEPR][0] * ctx->DrawBuffer->_DepthMaxF);
1484       }
1485    }
1486
1487    ctx->_CurrentProgram = 0;
1488 }
1489