src/mesa/shader/nvvertexec.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file nvvertexec.c
  27  * Code to execute vertex programs.
  28  * \author Brian Paul
  29  */
  30
  31 #include "glheader.h"
  32 #include "context.h"
  33 #include "imports.h"
  34 #include "macros.h"
  35 #include "mtypes.h"
  36 #include "nvvertexec.h"
  37 #include "nvvertprog.h"
  38 #include "program.h"
  39 #include "math/m_matrix.h"
  40
  41
  42 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  43
  44
  45 /**
  46  * Load/initialize the vertex program registers which need to be set
  47  * per-vertex.
  48  */
  49 void
  50 _mesa_init_vp_per_vertex_registers(GLcontext *ctx)
  51 {
  52    /* Input registers get initialized from the current vertex attribs */
  53    MEMCPY(ctx->VertexProgram.Inputs, ctx->Current.Attrib,
  54           VERT_ATTRIB_MAX * 4 * sizeof(GLfloat));
  55
  56    if (ctx->VertexProgram.Current->IsNVProgram) {
  57       GLuint i;
  58       /* Output/result regs are initialized to [0,0,0,1] */
  59       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
  60          ASSIGN_4V(ctx->VertexProgram.Outputs[i], 0.0F, 0.0F, 0.0F, 1.0F);
  61       }
  62       /* Temp regs are initialized to [0,0,0,0] */
  63       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
  64          ASSIGN_4V(ctx->VertexProgram.Temporaries[i], 0.0F, 0.0F, 0.0F, 0.0F);
  65       }
  66       ASSIGN_4V(ctx->VertexProgram.AddressReg, 0, 0, 0, 0);
  67    }
  68 }
  69
  70
  71
  72 /**
  73  * Copy the 16 elements of a matrix into four consecutive program
  74  * registers starting at 'pos'.
  75  */
  76 static void
  77 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  78 {
  79    GLuint i;
  80    for (i = 0; i < 4; i++) {
  81       registers[pos + i][0] = mat[0 + i];
  82       registers[pos + i][1] = mat[4 + i];
  83       registers[pos + i][2] = mat[8 + i];
  84       registers[pos + i][3] = mat[12 + i];
  85    }
  86 }
  87
  88
  89 /**
  90  * As above, but transpose the matrix.
  91  */
  92 static void
  93 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
  94                       const GLfloat mat[16])
  95 {
  96    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
  97 }
  98
  99
 100 /**
 101  * Load program parameter registers with tracked matrices (if NV program)
 102  * or GL state values (if ARB program).
 103  * This needs to be done per glBegin/glEnd, not per-vertex.
 104  */
 105 void
 106 _mesa_init_vp_per_primitive_registers(GLcontext *ctx)
 107 {
 108    if (ctx->VertexProgram.Current->IsNVProgram) {
 109       GLuint i;
 110
 111       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS / 4; i++) {
 112          /* point 'mat' at source matrix */
 113          GLmatrix *mat;
 114          if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 115             mat = ctx->ModelviewMatrixStack.Top;
 116          }
 117          else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 118             mat = ctx->ProjectionMatrixStack.Top;
 119          }
 120          else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 121             mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 122          }
 123          else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 124             mat = ctx->ColorMatrixStack.Top;
 125          }
 126          else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 127             /* XXX verify the combined matrix is up to date */
 128             mat = &ctx->_ModelProjectMatrix;
 129          }
 130          else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 131                   ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 132             GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 133             ASSERT(n < MAX_PROGRAM_MATRICES);
 134             mat = ctx->ProgramMatrixStack[n].Top;
 135          }
 136          else {
 137             /* no matrix is tracked, but we leave the register values as-is */
 138             assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 139             continue;
 140          }
 141
 142          /* load the matrix */
 143          if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 144             load_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 145          }
 146          else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 147             _math_matrix_analyse(mat); /* update the inverse */
 148             assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 149             load_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 150          }
 151          else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 152             load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 153          }
 154          else {
 155             assert(ctx->VertexProgram.TrackMatrixTransform[i]
 156                    == GL_INVERSE_TRANSPOSE_NV);
 157             _math_matrix_analyse(mat); /* update the inverse */
 158             assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 159             load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 160          }
 161       }
 162    }
 163    else {
 164       /* Using and ARB vertex program */
 165       if (ctx->VertexProgram.Current->Parameters) {
 166          /* Grab the state GL state and put into registers */
 167          _mesa_load_state_parameters(ctx,
 168                                      ctx->VertexProgram.Current->Parameters);
 169       }
 170    }
 171 }
 172
 173
 174
 175 /**
 176  * For debugging.  Dump the current vertex program machine registers.
 177  */
 178 void
 179 _mesa_dump_vp_state( const struct gl_vertex_program_state *state )
 180 {
 181    int i;
 182    _mesa_printf("VertexIn:\n");
 183    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_INPUTS; i++) {
 184       _mesa_printf("%d: %f %f %f %f   ", i,
 185                    state->Inputs[i][0],
 186                    state->Inputs[i][1],
 187                    state->Inputs[i][2],
 188                    state->Inputs[i][3]);
 189    }
 190    _mesa_printf("\n");
 191
 192    _mesa_printf("VertexOut:\n");
 193    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
 194       _mesa_printf("%d: %f %f %f %f   ", i,
 195                   state->Outputs[i][0],
 196                   state->Outputs[i][1],
 197                   state->Outputs[i][2],
 198                   state->Outputs[i][3]);
 199    }
 200    _mesa_printf("\n");
 201
 202    _mesa_printf("Registers:\n");
 203    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
 204       _mesa_printf("%d: %f %f %f %f   ", i,
 205                   state->Temporaries[i][0],
 206                   state->Temporaries[i][1],
 207                   state->Temporaries[i][2],
 208                   state->Temporaries[i][3]);
 209    }
 210    _mesa_printf("\n");
 211
 212    _mesa_printf("Parameters:\n");
 213    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS; i++) {
 214       _mesa_printf("%d: %f %f %f %f   ", i,
 215                   state->Parameters[i][0],
 216                   state->Parameters[i][1],
 217                   state->Parameters[i][2],
 218                   state->Parameters[i][3]);
 219    }
 220    _mesa_printf("\n");
 221 }
 222
 223
 224
 225 /**
 226  * Return a pointer to the 4-element float vector specified by the given
 227  * source register.
 228  */
 229 static INLINE const GLfloat *
 230 get_register_pointer( const struct vp_src_register *source,
 231                       const struct gl_vertex_program_state *state )
 232 {
 233    if (source->RelAddr) {
 234       const GLint reg = source->Index + state->AddressReg[0];
 235       ASSERT( (source->File == PROGRAM_ENV_PARAM) ||
 236         (source->File == PROGRAM_STATE_VAR) );
 237       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 238          return ZeroVec;
 239       else if (source->File == PROGRAM_ENV_PARAM)
 240          return state->Parameters[reg];
 241       else
 242          return state->Current->Parameters->Parameters[reg].Values;
 243    }
 244    else {
 245       switch (source->File) {
 246          case PROGRAM_TEMPORARY:
 247             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_TEMPS);
 248             return state->Temporaries[source->Index];
 249          case PROGRAM_INPUT:
 250             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_INPUTS);
 251             return state->Inputs[source->Index];
 252          case PROGRAM_OUTPUT:
 253             /* This is only needed for the PRINT instruction */
 254             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_OUTPUTS);
 255             return state->Outputs[source->Index];
 256          case PROGRAM_LOCAL_PARAM:
 257             ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 258             return state->Current->Base.LocalParams[source->Index];
 259          case PROGRAM_ENV_PARAM:
 260             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_PARAMS);
 261             return state->Parameters[source->Index];
 262          case PROGRAM_STATE_VAR:
 263             ASSERT(source->Index < state->Current->Parameters->NumParameters);
 264             return state->Current->Parameters->Parameters[source->Index].Values;
 265          default:
 266             _mesa_problem(NULL,
 267                           "Bad source register file in get_register_pointer");
 268             return NULL;
 269       }
 270    }
 271    return NULL;
 272 }
 273
 274
 275 /**
 276  * Fetch a 4-element float vector from the given source register.
 277  * Apply swizzling and negating as needed.
 278  */
 279 static INLINE void
 280 fetch_vector4( const struct vp_src_register *source,
 281                const struct gl_vertex_program_state *state,
 282                GLfloat result[4] )
 283 {
 284    const GLfloat *src = get_register_pointer(source, state);
 285
 286    if (source->Negate) {
 287       result[0] = -src[source->Swizzle[0]];
 288       result[1] = -src[source->Swizzle[1]];
 289       result[2] = -src[source->Swizzle[2]];
 290       result[3] = -src[source->Swizzle[3]];
 291    }
 292    else {
 293       result[0] = src[source->Swizzle[0]];
 294       result[1] = src[source->Swizzle[1]];
 295       result[2] = src[source->Swizzle[2]];
 296       result[3] = src[source->Swizzle[3]];
 297    }
 298 }
 299
 300
 301
 302 /**
 303  * As above, but only return result[0] element.
 304  */
 305 static INLINE void
 306 fetch_vector1( const struct vp_src_register *source,
 307                const struct gl_vertex_program_state *state,
 308                GLfloat result[4] )
 309 {
 310    const GLfloat *src = get_register_pointer(source, state);
 311
 312    if (source->Negate) {
 313       result[0] = -src[source->Swizzle[0]];
 314    }
 315    else {
 316       result[0] = src[source->Swizzle[0]];
 317    }
 318 }
 319
 320
 321 /**
 322  * Store 4 floats into a register.
 323  */
 324 static void
 325 store_vector4( const struct vp_dst_register *dest,
 326                struct gl_vertex_program_state *state,
 327                const GLfloat value[4] )
 328 {
 329    GLfloat *dst;
 330    switch (dest->File) {
 331       case PROGRAM_TEMPORARY:
 332          dst = state->Temporaries[dest->Index];
 333          break;
 334       case PROGRAM_OUTPUT:
 335          dst = state->Outputs[dest->Index];
 336          break;
 337       case PROGRAM_ENV_PARAM:
 338          {
 339             /* a slight hack */
 340             GET_CURRENT_CONTEXT(ctx);
 341             dst = ctx->VertexProgram.Parameters[dest->Index];
 342          }
 343          break;
 344       default:
 345          _mesa_problem(NULL, "Invalid register file in store_vector4(file=%d)",
 346                        dest->File);
 347          return;
 348    }
 349
 350    if (dest->WriteMask[0])
 351       dst[0] = value[0];
 352    if (dest->WriteMask[1])
 353       dst[1] = value[1];
 354    if (dest->WriteMask[2])
 355       dst[2] = value[2];
 356    if (dest->WriteMask[3])
 357       dst[3] = value[3];
 358 }
 359
 360
 361 /**
 362  * Set x to positive or negative infinity.
 363  */
 364 #if defined(USE_IEEE) || defined(_WIN32)
 365 #define SET_POS_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0x7F800000 )
 366 #define SET_NEG_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0xFF800000 )
 367 #elif defined(VMS)
 368 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 369 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 370 #else
 371 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 372 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 373 #endif
 374
 375 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
 376
 377
 378 /**
 379  * Execute the given vertex program
 380  */
 381 void
 382 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 383 {
 384    struct gl_vertex_program_state *state = &ctx->VertexProgram;
 385    const struct vp_instruction *inst;
 386
 387    ctx->_CurrentProgram = GL_VERTEX_PROGRAM_ARB; /* or NV, doesn't matter */
 388
 389    /* If the program is position invariant, multiply the input
 390     * position and the MVP matrix and stick it into the output pos slot
 391     */
 392    if (ctx->VertexProgram.Current->IsPositionInvariant) {
 393       TRANSFORM_POINT( ctx->VertexProgram.Outputs[0],
 394                        ctx->_ModelProjectMatrix.m,
 395                        ctx->VertexProgram.Inputs[0]);
 396
 397       /* XXX: This could go elsewhere */
 398       ctx->VertexProgram.Current->OutputsWritten |= 0x1;
 399    }
 400    for (inst = program->Instructions; ; inst++) {
 401
 402       if (ctx->VertexProgram.CallbackEnabled &&
 403           ctx->VertexProgram.Callback) {
 404          ctx->VertexProgram.CurrentPosition = inst->StringPos;
 405          ctx->VertexProgram.Callback(program->Base.Target,
 406                                      ctx->VertexProgram.CallbackData);
 407       }
 408
 409       switch (inst->Opcode) {
 410          case VP_OPCODE_MOV:
 411             {
 412                GLfloat t[4];
 413                fetch_vector4( &inst->SrcReg[0], state, t );
 414                store_vector4( &inst->DstReg, state, t );
 415             }
 416             break;
 417          case VP_OPCODE_LIT:
 418             {
 419                const GLfloat epsilon = 1.0F / 256.0F; /* per NV spec */
 420                GLfloat t[4], lit[4];
 421                fetch_vector4( &inst->SrcReg[0], state, t );
 422                t[0] = MAX2(t[0], 0.0F);
 423                t[1] = MAX2(t[1], 0.0F);
 424                t[3] = CLAMP(t[3], -(128.0F - epsilon), (128.0F - epsilon));
 425                lit[0] = 1.0;
 426                lit[1] = t[0];
 427                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 428                lit[3] = 1.0;
 429                store_vector4( &inst->DstReg, state, lit );
 430             }
 431             break;
 432          case VP_OPCODE_RCP:
 433             {
 434                GLfloat t[4];
 435                fetch_vector1( &inst->SrcReg[0], state, t );
 436                if (t[0] != 1.0F)
 437                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 438                t[1] = t[2] = t[3] = t[0];
 439                store_vector4( &inst->DstReg, state, t );
 440             }
 441             break;
 442          case VP_OPCODE_RSQ:
 443             {
 444                GLfloat t[4];
 445                fetch_vector1( &inst->SrcReg[0], state, t );
 446                t[0] = INV_SQRTF(FABSF(t[0]));
 447                t[1] = t[2] = t[3] = t[0];
 448                store_vector4( &inst->DstReg, state, t );
 449             }
 450             break;
 451          case VP_OPCODE_EXP:
 452             {
 453                GLfloat t[4], q[4], floor_t0;
 454                fetch_vector1( &inst->SrcReg[0], state, t );
 455                floor_t0 = (float) floor(t[0]);
 456                if (floor_t0 > FLT_MAX_EXP) {
 457                   SET_POS_INFINITY(q[0]);
 458                   SET_POS_INFINITY(q[2]);
 459                }
 460                else if (floor_t0 < FLT_MIN_EXP) {
 461                   q[0] = 0.0F;
 462                   q[2] = 0.0F;
 463                }
 464                else {
 465 #ifdef USE_IEEE
 466                   GLint ii = (GLint) floor_t0;
 467                   ii = (ii < 23) + 0x3f800000;
 468                   SET_FLOAT_BITS(q[0], ii);
 469                   q[0] = *((GLfloat *) (void *)&ii);
 470 #else
 471                   q[0] = (GLfloat) pow(2.0, floor_t0);
 472 #endif
 473                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 474                }
 475                q[1] = t[0] - floor_t0;
 476                q[3] = 1.0F;
 477                store_vector4( &inst->DstReg, state, q );
 478             }
 479             break;
 480          case VP_OPCODE_LOG:
 481             {
 482                GLfloat t[4], q[4], abs_t0;
 483                fetch_vector1( &inst->SrcReg[0], state, t );
 484                abs_t0 = (GLfloat) fabs(t[0]);
 485                if (abs_t0 != 0.0F) {
 486                   /* Since we really can't handle infinite values on VMS
 487                    * like other OSes we'll use __MAXFLOAT to represent
 488                    * infinity.  This may need some tweaking.
 489                    */
 490 #ifdef VMS
 491                   if (abs_t0 == __MAXFLOAT)
 492 #else
 493                   if (IS_INF_OR_NAN(abs_t0))
 494 #endif
 495                   {
 496                      SET_POS_INFINITY(q[0]);
 497                      q[1] = 1.0F;
 498                      SET_POS_INFINITY(q[2]);
 499                   }
 500                   else {
 501                      int exponent;
 502                      double mantissa = frexp(t[0], &exponent);
 503                      q[0] = (GLfloat) (exponent - 1);
 504                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 505                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 506                   }
 507                   }
 508                else {
 509                   SET_NEG_INFINITY(q[0]);
 510                   q[1] = 1.0F;
 511                   SET_NEG_INFINITY(q[2]);
 512                }
 513                q[3] = 1.0;
 514                store_vector4( &inst->DstReg, state, q );
 515             }
 516             break;
 517          case VP_OPCODE_MUL:
 518             {
 519                GLfloat t[4], u[4], prod[4];
 520                fetch_vector4( &inst->SrcReg[0], state, t );
 521                fetch_vector4( &inst->SrcReg[1], state, u );
 522                prod[0] = t[0] * u[0];
 523                prod[1] = t[1] * u[1];
 524                prod[2] = t[2] * u[2];
 525                prod[3] = t[3] * u[3];
 526                store_vector4( &inst->DstReg, state, prod );
 527             }
 528             break;
 529          case VP_OPCODE_ADD:
 530             {
 531                GLfloat t[4], u[4], sum[4];
 532                fetch_vector4( &inst->SrcReg[0], state, t );
 533                fetch_vector4( &inst->SrcReg[1], state, u );
 534                sum[0] = t[0] + u[0];
 535                sum[1] = t[1] + u[1];
 536                sum[2] = t[2] + u[2];
 537                sum[3] = t[3] + u[3];
 538                store_vector4( &inst->DstReg, state, sum );
 539             }
 540             break;
 541          case VP_OPCODE_DP3:
 542             {
 543                GLfloat t[4], u[4], dot[4];
 544                fetch_vector4( &inst->SrcReg[0], state, t );
 545                fetch_vector4( &inst->SrcReg[1], state, u );
 546                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 547                dot[1] = dot[2] = dot[3] = dot[0];
 548                store_vector4( &inst->DstReg, state, dot );
 549             }
 550             break;
 551          case VP_OPCODE_DP4:
 552             {
 553                GLfloat t[4], u[4], dot[4];
 554                fetch_vector4( &inst->SrcReg[0], state, t );
 555                fetch_vector4( &inst->SrcReg[1], state, u );
 556                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 557                dot[1] = dot[2] = dot[3] = dot[0];
 558                store_vector4( &inst->DstReg, state, dot );
 559             }
 560             break;
 561          case VP_OPCODE_DST:
 562             {
 563                GLfloat t[4], u[4], dst[4];
 564                fetch_vector4( &inst->SrcReg[0], state, t );
 565                fetch_vector4( &inst->SrcReg[1], state, u );
 566                dst[0] = 1.0F;
 567                dst[1] = t[1] * u[1];
 568                dst[2] = t[2];
 569                dst[3] = u[3];
 570                store_vector4( &inst->DstReg, state, dst );
 571             }
 572             break;
 573          case VP_OPCODE_MIN:
 574             {
 575                GLfloat t[4], u[4], min[4];
 576                fetch_vector4( &inst->SrcReg[0], state, t );
 577                fetch_vector4( &inst->SrcReg[1], state, u );
 578                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 579                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 580                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 581                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 582                store_vector4( &inst->DstReg, state, min );
 583             }
 584             break;
 585          case VP_OPCODE_MAX:
 586             {
 587                GLfloat t[4], u[4], max[4];
 588                fetch_vector4( &inst->SrcReg[0], state, t );
 589                fetch_vector4( &inst->SrcReg[1], state, u );
 590                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 591                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 592                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 593                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 594                store_vector4( &inst->DstReg, state, max );
 595             }
 596             break;
 597          case VP_OPCODE_SLT:
 598             {
 599                GLfloat t[4], u[4], slt[4];
 600                fetch_vector4( &inst->SrcReg[0], state, t );
 601                fetch_vector4( &inst->SrcReg[1], state, u );
 602                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 603                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 604                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 605                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 606                store_vector4( &inst->DstReg, state, slt );
 607             }
 608             break;
 609          case VP_OPCODE_SGE:
 610             {
 611                GLfloat t[4], u[4], sge[4];
 612                fetch_vector4( &inst->SrcReg[0], state, t );
 613                fetch_vector4( &inst->SrcReg[1], state, u );
 614                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 615                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 616                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 617                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 618                store_vector4( &inst->DstReg, state, sge );
 619             }
 620             break;
 621          case VP_OPCODE_MAD:
 622             {
 623                GLfloat t[4], u[4], v[4], sum[4];
 624                fetch_vector4( &inst->SrcReg[0], state, t );
 625                fetch_vector4( &inst->SrcReg[1], state, u );
 626                fetch_vector4( &inst->SrcReg[2], state, v );
 627                sum[0] = t[0] * u[0] + v[0];
 628                sum[1] = t[1] * u[1] + v[1];
 629                sum[2] = t[2] * u[2] + v[2];
 630                sum[3] = t[3] * u[3] + v[3];
 631                store_vector4( &inst->DstReg, state, sum );
 632             }
 633             break;
 634          case VP_OPCODE_ARL:
 635             {
 636                GLfloat t[4];
 637                fetch_vector4( &inst->SrcReg[0], state, t );
 638                state->AddressReg[0] = (GLint) floor(t[0]);
 639             }
 640             break;
 641          case VP_OPCODE_DPH:
 642             {
 643                GLfloat t[4], u[4], dot[4];
 644                fetch_vector4( &inst->SrcReg[0], state, t );
 645                fetch_vector4( &inst->SrcReg[1], state, u );
 646                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 647                dot[1] = dot[2] = dot[3] = dot[0];
 648                store_vector4( &inst->DstReg, state, dot );
 649             }
 650             break;
 651          case VP_OPCODE_RCC:
 652             {
 653                GLfloat t[4], u;
 654                fetch_vector1( &inst->SrcReg[0], state, t );
 655                if (t[0] == 1.0F)
 656                   u = 1.0F;
 657                else
 658                   u = 1.0F / t[0];
 659                if (u > 0.0F) {
 660                   if (u > 1.884467e+019F) {
 661                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 662                   }
 663                   else if (u < 5.42101e-020F) {
 664                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 665                   }
 666                }
 667                else {
 668                   if (u < -1.884467e+019F) {
 669                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 670                   }
 671                   else if (u > -5.42101e-020F) {
 672                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 673                   }
 674                }
 675                t[0] = t[1] = t[2] = t[3] = u;
 676                store_vector4( &inst->DstReg, state, t );
 677             }
 678             break;
 679          case VP_OPCODE_SUB: /* GL_NV_vertex_program1_1 */
 680             {
 681                GLfloat t[4], u[4], sum[4];
 682                fetch_vector4( &inst->SrcReg[0], state, t );
 683                fetch_vector4( &inst->SrcReg[1], state, u );
 684                sum[0] = t[0] - u[0];
 685                sum[1] = t[1] - u[1];
 686                sum[2] = t[2] - u[2];
 687                sum[3] = t[3] - u[3];
 688                store_vector4( &inst->DstReg, state, sum );
 689             }
 690             break;
 691          case VP_OPCODE_ABS: /* GL_NV_vertex_program1_1 */
 692             {
 693                GLfloat t[4];
 694                fetch_vector4( &inst->SrcReg[0], state, t );
 695                if (t[0] < 0.0)  t[0] = -t[0];
 696                if (t[1] < 0.0)  t[1] = -t[1];
 697                if (t[2] < 0.0)  t[2] = -t[2];
 698                if (t[3] < 0.0)  t[3] = -t[3];
 699                store_vector4( &inst->DstReg, state, t );
 700             }
 701             break;
 702          case VP_OPCODE_FLR: /* GL_ARB_vertex_program */
 703             {
 704                GLfloat t[4];
 705                fetch_vector4( &inst->SrcReg[0], state, t );
 706                t[0] = FLOORF(t[0]);
 707                t[1] = FLOORF(t[1]);
 708                t[2] = FLOORF(t[2]);
 709                t[3] = FLOORF(t[3]);
 710                store_vector4( &inst->DstReg, state, t );
 711             }
 712             break;
 713          case VP_OPCODE_FRC: /* GL_ARB_vertex_program */
 714             {
 715                GLfloat t[4];
 716                fetch_vector4( &inst->SrcReg[0], state, t );
 717                t[0] = t[0] - FLOORF(t[0]);
 718                t[1] = t[1] - FLOORF(t[1]);
 719                t[2] = t[2] - FLOORF(t[2]);
 720                t[3] = t[3] - FLOORF(t[3]);
 721                store_vector4( &inst->DstReg, state, t );
 722             }
 723             break;
 724          case VP_OPCODE_EX2: /* GL_ARB_vertex_program */
 725             {
 726                GLfloat t[4];
 727                fetch_vector1( &inst->SrcReg[0], state, t );
 728                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(2.0, t[0]);
 729                store_vector4( &inst->DstReg, state, t );
 730             }
 731             break;
 732          case VP_OPCODE_LG2: /* GL_ARB_vertex_program */
 733             {
 734                GLfloat t[4];
 735                fetch_vector1( &inst->SrcReg[0], state, t );
 736                t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
 737                store_vector4( &inst->DstReg, state, t );
 738             }
 739             break;
 740          case VP_OPCODE_POW: /* GL_ARB_vertex_program */
 741             {
 742                GLfloat t[4], u[4];
 743                fetch_vector1( &inst->SrcReg[0], state, t );
 744                fetch_vector1( &inst->SrcReg[1], state, u );
 745                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(t[0], u[0]);
 746                store_vector4( &inst->DstReg, state, t );
 747             }
 748             break;
 749          case VP_OPCODE_XPD: /* GL_ARB_vertex_program */
 750             {
 751                GLfloat t[4], u[4], cross[4];
 752                fetch_vector4( &inst->SrcReg[0], state, t );
 753                fetch_vector4( &inst->SrcReg[1], state, u );
 754                cross[0] = t[1] * u[2] - t[2] * u[1];
 755                cross[1] = t[2] * u[0] - t[0] * u[2];
 756                cross[2] = t[0] * u[1] - t[1] * u[0];
 757                store_vector4( &inst->DstReg, state, cross );
 758             }
 759             break;
 760          case VP_OPCODE_SWZ: /* GL_ARB_vertex_program */
 761             {
 762                const struct vp_src_register *source = &inst->SrcReg[0];
 763                const GLfloat *src = get_register_pointer(source, state);
 764                GLfloat result[4];
 765                GLuint i;
 766
 767                /* do extended swizzling here */
 768                for (i = 0; i < 3; i++) {
 769                   if (source->Swizzle[i] == SWIZZLE_ZERO)
 770                      result[i] = 0.0;
 771                   else if (source->Swizzle[i] == SWIZZLE_ONE)
 772                      result[i] = -1.0;
 773                   else
 774                      result[i] = -src[source->Swizzle[i]];
 775                   if (source->Negate)
 776                      result[i] = -result[i];
 777                }
 778                store_vector4( &inst->DstReg, state, result );
 779             }
 780             break;
 781          case VP_OPCODE_PRINT:
 782             if (inst->SrcReg[0].File) {
 783                GLfloat t[4];
 784                fetch_vector4( &inst->SrcReg[0], state, t );
 785                _mesa_printf("%s%g, %g, %g, %g\n",
 786                             (char *) inst->Data, t[0], t[1], t[2], t[3]);
 787             }
 788             else {
 789                _mesa_printf("%s\n", (char *) inst->Data);
 790             }
 791             break;
 792          case VP_OPCODE_END:
 793             ctx->_CurrentProgram = 0;
 794             return;
 795          default:
 796             /* bad instruction opcode */
 797             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 798             ctx->_CurrentProgram = 0;
 799             return;
 800       } /* switch */
 801    } /* for */
 802
 803    ctx->_CurrentProgram = 0;
 804 }
 805
 806
 807
 808 /**
 809 Thoughts on vertex program optimization:
 810
 811 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 812 assembly code.  That will probably be a lot of work.
 813
 814 Another approach might be to replace the vp_instruction->Opcode field with
 815 a pointer to a specialized C function which executes the instruction.
 816 In particular we can write functions which skip swizzling, negating,
 817 masking, relative addressing, etc. when they're not needed.
 818
 819 For example:
 820
 821 void simple_add( struct vp_instruction *inst )
 822 {
 823    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 824    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 825    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 826    sum[0] = a[0] + b[0];
 827    sum[1] = a[1] + b[1];
 828    sum[2] = a[2] + b[2];
 829    sum[3] = a[3] + b[3];
 830 }
 831
 832 */
 833
 834 /*
 835
 836 KW:
 837
 838 A first step would be to 'vectorize' the programs in the same way as
 839 the normal transformation code in the tnl module.  Thus each opcode
 840 takes zero or more input vectors (registers) and produces one or more
 841 output vectors.
 842
 843 These operations would intially be coded in C, with machine-specific
 844 assembly following, as is currently the case for matrix
 845 transformations in the math/ directory.  The preprocessing scheme for
 846 selecting simpler operations Brian describes above would also work
 847 here.
 848
 849 This should give reasonable performance without excessive effort.
 850
 851 */