src/mesa/main/nvvertexec.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.0.1
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file nvvertexec.c
  27  * Code to execute vertex programs.
  28  * \author Brian Paul
  29  */
  30
  31 #include "glheader.h"
  32 #include "context.h"
  33 #include "imports.h"
  34 #include "macros.h"
  35 #include "mtypes.h"
  36 #include "nvvertexec.h"
  37 #include "nvvertprog.h"
  38 #include "program.h"
  39 #include "math/m_matrix.h"
  40
  41
  42 static const GLfloat zeroVec[4] = { 0, 0, 0, 0 };
  43
  44
  45 /**
  46  * Load/initialize the vertex program registers.
  47  * This needs to be done per vertex.
  48  */
  49 void
  50 _mesa_init_vp_registers(GLcontext *ctx)
  51 {
  52    GLuint i;
  53
  54    /* Input registers get initialized from the current vertex attribs */
  55    MEMCPY(ctx->VertexProgram.Inputs, ctx->Current.Attrib,
  56           VERT_ATTRIB_MAX * 4 * sizeof(GLfloat));
  57
  58    /* Output and temp regs are initialized to [0,0,0,1] */
  59    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
  60       ASSIGN_4V(ctx->VertexProgram.Outputs[i], 0.0F, 0.0F, 0.0F, 1.0F);
  61    }
  62    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
  63       ASSIGN_4V(ctx->VertexProgram.Temporaries[i], 0.0F, 0.0F, 0.0F, 1.0F);
  64    }
  65
  66    /* The program parameters aren't touched */
  67    /* XXX: This should be moved to glBegin() time, but its safe (and slow!)
  68     * here - Karl
  69     */
  70    if (ctx->VertexProgram.Current->Parameters) {
  71       /* Grab the state */
  72       _mesa_load_state_parameters(ctx, ctx->VertexProgram.Current->Parameters);
  73
  74       /* And copy it into the program state */
  75       for (i=0; i<ctx->VertexProgram.Current->Parameters->NumParameters; i++) {
  76          MEMCPY(ctx->VertexProgram.Parameters[i],
  77                 &ctx->VertexProgram.Current->Parameters->Parameters[i].Values,
  78                 4*sizeof(GLfloat));
  79       }
  80    }
  81 }
  82
  83
  84
  85 /**
  86  * Copy the 16 elements of a matrix into four consecutive program
  87  * registers starting at 'pos'.
  88  */
  89 static void
  90 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  91 {
  92    GLuint i;
  93    for (i = 0; i < 4; i++) {
  94       registers[pos + i][0] = mat[0 + i];
  95       registers[pos + i][1] = mat[4 + i];
  96       registers[pos + i][2] = mat[8 + i];
  97       registers[pos + i][3] = mat[12 + i];
  98    }
  99 }
 100
 101
 102 /**
 103  * As above, but transpose the matrix.
 104  */
 105 static void
 106 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
 107                       const GLfloat mat[16])
 108 {
 109    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
 110 }
 111
 112
 113 /**
 114  * Load all currently tracked matrices into the program registers.
 115  * This needs to be done per glBegin/glEnd.
 116  */
 117 void
 118 _mesa_init_tracked_matrices(GLcontext *ctx)
 119 {
 120    GLuint i;
 121
 122    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS / 4; i++) {
 123       /* point 'mat' at source matrix */
 124       GLmatrix *mat;
 125       if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 126          mat = ctx->ModelviewMatrixStack.Top;
 127       }
 128       else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 129          mat = ctx->ProjectionMatrixStack.Top;
 130       }
 131       else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 132          mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 133       }
 134       else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 135          mat = ctx->ColorMatrixStack.Top;
 136       }
 137       else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 138          /* XXX verify the combined matrix is up to date */
 139          mat = &ctx->_ModelProjectMatrix;
 140       }
 141       else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 142                ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 143          GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 144          ASSERT(n < MAX_PROGRAM_MATRICES);
 145          mat = ctx->ProgramMatrixStack[n].Top;
 146       }
 147       else {
 148          /* no matrix is tracked, but we leave the register values as-is */
 149          assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 150          continue;
 151       }
 152
 153       /* load the matrix */
 154       if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 155          load_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 156       }
 157       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 158          _math_matrix_analyse(mat); /* update the inverse */
 159          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 160          load_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 161       }
 162       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 163          load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 164       }
 165       else {
 166          assert(ctx->VertexProgram.TrackMatrixTransform[i]
 167                 == GL_INVERSE_TRANSPOSE_NV);
 168          _math_matrix_analyse(mat); /* update the inverse */
 169          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 170          load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 171       }
 172    }
 173 }
 174
 175
 176
 177 /**
 178  * For debugging.  Dump the current vertex program machine registers.
 179  */
 180 void
 181 _mesa_dump_vp_state( const struct vertex_program_state *state )
 182 {
 183    int i;
 184    _mesa_printf("VertexIn:\n");
 185    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_INPUTS; i++) {
 186       _mesa_printf("%d: %f %f %f %f   ", i,
 187                    state->Inputs[i][0],
 188                    state->Inputs[i][1],
 189                    state->Inputs[i][2],
 190                    state->Inputs[i][3]);
 191    }
 192    _mesa_printf("\n");
 193
 194    _mesa_printf("VertexOut:\n");
 195    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
 196       _mesa_printf("%d: %f %f %f %f   ", i,
 197                   state->Outputs[i][0],
 198                   state->Outputs[i][1],
 199                   state->Outputs[i][2],
 200                   state->Outputs[i][3]);
 201    }
 202    _mesa_printf("\n");
 203
 204    _mesa_printf("Registers:\n");
 205    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
 206       _mesa_printf("%d: %f %f %f %f   ", i,
 207                   state->Temporaries[i][0],
 208                   state->Temporaries[i][1],
 209                   state->Temporaries[i][2],
 210                   state->Temporaries[i][3]);
 211    }
 212    _mesa_printf("\n");
 213
 214    _mesa_printf("Parameters:\n");
 215    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS; i++) {
 216       _mesa_printf("%d: %f %f %f %f   ", i,
 217                   state->Parameters[i][0],
 218                   state->Parameters[i][1],
 219                   state->Parameters[i][2],
 220                   state->Parameters[i][3]);
 221    }
 222    _mesa_printf("\n");
 223 }
 224
 225
 226
 227 /**
 228  * Return a pointer to the 4-element float vector specified by the given
 229  * source register.
 230  */
 231 static INLINE const GLfloat *
 232 get_register_pointer( const struct vp_src_register *source,
 233                       const struct vertex_program_state *state )
 234 {
 235    if (source->RelAddr) {
 236       const GLint reg = source->Index + state->AddressReg[0];
 237       ASSERT( (source->File == PROGRAM_ENV_PARAM) ||
 238         (source->File == PROGRAM_STATE_VAR) );
 239       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 240          return zeroVec;
 241       else
 242          return state->Parameters[reg];
 243    }
 244    else {
 245       switch (source->File) {
 246          case PROGRAM_TEMPORARY:
 247             return state->Temporaries[source->Index];
 248          case PROGRAM_INPUT:
 249             return state->Inputs[source->Index];
 250          case PROGRAM_LOCAL_PARAM:
 251             /* XXX fix */
 252             return state->Temporaries[source->Index];
 253          case PROGRAM_ENV_PARAM:
 254             return state->Parameters[source->Index];
 255          case PROGRAM_STATE_VAR:
 256             return state->Parameters[source->Index];
 257          default:
 258             _mesa_problem(NULL,
 259                           "Bad source register file in fetch_vector4(vp)");
 260             return NULL;
 261       }
 262    }
 263    return NULL;
 264 }
 265
 266
 267 /**
 268  * Fetch a 4-element float vector from the given source register.
 269  * Apply swizzling and negating as needed.
 270  */
 271 static INLINE void
 272 fetch_vector4( const struct vp_src_register *source,
 273                const struct vertex_program_state *state,
 274                GLfloat result[4] )
 275 {
 276    const GLfloat *src = get_register_pointer(source, state);
 277
 278    if (source->Negate) {
 279       result[0] = -src[source->Swizzle[0]];
 280       result[1] = -src[source->Swizzle[1]];
 281       result[2] = -src[source->Swizzle[2]];
 282       result[3] = -src[source->Swizzle[3]];
 283    }
 284    else {
 285       result[0] = src[source->Swizzle[0]];
 286       result[1] = src[source->Swizzle[1]];
 287       result[2] = src[source->Swizzle[2]];
 288       result[3] = src[source->Swizzle[3]];
 289    }
 290 }
 291
 292
 293
 294 /**
 295  * As above, but only return result[0] element.
 296  */
 297 static INLINE void
 298 fetch_vector1( const struct vp_src_register *source,
 299                const struct vertex_program_state *state,
 300                GLfloat result[4] )
 301 {
 302    const GLfloat *src = get_register_pointer(source, state);
 303
 304    if (source->Negate) {
 305       result[0] = -src[source->Swizzle[0]];
 306    }
 307    else {
 308       result[0] = src[source->Swizzle[0]];
 309    }
 310 }
 311
 312
 313 /**
 314  * Store 4 floats into a register.
 315  */
 316 static void
 317 store_vector4( const struct vp_dst_register *dest,
 318                struct vertex_program_state *state,
 319                const GLfloat value[4] )
 320 {
 321    GLfloat *dst;
 322    switch (dest->File) {
 323       case PROGRAM_TEMPORARY:
 324          dst = state->Temporaries[dest->Index];
 325          break;
 326       case PROGRAM_OUTPUT:
 327          dst = state->Outputs[dest->Index];
 328          break;
 329       case PROGRAM_ENV_PARAM:
 330          {
 331             /* a slight hack */
 332             GET_CURRENT_CONTEXT(ctx);
 333             dst = ctx->VertexProgram.Parameters[dest->Index];
 334          }
 335          break;
 336       default:
 337          _mesa_problem(NULL, "Invalid register file in store_vector4(file=%d)",
 338                        dest->File);
 339          return;
 340    }
 341
 342    if (dest->WriteMask[0])
 343       dst[0] = value[0];
 344    if (dest->WriteMask[1])
 345       dst[1] = value[1];
 346    if (dest->WriteMask[2])
 347       dst[2] = value[2];
 348    if (dest->WriteMask[3])
 349       dst[3] = value[3];
 350 }
 351
 352
 353 /**
 354  * Set x to positive or negative infinity.
 355  */
 356 #if defined(USE_IEEE) || defined(_WIN32)
 357 #define SET_POS_INFINITY(x)  ( *((GLuint *) &x) = 0x7F800000 )
 358 #define SET_NEG_INFINITY(x)  ( *((GLuint *) &x) = 0xFF800000 )
 359 #elif defined(VMS)
 360 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 361 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 362 #else
 363 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 364 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 365 #endif
 366
 367 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
 368
 369
 370 /**
 371  * Execute the given vertex program
 372  */
 373 void
 374 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 375 {
 376    struct vertex_program_state *state = &ctx->VertexProgram;
 377    const struct vp_instruction *inst;
 378
 379    ctx->_CurrentProgram = GL_VERTEX_PROGRAM_ARB; /* or NV, doesn't matter */
 380
 381    /* If the program is position invariant, multiply the input
 382     * position and the MVP matrix and stick it into the output pos slot
 383     */
 384    if (ctx->VertexProgram.Current->IsPositionInvariant) {
 385       TRANSFORM_POINT( ctx->VertexProgram.Outputs[0],
 386                        ctx->_ModelProjectMatrix.m,
 387                        ctx->VertexProgram.Inputs[0]);
 388
 389       /* XXX: This could go elsewhere */
 390       ctx->VertexProgram.Current->OutputsWritten |= 0x1;
 391    }
 392
 393    for (inst = program->Instructions; /*inst->Opcode != VP_OPCODE_END*/; inst++) {
 394
 395       if (ctx->VertexProgram.CallbackEnabled &&
 396           ctx->VertexProgram.Callback) {
 397          ctx->VertexProgram.CurrentPosition = inst->StringPos;
 398          ctx->VertexProgram.Callback(program->Base.Target,
 399                                      ctx->VertexProgram.CallbackData);
 400       }
 401
 402       switch (inst->Opcode) {
 403          case VP_OPCODE_MOV:
 404             {
 405                GLfloat t[4];
 406                fetch_vector4( &inst->SrcReg[0], state, t );
 407                store_vector4( &inst->DstReg, state, t );
 408             }
 409             break;
 410          case VP_OPCODE_LIT:
 411             {
 412                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 413                GLfloat t[4], lit[4];
 414                fetch_vector4( &inst->SrcReg[0], state, t );
 415                if (t[3] < -(128.0F - epsilon))
 416                    t[3] = - (128.0F - epsilon);
 417                else if (t[3] > 128.0F - epsilon)
 418                   t[3] = 128.0F - epsilon;
 419                if (t[0] < 0.0)
 420                   t[0] = 0.0;
 421                if (t[1] < 0.0)
 422                   t[1] = 0.0;
 423                lit[0] = 1.0;
 424                lit[1] = t[0];
 425                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 426                lit[3] = 1.0;
 427                store_vector4( &inst->DstReg, state, lit );
 428             }
 429             break;
 430          case VP_OPCODE_RCP:
 431             {
 432                GLfloat t[4];
 433                fetch_vector1( &inst->SrcReg[0], state, t );
 434                if (t[0] != 1.0F)
 435                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 436                t[1] = t[2] = t[3] = t[0];
 437                store_vector4( &inst->DstReg, state, t );
 438             }
 439             break;
 440          case VP_OPCODE_RSQ:
 441             {
 442                GLfloat t[4];
 443                fetch_vector1( &inst->SrcReg[0], state, t );
 444                t[0] = INV_SQRTF(FABSF(t[0]));
 445                t[1] = t[2] = t[3] = t[0];
 446                store_vector4( &inst->DstReg, state, t );
 447             }
 448             break;
 449          case VP_OPCODE_EXP:
 450             {
 451                GLfloat t[4], q[4], floor_t0;
 452                fetch_vector1( &inst->SrcReg[0], state, t );
 453                floor_t0 = (float) floor(t[0]);
 454                if (floor_t0 > FLT_MAX_EXP) {
 455                   SET_POS_INFINITY(q[0]);
 456                   SET_POS_INFINITY(q[2]);
 457                }
 458                else if (floor_t0 < FLT_MIN_EXP) {
 459                   q[0] = 0.0F;
 460                   q[2] = 0.0F;
 461                }
 462                else {
 463 #ifdef USE_IEEE
 464                   GLint ii = (GLint) floor_t0;
 465                   ii = (ii < 23) + 0x3f800000;
 466                   SET_FLOAT_BITS(q[0], ii);
 467                   q[0] = *((GLfloat *) &ii);
 468 #else
 469                   q[0] = (GLfloat) pow(2.0, floor_t0);
 470 #endif
 471                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 472                }
 473                q[1] = t[0] - floor_t0;
 474                q[3] = 1.0F;
 475                store_vector4( &inst->DstReg, state, q );
 476             }
 477             break;
 478          case VP_OPCODE_LOG:
 479             {
 480                GLfloat t[4], q[4], abs_t0;
 481                fetch_vector1( &inst->SrcReg[0], state, t );
 482                abs_t0 = (GLfloat) fabs(t[0]);
 483                if (abs_t0 != 0.0F) {
 484                   /* Since we really can't handle infinite values on VMS
 485                    * like other OSes we'll use __MAXFLOAT to represent
 486                    * infinity.  This may need some tweaking.
 487                    */
 488 #ifdef VMS
 489                   if (abs_t0 == __MAXFLOAT)
 490 #else
 491                   if (IS_INF_OR_NAN(abs_t0))
 492 #endif
 493                   {
 494                      SET_POS_INFINITY(q[0]);
 495                      q[1] = 1.0F;
 496                      SET_POS_INFINITY(q[2]);
 497                   }
 498                   else {
 499                      int exponent;
 500                      double mantissa = frexp(t[0], &exponent);
 501                      q[0] = (GLfloat) (exponent - 1);
 502                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 503                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 504                   }
 505                   }
 506                else {
 507                   SET_NEG_INFINITY(q[0]);
 508                   q[1] = 1.0F;
 509                   SET_NEG_INFINITY(q[2]);
 510                }
 511                q[3] = 1.0;
 512                store_vector4( &inst->DstReg, state, q );
 513             }
 514             break;
 515          case VP_OPCODE_MUL:
 516             {
 517                GLfloat t[4], u[4], prod[4];
 518                fetch_vector4( &inst->SrcReg[0], state, t );
 519                fetch_vector4( &inst->SrcReg[1], state, u );
 520                prod[0] = t[0] * u[0];
 521                prod[1] = t[1] * u[1];
 522                prod[2] = t[2] * u[2];
 523                prod[3] = t[3] * u[3];
 524                store_vector4( &inst->DstReg, state, prod );
 525             }
 526             break;
 527          case VP_OPCODE_ADD:
 528             {
 529                GLfloat t[4], u[4], sum[4];
 530                fetch_vector4( &inst->SrcReg[0], state, t );
 531                fetch_vector4( &inst->SrcReg[1], state, u );
 532                sum[0] = t[0] + u[0];
 533                sum[1] = t[1] + u[1];
 534                sum[2] = t[2] + u[2];
 535                sum[3] = t[3] + u[3];
 536                store_vector4( &inst->DstReg, state, sum );
 537             }
 538             break;
 539          case VP_OPCODE_DP3:
 540             {
 541                GLfloat t[4], u[4], dot[4];
 542                fetch_vector4( &inst->SrcReg[0], state, t );
 543                fetch_vector4( &inst->SrcReg[1], state, u );
 544                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 545                dot[1] = dot[2] = dot[3] = dot[0];
 546                store_vector4( &inst->DstReg, state, dot );
 547             }
 548             break;
 549          case VP_OPCODE_DP4:
 550             {
 551                GLfloat t[4], u[4], dot[4];
 552                fetch_vector4( &inst->SrcReg[0], state, t );
 553                fetch_vector4( &inst->SrcReg[1], state, u );
 554                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 555                dot[1] = dot[2] = dot[3] = dot[0];
 556                store_vector4( &inst->DstReg, state, dot );
 557             }
 558             break;
 559          case VP_OPCODE_DST:
 560             {
 561                GLfloat t[4], u[4], dst[4];
 562                fetch_vector4( &inst->SrcReg[0], state, t );
 563                fetch_vector4( &inst->SrcReg[1], state, u );
 564                dst[0] = 1.0F;
 565                dst[1] = t[1] * u[1];
 566                dst[2] = t[2];
 567                dst[3] = u[3];
 568                store_vector4( &inst->DstReg, state, dst );
 569             }
 570             break;
 571          case VP_OPCODE_MIN:
 572             {
 573                GLfloat t[4], u[4], min[4];
 574                fetch_vector4( &inst->SrcReg[0], state, t );
 575                fetch_vector4( &inst->SrcReg[1], state, u );
 576                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 577                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 578                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 579                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 580                store_vector4( &inst->DstReg, state, min );
 581             }
 582             break;
 583          case VP_OPCODE_MAX:
 584             {
 585                GLfloat t[4], u[4], max[4];
 586                fetch_vector4( &inst->SrcReg[0], state, t );
 587                fetch_vector4( &inst->SrcReg[1], state, u );
 588                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 589                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 590                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 591                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 592                store_vector4( &inst->DstReg, state, max );
 593             }
 594             break;
 595          case VP_OPCODE_SLT:
 596             {
 597                GLfloat t[4], u[4], slt[4];
 598                fetch_vector4( &inst->SrcReg[0], state, t );
 599                fetch_vector4( &inst->SrcReg[1], state, u );
 600                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 601                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 602                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 603                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 604                store_vector4( &inst->DstReg, state, slt );
 605             }
 606             break;
 607          case VP_OPCODE_SGE:
 608             {
 609                GLfloat t[4], u[4], sge[4];
 610                fetch_vector4( &inst->SrcReg[0], state, t );
 611                fetch_vector4( &inst->SrcReg[1], state, u );
 612                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 613                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 614                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 615                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 616                store_vector4( &inst->DstReg, state, sge );
 617             }
 618             break;
 619          case VP_OPCODE_MAD:
 620             {
 621                GLfloat t[4], u[4], v[4], sum[4];
 622                fetch_vector4( &inst->SrcReg[0], state, t );
 623                fetch_vector4( &inst->SrcReg[1], state, u );
 624                fetch_vector4( &inst->SrcReg[2], state, v );
 625                sum[0] = t[0] * u[0] + v[0];
 626                sum[1] = t[1] * u[1] + v[1];
 627                sum[2] = t[2] * u[2] + v[2];
 628                sum[3] = t[3] * u[3] + v[3];
 629                store_vector4( &inst->DstReg, state, sum );
 630             }
 631             break;
 632          case VP_OPCODE_ARL:
 633             {
 634                GLfloat t[4];
 635                fetch_vector4( &inst->SrcReg[0], state, t );
 636                state->AddressReg[0] = (GLint) floor(t[0]);
 637             }
 638             break;
 639          case VP_OPCODE_DPH:
 640             {
 641                GLfloat t[4], u[4], dot[4];
 642                fetch_vector4( &inst->SrcReg[0], state, t );
 643                fetch_vector4( &inst->SrcReg[1], state, u );
 644                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 645                dot[1] = dot[2] = dot[3] = dot[0];
 646                store_vector4( &inst->DstReg, state, dot );
 647             }
 648             break;
 649          case VP_OPCODE_RCC:
 650             {
 651                GLfloat t[4], u;
 652                fetch_vector1( &inst->SrcReg[0], state, t );
 653                if (t[0] == 1.0F)
 654                   u = 1.0F;
 655                else
 656                   u = 1.0F / t[0];
 657                if (u > 0.0F) {
 658                   if (u > 1.884467e+019F) {
 659                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 660                   }
 661                   else if (u < 5.42101e-020F) {
 662                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 663                   }
 664                }
 665                else {
 666                   if (u < -1.884467e+019F) {
 667                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 668                   }
 669                   else if (u > -5.42101e-020F) {
 670                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 671                   }
 672                }
 673                t[0] = t[1] = t[2] = t[3] = u;
 674                store_vector4( &inst->DstReg, state, t );
 675             }
 676             break;
 677          case VP_OPCODE_SUB: /* GL_NV_vertex_program1_1 */
 678             {
 679                GLfloat t[4], u[4], sum[4];
 680                fetch_vector4( &inst->SrcReg[0], state, t );
 681                fetch_vector4( &inst->SrcReg[1], state, u );
 682                sum[0] = t[0] - u[0];
 683                sum[1] = t[1] - u[1];
 684                sum[2] = t[2] - u[2];
 685                sum[3] = t[3] - u[3];
 686                store_vector4( &inst->DstReg, state, sum );
 687             }
 688             break;
 689          case VP_OPCODE_ABS: /* GL_NV_vertex_program1_1 */
 690             {
 691                GLfloat t[4];
 692                fetch_vector4( &inst->SrcReg[0], state, t );
 693                if (t[0] < 0.0)  t[0] = -t[0];
 694                if (t[1] < 0.0)  t[1] = -t[1];
 695                if (t[2] < 0.0)  t[2] = -t[2];
 696                if (t[3] < 0.0)  t[3] = -t[3];
 697                store_vector4( &inst->DstReg, state, t );
 698             }
 699             break;
 700          case VP_OPCODE_FLR: /* GL_ARB_vertex_program */
 701             {
 702                GLfloat t[4];
 703                fetch_vector4( &inst->SrcReg[0], state, t );
 704                t[0] = FLOORF(t[0]);
 705                t[1] = FLOORF(t[1]);
 706                t[2] = FLOORF(t[2]);
 707                t[3] = FLOORF(t[3]);
 708                store_vector4( &inst->DstReg, state, t );
 709             }
 710             break;
 711          case VP_OPCODE_FRC: /* GL_ARB_vertex_program */
 712             {
 713                GLfloat t[4];
 714                fetch_vector4( &inst->SrcReg[0], state, t );
 715                t[0] = t[0] - FLOORF(t[0]);
 716                t[1] = t[1] - FLOORF(t[1]);
 717                t[2] = t[2] - FLOORF(t[2]);
 718                t[3] = t[3] - FLOORF(t[3]);
 719                store_vector4( &inst->DstReg, state, t );
 720             }
 721             break;
 722          case VP_OPCODE_EX2: /* GL_ARB_vertex_program */
 723             {
 724                GLfloat t[4];
 725                fetch_vector1( &inst->SrcReg[0], state, t );
 726                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(2.0, t[0]);
 727                store_vector4( &inst->DstReg, state, t );
 728             }
 729             break;
 730          case VP_OPCODE_LG2: /* GL_ARB_vertex_program */
 731             {
 732                GLfloat t[4];
 733                fetch_vector1( &inst->SrcReg[0], state, t );
 734                t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
 735                store_vector4( &inst->DstReg, state, t );
 736             }
 737             break;
 738          case VP_OPCODE_POW: /* GL_ARB_vertex_program */
 739             {
 740                GLfloat t[4], u[4];
 741                fetch_vector1( &inst->SrcReg[0], state, t );
 742                fetch_vector1( &inst->SrcReg[1], state, u );
 743                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(t[0], u[0]);
 744                store_vector4( &inst->DstReg, state, t );
 745             }
 746             break;
 747          case VP_OPCODE_XPD: /* GL_ARB_vertex_program */
 748             {
 749                GLfloat t[4], u[4], cross[4];
 750                fetch_vector4( &inst->SrcReg[0], state, t );
 751                fetch_vector4( &inst->SrcReg[1], state, u );
 752                cross[0] = t[1] * u[2] - t[2] * u[1];
 753                cross[1] = t[2] * u[0] - t[0] * u[2];
 754                cross[2] = t[0] * u[1] - t[1] * u[0];
 755                store_vector4( &inst->DstReg, state, cross );
 756             }
 757             break;
 758          case VP_OPCODE_SWZ: /* GL_ARB_vertex_program */
 759             {
 760                const struct vp_src_register *source = &inst->SrcReg[0];
 761                const GLfloat *src = get_register_pointer(source, state);
 762                GLfloat result[4];
 763                GLuint i;
 764
 765                /* do extended swizzling here */
 766                for (i = 0; i < 3; i++) {
 767                   if (source->Swizzle[i] == SWIZZLE_ZERO)
 768                      result[i] = 0.0;
 769                   else if (source->Swizzle[i] == SWIZZLE_ONE)
 770                      result[i] = -1.0;
 771                   else
 772                      result[i] = -src[source->Swizzle[i]];
 773                   if (source->Negate)
 774                      result[i] = -result[i];
 775                }
 776                store_vector4( &inst->DstReg, state, result );
 777             }
 778             break;
 779
 780          case VP_OPCODE_END:
 781             ctx->_CurrentProgram = 0;
 782             return;
 783          default:
 784             /* bad instruction opcode */
 785             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 786             ctx->_CurrentProgram = 0;
 787             return;
 788       } /* switch */
 789    } /* for */
 790
 791    ctx->_CurrentProgram = 0;
 792 }
 793
 794
 795
 796 /**
 797 Thoughts on vertex program optimization:
 798
 799 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 800 assembly code.  That will probably be a lot of work.
 801
 802 Another approach might be to replace the vp_instruction->Opcode field with
 803 a pointer to a specialized C function which executes the instruction.
 804 In particular we can write functions which skip swizzling, negating,
 805 masking, relative addressing, etc. when they're not needed.
 806
 807 For example:
 808
 809 void simple_add( struct vp_instruction *inst )
 810 {
 811    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 812    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 813    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 814    sum[0] = a[0] + b[0];
 815    sum[1] = a[1] + b[1];
 816    sum[2] = a[2] + b[2];
 817    sum[3] = a[3] + b[3];
 818 }
 819
 820 */
 821
 822 /*
 823
 824 KW:
 825
 826 A first step would be to 'vectorize' the programs in the same way as
 827 the normal transformation code in the tnl module.  Thus each opcode
 828 takes zero or more input vectors (registers) and produces one or more
 829 output vectors.
 830
 831 These operations would intially be coded in C, with machine-specific
 832 assembly following, as is currently the case for matrix
 833 transformations in the math/ directory.  The preprocessing scheme for
 834 selecting simpler operations Brian describes above would also work
 835 here.
 836
 837 This should give reasonable performance without excessive effort.
 838
 839 */