src/mesa/shader/nvvertexec.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.1
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file nvvertexec.c
  27  * Code to execute vertex programs.
  28  * \author Brian Paul
  29  */
  30
  31 #include "glheader.h"
  32 #include "context.h"
  33 #include "imports.h"
  34 #include "macros.h"
  35 #include "mtypes.h"
  36 #include "nvvertexec.h"
  37 #include "nvvertprog.h"
  38 #include "program.h"
  39 #include "math/m_matrix.h"
  40
  41
  42 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  43
  44
  45 /**
  46  * Load/initialize the vertex program registers which need to be set
  47  * per-vertex.
  48  */
  49 void
  50 _mesa_init_vp_per_vertex_registers(GLcontext *ctx)
  51 {
  52    /* Input registers get initialized from the current vertex attribs */
  53    MEMCPY(ctx->VertexProgram.Inputs, ctx->Current.Attrib,
  54           VERT_ATTRIB_MAX * 4 * sizeof(GLfloat));
  55
  56    if (ctx->VertexProgram.Current->IsNVProgram) {
  57       GLuint i;
  58       /* Output/result regs are initialized to [0,0,0,1] */
  59       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
  60          ASSIGN_4V(ctx->VertexProgram.Outputs[i], 0.0F, 0.0F, 0.0F, 1.0F);
  61       }
  62       /* Temp regs are initialized to [0,0,0,0] */
  63       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
  64          ASSIGN_4V(ctx->VertexProgram.Temporaries[i], 0.0F, 0.0F, 0.0F, 0.0F);
  65       }
  66       ASSIGN_4V(ctx->VertexProgram.AddressReg, 0, 0, 0, 0);
  67    }
  68 }
  69
  70
  71
  72 /**
  73  * Copy the 16 elements of a matrix into four consecutive program
  74  * registers starting at 'pos'.
  75  */
  76 static void
  77 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  78 {
  79    GLuint i;
  80    for (i = 0; i < 4; i++) {
  81       registers[pos + i][0] = mat[0 + i];
  82       registers[pos + i][1] = mat[4 + i];
  83       registers[pos + i][2] = mat[8 + i];
  84       registers[pos + i][3] = mat[12 + i];
  85    }
  86 }
  87
  88
  89 /**
  90  * As above, but transpose the matrix.
  91  */
  92 static void
  93 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
  94                       const GLfloat mat[16])
  95 {
  96    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
  97 }
  98
  99
 100 /**
 101  * Load program parameter registers with tracked matrices (if NV program)
 102  * or GL state values (if ARB program).
 103  * This needs to be done per glBegin/glEnd, not per-vertex.
 104  */
 105 void
 106 _mesa_init_vp_per_primitive_registers(GLcontext *ctx)
 107 {
 108    if (ctx->VertexProgram.Current->IsNVProgram) {
 109       GLuint i;
 110
 111       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS / 4; i++) {
 112          /* point 'mat' at source matrix */
 113          GLmatrix *mat;
 114          if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 115             mat = ctx->ModelviewMatrixStack.Top;
 116          }
 117          else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 118             mat = ctx->ProjectionMatrixStack.Top;
 119          }
 120          else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 121             mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 122          }
 123          else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 124             mat = ctx->ColorMatrixStack.Top;
 125          }
 126          else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 127             /* XXX verify the combined matrix is up to date */
 128             mat = &ctx->_ModelProjectMatrix;
 129          }
 130          else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 131                   ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 132             GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 133             ASSERT(n < MAX_PROGRAM_MATRICES);
 134             mat = ctx->ProgramMatrixStack[n].Top;
 135          }
 136          else {
 137             /* no matrix is tracked, but we leave the register values as-is */
 138             assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 139             continue;
 140          }
 141
 142          /* load the matrix */
 143          if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 144             load_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 145          }
 146          else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 147             _math_matrix_analyse(mat); /* update the inverse */
 148             assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 149             load_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 150          }
 151          else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 152             load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 153          }
 154          else {
 155             assert(ctx->VertexProgram.TrackMatrixTransform[i]
 156                    == GL_INVERSE_TRANSPOSE_NV);
 157             _math_matrix_analyse(mat); /* update the inverse */
 158             assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 159             load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 160          }
 161       }
 162    }
 163    else {
 164       /* Using and ARB vertex program */
 165       if (ctx->VertexProgram.Current->Parameters) {
 166          /* Grab the state GL state and put into registers */
 167          _mesa_load_state_parameters(ctx,
 168                                      ctx->VertexProgram.Current->Parameters);
 169       }
 170    }
 171 }
 172
 173
 174
 175 /**
 176  * For debugging.  Dump the current vertex program machine registers.
 177  */
 178 void
 179 _mesa_dump_vp_state( const struct vertex_program_state *state )
 180 {
 181    int i;
 182    _mesa_printf("VertexIn:\n");
 183    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_INPUTS; i++) {
 184       _mesa_printf("%d: %f %f %f %f   ", i,
 185                    state->Inputs[i][0],
 186                    state->Inputs[i][1],
 187                    state->Inputs[i][2],
 188                    state->Inputs[i][3]);
 189    }
 190    _mesa_printf("\n");
 191
 192    _mesa_printf("VertexOut:\n");
 193    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
 194       _mesa_printf("%d: %f %f %f %f   ", i,
 195                   state->Outputs[i][0],
 196                   state->Outputs[i][1],
 197                   state->Outputs[i][2],
 198                   state->Outputs[i][3]);
 199    }
 200    _mesa_printf("\n");
 201
 202    _mesa_printf("Registers:\n");
 203    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
 204       _mesa_printf("%d: %f %f %f %f   ", i,
 205                   state->Temporaries[i][0],
 206                   state->Temporaries[i][1],
 207                   state->Temporaries[i][2],
 208                   state->Temporaries[i][3]);
 209    }
 210    _mesa_printf("\n");
 211
 212    _mesa_printf("Parameters:\n");
 213    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS; i++) {
 214       _mesa_printf("%d: %f %f %f %f   ", i,
 215                   state->Parameters[i][0],
 216                   state->Parameters[i][1],
 217                   state->Parameters[i][2],
 218                   state->Parameters[i][3]);
 219    }
 220    _mesa_printf("\n");
 221 }
 222
 223
 224
 225 /**
 226  * Return a pointer to the 4-element float vector specified by the given
 227  * source register.
 228  */
 229 static INLINE const GLfloat *
 230 get_register_pointer( const struct vp_src_register *source,
 231                       const struct vertex_program_state *state )
 232 {
 233    if (source->RelAddr) {
 234       const GLint reg = source->Index + state->AddressReg[0];
 235       ASSERT( (source->File == PROGRAM_ENV_PARAM) ||
 236         (source->File == PROGRAM_STATE_VAR) );
 237       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 238          return ZeroVec;
 239       else if (source->File == PROGRAM_ENV_PARAM)
 240          return state->Parameters[reg];
 241       else
 242          return state->Current->Parameters->Parameters[reg].Values;
 243    }
 244    else {
 245       switch (source->File) {
 246          case PROGRAM_TEMPORARY:
 247             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_TEMPS);
 248             return state->Temporaries[source->Index];
 249          case PROGRAM_INPUT:
 250             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_INPUTS);
 251             return state->Inputs[source->Index];
 252          case PROGRAM_LOCAL_PARAM:
 253             ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
 254             return state->Current->Base.LocalParams[source->Index];
 255          case PROGRAM_ENV_PARAM:
 256             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_PARAMS);
 257             return state->Parameters[source->Index];
 258          case PROGRAM_STATE_VAR:
 259             ASSERT(source->Index < state->Current->Parameters->NumParameters);
 260             return state->Current->Parameters->Parameters[source->Index].Values;
 261          default:
 262             _mesa_problem(NULL,
 263                           "Bad source register file in get_register_pointer");
 264             return NULL;
 265       }
 266    }
 267    return NULL;
 268 }
 269
 270
 271 /**
 272  * Fetch a 4-element float vector from the given source register.
 273  * Apply swizzling and negating as needed.
 274  */
 275 static INLINE void
 276 fetch_vector4( const struct vp_src_register *source,
 277                const struct vertex_program_state *state,
 278                GLfloat result[4] )
 279 {
 280    const GLfloat *src = get_register_pointer(source, state);
 281
 282    if (source->Negate) {
 283       result[0] = -src[source->Swizzle[0]];
 284       result[1] = -src[source->Swizzle[1]];
 285       result[2] = -src[source->Swizzle[2]];
 286       result[3] = -src[source->Swizzle[3]];
 287    }
 288    else {
 289       result[0] = src[source->Swizzle[0]];
 290       result[1] = src[source->Swizzle[1]];
 291       result[2] = src[source->Swizzle[2]];
 292       result[3] = src[source->Swizzle[3]];
 293    }
 294 }
 295
 296
 297
 298 /**
 299  * As above, but only return result[0] element.
 300  */
 301 static INLINE void
 302 fetch_vector1( const struct vp_src_register *source,
 303                const struct vertex_program_state *state,
 304                GLfloat result[4] )
 305 {
 306    const GLfloat *src = get_register_pointer(source, state);
 307
 308    if (source->Negate) {
 309       result[0] = -src[source->Swizzle[0]];
 310    }
 311    else {
 312       result[0] = src[source->Swizzle[0]];
 313    }
 314 }
 315
 316
 317 /**
 318  * Store 4 floats into a register.
 319  */
 320 static void
 321 store_vector4( const struct vp_dst_register *dest,
 322                struct vertex_program_state *state,
 323                const GLfloat value[4] )
 324 {
 325    GLfloat *dst;
 326    switch (dest->File) {
 327       case PROGRAM_TEMPORARY:
 328          dst = state->Temporaries[dest->Index];
 329          break;
 330       case PROGRAM_OUTPUT:
 331          dst = state->Outputs[dest->Index];
 332          break;
 333       case PROGRAM_ENV_PARAM:
 334          {
 335             /* a slight hack */
 336             GET_CURRENT_CONTEXT(ctx);
 337             dst = ctx->VertexProgram.Parameters[dest->Index];
 338          }
 339          break;
 340       default:
 341          _mesa_problem(NULL, "Invalid register file in store_vector4(file=%d)",
 342                        dest->File);
 343          return;
 344    }
 345
 346    if (dest->WriteMask[0])
 347       dst[0] = value[0];
 348    if (dest->WriteMask[1])
 349       dst[1] = value[1];
 350    if (dest->WriteMask[2])
 351       dst[2] = value[2];
 352    if (dest->WriteMask[3])
 353       dst[3] = value[3];
 354 }
 355
 356
 357 /**
 358  * Set x to positive or negative infinity.
 359  */
 360 #if defined(USE_IEEE) || defined(_WIN32)
 361 #define SET_POS_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0x7F800000 )
 362 #define SET_NEG_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0xFF800000 )
 363 #elif defined(VMS)
 364 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 365 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 366 #else
 367 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 368 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 369 #endif
 370
 371 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
 372
 373
 374 /**
 375  * Execute the given vertex program
 376  */
 377 void
 378 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 379 {
 380    struct vertex_program_state *state = &ctx->VertexProgram;
 381    const struct vp_instruction *inst;
 382
 383    ctx->_CurrentProgram = GL_VERTEX_PROGRAM_ARB; /* or NV, doesn't matter */
 384
 385    /* If the program is position invariant, multiply the input
 386     * position and the MVP matrix and stick it into the output pos slot
 387     */
 388    if (ctx->VertexProgram.Current->IsPositionInvariant) {
 389       TRANSFORM_POINT( ctx->VertexProgram.Outputs[0],
 390                        ctx->_ModelProjectMatrix.m,
 391                        ctx->VertexProgram.Inputs[0]);
 392
 393       /* XXX: This could go elsewhere */
 394       ctx->VertexProgram.Current->OutputsWritten |= 0x1;
 395    }
 396    for (inst = program->Instructions; ; inst++) {
 397
 398       if (ctx->VertexProgram.CallbackEnabled &&
 399           ctx->VertexProgram.Callback) {
 400          ctx->VertexProgram.CurrentPosition = inst->StringPos;
 401          ctx->VertexProgram.Callback(program->Base.Target,
 402                                      ctx->VertexProgram.CallbackData);
 403       }
 404
 405       switch (inst->Opcode) {
 406          case VP_OPCODE_MOV:
 407             {
 408                GLfloat t[4];
 409                fetch_vector4( &inst->SrcReg[0], state, t );
 410                store_vector4( &inst->DstReg, state, t );
 411             }
 412             break;
 413          case VP_OPCODE_LIT:
 414             {
 415                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 416                GLfloat t[4], lit[4];
 417                fetch_vector4( &inst->SrcReg[0], state, t );
 418                if (t[3] < -(128.0F - epsilon))
 419                    t[3] = - (128.0F - epsilon);
 420                else if (t[3] > 128.0F - epsilon)
 421                   t[3] = 128.0F - epsilon;
 422                if (t[0] < 0.0)
 423                   t[0] = 0.0;
 424                if (t[1] < 0.0)
 425                   t[1] = 0.0;
 426                lit[0] = 1.0;
 427                lit[1] = t[0];
 428                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 429                lit[3] = 1.0;
 430                store_vector4( &inst->DstReg, state, lit );
 431             }
 432             break;
 433          case VP_OPCODE_RCP:
 434             {
 435                GLfloat t[4];
 436                fetch_vector1( &inst->SrcReg[0], state, t );
 437                if (t[0] != 1.0F)
 438                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 439                t[1] = t[2] = t[3] = t[0];
 440                store_vector4( &inst->DstReg, state, t );
 441             }
 442             break;
 443          case VP_OPCODE_RSQ:
 444             {
 445                GLfloat t[4];
 446                fetch_vector1( &inst->SrcReg[0], state, t );
 447                t[0] = INV_SQRTF(FABSF(t[0]));
 448                t[1] = t[2] = t[3] = t[0];
 449                store_vector4( &inst->DstReg, state, t );
 450             }
 451             break;
 452          case VP_OPCODE_EXP:
 453             {
 454                GLfloat t[4], q[4], floor_t0;
 455                fetch_vector1( &inst->SrcReg[0], state, t );
 456                floor_t0 = (float) floor(t[0]);
 457                if (floor_t0 > FLT_MAX_EXP) {
 458                   SET_POS_INFINITY(q[0]);
 459                   SET_POS_INFINITY(q[2]);
 460                }
 461                else if (floor_t0 < FLT_MIN_EXP) {
 462                   q[0] = 0.0F;
 463                   q[2] = 0.0F;
 464                }
 465                else {
 466 #ifdef USE_IEEE
 467                   GLint ii = (GLint) floor_t0;
 468                   ii = (ii < 23) + 0x3f800000;
 469                   SET_FLOAT_BITS(q[0], ii);
 470                   q[0] = *((GLfloat *) (void *)&ii);
 471 #else
 472                   q[0] = (GLfloat) pow(2.0, floor_t0);
 473 #endif
 474                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 475                }
 476                q[1] = t[0] - floor_t0;
 477                q[3] = 1.0F;
 478                store_vector4( &inst->DstReg, state, q );
 479             }
 480             break;
 481          case VP_OPCODE_LOG:
 482             {
 483                GLfloat t[4], q[4], abs_t0;
 484                fetch_vector1( &inst->SrcReg[0], state, t );
 485                abs_t0 = (GLfloat) fabs(t[0]);
 486                if (abs_t0 != 0.0F) {
 487                   /* Since we really can't handle infinite values on VMS
 488                    * like other OSes we'll use __MAXFLOAT to represent
 489                    * infinity.  This may need some tweaking.
 490                    */
 491 #ifdef VMS
 492                   if (abs_t0 == __MAXFLOAT)
 493 #else
 494                   if (IS_INF_OR_NAN(abs_t0))
 495 #endif
 496                   {
 497                      SET_POS_INFINITY(q[0]);
 498                      q[1] = 1.0F;
 499                      SET_POS_INFINITY(q[2]);
 500                   }
 501                   else {
 502                      int exponent;
 503                      double mantissa = frexp(t[0], &exponent);
 504                      q[0] = (GLfloat) (exponent - 1);
 505                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 506                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 507                   }
 508                   }
 509                else {
 510                   SET_NEG_INFINITY(q[0]);
 511                   q[1] = 1.0F;
 512                   SET_NEG_INFINITY(q[2]);
 513                }
 514                q[3] = 1.0;
 515                store_vector4( &inst->DstReg, state, q );
 516             }
 517             break;
 518          case VP_OPCODE_MUL:
 519             {
 520                GLfloat t[4], u[4], prod[4];
 521                fetch_vector4( &inst->SrcReg[0], state, t );
 522                fetch_vector4( &inst->SrcReg[1], state, u );
 523                prod[0] = t[0] * u[0];
 524                prod[1] = t[1] * u[1];
 525                prod[2] = t[2] * u[2];
 526                prod[3] = t[3] * u[3];
 527                store_vector4( &inst->DstReg, state, prod );
 528             }
 529             break;
 530          case VP_OPCODE_ADD:
 531             {
 532                GLfloat t[4], u[4], sum[4];
 533                fetch_vector4( &inst->SrcReg[0], state, t );
 534                fetch_vector4( &inst->SrcReg[1], state, u );
 535                sum[0] = t[0] + u[0];
 536                sum[1] = t[1] + u[1];
 537                sum[2] = t[2] + u[2];
 538                sum[3] = t[3] + u[3];
 539                store_vector4( &inst->DstReg, state, sum );
 540             }
 541             break;
 542          case VP_OPCODE_DP3:
 543             {
 544                GLfloat t[4], u[4], dot[4];
 545                fetch_vector4( &inst->SrcReg[0], state, t );
 546                fetch_vector4( &inst->SrcReg[1], state, u );
 547                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 548                dot[1] = dot[2] = dot[3] = dot[0];
 549                store_vector4( &inst->DstReg, state, dot );
 550             }
 551             break;
 552          case VP_OPCODE_DP4:
 553             {
 554                GLfloat t[4], u[4], dot[4];
 555                fetch_vector4( &inst->SrcReg[0], state, t );
 556                fetch_vector4( &inst->SrcReg[1], state, u );
 557                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 558                dot[1] = dot[2] = dot[3] = dot[0];
 559                store_vector4( &inst->DstReg, state, dot );
 560             }
 561             break;
 562          case VP_OPCODE_DST:
 563             {
 564                GLfloat t[4], u[4], dst[4];
 565                fetch_vector4( &inst->SrcReg[0], state, t );
 566                fetch_vector4( &inst->SrcReg[1], state, u );
 567                dst[0] = 1.0F;
 568                dst[1] = t[1] * u[1];
 569                dst[2] = t[2];
 570                dst[3] = u[3];
 571                store_vector4( &inst->DstReg, state, dst );
 572             }
 573             break;
 574          case VP_OPCODE_MIN:
 575             {
 576                GLfloat t[4], u[4], min[4];
 577                fetch_vector4( &inst->SrcReg[0], state, t );
 578                fetch_vector4( &inst->SrcReg[1], state, u );
 579                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 580                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 581                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 582                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 583                store_vector4( &inst->DstReg, state, min );
 584             }
 585             break;
 586          case VP_OPCODE_MAX:
 587             {
 588                GLfloat t[4], u[4], max[4];
 589                fetch_vector4( &inst->SrcReg[0], state, t );
 590                fetch_vector4( &inst->SrcReg[1], state, u );
 591                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 592                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 593                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 594                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 595                store_vector4( &inst->DstReg, state, max );
 596             }
 597             break;
 598          case VP_OPCODE_SLT:
 599             {
 600                GLfloat t[4], u[4], slt[4];
 601                fetch_vector4( &inst->SrcReg[0], state, t );
 602                fetch_vector4( &inst->SrcReg[1], state, u );
 603                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 604                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 605                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 606                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 607                store_vector4( &inst->DstReg, state, slt );
 608             }
 609             break;
 610          case VP_OPCODE_SGE:
 611             {
 612                GLfloat t[4], u[4], sge[4];
 613                fetch_vector4( &inst->SrcReg[0], state, t );
 614                fetch_vector4( &inst->SrcReg[1], state, u );
 615                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 616                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 617                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 618                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 619                store_vector4( &inst->DstReg, state, sge );
 620             }
 621             break;
 622          case VP_OPCODE_MAD:
 623             {
 624                GLfloat t[4], u[4], v[4], sum[4];
 625                fetch_vector4( &inst->SrcReg[0], state, t );
 626                fetch_vector4( &inst->SrcReg[1], state, u );
 627                fetch_vector4( &inst->SrcReg[2], state, v );
 628                sum[0] = t[0] * u[0] + v[0];
 629                sum[1] = t[1] * u[1] + v[1];
 630                sum[2] = t[2] * u[2] + v[2];
 631                sum[3] = t[3] * u[3] + v[3];
 632                store_vector4( &inst->DstReg, state, sum );
 633             }
 634             break;
 635          case VP_OPCODE_ARL:
 636             {
 637                GLfloat t[4];
 638                fetch_vector4( &inst->SrcReg[0], state, t );
 639                state->AddressReg[0] = (GLint) floor(t[0]);
 640             }
 641             break;
 642          case VP_OPCODE_DPH:
 643             {
 644                GLfloat t[4], u[4], dot[4];
 645                fetch_vector4( &inst->SrcReg[0], state, t );
 646                fetch_vector4( &inst->SrcReg[1], state, u );
 647                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 648                dot[1] = dot[2] = dot[3] = dot[0];
 649                store_vector4( &inst->DstReg, state, dot );
 650             }
 651             break;
 652          case VP_OPCODE_RCC:
 653             {
 654                GLfloat t[4], u;
 655                fetch_vector1( &inst->SrcReg[0], state, t );
 656                if (t[0] == 1.0F)
 657                   u = 1.0F;
 658                else
 659                   u = 1.0F / t[0];
 660                if (u > 0.0F) {
 661                   if (u > 1.884467e+019F) {
 662                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 663                   }
 664                   else if (u < 5.42101e-020F) {
 665                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 666                   }
 667                }
 668                else {
 669                   if (u < -1.884467e+019F) {
 670                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 671                   }
 672                   else if (u > -5.42101e-020F) {
 673                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 674                   }
 675                }
 676                t[0] = t[1] = t[2] = t[3] = u;
 677                store_vector4( &inst->DstReg, state, t );
 678             }
 679             break;
 680          case VP_OPCODE_SUB: /* GL_NV_vertex_program1_1 */
 681             {
 682                GLfloat t[4], u[4], sum[4];
 683                fetch_vector4( &inst->SrcReg[0], state, t );
 684                fetch_vector4( &inst->SrcReg[1], state, u );
 685                sum[0] = t[0] - u[0];
 686                sum[1] = t[1] - u[1];
 687                sum[2] = t[2] - u[2];
 688                sum[3] = t[3] - u[3];
 689                store_vector4( &inst->DstReg, state, sum );
 690             }
 691             break;
 692          case VP_OPCODE_ABS: /* GL_NV_vertex_program1_1 */
 693             {
 694                GLfloat t[4];
 695                fetch_vector4( &inst->SrcReg[0], state, t );
 696                if (t[0] < 0.0)  t[0] = -t[0];
 697                if (t[1] < 0.0)  t[1] = -t[1];
 698                if (t[2] < 0.0)  t[2] = -t[2];
 699                if (t[3] < 0.0)  t[3] = -t[3];
 700                store_vector4( &inst->DstReg, state, t );
 701             }
 702             break;
 703          case VP_OPCODE_FLR: /* GL_ARB_vertex_program */
 704             {
 705                GLfloat t[4];
 706                fetch_vector4( &inst->SrcReg[0], state, t );
 707                t[0] = FLOORF(t[0]);
 708                t[1] = FLOORF(t[1]);
 709                t[2] = FLOORF(t[2]);
 710                t[3] = FLOORF(t[3]);
 711                store_vector4( &inst->DstReg, state, t );
 712             }
 713             break;
 714          case VP_OPCODE_FRC: /* GL_ARB_vertex_program */
 715             {
 716                GLfloat t[4];
 717                fetch_vector4( &inst->SrcReg[0], state, t );
 718                t[0] = t[0] - FLOORF(t[0]);
 719                t[1] = t[1] - FLOORF(t[1]);
 720                t[2] = t[2] - FLOORF(t[2]);
 721                t[3] = t[3] - FLOORF(t[3]);
 722                store_vector4( &inst->DstReg, state, t );
 723             }
 724             break;
 725          case VP_OPCODE_EX2: /* GL_ARB_vertex_program */
 726             {
 727                GLfloat t[4];
 728                fetch_vector1( &inst->SrcReg[0], state, t );
 729                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(2.0, t[0]);
 730                store_vector4( &inst->DstReg, state, t );
 731             }
 732             break;
 733          case VP_OPCODE_LG2: /* GL_ARB_vertex_program */
 734             {
 735                GLfloat t[4];
 736                fetch_vector1( &inst->SrcReg[0], state, t );
 737                t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
 738                store_vector4( &inst->DstReg, state, t );
 739             }
 740             break;
 741          case VP_OPCODE_POW: /* GL_ARB_vertex_program */
 742             {
 743                GLfloat t[4], u[4];
 744                fetch_vector1( &inst->SrcReg[0], state, t );
 745                fetch_vector1( &inst->SrcReg[1], state, u );
 746                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(t[0], u[0]);
 747                store_vector4( &inst->DstReg, state, t );
 748             }
 749             break;
 750          case VP_OPCODE_XPD: /* GL_ARB_vertex_program */
 751             {
 752                GLfloat t[4], u[4], cross[4];
 753                fetch_vector4( &inst->SrcReg[0], state, t );
 754                fetch_vector4( &inst->SrcReg[1], state, u );
 755                cross[0] = t[1] * u[2] - t[2] * u[1];
 756                cross[1] = t[2] * u[0] - t[0] * u[2];
 757                cross[2] = t[0] * u[1] - t[1] * u[0];
 758                store_vector4( &inst->DstReg, state, cross );
 759             }
 760             break;
 761          case VP_OPCODE_SWZ: /* GL_ARB_vertex_program */
 762             {
 763                const struct vp_src_register *source = &inst->SrcReg[0];
 764                const GLfloat *src = get_register_pointer(source, state);
 765                GLfloat result[4];
 766                GLuint i;
 767
 768                /* do extended swizzling here */
 769                for (i = 0; i < 3; i++) {
 770                   if (source->Swizzle[i] == SWIZZLE_ZERO)
 771                      result[i] = 0.0;
 772                   else if (source->Swizzle[i] == SWIZZLE_ONE)
 773                      result[i] = -1.0;
 774                   else
 775                      result[i] = -src[source->Swizzle[i]];
 776                   if (source->Negate)
 777                      result[i] = -result[i];
 778                }
 779                store_vector4( &inst->DstReg, state, result );
 780             }
 781             break;
 782
 783          case VP_OPCODE_END:
 784             ctx->_CurrentProgram = 0;
 785             return;
 786          default:
 787             /* bad instruction opcode */
 788             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 789             ctx->_CurrentProgram = 0;
 790             return;
 791       } /* switch */
 792    } /* for */
 793
 794    ctx->_CurrentProgram = 0;
 795 }
 796
 797
 798
 799 /**
 800 Thoughts on vertex program optimization:
 801
 802 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 803 assembly code.  That will probably be a lot of work.
 804
 805 Another approach might be to replace the vp_instruction->Opcode field with
 806 a pointer to a specialized C function which executes the instruction.
 807 In particular we can write functions which skip swizzling, negating,
 808 masking, relative addressing, etc. when they're not needed.
 809
 810 For example:
 811
 812 void simple_add( struct vp_instruction *inst )
 813 {
 814    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 815    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 816    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 817    sum[0] = a[0] + b[0];
 818    sum[1] = a[1] + b[1];
 819    sum[2] = a[2] + b[2];
 820    sum[3] = a[3] + b[3];
 821 }
 822
 823 */
 824
 825 /*
 826
 827 KW:
 828
 829 A first step would be to 'vectorize' the programs in the same way as
 830 the normal transformation code in the tnl module.  Thus each opcode
 831 takes zero or more input vectors (registers) and produces one or more
 832 output vectors.
 833
 834 These operations would intially be coded in C, with machine-specific
 835 assembly following, as is currently the case for matrix
 836 transformations in the math/ directory.  The preprocessing scheme for
 837 selecting simpler operations Brian describes above would also work
 838 here.
 839
 840 This should give reasonable performance without excessive effort.
 841
 842 */