src/mesa/main/nvvertexec.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file nvvertexec.c
  27  * Code to execute vertex programs.
  28  * \author Brian Paul
  29  */
  30
  31 #include "glheader.h"
  32 #include "context.h"
  33 #include "imports.h"
  34 #include "macros.h"
  35 #include "mtypes.h"
  36 #include "nvvertexec.h"
  37 #include "nvvertprog.h"
  38 #include "program.h"
  39 #include "math/m_matrix.h"
  40
  41
  42 static const GLfloat zeroVec[4] = { 0, 0, 0, 0 };
  43
  44
  45 /**
  46  * Load/initialize the vertex program registers.
  47  * This needs to be done per vertex.
  48  */
  49 void
  50 _mesa_init_vp_registers(GLcontext *ctx)
  51 {
  52    GLuint i;
  53
  54    /* Input registers get initialized from the current vertex attribs */
  55    MEMCPY(ctx->VertexProgram.Inputs, ctx->Current.Attrib,
  56           VERT_ATTRIB_MAX * 4 * sizeof(GLfloat));
  57
  58    /* Output and temp regs are initialized to [0,0,0,1] */
  59    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
  60       ASSIGN_4V(ctx->VertexProgram.Outputs[i], 0.0F, 0.0F, 0.0F, 1.0F);
  61    }
  62    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
  63       ASSIGN_4V(ctx->VertexProgram.Temporaries[i], 0.0F, 0.0F, 0.0F, 1.0F);
  64    }
  65
  66    /* The program parameters aren't touched */
  67         /* XXX: This should be moved to glBegin() time, but its safe (and slow!)
  68          *       here - Karl
  69          */
  70    if (ctx->VertexProgram.Current->Parameters) {
  71
  72       /* Grab the state */
  73       _mesa_load_state_parameters(ctx, ctx->VertexProgram.Current->Parameters);
  74
  75                 /* And copy it into the program state */
  76       for (i=0; i<ctx->VertexProgram.Current->Parameters->NumParameters; i++) {
  77          MEMCPY(ctx->VertexProgram.Parameters[i],
  78             &ctx->VertexProgram.Current->Parameters->Parameters[i].Values,
  79             4*sizeof(GLfloat));
  80       }
  81
  82    }
  83
  84 }
  85
  86
  87
  88 /**
  89  * Copy the 16 elements of a matrix into four consecutive program
  90  * registers starting at 'pos'.
  91  */
  92 static void
  93 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  94 {
  95    GLuint i;
  96    for (i = 0; i < 4; i++) {
  97       registers[pos + i][0] = mat[0 + i];
  98       registers[pos + i][1] = mat[4 + i];
  99       registers[pos + i][2] = mat[8 + i];
 100       registers[pos + i][3] = mat[12 + i];
 101    }
 102 }
 103
 104
 105 /**
 106  * As above, but transpose the matrix.
 107  */
 108 static void
 109 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
 110                       const GLfloat mat[16])
 111 {
 112    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
 113 }
 114
 115
 116 /**
 117  * Load all currently tracked matrices into the program registers.
 118  * This needs to be done per glBegin/glEnd.
 119  */
 120 void
 121 _mesa_init_tracked_matrices(GLcontext *ctx)
 122 {
 123    GLuint i;
 124
 125    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS / 4; i++) {
 126       /* point 'mat' at source matrix */
 127       GLmatrix *mat;
 128       if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 129          mat = ctx->ModelviewMatrixStack.Top;
 130       }
 131       else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 132          mat = ctx->ProjectionMatrixStack.Top;
 133       }
 134       else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 135          mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 136       }
 137       else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 138          mat = ctx->ColorMatrixStack.Top;
 139       }
 140       else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 141          /* XXX verify the combined matrix is up to date */
 142          mat = &ctx->_ModelProjectMatrix;
 143       }
 144       else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 145                ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 146          GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 147          ASSERT(n < MAX_PROGRAM_MATRICES);
 148          mat = ctx->ProgramMatrixStack[n].Top;
 149       }
 150       else {
 151          /* no matrix is tracked, but we leave the register values as-is */
 152          assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 153          continue;
 154       }
 155
 156       /* load the matrix */
 157       if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 158          load_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 159       }
 160       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 161          _math_matrix_analyse(mat); /* update the inverse */
 162          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 163          load_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 164       }
 165       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 166          load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
 167       }
 168       else {
 169          assert(ctx->VertexProgram.TrackMatrixTransform[i]
 170                 == GL_INVERSE_TRANSPOSE_NV);
 171          _math_matrix_analyse(mat); /* update the inverse */
 172          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 173          load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
 174       }
 175    }
 176 }
 177
 178
 179
 180 /**
 181  * For debugging.  Dump the current vertex program machine registers.
 182  */
 183 void
 184 _mesa_dump_vp_state( const struct vertex_program_state *state )
 185 {
 186    int i;
 187    _mesa_printf("VertexIn:\n");
 188    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_INPUTS; i++) {
 189       _mesa_printf("%d: %f %f %f %f   ", i,
 190                    state->Inputs[i][0],
 191                    state->Inputs[i][1],
 192                    state->Inputs[i][2],
 193                    state->Inputs[i][3]);
 194    }
 195    _mesa_printf("\n");
 196
 197    _mesa_printf("VertexOut:\n");
 198    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
 199       _mesa_printf("%d: %f %f %f %f   ", i,
 200                   state->Outputs[i][0],
 201                   state->Outputs[i][1],
 202                   state->Outputs[i][2],
 203                   state->Outputs[i][3]);
 204    }
 205    _mesa_printf("\n");
 206
 207    _mesa_printf("Registers:\n");
 208    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
 209       _mesa_printf("%d: %f %f %f %f   ", i,
 210                   state->Temporaries[i][0],
 211                   state->Temporaries[i][1],
 212                   state->Temporaries[i][2],
 213                   state->Temporaries[i][3]);
 214    }
 215    _mesa_printf("\n");
 216
 217    _mesa_printf("Parameters:\n");
 218    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS; i++) {
 219       _mesa_printf("%d: %f %f %f %f   ", i,
 220                   state->Parameters[i][0],
 221                   state->Parameters[i][1],
 222                   state->Parameters[i][2],
 223                   state->Parameters[i][3]);
 224    }
 225    _mesa_printf("\n");
 226 }
 227
 228
 229
 230 /**
 231  * Return a pointer to the 4-element float vector specified by the given
 232  * source register.
 233  */
 234 static INLINE const GLfloat *
 235 get_register_pointer( const struct vp_src_register *source,
 236                       const struct vertex_program_state *state )
 237 {
 238    if (source->RelAddr) {
 239       const GLint reg = source->Index + state->AddressReg[0];
 240       ASSERT(source->File == PROGRAM_ENV_PARAM);
 241       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 242          return zeroVec;
 243       else
 244          return state->Parameters[reg];
 245    }
 246    else {
 247       switch (source->File) {
 248          case PROGRAM_TEMPORARY:
 249             return state->Temporaries[source->Index];
 250          case PROGRAM_INPUT:
 251             return state->Inputs[source->Index];
 252          case PROGRAM_LOCAL_PARAM:
 253             /* XXX fix */
 254             return state->Temporaries[source->Index];
 255          case PROGRAM_ENV_PARAM:
 256             return state->Parameters[source->Index];
 257          case PROGRAM_STATE_VAR:
 258             return state->Parameters[source->Index];
 259          default:
 260             _mesa_problem(NULL,
 261                           "Bad source register file in fetch_vector4(vp)");
 262             return NULL;
 263       }
 264    }
 265    return NULL;
 266 }
 267
 268
 269 /**
 270  * Fetch a 4-element float vector from the given source register.
 271  * Apply swizzling and negating as needed.
 272  */
 273 static INLINE void
 274 fetch_vector4( const struct vp_src_register *source,
 275                const struct vertex_program_state *state,
 276                GLfloat result[4] )
 277 {
 278    const GLfloat *src = get_register_pointer(source, state);
 279
 280    if (source->Negate) {
 281       result[0] = -src[source->Swizzle[0]];
 282       result[1] = -src[source->Swizzle[1]];
 283       result[2] = -src[source->Swizzle[2]];
 284       result[3] = -src[source->Swizzle[3]];
 285    }
 286    else {
 287       result[0] = src[source->Swizzle[0]];
 288       result[1] = src[source->Swizzle[1]];
 289       result[2] = src[source->Swizzle[2]];
 290       result[3] = src[source->Swizzle[3]];
 291    }
 292 }
 293
 294
 295
 296 /**
 297  * As above, but only return result[0] element.
 298  */
 299 static INLINE void
 300 fetch_vector1( const struct vp_src_register *source,
 301                const struct vertex_program_state *state,
 302                GLfloat result[4] )
 303 {
 304    const GLfloat *src = get_register_pointer(source, state);
 305
 306    if (source->Negate) {
 307       result[0] = -src[source->Swizzle[0]];
 308    }
 309    else {
 310       result[0] = src[source->Swizzle[0]];
 311    }
 312 }
 313
 314
 315 /**
 316  * Store 4 floats into a register.
 317  */
 318 static void
 319 store_vector4( const struct vp_dst_register *dest,
 320                struct vertex_program_state *state,
 321                const GLfloat value[4] )
 322 {
 323    GLfloat *dst;
 324    switch (dest->File) {
 325       case PROGRAM_TEMPORARY:
 326          dst = state->Temporaries[dest->Index];
 327          break;
 328       case PROGRAM_OUTPUT:
 329          dst = state->Outputs[dest->Index];
 330          break;
 331       default:
 332          _mesa_problem(NULL, "Invalid register file in fetch_vector1(vp)");
 333          return;
 334    }
 335
 336    if (dest->WriteMask[0])
 337       dst[0] = value[0];
 338    if (dest->WriteMask[1])
 339       dst[1] = value[1];
 340    if (dest->WriteMask[2])
 341       dst[2] = value[2];
 342    if (dest->WriteMask[3])
 343       dst[3] = value[3];
 344 }
 345
 346
 347 /**
 348  * Set x to positive or negative infinity.
 349  */
 350 #if defined(USE_IEEE) || defined(_WIN32)
 351 #define SET_POS_INFINITY(x)  ( *((GLuint *) &x) = 0x7F800000 )
 352 #define SET_NEG_INFINITY(x)  ( *((GLuint *) &x) = 0xFF800000 )
 353 #elif defined(VMS)
 354 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 355 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 356 #else
 357 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 358 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 359 #endif
 360
 361 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
 362
 363
 364 /**
 365  * Execute the given vertex program
 366  */
 367 void
 368 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 369 {
 370    struct vertex_program_state *state = &ctx->VertexProgram;
 371    const struct vp_instruction *inst;
 372
 373    ctx->_CurrentProgram = GL_VERTEX_PROGRAM_ARB; /* or NV, doesn't matter */
 374
 375    for (inst = program->Instructions; inst->Opcode != VP_OPCODE_END; inst++) {
 376
 377       if (ctx->VertexProgram.CallbackEnabled &&
 378           ctx->VertexProgram.Callback) {
 379          ctx->VertexProgram.CurrentPosition = inst->StringPos;
 380          ctx->VertexProgram.Callback(program->Base.Target,
 381                                      ctx->VertexProgram.CallbackData);
 382       }
 383
 384       switch (inst->Opcode) {
 385          case VP_OPCODE_MOV:
 386             {
 387                GLfloat t[4];
 388                fetch_vector4( &inst->SrcReg[0], state, t );
 389                store_vector4( &inst->DstReg, state, t );
 390             }
 391             break;
 392          case VP_OPCODE_LIT:
 393             {
 394                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 395                GLfloat t[4], lit[4];
 396                fetch_vector4( &inst->SrcReg[0], state, t );
 397                if (t[3] < -(128.0F - epsilon))
 398                    t[3] = - (128.0F - epsilon);
 399                else if (t[3] > 128.0F - epsilon)
 400                   t[3] = 128.0F - epsilon;
 401                if (t[0] < 0.0)
 402                   t[0] = 0.0;
 403                if (t[1] < 0.0)
 404                   t[1] = 0.0;
 405                lit[0] = 1.0;
 406                lit[1] = t[0];
 407                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 408                lit[3] = 1.0;
 409                store_vector4( &inst->DstReg, state, lit );
 410             }
 411             break;
 412          case VP_OPCODE_RCP:
 413             {
 414                GLfloat t[4];
 415                fetch_vector1( &inst->SrcReg[0], state, t );
 416                if (t[0] != 1.0F)
 417                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 418                t[1] = t[2] = t[3] = t[0];
 419                store_vector4( &inst->DstReg, state, t );
 420             }
 421             break;
 422          case VP_OPCODE_RSQ:
 423             {
 424                GLfloat t[4];
 425                fetch_vector1( &inst->SrcReg[0], state, t );
 426                t[0] = INV_SQRTF(FABSF(t[0]));
 427                t[1] = t[2] = t[3] = t[0];
 428                store_vector4( &inst->DstReg, state, t );
 429             }
 430             break;
 431          case VP_OPCODE_EXP:
 432             {
 433                GLfloat t[4], q[4], floor_t0;
 434                fetch_vector1( &inst->SrcReg[0], state, t );
 435                floor_t0 = (float) floor(t[0]);
 436                if (floor_t0 > FLT_MAX_EXP) {
 437                   SET_POS_INFINITY(q[0]);
 438                   SET_POS_INFINITY(q[2]);
 439                }
 440                else if (floor_t0 < FLT_MIN_EXP) {
 441                   q[0] = 0.0F;
 442                   q[2] = 0.0F;
 443                }
 444                else {
 445 #ifdef USE_IEEE
 446                   GLint ii = (GLint) floor_t0;
 447                   ii = (ii < 23) + 0x3f800000;
 448                   SET_FLOAT_BITS(q[0], ii);
 449                   q[0] = *((GLfloat *) &ii);
 450 #else
 451                   q[0] = (GLfloat) pow(2.0, floor_t0);
 452 #endif
 453                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 454                }
 455                q[1] = t[0] - floor_t0;
 456                q[3] = 1.0F;
 457                store_vector4( &inst->DstReg, state, q );
 458             }
 459             break;
 460          case VP_OPCODE_LOG:
 461             {
 462                GLfloat t[4], q[4], abs_t0;
 463                fetch_vector1( &inst->SrcReg[0], state, t );
 464                abs_t0 = (GLfloat) fabs(t[0]);
 465                if (abs_t0 != 0.0F) {
 466                   /* Since we really can't handle infinite values on VMS
 467                    * like other OSes we'll use __MAXFLOAT to represent
 468                    * infinity.  This may need some tweaking.
 469                    */
 470 #ifdef VMS
 471                   if (abs_t0 == __MAXFLOAT)
 472 #else
 473                   if (IS_INF_OR_NAN(abs_t0))
 474 #endif
 475                   {
 476                      SET_POS_INFINITY(q[0]);
 477                      q[1] = 1.0F;
 478                      SET_POS_INFINITY(q[2]);
 479                   }
 480                   else {
 481                      int exponent;
 482                      double mantissa = frexp(t[0], &exponent);
 483                      q[0] = (GLfloat) (exponent - 1);
 484                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 485                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 486                   }
 487                   }
 488                else {
 489                   SET_NEG_INFINITY(q[0]);
 490                   q[1] = 1.0F;
 491                   SET_NEG_INFINITY(q[2]);
 492                }
 493                q[3] = 1.0;
 494                store_vector4( &inst->DstReg, state, q );
 495             }
 496             break;
 497          case VP_OPCODE_MUL:
 498             {
 499                GLfloat t[4], u[4], prod[4];
 500                fetch_vector4( &inst->SrcReg[0], state, t );
 501                fetch_vector4( &inst->SrcReg[1], state, u );
 502                prod[0] = t[0] * u[0];
 503                prod[1] = t[1] * u[1];
 504                prod[2] = t[2] * u[2];
 505                prod[3] = t[3] * u[3];
 506                store_vector4( &inst->DstReg, state, prod );
 507             }
 508             break;
 509          case VP_OPCODE_ADD:
 510             {
 511                GLfloat t[4], u[4], sum[4];
 512                fetch_vector4( &inst->SrcReg[0], state, t );
 513                fetch_vector4( &inst->SrcReg[1], state, u );
 514                sum[0] = t[0] + u[0];
 515                sum[1] = t[1] + u[1];
 516                sum[2] = t[2] + u[2];
 517                sum[3] = t[3] + u[3];
 518                store_vector4( &inst->DstReg, state, sum );
 519             }
 520             break;
 521          case VP_OPCODE_DP3:
 522             {
 523                GLfloat t[4], u[4], dot[4];
 524                fetch_vector4( &inst->SrcReg[0], state, t );
 525                fetch_vector4( &inst->SrcReg[1], state, u );
 526                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 527                dot[1] = dot[2] = dot[3] = dot[0];
 528                store_vector4( &inst->DstReg, state, dot );
 529             }
 530             break;
 531          case VP_OPCODE_DP4:
 532             {
 533                GLfloat t[4], u[4], dot[4];
 534                fetch_vector4( &inst->SrcReg[0], state, t );
 535                fetch_vector4( &inst->SrcReg[1], state, u );
 536                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 537                dot[1] = dot[2] = dot[3] = dot[0];
 538                store_vector4( &inst->DstReg, state, dot );
 539             }
 540             break;
 541          case VP_OPCODE_DST:
 542             {
 543                GLfloat t[4], u[4], dst[4];
 544                fetch_vector4( &inst->SrcReg[0], state, t );
 545                fetch_vector4( &inst->SrcReg[1], state, u );
 546                dst[0] = 1.0F;
 547                dst[1] = t[1] * u[1];
 548                dst[2] = t[2];
 549                dst[3] = u[3];
 550                store_vector4( &inst->DstReg, state, dst );
 551             }
 552             break;
 553          case VP_OPCODE_MIN:
 554             {
 555                GLfloat t[4], u[4], min[4];
 556                fetch_vector4( &inst->SrcReg[0], state, t );
 557                fetch_vector4( &inst->SrcReg[1], state, u );
 558                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 559                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 560                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 561                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 562                store_vector4( &inst->DstReg, state, min );
 563             }
 564             break;
 565          case VP_OPCODE_MAX:
 566             {
 567                GLfloat t[4], u[4], max[4];
 568                fetch_vector4( &inst->SrcReg[0], state, t );
 569                fetch_vector4( &inst->SrcReg[1], state, u );
 570                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 571                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 572                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 573                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 574                store_vector4( &inst->DstReg, state, max );
 575             }
 576             break;
 577          case VP_OPCODE_SLT:
 578             {
 579                GLfloat t[4], u[4], slt[4];
 580                fetch_vector4( &inst->SrcReg[0], state, t );
 581                fetch_vector4( &inst->SrcReg[1], state, u );
 582                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 583                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 584                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 585                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 586                store_vector4( &inst->DstReg, state, slt );
 587             }
 588             break;
 589          case VP_OPCODE_SGE:
 590             {
 591                GLfloat t[4], u[4], sge[4];
 592                fetch_vector4( &inst->SrcReg[0], state, t );
 593                fetch_vector4( &inst->SrcReg[1], state, u );
 594                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 595                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 596                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 597                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 598                store_vector4( &inst->DstReg, state, sge );
 599             }
 600             break;
 601          case VP_OPCODE_MAD:
 602             {
 603                GLfloat t[4], u[4], v[4], sum[4];
 604                fetch_vector4( &inst->SrcReg[0], state, t );
 605                fetch_vector4( &inst->SrcReg[1], state, u );
 606                fetch_vector4( &inst->SrcReg[2], state, v );
 607                sum[0] = t[0] * u[0] + v[0];
 608                sum[1] = t[1] * u[1] + v[1];
 609                sum[2] = t[2] * u[2] + v[2];
 610                sum[3] = t[3] * u[3] + v[3];
 611                store_vector4( &inst->DstReg, state, sum );
 612             }
 613             break;
 614          case VP_OPCODE_ARL:
 615             {
 616                GLfloat t[4];
 617                fetch_vector4( &inst->SrcReg[0], state, t );
 618                state->AddressReg[0] = (GLint) floor(t[0]);
 619             }
 620             break;
 621          case VP_OPCODE_DPH:
 622             {
 623                GLfloat t[4], u[4], dot[4];
 624                fetch_vector4( &inst->SrcReg[0], state, t );
 625                fetch_vector4( &inst->SrcReg[1], state, u );
 626                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 627                dot[1] = dot[2] = dot[3] = dot[0];
 628                store_vector4( &inst->DstReg, state, dot );
 629             }
 630             break;
 631          case VP_OPCODE_RCC:
 632             {
 633                GLfloat t[4], u;
 634                fetch_vector1( &inst->SrcReg[0], state, t );
 635                if (t[0] == 1.0F)
 636                   u = 1.0F;
 637                else
 638                   u = 1.0F / t[0];
 639                if (u > 0.0F) {
 640                   if (u > 1.884467e+019F) {
 641                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 642                   }
 643                   else if (u < 5.42101e-020F) {
 644                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 645                   }
 646                }
 647                else {
 648                   if (u < -1.884467e+019F) {
 649                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 650                   }
 651                   else if (u > -5.42101e-020F) {
 652                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 653                   }
 654                }
 655                t[0] = t[1] = t[2] = t[3] = u;
 656                store_vector4( &inst->DstReg, state, t );
 657             }
 658             break;
 659          case VP_OPCODE_SUB: /* GL_NV_vertex_program1_1 */
 660             {
 661                GLfloat t[4], u[4], sum[4];
 662                fetch_vector4( &inst->SrcReg[0], state, t );
 663                fetch_vector4( &inst->SrcReg[1], state, u );
 664                sum[0] = t[0] - u[0];
 665                sum[1] = t[1] - u[1];
 666                sum[2] = t[2] - u[2];
 667                sum[3] = t[3] - u[3];
 668                store_vector4( &inst->DstReg, state, sum );
 669             }
 670             break;
 671          case VP_OPCODE_ABS: /* GL_NV_vertex_program1_1 */
 672             {
 673                GLfloat t[4];
 674                fetch_vector4( &inst->SrcReg[0], state, t );
 675                if (t[0] < 0.0)  t[0] = -t[0];
 676                if (t[1] < 0.0)  t[1] = -t[1];
 677                if (t[2] < 0.0)  t[2] = -t[2];
 678                if (t[3] < 0.0)  t[3] = -t[3];
 679                store_vector4( &inst->DstReg, state, t );
 680             }
 681             break;
 682          case VP_OPCODE_FLR: /* GL_ARB_vertex_program */
 683             {
 684                GLfloat t[4];
 685                fetch_vector4( &inst->SrcReg[0], state, t );
 686                t[0] = FLOORF(t[0]);
 687                t[1] = FLOORF(t[1]);
 688                t[2] = FLOORF(t[2]);
 689                t[3] = FLOORF(t[3]);
 690                store_vector4( &inst->DstReg, state, t );
 691             }
 692             break;
 693          case VP_OPCODE_FRC: /* GL_ARB_vertex_program */
 694             {
 695                GLfloat t[4];
 696                fetch_vector4( &inst->SrcReg[0], state, t );
 697                t[0] = t[0] - FLOORF(t[0]);
 698                t[1] = t[1] - FLOORF(t[1]);
 699                t[2] = t[2] - FLOORF(t[2]);
 700                t[3] = t[3] - FLOORF(t[3]);
 701                store_vector4( &inst->DstReg, state, t );
 702             }
 703             break;
 704          case VP_OPCODE_EX2: /* GL_ARB_vertex_program */
 705             {
 706                GLfloat t[4];
 707                fetch_vector1( &inst->SrcReg[0], state, t );
 708                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(2.0, t[0]);
 709                store_vector4( &inst->DstReg, state, t );
 710             }
 711             break;
 712          case VP_OPCODE_LG2: /* GL_ARB_vertex_program */
 713             {
 714                GLfloat t[4];
 715                fetch_vector1( &inst->SrcReg[0], state, t );
 716                t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
 717                store_vector4( &inst->DstReg, state, t );
 718             }
 719             break;
 720          case VP_OPCODE_POW: /* GL_ARB_vertex_program */
 721             {
 722                GLfloat t[4], u[4];
 723                fetch_vector1( &inst->SrcReg[0], state, t );
 724                fetch_vector1( &inst->SrcReg[1], state, u );
 725                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(t[0], u[0]);
 726                store_vector4( &inst->DstReg, state, t );
 727             }
 728             break;
 729          case VP_OPCODE_XPD: /* GL_ARB_vertex_program */
 730             {
 731                GLfloat t[4], u[4], cross[4];
 732                fetch_vector4( &inst->SrcReg[0], state, t );
 733                fetch_vector4( &inst->SrcReg[1], state, u );
 734                cross[0] = t[1] * u[2] - t[2] * u[1];
 735                cross[1] = t[2] * u[0] - t[0] * u[2];
 736                cross[2] = t[0] * u[1] - t[1] * u[0];
 737                store_vector4( &inst->DstReg, state, cross );
 738             }
 739             break;
 740          case VP_OPCODE_SWZ: /* GL_ARB_vertex_program */
 741             {
 742                const struct vp_src_register *source = &inst->SrcReg[0];
 743                const GLfloat *src = get_register_pointer(source, state);
 744                GLfloat result[4];
 745                GLuint i;
 746
 747                /* do extended swizzling here */
 748                for (i = 0; i < 3; i++) {
 749                   if (source->Swizzle[i] == SWIZZLE_ZERO)
 750                      result[i] = 0.0;
 751                   else if (source->Swizzle[i] == SWIZZLE_ONE)
 752                      result[i] = -1.0;
 753                   else
 754                      result[i] = -src[source->Swizzle[i]];
 755                   if (source->Negate)
 756                      result[i] = -result[i];
 757                }
 758                store_vector4( &inst->DstReg, state, result );
 759             }
 760             break;
 761
 762          case VP_OPCODE_END:
 763             ctx->_CurrentProgram = 0;
 764             return;
 765          default:
 766             /* bad instruction opcode */
 767             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 768             ctx->_CurrentProgram = 0;
 769             return;
 770       } /* switch */
 771    } /* for */
 772
 773    ctx->_CurrentProgram = 0;
 774 }
 775
 776
 777
 778 /**
 779 Thoughts on vertex program optimization:
 780
 781 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 782 assembly code.  That will probably be a lot of work.
 783
 784 Another approach might be to replace the vp_instruction->Opcode field with
 785 a pointer to a specialized C function which executes the instruction.
 786 In particular we can write functions which skip swizzling, negating,
 787 masking, relative addressing, etc. when they're not needed.
 788
 789 For example:
 790
 791 void simple_add( struct vp_instruction *inst )
 792 {
 793    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 794    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 795    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 796    sum[0] = a[0] + b[0];
 797    sum[1] = a[1] + b[1];
 798    sum[2] = a[2] + b[2];
 799    sum[3] = a[3] + b[3];
 800 }
 801
 802 */
 803
 804 /*
 805
 806 KW:
 807
 808 A first step would be to 'vectorize' the programs in the same way as
 809 the normal transformation code in the tnl module.  Thus each opcode
 810 takes zero or more input vectors (registers) and produces one or more
 811 output vectors.
 812
 813 These operations would intially be coded in C, with machine-specific
 814 assembly following, as is currently the case for matrix
 815 transformations in the math/ directory.  The preprocessing scheme for
 816 selecting simpler operations Brian describes above would also work
 817 here.
 818
 819 This should give reasonable performance without excessive effort.
 820
 821 */