src/mesa/main/nvvertexec.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file nvvertexec.c
  27  * Code to execute vertex programs.
  28  * \author Brian Paul
  29  */
  30
  31 #include "glheader.h"
  32 #include "context.h"
  33 #include "imports.h"
  34 #include "macros.h"
  35 #include "mtypes.h"
  36 #include "nvvertexec.h"
  37 #include "nvvertprog.h"
  38 #include "math/m_matrix.h"
  39
  40
  41 static const GLfloat zeroVec[4] = { 0, 0, 0, 0 };
  42
  43
  44 /**
  45  * Load/initialize the vertex program registers.
  46  * This needs to be done per vertex.
  47  */
  48 void
  49 _mesa_init_vp_registers(GLcontext *ctx)
  50 {
  51    struct vp_machine *machine = &(ctx->VertexProgram.Machine);
  52    GLuint i;
  53
  54    /* Input registers get initialized from the current vertex attribs */
  55    MEMCPY(machine->Registers[VP_INPUT_REG_START],
  56           ctx->Current.Attrib,
  57           16 * 4 * sizeof(GLfloat));
  58
  59    /* Output and temp regs are initialized to [0,0,0,1] */
  60    for (i = VP_OUTPUT_REG_START; i <= VP_OUTPUT_REG_END; i++) {
  61       machine->Registers[i][0] = 0.0F;
  62       machine->Registers[i][1] = 0.0F;
  63       machine->Registers[i][2] = 0.0F;
  64       machine->Registers[i][3] = 1.0F;
  65    }
  66    for (i = VP_TEMP_REG_START; i <= VP_TEMP_REG_END; i++) {
  67       machine->Registers[i][0] = 0.0F;
  68       machine->Registers[i][1] = 0.0F;
  69       machine->Registers[i][2] = 0.0F;
  70       machine->Registers[i][3] = 1.0F;
  71    }
  72
  73    /* The program regs aren't touched */
  74 }
  75
  76
  77
  78 /**
  79  * Copy the 16 elements of a matrix into four consecutive program
  80  * registers starting at 'pos'.
  81  */
  82 static void
  83 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  84 {
  85    GLuint i;
  86    pos += VP_PROG_REG_START;
  87    for (i = 0; i < 4; i++) {
  88       registers[pos + i][0] = mat[0 + i];
  89       registers[pos + i][1] = mat[4 + i];
  90       registers[pos + i][2] = mat[8 + i];
  91       registers[pos + i][3] = mat[12 + i];
  92    }
  93 }
  94
  95
  96 /**
  97  * As above, but transpose the matrix.
  98  */
  99 static void
 100 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
 101                       const GLfloat mat[16])
 102 {
 103    pos += VP_PROG_REG_START;
 104    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
 105 }
 106
 107
 108 /**
 109  * Load all currently tracked matrices into the program registers.
 110  * This needs to be done per glBegin/glEnd.
 111  */
 112 void
 113 _mesa_init_tracked_matrices(GLcontext *ctx)
 114 {
 115    GLuint i;
 116
 117    for (i = 0; i < VP_NUM_PROG_REGS / 4; i++) {
 118       /* point 'mat' at source matrix */
 119       GLmatrix *mat;
 120       if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 121          mat = ctx->ModelviewMatrixStack.Top;
 122       }
 123       else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 124          mat = ctx->ProjectionMatrixStack.Top;
 125       }
 126       else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 127          mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 128       }
 129       else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 130          mat = ctx->ColorMatrixStack.Top;
 131       }
 132       else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 133          /* XXX verify the combined matrix is up to date */
 134          mat = &ctx->_ModelProjectMatrix;
 135       }
 136       else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 137                ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 138          GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 139          ASSERT(n < MAX_PROGRAM_MATRICES);
 140          mat = ctx->ProgramMatrixStack[n].Top;
 141       }
 142       else {
 143          /* no matrix is tracked, but we leave the register values as-is */
 144          assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 145          continue;
 146       }
 147
 148       /* load the matrix */
 149       if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 150          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 151       }
 152       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 153          _math_matrix_analyse(mat); /* update the inverse */
 154          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 155          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->inv);
 156       }
 157       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 158          load_transpose_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 159       }
 160       else {
 161          assert(ctx->VertexProgram.TrackMatrixTransform[i]
 162                 == GL_INVERSE_TRANSPOSE_NV);
 163          _math_matrix_analyse(mat); /* update the inverse */
 164          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 165          load_transpose_matrix(ctx->VertexProgram.Machine.Registers,
 166                                i*4, mat->inv);
 167       }
 168    }
 169 }
 170
 171
 172
 173 /**
 174  * For debugging.  Dump the current vertex program machine registers.
 175  */
 176 void
 177 _mesa_dump_vp_machine( const struct vp_machine *machine )
 178 {
 179    int i;
 180    _mesa_printf("VertexIn:\n");
 181    for (i = 0; i < VP_NUM_INPUT_REGS; i++) {
 182       _mesa_printf("%d: %f %f %f %f   ", i,
 183              machine->Registers[i + VP_INPUT_REG_START][0],
 184              machine->Registers[i + VP_INPUT_REG_START][1],
 185              machine->Registers[i + VP_INPUT_REG_START][2],
 186              machine->Registers[i + VP_INPUT_REG_START][3]);
 187    }
 188    _mesa_printf("\n");
 189
 190    _mesa_printf("VertexOut:\n");
 191    for (i = 0; i < VP_NUM_OUTPUT_REGS; i++) {
 192       _mesa_printf("%d: %f %f %f %f   ", i,
 193              machine->Registers[i + VP_OUTPUT_REG_START][0],
 194              machine->Registers[i + VP_OUTPUT_REG_START][1],
 195              machine->Registers[i + VP_OUTPUT_REG_START][2],
 196              machine->Registers[i + VP_OUTPUT_REG_START][3]);
 197    }
 198    _mesa_printf("\n");
 199
 200    _mesa_printf("Registers:\n");
 201    for (i = 0; i < VP_NUM_TEMP_REGS; i++) {
 202       _mesa_printf("%d: %f %f %f %f   ", i,
 203              machine->Registers[i + VP_TEMP_REG_START][0],
 204              machine->Registers[i + VP_TEMP_REG_START][1],
 205              machine->Registers[i + VP_TEMP_REG_START][2],
 206              machine->Registers[i + VP_TEMP_REG_START][3]);
 207    }
 208    _mesa_printf("\n");
 209
 210    _mesa_printf("Parameters:\n");
 211    for (i = 0; i < VP_NUM_PROG_REGS; i++) {
 212       _mesa_printf("%d: %f %f %f %f   ", i,
 213              machine->Registers[i + VP_PROG_REG_START][0],
 214              machine->Registers[i + VP_PROG_REG_START][1],
 215              machine->Registers[i + VP_PROG_REG_START][2],
 216              machine->Registers[i + VP_PROG_REG_START][3]);
 217    }
 218    _mesa_printf("\n");
 219 }
 220
 221
 222 /**
 223  * Fetch a 4-element float vector from the given source register.
 224  * Apply swizzling and negating as needed.
 225  */
 226 static void
 227 fetch_vector4( const struct vp_src_register *source,
 228                const struct vp_machine *machine,
 229                GLfloat result[4] )
 230 {
 231    const GLfloat *src;
 232
 233    if (source->RelAddr) {
 234       const GLint reg = source->Register + machine->AddressReg;
 235       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 236          src = zeroVec;
 237       else
 238          src = machine->Registers[VP_PROG_REG_START + reg];
 239    }
 240    else {
 241       src = machine->Registers[source->Register];
 242    }
 243
 244    if (source->Negate) {
 245       result[0] = -src[source->Swizzle[0]];
 246       result[1] = -src[source->Swizzle[1]];
 247       result[2] = -src[source->Swizzle[2]];
 248       result[3] = -src[source->Swizzle[3]];
 249    }
 250    else {
 251       result[0] = src[source->Swizzle[0]];
 252       result[1] = src[source->Swizzle[1]];
 253       result[2] = src[source->Swizzle[2]];
 254       result[3] = src[source->Swizzle[3]];
 255    }
 256 }
 257
 258
 259 /**
 260  * As above, but only return result[0] element.
 261  */
 262 static void
 263 fetch_vector1( const struct vp_src_register *source,
 264                const struct vp_machine *machine,
 265                GLfloat result[4] )
 266 {
 267    const GLfloat *src;
 268
 269    if (source->RelAddr) {
 270       const GLint reg = source->Register + machine->AddressReg;
 271       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 272          src = zeroVec;
 273       else
 274          src = machine->Registers[VP_PROG_REG_START + reg];
 275    }
 276    else {
 277       src = machine->Registers[source->Register];
 278    }
 279
 280    if (source->Negate) {
 281       result[0] = -src[source->Swizzle[0]];
 282    }
 283    else {
 284       result[0] = src[source->Swizzle[0]];
 285    }
 286 }
 287
 288
 289 /**
 290  * Store 4 floats into a register.
 291  */
 292 static void
 293 store_vector4( const struct vp_dst_register *dest, struct vp_machine *machine,
 294                const GLfloat value[4] )
 295 {
 296    GLfloat *dst = machine->Registers[dest->Register];
 297
 298    if (dest->WriteMask[0])
 299       dst[0] = value[0];
 300    if (dest->WriteMask[1])
 301       dst[1] = value[1];
 302    if (dest->WriteMask[2])
 303       dst[2] = value[2];
 304    if (dest->WriteMask[3])
 305       dst[3] = value[3];
 306 }
 307
 308
 309 /**
 310  * Set x to positive or negative infinity.
 311  */
 312 #ifdef USE_IEEE
 313 #define SET_POS_INFINITY(x)  ( *((GLuint *) &x) = 0x7F800000 )
 314 #define SET_NEG_INFINITY(x)  ( *((GLuint *) &x) = 0xFF800000 )
 315 #elif defined(VMS)
 316 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 317 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 318 #else
 319 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 320 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 321 #endif
 322
 323 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
 324
 325
 326 /**
 327  * Execute the given vertex program
 328  */
 329 void
 330 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 331 {
 332    struct vp_machine *machine = &ctx->VertexProgram.Machine;
 333    const struct vp_instruction *inst;
 334
 335    ctx->_CurrentProgram = GL_VERTEX_PROGRAM_ARB; /* or NV, doesn't matter */
 336
 337    for (inst = program->Instructions; inst->Opcode != VP_OPCODE_END; inst++) {
 338
 339       if (ctx->VertexProgram.CallbackEnabled &&
 340           ctx->VertexProgram.Callback) {
 341          ctx->VertexProgram.CurrentPosition = inst->StringPos;
 342          ctx->VertexProgram.Callback(program->Base.Target,
 343                                      ctx->VertexProgram.CallbackData);
 344       }
 345
 346       switch (inst->Opcode) {
 347          case VP_OPCODE_MOV:
 348             {
 349                GLfloat t[4];
 350                fetch_vector4( &inst->SrcReg[0], machine, t );
 351                store_vector4( &inst->DstReg, machine, t );
 352             }
 353             break;
 354          case VP_OPCODE_LIT:
 355             {
 356                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 357                GLfloat t[4], lit[4];
 358                fetch_vector4( &inst->SrcReg[0], machine, t );
 359                if (t[3] < -(128.0F - epsilon))
 360                    t[3] = - (128.0F - epsilon);
 361                else if (t[3] > 128.0F - epsilon)
 362                   t[3] = 128.0F - epsilon;
 363                if (t[0] < 0.0)
 364                   t[0] = 0.0;
 365                if (t[1] < 0.0)
 366                   t[1] = 0.0;
 367                lit[0] = 1.0;
 368                lit[1] = t[0];
 369                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 370                lit[3] = 1.0;
 371                store_vector4( &inst->DstReg, machine, lit );
 372             }
 373             break;
 374          case VP_OPCODE_RCP:
 375             {
 376                GLfloat t[4];
 377                fetch_vector1( &inst->SrcReg[0], machine, t );
 378                if (t[0] != 1.0F)
 379                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 380                t[1] = t[2] = t[3] = t[0];
 381                store_vector4( &inst->DstReg, machine, t );
 382             }
 383             break;
 384          case VP_OPCODE_RSQ:
 385             {
 386                GLfloat t[4];
 387                fetch_vector1( &inst->SrcReg[0], machine, t );
 388                t[0] = INV_SQRTF(FABSF(t[0]));
 389                t[1] = t[2] = t[3] = t[0];
 390                store_vector4( &inst->DstReg, machine, t );
 391             }
 392             break;
 393          case VP_OPCODE_EXP:
 394             {
 395                GLfloat t[4], q[4], floor_t0;
 396                fetch_vector1( &inst->SrcReg[0], machine, t );
 397                floor_t0 = (float) floor(t[0]);
 398                if (floor_t0 > FLT_MAX_EXP) {
 399                   SET_POS_INFINITY(q[0]);
 400                   SET_POS_INFINITY(q[2]);
 401                }
 402                else if (floor_t0 < FLT_MIN_EXP) {
 403                   q[0] = 0.0F;
 404                   q[2] = 0.0F;
 405                }
 406                else {
 407 #ifdef USE_IEEE
 408                   GLint ii = (GLint) floor_t0;
 409                   ii = (ii < 23) + 0x3f800000;
 410                   SET_FLOAT_BITS(q[0], ii);
 411                   q[0] = *((GLfloat *) &ii);
 412 #else
 413                   q[0] = (GLfloat) pow(2.0, floor_t0);
 414 #endif
 415                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 416                }
 417                q[1] = t[0] - floor_t0;
 418                q[3] = 1.0F;
 419                store_vector4( &inst->DstReg, machine, q );
 420             }
 421             break;
 422          case VP_OPCODE_LOG:
 423             {
 424                GLfloat t[4], q[4], abs_t0;
 425                fetch_vector1( &inst->SrcReg[0], machine, t );
 426                abs_t0 = (GLfloat) fabs(t[0]);
 427                if (abs_t0 != 0.0F) {
 428                   /* Since we really can't handle infinite values on VMS
 429                    * like other OSes we'll use __MAXFLOAT to represent
 430                    * infinity.  This may need some tweaking.
 431                    */
 432 #ifdef VMS
 433                   if (abs_t0 == __MAXFLOAT)
 434 #else
 435                   if (IS_INF_OR_NAN(abs_t0))
 436 #endif
 437                   {
 438                      SET_POS_INFINITY(q[0]);
 439                      q[1] = 1.0F;
 440                      SET_POS_INFINITY(q[2]);
 441                   }
 442                   else {
 443                      int exponent;
 444                      double mantissa = frexp(t[0], &exponent);
 445                      q[0] = (GLfloat) (exponent - 1);
 446                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 447                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 448                   }
 449                   }
 450                else {
 451                   SET_NEG_INFINITY(q[0]);
 452                   q[1] = 1.0F;
 453                   SET_NEG_INFINITY(q[2]);
 454                }
 455                q[3] = 1.0;
 456                store_vector4( &inst->DstReg, machine, q );
 457             }
 458             break;
 459          case VP_OPCODE_MUL:
 460             {
 461                GLfloat t[4], u[4], prod[4];
 462                fetch_vector4( &inst->SrcReg[0], machine, t );
 463                fetch_vector4( &inst->SrcReg[1], machine, u );
 464                prod[0] = t[0] * u[0];
 465                prod[1] = t[1] * u[1];
 466                prod[2] = t[2] * u[2];
 467                prod[3] = t[3] * u[3];
 468                store_vector4( &inst->DstReg, machine, prod );
 469             }
 470             break;
 471          case VP_OPCODE_ADD:
 472             {
 473                GLfloat t[4], u[4], sum[4];
 474                fetch_vector4( &inst->SrcReg[0], machine, t );
 475                fetch_vector4( &inst->SrcReg[1], machine, u );
 476                sum[0] = t[0] + u[0];
 477                sum[1] = t[1] + u[1];
 478                sum[2] = t[2] + u[2];
 479                sum[3] = t[3] + u[3];
 480                store_vector4( &inst->DstReg, machine, sum );
 481             }
 482             break;
 483          case VP_OPCODE_DP3:
 484             {
 485                GLfloat t[4], u[4], dot[4];
 486                fetch_vector4( &inst->SrcReg[0], machine, t );
 487                fetch_vector4( &inst->SrcReg[1], machine, u );
 488                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 489                dot[1] = dot[2] = dot[3] = dot[0];
 490                store_vector4( &inst->DstReg, machine, dot );
 491             }
 492             break;
 493          case VP_OPCODE_DP4:
 494             {
 495                GLfloat t[4], u[4], dot[4];
 496                fetch_vector4( &inst->SrcReg[0], machine, t );
 497                fetch_vector4( &inst->SrcReg[1], machine, u );
 498                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 499                dot[1] = dot[2] = dot[3] = dot[0];
 500                store_vector4( &inst->DstReg, machine, dot );
 501             }
 502             break;
 503          case VP_OPCODE_DST:
 504             {
 505                GLfloat t[4], u[4], dst[4];
 506                fetch_vector4( &inst->SrcReg[0], machine, t );
 507                fetch_vector4( &inst->SrcReg[1], machine, u );
 508                dst[0] = 1.0F;
 509                dst[1] = t[1] * u[1];
 510                dst[2] = t[2];
 511                dst[3] = u[3];
 512                store_vector4( &inst->DstReg, machine, dst );
 513             }
 514             break;
 515          case VP_OPCODE_MIN:
 516             {
 517                GLfloat t[4], u[4], min[4];
 518                fetch_vector4( &inst->SrcReg[0], machine, t );
 519                fetch_vector4( &inst->SrcReg[1], machine, u );
 520                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 521                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 522                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 523                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 524                store_vector4( &inst->DstReg, machine, min );
 525             }
 526             break;
 527          case VP_OPCODE_MAX:
 528             {
 529                GLfloat t[4], u[4], max[4];
 530                fetch_vector4( &inst->SrcReg[0], machine, t );
 531                fetch_vector4( &inst->SrcReg[1], machine, u );
 532                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 533                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 534                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 535                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 536                store_vector4( &inst->DstReg, machine, max );
 537             }
 538             break;
 539          case VP_OPCODE_SLT:
 540             {
 541                GLfloat t[4], u[4], slt[4];
 542                fetch_vector4( &inst->SrcReg[0], machine, t );
 543                fetch_vector4( &inst->SrcReg[1], machine, u );
 544                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 545                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 546                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 547                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 548                store_vector4( &inst->DstReg, machine, slt );
 549             }
 550             break;
 551          case VP_OPCODE_SGE:
 552             {
 553                GLfloat t[4], u[4], sge[4];
 554                fetch_vector4( &inst->SrcReg[0], machine, t );
 555                fetch_vector4( &inst->SrcReg[1], machine, u );
 556                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 557                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 558                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 559                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 560                store_vector4( &inst->DstReg, machine, sge );
 561             }
 562             break;
 563          case VP_OPCODE_MAD:
 564             {
 565                GLfloat t[4], u[4], v[4], sum[4];
 566                fetch_vector4( &inst->SrcReg[0], machine, t );
 567                fetch_vector4( &inst->SrcReg[1], machine, u );
 568                fetch_vector4( &inst->SrcReg[2], machine, v );
 569                sum[0] = t[0] * u[0] + v[0];
 570                sum[1] = t[1] * u[1] + v[1];
 571                sum[2] = t[2] * u[2] + v[2];
 572                sum[3] = t[3] * u[3] + v[3];
 573                store_vector4( &inst->DstReg, machine, sum );
 574             }
 575             break;
 576          case VP_OPCODE_ARL:
 577             {
 578                GLfloat t[4];
 579                fetch_vector4( &inst->SrcReg[0], machine, t );
 580                machine->AddressReg = (GLint) floor(t[0]);
 581             }
 582             break;
 583          case VP_OPCODE_DPH:
 584             {
 585                GLfloat t[4], u[4], dot[4];
 586                fetch_vector4( &inst->SrcReg[0], machine, t );
 587                fetch_vector4( &inst->SrcReg[1], machine, u );
 588                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 589                dot[1] = dot[2] = dot[3] = dot[0];
 590                store_vector4( &inst->DstReg, machine, dot );
 591             }
 592             break;
 593          case VP_OPCODE_RCC:
 594             {
 595                GLfloat t[4], u;
 596                fetch_vector1( &inst->SrcReg[0], machine, t );
 597                if (t[0] == 1.0F)
 598                   u = 1.0F;
 599                else
 600                   u = 1.0F / t[0];
 601                if (u > 0.0F) {
 602                   if (u > 1.884467e+019F) {
 603                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 604                   }
 605                   else if (u < 5.42101e-020F) {
 606                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 607                   }
 608                }
 609                else {
 610                   if (u < -1.884467e+019F) {
 611                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 612                   }
 613                   else if (u > -5.42101e-020F) {
 614                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 615                   }
 616                }
 617                t[0] = t[1] = t[2] = t[3] = u;
 618                store_vector4( &inst->DstReg, machine, t );
 619             }
 620             break;
 621          case VP_OPCODE_SUB: /* GL_NV_vertex_program1_1 */
 622             {
 623                GLfloat t[4], u[4], sum[4];
 624                fetch_vector4( &inst->SrcReg[0], machine, t );
 625                fetch_vector4( &inst->SrcReg[1], machine, u );
 626                sum[0] = t[0] - u[0];
 627                sum[1] = t[1] - u[1];
 628                sum[2] = t[2] - u[2];
 629                sum[3] = t[3] - u[3];
 630                store_vector4( &inst->DstReg, machine, sum );
 631             }
 632             break;
 633          case VP_OPCODE_ABS: /* GL_NV_vertex_program1_1 */
 634             {
 635                GLfloat t[4];
 636                fetch_vector4( &inst->SrcReg[0], machine, t );
 637                if (t[0] < 0.0)  t[0] = -t[0];
 638                if (t[1] < 0.0)  t[1] = -t[1];
 639                if (t[2] < 0.0)  t[2] = -t[2];
 640                if (t[3] < 0.0)  t[3] = -t[3];
 641                store_vector4( &inst->DstReg, machine, t );
 642             }
 643             break;
 644          case VP_OPCODE_FLR: /* GL_ARB_vertex_program */
 645             {
 646                GLfloat t[4];
 647                fetch_vector4( &inst->SrcReg[0], machine, t );
 648                t[0] = FLOORF(t[0]);
 649                t[1] = FLOORF(t[1]);
 650                t[2] = FLOORF(t[2]);
 651                t[3] = FLOORF(t[3]);
 652                store_vector4( &inst->DstReg, machine, t );
 653             }
 654             break;
 655          case VP_OPCODE_FRC: /* GL_ARB_vertex_program */
 656             {
 657                GLfloat t[4];
 658                fetch_vector4( &inst->SrcReg[0], machine, t );
 659                t[0] = t[0] - FLOORF(t[0]);
 660                t[1] = t[1] - FLOORF(t[1]);
 661                t[2] = t[2] - FLOORF(t[2]);
 662                t[3] = t[3] - FLOORF(t[3]);
 663                store_vector4( &inst->DstReg, machine, t );
 664             }
 665             break;
 666          case VP_OPCODE_EX2: /* GL_ARB_vertex_program */
 667             {
 668                GLfloat t[4];
 669                fetch_vector1( &inst->SrcReg[0], machine, t );
 670                t[0] = t[1] = t[2] = t[3] = _mesa_pow(2.0, t[0]);
 671                store_vector4( &inst->DstReg, machine, t );
 672             }
 673             break;
 674          case VP_OPCODE_LG2: /* GL_ARB_vertex_program */
 675             {
 676                GLfloat t[4];
 677                fetch_vector1( &inst->SrcReg[0], machine, t );
 678                t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
 679                store_vector4( &inst->DstReg, machine, t );
 680             }
 681             break;
 682          case VP_OPCODE_POW: /* GL_ARB_vertex_program */
 683             {
 684                GLfloat t[4], u[4];
 685                fetch_vector1( &inst->SrcReg[0], machine, t );
 686                fetch_vector1( &inst->SrcReg[1], machine, u );
 687                t[0] = t[1] = t[2] = t[3] = _mesa_pow(t[0], u[0]);
 688                store_vector4( &inst->DstReg, machine, t );
 689             }
 690             break;
 691          case VP_OPCODE_XPD: /* GL_ARB_vertex_program */
 692             {
 693                GLfloat t[4], u[4], cross[4];
 694                fetch_vector4( &inst->SrcReg[0], machine, t );
 695                fetch_vector4( &inst->SrcReg[1], machine, u );
 696                cross[0] = t[1] * u[2] - t[2] * u[1];
 697                cross[1] = t[2] * u[0] - t[0] * u[2];
 698                cross[2] = t[0] * u[1] - t[1] * u[0];
 699                store_vector4( &inst->DstReg, machine, cross );
 700             }
 701             break;
 702          case VP_OPCODE_SWZ: /* GL_ARB_vertex_program */
 703             {
 704                const struct vp_src_register *source = &inst->SrcReg[0];
 705                const GLfloat *src;
 706                GLfloat result[4];
 707                GLuint i;
 708
 709                /* Code similar to fetch_vector4() */
 710                if (source->RelAddr) {
 711                   const GLint reg = source->Register + machine->AddressReg;
 712                   if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 713                      src = zeroVec;
 714                   else
 715                      src = machine->Registers[VP_PROG_REG_START + reg];
 716                }
 717                else {
 718                   src = machine->Registers[source->Register];
 719                }
 720
 721                /* extended swizzling here */
 722                for (i = 0; i < 3; i++) {
 723                   if (source->Swizzle[i] == SWIZZLE_ZERO)
 724                      result[i] = 0.0;
 725                   else if (source->Swizzle[i] == SWIZZLE_ONE)
 726                      result[i] = -1.0;
 727                   else
 728                      result[i] = -src[source->Swizzle[i]];
 729                   if (source->Negate)
 730                      result[i] = -result[i];
 731                }
 732                store_vector4( &inst->DstReg, machine, result );
 733             }
 734             break;
 735
 736          case VP_OPCODE_END:
 737             ctx->_CurrentProgram = 0;
 738             return;
 739          default:
 740             /* bad instruction opcode */
 741             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 742             ctx->_CurrentProgram = 0;
 743             return;
 744       } /* switch */
 745    } /* for */
 746
 747    ctx->_CurrentProgram = 0;
 748 }
 749
 750
 751
 752 /**
 753 Thoughts on vertex program optimization:
 754
 755 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 756 assembly code.  That will probably be a lot of work.
 757
 758 Another approach might be to replace the vp_instruction->Opcode field with
 759 a pointer to a specialized C function which executes the instruction.
 760 In particular we can write functions which skip swizzling, negating,
 761 masking, relative addressing, etc. when they're not needed.
 762
 763 For example:
 764
 765 void simple_add( struct vp_instruction *inst )
 766 {
 767    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 768    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 769    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 770    sum[0] = a[0] + b[0];
 771    sum[1] = a[1] + b[1];
 772    sum[2] = a[2] + b[2];
 773    sum[3] = a[3] + b[3];
 774 }
 775
 776 */
 777
 778 /*
 779
 780 KW:
 781
 782 A first step would be to 'vectorize' the programs in the same way as
 783 the normal transformation code in the tnl module.  Thus each opcode
 784 takes zero or more input vectors (registers) and produces one or more
 785 output vectors.
 786
 787 These operations would intially be coded in C, with machine-specific
 788 assembly following, as is currently the case for matrix
 789 transformations in the math/ directory.  The preprocessing scheme for
 790 selecting simpler operations Brian describes above would also work
 791 here.
 792
 793 This should give reasonable performance without excessive effort.
 794
 795 */