src/mesa/main/nvvertexec.c

   1 /* $Id: nvvertexec.c,v 1.1 2003/01/14 04:55:46 brianp Exp $ */
   2
   3 /*
   4  * Mesa 3-D graphics library
   5  * Version:  5.1
   6  *
   7  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   8  *
   9  * Permission is hereby granted, free of charge, to any person obtaining a
  10  * copy of this software and associated documentation files (the "Software"),
  11  * to deal in the Software without restriction, including without limitation
  12  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  13  * and/or sell copies of the Software, and to permit persons to whom the
  14  * Software is furnished to do so, subject to the following conditions:
  15  *
  16  * The above copyright notice and this permission notice shall be included
  17  * in all copies or substantial portions of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  22  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  23  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  24  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  */
  26
  27 /**
  28  * \file nvvertexec.c
  29  * \brief Code to execute vertex programs.
  30  * \author Brian Paul
  31  */
  32
  33 #include "glheader.h"
  34 #include "context.h"
  35 #include "imports.h"
  36 #include "macros.h"
  37 #include "mtypes.h"
  38 #include "nvvertexec.h"
  39 #include "nvvertprog.h"
  40 #include "mmath.h"
  41 #include "math/m_matrix.h"
  42
  43
  44 /**
  45  * Load/initialize the vertex program registers.
  46  * This needs to be done per vertex.
  47  */
  48 void
  49 _mesa_init_vp_registers(GLcontext *ctx)
  50 {
  51    struct vp_machine *machine = &(ctx->VertexProgram.Machine);
  52    GLuint i;
  53
  54    /* Input registers get initialized from the current vertex attribs */
  55    MEMCPY(machine->Registers[VP_INPUT_REG_START],
  56           ctx->Current.Attrib,
  57           16 * 4 * sizeof(GLfloat));
  58
  59    /* Output and temp regs are initialized to [0,0,0,1] */
  60    for (i = VP_OUTPUT_REG_START; i <= VP_OUTPUT_REG_END; i++) {
  61       machine->Registers[i][0] = 0.0F;
  62       machine->Registers[i][1] = 0.0F;
  63       machine->Registers[i][2] = 0.0F;
  64       machine->Registers[i][3] = 1.0F;
  65    }
  66    for (i = VP_TEMP_REG_START; i <= VP_TEMP_REG_END; i++) {
  67       machine->Registers[i][0] = 0.0F;
  68       machine->Registers[i][1] = 0.0F;
  69       machine->Registers[i][2] = 0.0F;
  70       machine->Registers[i][3] = 1.0F;
  71    }
  72
  73    /* The program regs aren't touched */
  74 }
  75
  76
  77
  78 /**
  79  * Copy the 16 elements of a matrix into four consecutive program
  80  * registers starting at 'pos'.
  81  */
  82 static void
  83 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  84 {
  85    GLuint i;
  86    pos += VP_PROG_REG_START;
  87    for (i = 0; i < 4; i++) {
  88       registers[pos + i][0] = mat[0 + i];
  89       registers[pos + i][1] = mat[4 + i];
  90       registers[pos + i][2] = mat[8 + i];
  91       registers[pos + i][3] = mat[12 + i];
  92    }
  93 }
  94
  95
  96 /**
  97  * As above, but transpose the matrix.
  98  */
  99 static void
 100 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
 101                       const GLfloat mat[16])
 102 {
 103    pos += VP_PROG_REG_START;
 104    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
 105 }
 106
 107
 108 /**
 109  * Load all currently tracked matrices into the program registers.
 110  * This needs to be done per glBegin/glEnd.
 111  */
 112 void
 113 _mesa_init_tracked_matrices(GLcontext *ctx)
 114 {
 115    GLuint i;
 116
 117    for (i = 0; i < VP_NUM_PROG_REGS / 4; i++) {
 118       /* point 'mat' at source matrix */
 119       GLmatrix *mat;
 120       if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 121          mat = ctx->ModelviewMatrixStack.Top;
 122       }
 123       else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 124          mat = ctx->ProjectionMatrixStack.Top;
 125       }
 126       else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 127          mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 128       }
 129       else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 130          mat = ctx->ColorMatrixStack.Top;
 131       }
 132       else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 133          /* XXX verify the combined matrix is up to date */
 134          mat = &ctx->_ModelProjectMatrix;
 135       }
 136       else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 137                ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 138          GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 139          ASSERT(n < MAX_PROGRAM_MATRICES);
 140          mat = ctx->ProgramMatrixStack[n].Top;
 141       }
 142       else {
 143          /* no matrix is tracked, but we leave the register values as-is */
 144          assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 145          continue;
 146       }
 147
 148       /* load the matrix */
 149       if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 150          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 151       }
 152       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 153          _math_matrix_analyse(mat); /* update the inverse */
 154          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 155          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->inv);
 156       }
 157       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 158          load_transpose_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 159       }
 160       else {
 161          assert(ctx->VertexProgram.TrackMatrixTransform[i]
 162                 == GL_INVERSE_TRANSPOSE_NV);
 163          _math_matrix_analyse(mat); /* update the inverse */
 164          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 165          load_transpose_matrix(ctx->VertexProgram.Machine.Registers,
 166                                i*4, mat->inv);
 167       }
 168    }
 169 }
 170
 171
 172
 173 /**
 174  * For debugging.  Dump the current vertex program machine registers.
 175  */
 176 void
 177 _mesa_dump_vp_machine( const struct vp_machine *machine )
 178 {
 179    int i;
 180    _mesa_printf("VertexIn:\n");
 181    for (i = 0; i < VP_NUM_INPUT_REGS; i++) {
 182       _mesa_printf("%d: %f %f %f %f   ", i,
 183              machine->Registers[i + VP_INPUT_REG_START][0],
 184              machine->Registers[i + VP_INPUT_REG_START][1],
 185              machine->Registers[i + VP_INPUT_REG_START][2],
 186              machine->Registers[i + VP_INPUT_REG_START][3]);
 187    }
 188    _mesa_printf("\n");
 189
 190    _mesa_printf("VertexOut:\n");
 191    for (i = 0; i < VP_NUM_OUTPUT_REGS; i++) {
 192       _mesa_printf("%d: %f %f %f %f   ", i,
 193              machine->Registers[i + VP_OUTPUT_REG_START][0],
 194              machine->Registers[i + VP_OUTPUT_REG_START][1],
 195              machine->Registers[i + VP_OUTPUT_REG_START][2],
 196              machine->Registers[i + VP_OUTPUT_REG_START][3]);
 197    }
 198    _mesa_printf("\n");
 199
 200    _mesa_printf("Registers:\n");
 201    for (i = 0; i < VP_NUM_TEMP_REGS; i++) {
 202       _mesa_printf("%d: %f %f %f %f   ", i,
 203              machine->Registers[i + VP_TEMP_REG_START][0],
 204              machine->Registers[i + VP_TEMP_REG_START][1],
 205              machine->Registers[i + VP_TEMP_REG_START][2],
 206              machine->Registers[i + VP_TEMP_REG_START][3]);
 207    }
 208    _mesa_printf("\n");
 209
 210    _mesa_printf("Parameters:\n");
 211    for (i = 0; i < VP_NUM_PROG_REGS; i++) {
 212       _mesa_printf("%d: %f %f %f %f   ", i,
 213              machine->Registers[i + VP_PROG_REG_START][0],
 214              machine->Registers[i + VP_PROG_REG_START][1],
 215              machine->Registers[i + VP_PROG_REG_START][2],
 216              machine->Registers[i + VP_PROG_REG_START][3]);
 217    }
 218    _mesa_printf("\n");
 219 }
 220
 221
 222 /**
 223  * Fetch a 4-element float vector from the given source register.
 224  * Apply swizzling and negating as needed.
 225  */
 226 static void
 227 fetch_vector4( const struct vp_src_register *source,
 228                const struct vp_machine *machine,
 229                GLfloat result[4] )
 230 {
 231    static const GLfloat zero[4] = { 0, 0, 0, 0 };
 232    const GLfloat *src;
 233
 234    if (source->RelAddr) {
 235       GLint reg = source->Register + machine->AddressReg;
 236       if (reg < VP_PROG_REG_START || reg > VP_PROG_REG_END)
 237          src = zero;
 238       else
 239          src = machine->Registers[reg];
 240    }
 241    else {
 242       src = machine->Registers[source->Register];
 243    }
 244
 245    if (source->Negate) {
 246       result[0] = -src[source->Swizzle[0]];
 247       result[1] = -src[source->Swizzle[1]];
 248       result[2] = -src[source->Swizzle[2]];
 249       result[3] = -src[source->Swizzle[3]];
 250    }
 251    else {
 252       result[0] = src[source->Swizzle[0]];
 253       result[1] = src[source->Swizzle[1]];
 254       result[2] = src[source->Swizzle[2]];
 255       result[3] = src[source->Swizzle[3]];
 256    }
 257 }
 258
 259
 260 /**
 261  * As above, but only return result[0] element.
 262  */
 263 static void
 264 fetch_vector1( const struct vp_src_register *source,
 265                const struct vp_machine *machine,
 266                GLfloat result[4] )
 267 {
 268    static const GLfloat zero[4] = { 0, 0, 0, 0 };
 269    const GLfloat *src;
 270
 271    if (source->RelAddr) {
 272       GLint reg = source->Register + machine->AddressReg;
 273       if (reg < VP_PROG_REG_START || reg > VP_PROG_REG_END)
 274          src = zero;
 275       else
 276          src = machine->Registers[reg];
 277    }
 278    else {
 279       src = machine->Registers[source->Register];
 280    }
 281
 282    if (source->Negate) {
 283       result[0] = -src[source->Swizzle[0]];
 284    }
 285    else {
 286       result[0] = src[source->Swizzle[0]];
 287    }
 288 }
 289
 290
 291 /**
 292  * Store 4 floats into a register.
 293  */
 294 static void
 295 store_vector4( const struct vp_dst_register *dest, struct vp_machine *machine,
 296                const GLfloat value[4] )
 297 {
 298    GLfloat *dst = machine->Registers[dest->Register];
 299
 300    if (dest->WriteMask[0])
 301       dst[0] = value[0];
 302    if (dest->WriteMask[1])
 303       dst[1] = value[1];
 304    if (dest->WriteMask[2])
 305       dst[2] = value[2];
 306    if (dest->WriteMask[3])
 307       dst[3] = value[3];
 308 }
 309
 310
 311 /**
 312  * Set x to positive or negative infinity.
 313  */
 314 #ifdef USE_IEEE
 315 #define SET_POS_INFINITY(x)  ( *((GLuint *) &x) = 0x7F800000 )
 316 #define SET_NEG_INFINITY(x)  ( *((GLuint *) &x) = 0xFF800000 )
 317 #elif defined(VMS)
 318 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 319 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 320 #else
 321 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 322 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 323 #endif
 324
 325 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
 326
 327
 328 /**
 329  * Execute the given vertex program
 330  */
 331 void
 332 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 333 {
 334    struct vp_machine *machine = &ctx->VertexProgram.Machine;
 335    const struct vp_instruction *inst;
 336
 337    /* XXX load vertex fields into input registers */
 338    /* and do other initialization */
 339
 340
 341    for (inst = program->Instructions; inst->Opcode != VP_OPCODE_END; inst++) {
 342       switch (inst->Opcode) {
 343          case VP_OPCODE_MOV:
 344             {
 345                GLfloat t[4];
 346                fetch_vector4( &inst->SrcReg[0], machine, t );
 347                store_vector4( &inst->DstReg, machine, t );
 348             }
 349             break;
 350          case VP_OPCODE_LIT:
 351             {
 352                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 353                GLfloat t[4], lit[4];
 354                fetch_vector4( &inst->SrcReg[0], machine, t );
 355                if (t[3] < -(128.0F - epsilon))
 356                    t[3] = - (128.0F - epsilon);
 357                else if (t[3] > 128.0F - epsilon)
 358                   t[3] = 128.0F - epsilon;
 359                if (t[0] < 0.0)
 360                   t[0] = 0.0;
 361                if (t[1] < 0.0)
 362                   t[1] = 0.0;
 363                lit[0] = 1.0;
 364                lit[1] = t[0];
 365                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 366                lit[3] = 1.0;
 367                store_vector4( &inst->DstReg, machine, lit );
 368             }
 369             break;
 370          case VP_OPCODE_RCP:
 371             {
 372                GLfloat t[4];
 373                fetch_vector1( &inst->SrcReg[0], machine, t );
 374                if (t[0] != 1.0F)
 375                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 376                t[1] = t[2] = t[3] = t[0];
 377                store_vector4( &inst->DstReg, machine, t );
 378             }
 379             break;
 380          case VP_OPCODE_RSQ:
 381             {
 382                GLfloat t[4];
 383                fetch_vector1( &inst->SrcReg[0], machine, t );
 384                t[0] = (float) (1.0 / sqrt(fabs(t[0])));
 385                t[1] = t[2] = t[3] = t[0];
 386                store_vector4( &inst->DstReg, machine, t );
 387             }
 388             break;
 389          case VP_OPCODE_EXP:
 390             {
 391                GLfloat t[4], q[4], floor_t0;
 392                fetch_vector1( &inst->SrcReg[0], machine, t );
 393                floor_t0 = (float) floor(t[0]);
 394                if (floor_t0 > FLT_MAX_EXP) {
 395                   SET_POS_INFINITY(q[0]);
 396                   q[1] = 0.0F;
 397                   SET_POS_INFINITY(q[2]);
 398                   q[3] = 1.0F;
 399                }
 400                else if (floor_t0 < FLT_MIN_EXP) {
 401                   q[0] = 0.0F;
 402                   q[1] = 0.0F;
 403                   q[2] = 0.0F;
 404                   q[3] = 0.0F;
 405                }
 406                else {
 407 #ifdef USE_IEEE
 408                   GLint ii = (GLint) floor_t0;
 409                   ii = (ii < 23) + 0x3f800000;
 410                   SET_FLOAT_BITS(q[0], ii);
 411                   q[0] = *((GLfloat *) &ii);
 412 #else
 413                   q[0] = (GLfloat) pow(2.0, floor_t0);
 414 #endif
 415                   q[1] = t[0] - floor_t0;
 416                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 417                   q[3] = 1.0F;
 418                }
 419                store_vector4( &inst->DstReg, machine, t );
 420             }
 421             break;
 422          case VP_OPCODE_LOG:
 423             {
 424                GLfloat t[4], q[4], abs_t0;
 425                fetch_vector1( &inst->SrcReg[0], machine, t );
 426                abs_t0 = (GLfloat) fabs(t[0]);
 427                if (abs_t0 != 0.0F) {
 428                   /* Since we really can't handle infinite values on VMS
 429                    * like other OSes we'll use __MAXFLOAT to represent
 430                    * infinity.  This may need some tweaking.
 431                    */
 432 #ifdef VMS
 433                   if (abs_t0 == __MAXFLOAT) {
 434 #else
 435                   if (IS_INF_OR_NAN(abs_t0)) {
 436 #endif
 437                      SET_POS_INFINITY(q[0]);
 438                      q[1] = 1.0F;
 439                      SET_POS_INFINITY(q[2]);
 440                   }
 441                   else {
 442                      int exponent;
 443                      double mantissa = frexp(t[0], &exponent);
 444                      q[0] = (GLfloat) (exponent - 1);
 445                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 446                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 447                   }
 448                }
 449                else {
 450                   SET_NEG_INFINITY(q[0]);
 451                   q[1] = 1.0F;
 452                   SET_NEG_INFINITY(q[2]);
 453                }
 454                q[3] = 1.0;
 455                store_vector4( &inst->DstReg, machine, q );
 456             }
 457             break;
 458          case VP_OPCODE_MUL:
 459             {
 460                GLfloat t[4], u[4], prod[4];
 461                fetch_vector4( &inst->SrcReg[0], machine, t );
 462                fetch_vector4( &inst->SrcReg[1], machine, u );
 463                prod[0] = t[0] * u[0];
 464                prod[1] = t[1] * u[1];
 465                prod[2] = t[2] * u[2];
 466                prod[3] = t[3] * u[3];
 467                store_vector4( &inst->DstReg, machine, prod );
 468             }
 469             break;
 470          case VP_OPCODE_ADD:
 471             {
 472                GLfloat t[4], u[4], sum[4];
 473                fetch_vector4( &inst->SrcReg[0], machine, t );
 474                fetch_vector4( &inst->SrcReg[1], machine, u );
 475                sum[0] = t[0] + u[0];
 476                sum[1] = t[1] + u[1];
 477                sum[2] = t[2] + u[2];
 478                sum[3] = t[3] + u[3];
 479                store_vector4( &inst->DstReg, machine, sum );
 480             }
 481             break;
 482          case VP_OPCODE_DP3:
 483             {
 484                GLfloat t[4], u[4], dot[4];
 485                fetch_vector4( &inst->SrcReg[0], machine, t );
 486                fetch_vector4( &inst->SrcReg[1], machine, u );
 487                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 488                dot[1] = dot[2] = dot[3] = dot[0];
 489                store_vector4( &inst->DstReg, machine, dot );
 490             }
 491             break;
 492          case VP_OPCODE_DP4:
 493             {
 494                GLfloat t[4], u[4], dot[4];
 495                fetch_vector4( &inst->SrcReg[0], machine, t );
 496                fetch_vector4( &inst->SrcReg[1], machine, u );
 497                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 498                dot[1] = dot[2] = dot[3] = dot[0];
 499                store_vector4( &inst->DstReg, machine, dot );
 500             }
 501             break;
 502          case VP_OPCODE_DST:
 503             {
 504                GLfloat t[4], u[4], dst[4];
 505                fetch_vector4( &inst->SrcReg[0], machine, t );
 506                fetch_vector4( &inst->SrcReg[1], machine, u );
 507                dst[0] = 1.0F;
 508                dst[1] = t[1] * u[1];
 509                dst[2] = t[2];
 510                dst[3] = u[3];
 511                store_vector4( &inst->DstReg, machine, dst );
 512             }
 513             break;
 514          case VP_OPCODE_MIN:
 515             {
 516                GLfloat t[4], u[4], min[4];
 517                fetch_vector4( &inst->SrcReg[0], machine, t );
 518                fetch_vector4( &inst->SrcReg[1], machine, u );
 519                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 520                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 521                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 522                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 523                store_vector4( &inst->DstReg, machine, min );
 524             }
 525             break;
 526          case VP_OPCODE_MAX:
 527             {
 528                GLfloat t[4], u[4], max[4];
 529                fetch_vector4( &inst->SrcReg[0], machine, t );
 530                fetch_vector4( &inst->SrcReg[1], machine, u );
 531                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 532                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 533                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 534                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 535                store_vector4( &inst->DstReg, machine, max );
 536             }
 537             break;
 538          case VP_OPCODE_SLT:
 539             {
 540                GLfloat t[4], u[4], slt[4];
 541                fetch_vector4( &inst->SrcReg[0], machine, t );
 542                fetch_vector4( &inst->SrcReg[1], machine, u );
 543                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 544                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 545                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 546                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 547                store_vector4( &inst->DstReg, machine, slt );
 548             }
 549             break;
 550          case VP_OPCODE_SGE:
 551             {
 552                GLfloat t[4], u[4], sge[4];
 553                fetch_vector4( &inst->SrcReg[0], machine, t );
 554                fetch_vector4( &inst->SrcReg[1], machine, u );
 555                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 556                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 557                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 558                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 559                store_vector4( &inst->DstReg, machine, sge );
 560             }
 561             break;
 562          case VP_OPCODE_MAD:
 563             {
 564                GLfloat t[4], u[4], v[4], sum[4];
 565                fetch_vector4( &inst->SrcReg[0], machine, t );
 566                fetch_vector4( &inst->SrcReg[1], machine, u );
 567                fetch_vector4( &inst->SrcReg[2], machine, v );
 568                sum[0] = t[0] * u[0] + v[0];
 569                sum[1] = t[1] * u[1] + v[1];
 570                sum[2] = t[2] * u[2] + v[2];
 571                sum[3] = t[3] * u[3] + v[3];
 572                store_vector4( &inst->DstReg, machine, sum );
 573             }
 574             break;
 575          case VP_OPCODE_ARL:
 576             {
 577                GLfloat t[4];
 578                fetch_vector4( &inst->SrcReg[0], machine, t );
 579                machine->AddressReg = (GLint) floor(t[0]);
 580             }
 581             break;
 582          case VP_OPCODE_DPH:
 583             {
 584                GLfloat t[4], u[4], dot[4];
 585                fetch_vector4( &inst->SrcReg[0], machine, t );
 586                fetch_vector4( &inst->SrcReg[1], machine, u );
 587                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 588                dot[1] = dot[2] = dot[3] = dot[0];
 589                store_vector4( &inst->DstReg, machine, dot );
 590             }
 591             break;
 592          case VP_OPCODE_RCC:
 593             {
 594                GLfloat t[4], u;
 595                fetch_vector1( &inst->SrcReg[0], machine, t );
 596                if (t[0] == 1.0F)
 597                   u = 1.0F;
 598                else
 599                   u = 1.0F / t[0];
 600                if (u > 0.0F) {
 601                   if (u > 1.884467e+019F) {
 602                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 603                   }
 604                   else if (u < 5.42101e-020F) {
 605                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 606                   }
 607                }
 608                else {
 609                   if (u < -1.884467e+019F) {
 610                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 611                   }
 612                   else if (u > -5.42101e-020F) {
 613                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 614                   }
 615                }
 616                t[0] = t[1] = t[2] = t[3] = u;
 617                store_vector4( &inst->DstReg, machine, t );
 618             }
 619             break;
 620          case VP_OPCODE_SUB:
 621             {
 622                GLfloat t[4], u[4], sum[4];
 623                fetch_vector4( &inst->SrcReg[0], machine, t );
 624                fetch_vector4( &inst->SrcReg[1], machine, u );
 625                sum[0] = t[0] - u[0];
 626                sum[1] = t[1] - u[1];
 627                sum[2] = t[2] - u[2];
 628                sum[3] = t[3] - u[3];
 629                store_vector4( &inst->DstReg, machine, sum );
 630             }
 631             break;
 632          case VP_OPCODE_ABS:
 633             {
 634                GLfloat t[4];
 635                fetch_vector4( &inst->SrcReg[0], machine, t );
 636                if (t[0] < 0.0)  t[0] = -t[0];
 637                if (t[1] < 0.0)  t[1] = -t[1];
 638                if (t[2] < 0.0)  t[2] = -t[2];
 639                if (t[3] < 0.0)  t[3] = -t[3];
 640                store_vector4( &inst->DstReg, machine, t );
 641             }
 642             break;
 643
 644          case VP_OPCODE_END:
 645             return;
 646          default:
 647             /* bad instruction opcode */
 648             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 649             return;
 650       }
 651    }
 652 }
 653
 654
 655
 656 /**
 657 Thoughts on vertex program optimization:
 658
 659 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 660 assembly code.  That will probably be a lot of work.
 661
 662 Another approach might be to replace the vp_instruction->Opcode field with
 663 a pointer to a specialized C function which executes the instruction.
 664 In particular we can write functions which skip swizzling, negating,
 665 masking, relative addressing, etc. when they're not needed.
 666
 667 For example:
 668
 669 void simple_add( struct vp_instruction *inst )
 670 {
 671    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 672    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 673    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 674    sum[0] = a[0] + b[0];
 675    sum[1] = a[1] + b[1];
 676    sum[2] = a[2] + b[2];
 677    sum[3] = a[3] + b[3];
 678 }
 679
 680 */
 681
 682 /*
 683
 684 KW:
 685
 686 A first step would be to 'vectorize' the programs in the same way as
 687 the normal transformation code in the tnl module.  Thus each opcode
 688 takes zero or more input vectors (registers) and produces one or more
 689 output vectors.
 690
 691 These operations would intially be coded in C, with machine-specific
 692 assembly following, as is currently the case for matrix
 693 transformations in the math/ directory.  The preprocessing scheme for
 694 selecting simpler operations Brian describes above would also work
 695 here.
 696
 697 This should give reasonable performance without excessive effort.
 698
 699 */