src/mesa/main/nvvertexec.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file nvvertexec.c
  27  * \brief Code to execute vertex programs.
  28  * \author Brian Paul
  29  */
  30
  31 #include "glheader.h"
  32 #include "context.h"
  33 #include "imports.h"
  34 #include "macros.h"
  35 #include "mtypes.h"
  36 #include "nvvertexec.h"
  37 #include "nvvertprog.h"
  38 #include "math/m_matrix.h"
  39
  40
  41 /**
  42  * Load/initialize the vertex program registers.
  43  * This needs to be done per vertex.
  44  */
  45 void
  46 _mesa_init_vp_registers(GLcontext *ctx)
  47 {
  48    struct vp_machine *machine = &(ctx->VertexProgram.Machine);
  49    GLuint i;
  50
  51    /* Input registers get initialized from the current vertex attribs */
  52    MEMCPY(machine->Registers[VP_INPUT_REG_START],
  53           ctx->Current.Attrib,
  54           16 * 4 * sizeof(GLfloat));
  55
  56    /* Output and temp regs are initialized to [0,0,0,1] */
  57    for (i = VP_OUTPUT_REG_START; i <= VP_OUTPUT_REG_END; i++) {
  58       machine->Registers[i][0] = 0.0F;
  59       machine->Registers[i][1] = 0.0F;
  60       machine->Registers[i][2] = 0.0F;
  61       machine->Registers[i][3] = 1.0F;
  62    }
  63    for (i = VP_TEMP_REG_START; i <= VP_TEMP_REG_END; i++) {
  64       machine->Registers[i][0] = 0.0F;
  65       machine->Registers[i][1] = 0.0F;
  66       machine->Registers[i][2] = 0.0F;
  67       machine->Registers[i][3] = 1.0F;
  68    }
  69
  70    /* The program regs aren't touched */
  71 }
  72
  73
  74
  75 /**
  76  * Copy the 16 elements of a matrix into four consecutive program
  77  * registers starting at 'pos'.
  78  */
  79 static void
  80 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  81 {
  82    GLuint i;
  83    pos += VP_PROG_REG_START;
  84    for (i = 0; i < 4; i++) {
  85       registers[pos + i][0] = mat[0 + i];
  86       registers[pos + i][1] = mat[4 + i];
  87       registers[pos + i][2] = mat[8 + i];
  88       registers[pos + i][3] = mat[12 + i];
  89    }
  90 }
  91
  92
  93 /**
  94  * As above, but transpose the matrix.
  95  */
  96 static void
  97 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
  98                       const GLfloat mat[16])
  99 {
 100    pos += VP_PROG_REG_START;
 101    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
 102 }
 103
 104
 105 /**
 106  * Load all currently tracked matrices into the program registers.
 107  * This needs to be done per glBegin/glEnd.
 108  */
 109 void
 110 _mesa_init_tracked_matrices(GLcontext *ctx)
 111 {
 112    GLuint i;
 113
 114    for (i = 0; i < VP_NUM_PROG_REGS / 4; i++) {
 115       /* point 'mat' at source matrix */
 116       GLmatrix *mat;
 117       if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 118          mat = ctx->ModelviewMatrixStack.Top;
 119       }
 120       else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 121          mat = ctx->ProjectionMatrixStack.Top;
 122       }
 123       else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 124          mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 125       }
 126       else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 127          mat = ctx->ColorMatrixStack.Top;
 128       }
 129       else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 130          /* XXX verify the combined matrix is up to date */
 131          mat = &ctx->_ModelProjectMatrix;
 132       }
 133       else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 134                ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 135          GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 136          ASSERT(n < MAX_PROGRAM_MATRICES);
 137          mat = ctx->ProgramMatrixStack[n].Top;
 138       }
 139       else {
 140          /* no matrix is tracked, but we leave the register values as-is */
 141          assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 142          continue;
 143       }
 144
 145       /* load the matrix */
 146       if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 147          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 148       }
 149       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 150          _math_matrix_analyse(mat); /* update the inverse */
 151          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 152          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->inv);
 153       }
 154       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 155          load_transpose_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 156       }
 157       else {
 158          assert(ctx->VertexProgram.TrackMatrixTransform[i]
 159                 == GL_INVERSE_TRANSPOSE_NV);
 160          _math_matrix_analyse(mat); /* update the inverse */
 161          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 162          load_transpose_matrix(ctx->VertexProgram.Machine.Registers,
 163                                i*4, mat->inv);
 164       }
 165    }
 166 }
 167
 168
 169
 170 /**
 171  * For debugging.  Dump the current vertex program machine registers.
 172  */
 173 void
 174 _mesa_dump_vp_machine( const struct vp_machine *machine )
 175 {
 176    int i;
 177    _mesa_printf("VertexIn:\n");
 178    for (i = 0; i < VP_NUM_INPUT_REGS; i++) {
 179       _mesa_printf("%d: %f %f %f %f   ", i,
 180              machine->Registers[i + VP_INPUT_REG_START][0],
 181              machine->Registers[i + VP_INPUT_REG_START][1],
 182              machine->Registers[i + VP_INPUT_REG_START][2],
 183              machine->Registers[i + VP_INPUT_REG_START][3]);
 184    }
 185    _mesa_printf("\n");
 186
 187    _mesa_printf("VertexOut:\n");
 188    for (i = 0; i < VP_NUM_OUTPUT_REGS; i++) {
 189       _mesa_printf("%d: %f %f %f %f   ", i,
 190              machine->Registers[i + VP_OUTPUT_REG_START][0],
 191              machine->Registers[i + VP_OUTPUT_REG_START][1],
 192              machine->Registers[i + VP_OUTPUT_REG_START][2],
 193              machine->Registers[i + VP_OUTPUT_REG_START][3]);
 194    }
 195    _mesa_printf("\n");
 196
 197    _mesa_printf("Registers:\n");
 198    for (i = 0; i < VP_NUM_TEMP_REGS; i++) {
 199       _mesa_printf("%d: %f %f %f %f   ", i,
 200              machine->Registers[i + VP_TEMP_REG_START][0],
 201              machine->Registers[i + VP_TEMP_REG_START][1],
 202              machine->Registers[i + VP_TEMP_REG_START][2],
 203              machine->Registers[i + VP_TEMP_REG_START][3]);
 204    }
 205    _mesa_printf("\n");
 206
 207    _mesa_printf("Parameters:\n");
 208    for (i = 0; i < VP_NUM_PROG_REGS; i++) {
 209       _mesa_printf("%d: %f %f %f %f   ", i,
 210              machine->Registers[i + VP_PROG_REG_START][0],
 211              machine->Registers[i + VP_PROG_REG_START][1],
 212              machine->Registers[i + VP_PROG_REG_START][2],
 213              machine->Registers[i + VP_PROG_REG_START][3]);
 214    }
 215    _mesa_printf("\n");
 216 }
 217
 218
 219 /**
 220  * Fetch a 4-element float vector from the given source register.
 221  * Apply swizzling and negating as needed.
 222  */
 223 static void
 224 fetch_vector4( const struct vp_src_register *source,
 225                const struct vp_machine *machine,
 226                GLfloat result[4] )
 227 {
 228    static const GLfloat zero[4] = { 0, 0, 0, 0 };
 229    const GLfloat *src;
 230
 231    if (source->RelAddr) {
 232       const GLint reg = source->Register + machine->AddressReg;
 233       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 234          src = zero;
 235       else
 236          src = machine->Registers[VP_PROG_REG_START + reg];
 237    }
 238    else {
 239       src = machine->Registers[source->Register];
 240    }
 241
 242    if (source->Negate) {
 243       result[0] = -src[source->Swizzle[0]];
 244       result[1] = -src[source->Swizzle[1]];
 245       result[2] = -src[source->Swizzle[2]];
 246       result[3] = -src[source->Swizzle[3]];
 247    }
 248    else {
 249       result[0] = src[source->Swizzle[0]];
 250       result[1] = src[source->Swizzle[1]];
 251       result[2] = src[source->Swizzle[2]];
 252       result[3] = src[source->Swizzle[3]];
 253    }
 254 }
 255
 256
 257 /**
 258  * As above, but only return result[0] element.
 259  */
 260 static void
 261 fetch_vector1( const struct vp_src_register *source,
 262                const struct vp_machine *machine,
 263                GLfloat result[4] )
 264 {
 265    static const GLfloat zero[4] = { 0, 0, 0, 0 };
 266    const GLfloat *src;
 267
 268    if (source->RelAddr) {
 269       const GLint reg = source->Register + machine->AddressReg;
 270       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 271          src = zero;
 272       else
 273          src = machine->Registers[VP_PROG_REG_START + reg];
 274    }
 275    else {
 276       src = machine->Registers[source->Register];
 277    }
 278
 279    if (source->Negate) {
 280       result[0] = -src[source->Swizzle[0]];
 281    }
 282    else {
 283       result[0] = src[source->Swizzle[0]];
 284    }
 285 }
 286
 287
 288 /**
 289  * Store 4 floats into a register.
 290  */
 291 static void
 292 store_vector4( const struct vp_dst_register *dest, struct vp_machine *machine,
 293                const GLfloat value[4] )
 294 {
 295    GLfloat *dst = machine->Registers[dest->Register];
 296
 297    if (dest->WriteMask[0])
 298       dst[0] = value[0];
 299    if (dest->WriteMask[1])
 300       dst[1] = value[1];
 301    if (dest->WriteMask[2])
 302       dst[2] = value[2];
 303    if (dest->WriteMask[3])
 304       dst[3] = value[3];
 305 }
 306
 307
 308 /**
 309  * Set x to positive or negative infinity.
 310  */
 311 #ifdef USE_IEEE
 312 #define SET_POS_INFINITY(x)  ( *((GLuint *) &x) = 0x7F800000 )
 313 #define SET_NEG_INFINITY(x)  ( *((GLuint *) &x) = 0xFF800000 )
 314 #elif defined(VMS)
 315 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 316 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 317 #else
 318 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 319 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 320 #endif
 321
 322 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
 323
 324
 325 /**
 326  * Execute the given vertex program
 327  */
 328 void
 329 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 330 {
 331    struct vp_machine *machine = &ctx->VertexProgram.Machine;
 332    const struct vp_instruction *inst;
 333
 334    for (inst = program->Instructions; inst->Opcode != VP_OPCODE_END; inst++) {
 335       switch (inst->Opcode) {
 336          case VP_OPCODE_MOV:
 337             {
 338                GLfloat t[4];
 339                fetch_vector4( &inst->SrcReg[0], machine, t );
 340                store_vector4( &inst->DstReg, machine, t );
 341             }
 342             break;
 343          case VP_OPCODE_LIT:
 344             {
 345                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 346                GLfloat t[4], lit[4];
 347                fetch_vector4( &inst->SrcReg[0], machine, t );
 348                if (t[3] < -(128.0F - epsilon))
 349                    t[3] = - (128.0F - epsilon);
 350                else if (t[3] > 128.0F - epsilon)
 351                   t[3] = 128.0F - epsilon;
 352                if (t[0] < 0.0)
 353                   t[0] = 0.0;
 354                if (t[1] < 0.0)
 355                   t[1] = 0.0;
 356                lit[0] = 1.0;
 357                lit[1] = t[0];
 358                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 359                lit[3] = 1.0;
 360                store_vector4( &inst->DstReg, machine, lit );
 361             }
 362             break;
 363          case VP_OPCODE_RCP:
 364             {
 365                GLfloat t[4];
 366                fetch_vector1( &inst->SrcReg[0], machine, t );
 367                if (t[0] != 1.0F)
 368                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 369                t[1] = t[2] = t[3] = t[0];
 370                store_vector4( &inst->DstReg, machine, t );
 371             }
 372             break;
 373          case VP_OPCODE_RSQ:
 374             {
 375                GLfloat t[4];
 376                fetch_vector1( &inst->SrcReg[0], machine, t );
 377                t[0] = INV_SQRTF(FABSF(t[0]));
 378                t[1] = t[2] = t[3] = t[0];
 379                store_vector4( &inst->DstReg, machine, t );
 380             }
 381             break;
 382          case VP_OPCODE_EXP:
 383             {
 384                GLfloat t[4], q[4], floor_t0;
 385                fetch_vector1( &inst->SrcReg[0], machine, t );
 386                floor_t0 = (float) floor(t[0]);
 387                if (floor_t0 > FLT_MAX_EXP) {
 388                   SET_POS_INFINITY(q[0]);
 389                   SET_POS_INFINITY(q[2]);
 390                }
 391                else if (floor_t0 < FLT_MIN_EXP) {
 392                   q[0] = 0.0F;
 393                   q[2] = 0.0F;
 394                }
 395                else {
 396 #ifdef USE_IEEE
 397                   GLint ii = (GLint) floor_t0;
 398                   ii = (ii < 23) + 0x3f800000;
 399                   SET_FLOAT_BITS(q[0], ii);
 400                   q[0] = *((GLfloat *) &ii);
 401 #else
 402                   q[0] = (GLfloat) pow(2.0, floor_t0);
 403 #endif
 404                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 405                }
 406                q[1] = t[0] - floor_t0;
 407                q[3] = 1.0F;
 408                store_vector4( &inst->DstReg, machine, q );
 409             }
 410             break;
 411          case VP_OPCODE_LOG:
 412             {
 413                GLfloat t[4], q[4], abs_t0;
 414                fetch_vector1( &inst->SrcReg[0], machine, t );
 415                abs_t0 = (GLfloat) fabs(t[0]);
 416                if (abs_t0 != 0.0F) {
 417                   /* Since we really can't handle infinite values on VMS
 418                    * like other OSes we'll use __MAXFLOAT to represent
 419                    * infinity.  This may need some tweaking.
 420                    */
 421 #ifdef VMS
 422                   if (abs_t0 == __MAXFLOAT) {
 423 #else
 424                   if (IS_INF_OR_NAN(abs_t0)) {
 425 #endif
 426                      SET_POS_INFINITY(q[0]);
 427                      q[1] = 1.0F;
 428                      SET_POS_INFINITY(q[2]);
 429                   }
 430                   else {
 431                      int exponent;
 432                      double mantissa = frexp(t[0], &exponent);
 433                      q[0] = (GLfloat) (exponent - 1);
 434                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 435                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 436                   }
 437                }
 438                else {
 439                   SET_NEG_INFINITY(q[0]);
 440                   q[1] = 1.0F;
 441                   SET_NEG_INFINITY(q[2]);
 442                }
 443                q[3] = 1.0;
 444                store_vector4( &inst->DstReg, machine, q );
 445             }
 446             break;
 447          case VP_OPCODE_MUL:
 448             {
 449                GLfloat t[4], u[4], prod[4];
 450                fetch_vector4( &inst->SrcReg[0], machine, t );
 451                fetch_vector4( &inst->SrcReg[1], machine, u );
 452                prod[0] = t[0] * u[0];
 453                prod[1] = t[1] * u[1];
 454                prod[2] = t[2] * u[2];
 455                prod[3] = t[3] * u[3];
 456                store_vector4( &inst->DstReg, machine, prod );
 457             }
 458             break;
 459          case VP_OPCODE_ADD:
 460             {
 461                GLfloat t[4], u[4], sum[4];
 462                fetch_vector4( &inst->SrcReg[0], machine, t );
 463                fetch_vector4( &inst->SrcReg[1], machine, u );
 464                sum[0] = t[0] + u[0];
 465                sum[1] = t[1] + u[1];
 466                sum[2] = t[2] + u[2];
 467                sum[3] = t[3] + u[3];
 468                store_vector4( &inst->DstReg, machine, sum );
 469             }
 470             break;
 471          case VP_OPCODE_DP3:
 472             {
 473                GLfloat t[4], u[4], dot[4];
 474                fetch_vector4( &inst->SrcReg[0], machine, t );
 475                fetch_vector4( &inst->SrcReg[1], machine, u );
 476                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 477                dot[1] = dot[2] = dot[3] = dot[0];
 478                store_vector4( &inst->DstReg, machine, dot );
 479             }
 480             break;
 481          case VP_OPCODE_DP4:
 482             {
 483                GLfloat t[4], u[4], dot[4];
 484                fetch_vector4( &inst->SrcReg[0], machine, t );
 485                fetch_vector4( &inst->SrcReg[1], machine, u );
 486                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 487                dot[1] = dot[2] = dot[3] = dot[0];
 488                store_vector4( &inst->DstReg, machine, dot );
 489             }
 490             break;
 491          case VP_OPCODE_DST:
 492             {
 493                GLfloat t[4], u[4], dst[4];
 494                fetch_vector4( &inst->SrcReg[0], machine, t );
 495                fetch_vector4( &inst->SrcReg[1], machine, u );
 496                dst[0] = 1.0F;
 497                dst[1] = t[1] * u[1];
 498                dst[2] = t[2];
 499                dst[3] = u[3];
 500                store_vector4( &inst->DstReg, machine, dst );
 501             }
 502             break;
 503          case VP_OPCODE_MIN:
 504             {
 505                GLfloat t[4], u[4], min[4];
 506                fetch_vector4( &inst->SrcReg[0], machine, t );
 507                fetch_vector4( &inst->SrcReg[1], machine, u );
 508                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 509                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 510                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 511                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 512                store_vector4( &inst->DstReg, machine, min );
 513             }
 514             break;
 515          case VP_OPCODE_MAX:
 516             {
 517                GLfloat t[4], u[4], max[4];
 518                fetch_vector4( &inst->SrcReg[0], machine, t );
 519                fetch_vector4( &inst->SrcReg[1], machine, u );
 520                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 521                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 522                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 523                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 524                store_vector4( &inst->DstReg, machine, max );
 525             }
 526             break;
 527          case VP_OPCODE_SLT:
 528             {
 529                GLfloat t[4], u[4], slt[4];
 530                fetch_vector4( &inst->SrcReg[0], machine, t );
 531                fetch_vector4( &inst->SrcReg[1], machine, u );
 532                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 533                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 534                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 535                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 536                store_vector4( &inst->DstReg, machine, slt );
 537             }
 538             break;
 539          case VP_OPCODE_SGE:
 540             {
 541                GLfloat t[4], u[4], sge[4];
 542                fetch_vector4( &inst->SrcReg[0], machine, t );
 543                fetch_vector4( &inst->SrcReg[1], machine, u );
 544                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 545                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 546                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 547                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 548                store_vector4( &inst->DstReg, machine, sge );
 549             }
 550             break;
 551          case VP_OPCODE_MAD:
 552             {
 553                GLfloat t[4], u[4], v[4], sum[4];
 554                fetch_vector4( &inst->SrcReg[0], machine, t );
 555                fetch_vector4( &inst->SrcReg[1], machine, u );
 556                fetch_vector4( &inst->SrcReg[2], machine, v );
 557                sum[0] = t[0] * u[0] + v[0];
 558                sum[1] = t[1] * u[1] + v[1];
 559                sum[2] = t[2] * u[2] + v[2];
 560                sum[3] = t[3] * u[3] + v[3];
 561                store_vector4( &inst->DstReg, machine, sum );
 562             }
 563             break;
 564          case VP_OPCODE_ARL:
 565             {
 566                GLfloat t[4];
 567                fetch_vector4( &inst->SrcReg[0], machine, t );
 568                machine->AddressReg = (GLint) floor(t[0]);
 569             }
 570             break;
 571          case VP_OPCODE_DPH:
 572             {
 573                GLfloat t[4], u[4], dot[4];
 574                fetch_vector4( &inst->SrcReg[0], machine, t );
 575                fetch_vector4( &inst->SrcReg[1], machine, u );
 576                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 577                dot[1] = dot[2] = dot[3] = dot[0];
 578                store_vector4( &inst->DstReg, machine, dot );
 579             }
 580             break;
 581          case VP_OPCODE_RCC:
 582             {
 583                GLfloat t[4], u;
 584                fetch_vector1( &inst->SrcReg[0], machine, t );
 585                if (t[0] == 1.0F)
 586                   u = 1.0F;
 587                else
 588                   u = 1.0F / t[0];
 589                if (u > 0.0F) {
 590                   if (u > 1.884467e+019F) {
 591                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 592                   }
 593                   else if (u < 5.42101e-020F) {
 594                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 595                   }
 596                }
 597                else {
 598                   if (u < -1.884467e+019F) {
 599                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 600                   }
 601                   else if (u > -5.42101e-020F) {
 602                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 603                   }
 604                }
 605                t[0] = t[1] = t[2] = t[3] = u;
 606                store_vector4( &inst->DstReg, machine, t );
 607             }
 608             break;
 609          case VP_OPCODE_SUB:
 610             {
 611                GLfloat t[4], u[4], sum[4];
 612                fetch_vector4( &inst->SrcReg[0], machine, t );
 613                fetch_vector4( &inst->SrcReg[1], machine, u );
 614                sum[0] = t[0] - u[0];
 615                sum[1] = t[1] - u[1];
 616                sum[2] = t[2] - u[2];
 617                sum[3] = t[3] - u[3];
 618                store_vector4( &inst->DstReg, machine, sum );
 619             }
 620             break;
 621          case VP_OPCODE_ABS:
 622             {
 623                GLfloat t[4];
 624                fetch_vector4( &inst->SrcReg[0], machine, t );
 625                if (t[0] < 0.0)  t[0] = -t[0];
 626                if (t[1] < 0.0)  t[1] = -t[1];
 627                if (t[2] < 0.0)  t[2] = -t[2];
 628                if (t[3] < 0.0)  t[3] = -t[3];
 629                store_vector4( &inst->DstReg, machine, t );
 630             }
 631             break;
 632
 633          case VP_OPCODE_END:
 634             return;
 635          default:
 636             /* bad instruction opcode */
 637             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 638             return;
 639       }
 640    }
 641 }
 642
 643
 644
 645 /**
 646 Thoughts on vertex program optimization:
 647
 648 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 649 assembly code.  That will probably be a lot of work.
 650
 651 Another approach might be to replace the vp_instruction->Opcode field with
 652 a pointer to a specialized C function which executes the instruction.
 653 In particular we can write functions which skip swizzling, negating,
 654 masking, relative addressing, etc. when they're not needed.
 655
 656 For example:
 657
 658 void simple_add( struct vp_instruction *inst )
 659 {
 660    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 661    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 662    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 663    sum[0] = a[0] + b[0];
 664    sum[1] = a[1] + b[1];
 665    sum[2] = a[2] + b[2];
 666    sum[3] = a[3] + b[3];
 667 }
 668
 669 */
 670
 671 /*
 672
 673 KW:
 674
 675 A first step would be to 'vectorize' the programs in the same way as
 676 the normal transformation code in the tnl module.  Thus each opcode
 677 takes zero or more input vectors (registers) and produces one or more
 678 output vectors.
 679
 680 These operations would intially be coded in C, with machine-specific
 681 assembly following, as is currently the case for matrix
 682 transformations in the math/ directory.  The preprocessing scheme for
 683 selecting simpler operations Brian describes above would also work
 684 here.
 685
 686 This should give reasonable performance without excessive effort.
 687
 688 */