src/mesa/main/nvvertexec.c

   1 /* $Id: nvvertexec.c,v 1.4 2003/03/25 00:00:29 brianp Exp $ */
   2
   3 /*
   4  * Mesa 3-D graphics library
   5  * Version:  5.1
   6  *
   7  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   8  *
   9  * Permission is hereby granted, free of charge, to any person obtaining a
  10  * copy of this software and associated documentation files (the "Software"),
  11  * to deal in the Software without restriction, including without limitation
  12  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  13  * and/or sell copies of the Software, and to permit persons to whom the
  14  * Software is furnished to do so, subject to the following conditions:
  15  *
  16  * The above copyright notice and this permission notice shall be included
  17  * in all copies or substantial portions of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  22  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  23  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  24  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  */
  26
  27 /**
  28  * \file nvvertexec.c
  29  * \brief Code to execute vertex programs.
  30  * \author Brian Paul
  31  */
  32
  33 #include "glheader.h"
  34 #include "context.h"
  35 #include "imports.h"
  36 #include "macros.h"
  37 #include "mtypes.h"
  38 #include "nvvertexec.h"
  39 #include "nvvertprog.h"
  40 #include "math/m_matrix.h"
  41
  42
  43 /**
  44  * Load/initialize the vertex program registers.
  45  * This needs to be done per vertex.
  46  */
  47 void
  48 _mesa_init_vp_registers(GLcontext *ctx)
  49 {
  50    struct vp_machine *machine = &(ctx->VertexProgram.Machine);
  51    GLuint i;
  52
  53    /* Input registers get initialized from the current vertex attribs */
  54    MEMCPY(machine->Registers[VP_INPUT_REG_START],
  55           ctx->Current.Attrib,
  56           16 * 4 * sizeof(GLfloat));
  57
  58    /* Output and temp regs are initialized to [0,0,0,1] */
  59    for (i = VP_OUTPUT_REG_START; i <= VP_OUTPUT_REG_END; i++) {
  60       machine->Registers[i][0] = 0.0F;
  61       machine->Registers[i][1] = 0.0F;
  62       machine->Registers[i][2] = 0.0F;
  63       machine->Registers[i][3] = 1.0F;
  64    }
  65    for (i = VP_TEMP_REG_START; i <= VP_TEMP_REG_END; i++) {
  66       machine->Registers[i][0] = 0.0F;
  67       machine->Registers[i][1] = 0.0F;
  68       machine->Registers[i][2] = 0.0F;
  69       machine->Registers[i][3] = 1.0F;
  70    }
  71
  72    /* The program regs aren't touched */
  73 }
  74
  75
  76
  77 /**
  78  * Copy the 16 elements of a matrix into four consecutive program
  79  * registers starting at 'pos'.
  80  */
  81 static void
  82 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  83 {
  84    GLuint i;
  85    pos += VP_PROG_REG_START;
  86    for (i = 0; i < 4; i++) {
  87       registers[pos + i][0] = mat[0 + i];
  88       registers[pos + i][1] = mat[4 + i];
  89       registers[pos + i][2] = mat[8 + i];
  90       registers[pos + i][3] = mat[12 + i];
  91    }
  92 }
  93
  94
  95 /**
  96  * As above, but transpose the matrix.
  97  */
  98 static void
  99 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
 100                       const GLfloat mat[16])
 101 {
 102    pos += VP_PROG_REG_START;
 103    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
 104 }
 105
 106
 107 /**
 108  * Load all currently tracked matrices into the program registers.
 109  * This needs to be done per glBegin/glEnd.
 110  */
 111 void
 112 _mesa_init_tracked_matrices(GLcontext *ctx)
 113 {
 114    GLuint i;
 115
 116    for (i = 0; i < VP_NUM_PROG_REGS / 4; i++) {
 117       /* point 'mat' at source matrix */
 118       GLmatrix *mat;
 119       if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 120          mat = ctx->ModelviewMatrixStack.Top;
 121       }
 122       else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 123          mat = ctx->ProjectionMatrixStack.Top;
 124       }
 125       else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 126          mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 127       }
 128       else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 129          mat = ctx->ColorMatrixStack.Top;
 130       }
 131       else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 132          /* XXX verify the combined matrix is up to date */
 133          mat = &ctx->_ModelProjectMatrix;
 134       }
 135       else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 136                ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 137          GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 138          ASSERT(n < MAX_PROGRAM_MATRICES);
 139          mat = ctx->ProgramMatrixStack[n].Top;
 140       }
 141       else {
 142          /* no matrix is tracked, but we leave the register values as-is */
 143          assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 144          continue;
 145       }
 146
 147       /* load the matrix */
 148       if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 149          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 150       }
 151       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 152          _math_matrix_analyse(mat); /* update the inverse */
 153          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 154          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->inv);
 155       }
 156       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 157          load_transpose_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 158       }
 159       else {
 160          assert(ctx->VertexProgram.TrackMatrixTransform[i]
 161                 == GL_INVERSE_TRANSPOSE_NV);
 162          _math_matrix_analyse(mat); /* update the inverse */
 163          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 164          load_transpose_matrix(ctx->VertexProgram.Machine.Registers,
 165                                i*4, mat->inv);
 166       }
 167    }
 168 }
 169
 170
 171
 172 /**
 173  * For debugging.  Dump the current vertex program machine registers.
 174  */
 175 void
 176 _mesa_dump_vp_machine( const struct vp_machine *machine )
 177 {
 178    int i;
 179    _mesa_printf("VertexIn:\n");
 180    for (i = 0; i < VP_NUM_INPUT_REGS; i++) {
 181       _mesa_printf("%d: %f %f %f %f   ", i,
 182              machine->Registers[i + VP_INPUT_REG_START][0],
 183              machine->Registers[i + VP_INPUT_REG_START][1],
 184              machine->Registers[i + VP_INPUT_REG_START][2],
 185              machine->Registers[i + VP_INPUT_REG_START][3]);
 186    }
 187    _mesa_printf("\n");
 188
 189    _mesa_printf("VertexOut:\n");
 190    for (i = 0; i < VP_NUM_OUTPUT_REGS; i++) {
 191       _mesa_printf("%d: %f %f %f %f   ", i,
 192              machine->Registers[i + VP_OUTPUT_REG_START][0],
 193              machine->Registers[i + VP_OUTPUT_REG_START][1],
 194              machine->Registers[i + VP_OUTPUT_REG_START][2],
 195              machine->Registers[i + VP_OUTPUT_REG_START][3]);
 196    }
 197    _mesa_printf("\n");
 198
 199    _mesa_printf("Registers:\n");
 200    for (i = 0; i < VP_NUM_TEMP_REGS; i++) {
 201       _mesa_printf("%d: %f %f %f %f   ", i,
 202              machine->Registers[i + VP_TEMP_REG_START][0],
 203              machine->Registers[i + VP_TEMP_REG_START][1],
 204              machine->Registers[i + VP_TEMP_REG_START][2],
 205              machine->Registers[i + VP_TEMP_REG_START][3]);
 206    }
 207    _mesa_printf("\n");
 208
 209    _mesa_printf("Parameters:\n");
 210    for (i = 0; i < VP_NUM_PROG_REGS; i++) {
 211       _mesa_printf("%d: %f %f %f %f   ", i,
 212              machine->Registers[i + VP_PROG_REG_START][0],
 213              machine->Registers[i + VP_PROG_REG_START][1],
 214              machine->Registers[i + VP_PROG_REG_START][2],
 215              machine->Registers[i + VP_PROG_REG_START][3]);
 216    }
 217    _mesa_printf("\n");
 218 }
 219
 220
 221 /**
 222  * Fetch a 4-element float vector from the given source register.
 223  * Apply swizzling and negating as needed.
 224  */
 225 static void
 226 fetch_vector4( const struct vp_src_register *source,
 227                const struct vp_machine *machine,
 228                GLfloat result[4] )
 229 {
 230    static const GLfloat zero[4] = { 0, 0, 0, 0 };
 231    const GLfloat *src;
 232
 233    if (source->RelAddr) {
 234       GLint reg = source->Register + machine->AddressReg;
 235       if (reg < VP_PROG_REG_START || reg > VP_PROG_REG_END)
 236          src = zero;
 237       else
 238          src = machine->Registers[reg];
 239    }
 240    else {
 241       src = machine->Registers[source->Register];
 242    }
 243
 244    if (source->Negate) {
 245       result[0] = -src[source->Swizzle[0]];
 246       result[1] = -src[source->Swizzle[1]];
 247       result[2] = -src[source->Swizzle[2]];
 248       result[3] = -src[source->Swizzle[3]];
 249    }
 250    else {
 251       result[0] = src[source->Swizzle[0]];
 252       result[1] = src[source->Swizzle[1]];
 253       result[2] = src[source->Swizzle[2]];
 254       result[3] = src[source->Swizzle[3]];
 255    }
 256 }
 257
 258
 259 /**
 260  * As above, but only return result[0] element.
 261  */
 262 static void
 263 fetch_vector1( const struct vp_src_register *source,
 264                const struct vp_machine *machine,
 265                GLfloat result[4] )
 266 {
 267    static const GLfloat zero[4] = { 0, 0, 0, 0 };
 268    const GLfloat *src;
 269
 270    if (source->RelAddr) {
 271       GLint reg = source->Register + machine->AddressReg;
 272       if (reg < VP_PROG_REG_START || reg > VP_PROG_REG_END)
 273          src = zero;
 274       else
 275          src = machine->Registers[reg];
 276    }
 277    else {
 278       src = machine->Registers[source->Register];
 279    }
 280
 281    if (source->Negate) {
 282       result[0] = -src[source->Swizzle[0]];
 283    }
 284    else {
 285       result[0] = src[source->Swizzle[0]];
 286    }
 287 }
 288
 289
 290 /**
 291  * Store 4 floats into a register.
 292  */
 293 static void
 294 store_vector4( const struct vp_dst_register *dest, struct vp_machine *machine,
 295                const GLfloat value[4] )
 296 {
 297    GLfloat *dst = machine->Registers[dest->Register];
 298
 299    if (dest->WriteMask[0])
 300       dst[0] = value[0];
 301    if (dest->WriteMask[1])
 302       dst[1] = value[1];
 303    if (dest->WriteMask[2])
 304       dst[2] = value[2];
 305    if (dest->WriteMask[3])
 306       dst[3] = value[3];
 307 }
 308
 309
 310 /**
 311  * Set x to positive or negative infinity.
 312  */
 313 #ifdef USE_IEEE
 314 #define SET_POS_INFINITY(x)  ( *((GLuint *) &x) = 0x7F800000 )
 315 #define SET_NEG_INFINITY(x)  ( *((GLuint *) &x) = 0xFF800000 )
 316 #elif defined(VMS)
 317 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 318 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 319 #else
 320 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 321 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 322 #endif
 323
 324 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
 325
 326
 327 /**
 328  * Execute the given vertex program
 329  */
 330 void
 331 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 332 {
 333    struct vp_machine *machine = &ctx->VertexProgram.Machine;
 334    const struct vp_instruction *inst;
 335
 336    /* XXX load vertex fields into input registers */
 337    /* and do other initialization */
 338
 339
 340    for (inst = program->Instructions; inst->Opcode != VP_OPCODE_END; inst++) {
 341       switch (inst->Opcode) {
 342          case VP_OPCODE_MOV:
 343             {
 344                GLfloat t[4];
 345                fetch_vector4( &inst->SrcReg[0], machine, t );
 346                store_vector4( &inst->DstReg, machine, t );
 347             }
 348             break;
 349          case VP_OPCODE_LIT:
 350             {
 351                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 352                GLfloat t[4], lit[4];
 353                fetch_vector4( &inst->SrcReg[0], machine, t );
 354                if (t[3] < -(128.0F - epsilon))
 355                    t[3] = - (128.0F - epsilon);
 356                else if (t[3] > 128.0F - epsilon)
 357                   t[3] = 128.0F - epsilon;
 358                if (t[0] < 0.0)
 359                   t[0] = 0.0;
 360                if (t[1] < 0.0)
 361                   t[1] = 0.0;
 362                lit[0] = 1.0;
 363                lit[1] = t[0];
 364                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 365                lit[3] = 1.0;
 366                store_vector4( &inst->DstReg, machine, lit );
 367             }
 368             break;
 369          case VP_OPCODE_RCP:
 370             {
 371                GLfloat t[4];
 372                fetch_vector1( &inst->SrcReg[0], machine, t );
 373                if (t[0] != 1.0F)
 374                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 375                t[1] = t[2] = t[3] = t[0];
 376                store_vector4( &inst->DstReg, machine, t );
 377             }
 378             break;
 379          case VP_OPCODE_RSQ:
 380             {
 381                GLfloat t[4];
 382                fetch_vector1( &inst->SrcReg[0], machine, t );
 383                t[0] = INV_SQRTF(FABSF(t[0]));
 384                t[1] = t[2] = t[3] = t[0];
 385                store_vector4( &inst->DstReg, machine, t );
 386             }
 387             break;
 388          case VP_OPCODE_EXP:
 389             {
 390                GLfloat t[4], q[4], floor_t0;
 391                fetch_vector1( &inst->SrcReg[0], machine, t );
 392                floor_t0 = (float) floor(t[0]);
 393                if (floor_t0 > FLT_MAX_EXP) {
 394                   SET_POS_INFINITY(q[0]);
 395                   q[1] = 0.0F;
 396                   SET_POS_INFINITY(q[2]);
 397                   q[3] = 1.0F;
 398                }
 399                else if (floor_t0 < FLT_MIN_EXP) {
 400                   q[0] = 0.0F;
 401                   q[1] = 0.0F;
 402                   q[2] = 0.0F;
 403                   q[3] = 0.0F;
 404                }
 405                else {
 406 #ifdef USE_IEEE
 407                   GLint ii = (GLint) floor_t0;
 408                   ii = (ii < 23) + 0x3f800000;
 409                   SET_FLOAT_BITS(q[0], ii);
 410                   q[0] = *((GLfloat *) &ii);
 411 #else
 412                   q[0] = (GLfloat) pow(2.0, floor_t0);
 413 #endif
 414                   q[1] = t[0] - floor_t0;
 415                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 416                   q[3] = 1.0F;
 417                }
 418                store_vector4( &inst->DstReg, machine, q );
 419             }
 420             break;
 421          case VP_OPCODE_LOG:
 422             {
 423                GLfloat t[4], q[4], abs_t0;
 424                fetch_vector1( &inst->SrcReg[0], machine, t );
 425                abs_t0 = (GLfloat) fabs(t[0]);
 426                if (abs_t0 != 0.0F) {
 427                   /* Since we really can't handle infinite values on VMS
 428                    * like other OSes we'll use __MAXFLOAT to represent
 429                    * infinity.  This may need some tweaking.
 430                    */
 431 #ifdef VMS
 432                   if (abs_t0 == __MAXFLOAT) {
 433 #else
 434                   if (IS_INF_OR_NAN(abs_t0)) {
 435 #endif
 436                      SET_POS_INFINITY(q[0]);
 437                      q[1] = 1.0F;
 438                      SET_POS_INFINITY(q[2]);
 439                   }
 440                   else {
 441                      int exponent;
 442                      double mantissa = frexp(t[0], &exponent);
 443                      q[0] = (GLfloat) (exponent - 1);
 444                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 445                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 446                   }
 447                }
 448                else {
 449                   SET_NEG_INFINITY(q[0]);
 450                   q[1] = 1.0F;
 451                   SET_NEG_INFINITY(q[2]);
 452                }
 453                q[3] = 1.0;
 454                store_vector4( &inst->DstReg, machine, q );
 455             }
 456             break;
 457          case VP_OPCODE_MUL:
 458             {
 459                GLfloat t[4], u[4], prod[4];
 460                fetch_vector4( &inst->SrcReg[0], machine, t );
 461                fetch_vector4( &inst->SrcReg[1], machine, u );
 462                prod[0] = t[0] * u[0];
 463                prod[1] = t[1] * u[1];
 464                prod[2] = t[2] * u[2];
 465                prod[3] = t[3] * u[3];
 466                store_vector4( &inst->DstReg, machine, prod );
 467             }
 468             break;
 469          case VP_OPCODE_ADD:
 470             {
 471                GLfloat t[4], u[4], sum[4];
 472                fetch_vector4( &inst->SrcReg[0], machine, t );
 473                fetch_vector4( &inst->SrcReg[1], machine, u );
 474                sum[0] = t[0] + u[0];
 475                sum[1] = t[1] + u[1];
 476                sum[2] = t[2] + u[2];
 477                sum[3] = t[3] + u[3];
 478                store_vector4( &inst->DstReg, machine, sum );
 479             }
 480             break;
 481          case VP_OPCODE_DP3:
 482             {
 483                GLfloat t[4], u[4], dot[4];
 484                fetch_vector4( &inst->SrcReg[0], machine, t );
 485                fetch_vector4( &inst->SrcReg[1], machine, u );
 486                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 487                dot[1] = dot[2] = dot[3] = dot[0];
 488                store_vector4( &inst->DstReg, machine, dot );
 489             }
 490             break;
 491          case VP_OPCODE_DP4:
 492             {
 493                GLfloat t[4], u[4], dot[4];
 494                fetch_vector4( &inst->SrcReg[0], machine, t );
 495                fetch_vector4( &inst->SrcReg[1], machine, u );
 496                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 497                dot[1] = dot[2] = dot[3] = dot[0];
 498                store_vector4( &inst->DstReg, machine, dot );
 499             }
 500             break;
 501          case VP_OPCODE_DST:
 502             {
 503                GLfloat t[4], u[4], dst[4];
 504                fetch_vector4( &inst->SrcReg[0], machine, t );
 505                fetch_vector4( &inst->SrcReg[1], machine, u );
 506                dst[0] = 1.0F;
 507                dst[1] = t[1] * u[1];
 508                dst[2] = t[2];
 509                dst[3] = u[3];
 510                store_vector4( &inst->DstReg, machine, dst );
 511             }
 512             break;
 513          case VP_OPCODE_MIN:
 514             {
 515                GLfloat t[4], u[4], min[4];
 516                fetch_vector4( &inst->SrcReg[0], machine, t );
 517                fetch_vector4( &inst->SrcReg[1], machine, u );
 518                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 519                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 520                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 521                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 522                store_vector4( &inst->DstReg, machine, min );
 523             }
 524             break;
 525          case VP_OPCODE_MAX:
 526             {
 527                GLfloat t[4], u[4], max[4];
 528                fetch_vector4( &inst->SrcReg[0], machine, t );
 529                fetch_vector4( &inst->SrcReg[1], machine, u );
 530                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 531                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 532                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 533                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 534                store_vector4( &inst->DstReg, machine, max );
 535             }
 536             break;
 537          case VP_OPCODE_SLT:
 538             {
 539                GLfloat t[4], u[4], slt[4];
 540                fetch_vector4( &inst->SrcReg[0], machine, t );
 541                fetch_vector4( &inst->SrcReg[1], machine, u );
 542                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 543                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 544                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 545                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 546                store_vector4( &inst->DstReg, machine, slt );
 547             }
 548             break;
 549          case VP_OPCODE_SGE:
 550             {
 551                GLfloat t[4], u[4], sge[4];
 552                fetch_vector4( &inst->SrcReg[0], machine, t );
 553                fetch_vector4( &inst->SrcReg[1], machine, u );
 554                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 555                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 556                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 557                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 558                store_vector4( &inst->DstReg, machine, sge );
 559             }
 560             break;
 561          case VP_OPCODE_MAD:
 562             {
 563                GLfloat t[4], u[4], v[4], sum[4];
 564                fetch_vector4( &inst->SrcReg[0], machine, t );
 565                fetch_vector4( &inst->SrcReg[1], machine, u );
 566                fetch_vector4( &inst->SrcReg[2], machine, v );
 567                sum[0] = t[0] * u[0] + v[0];
 568                sum[1] = t[1] * u[1] + v[1];
 569                sum[2] = t[2] * u[2] + v[2];
 570                sum[3] = t[3] * u[3] + v[3];
 571                store_vector4( &inst->DstReg, machine, sum );
 572             }
 573             break;
 574          case VP_OPCODE_ARL:
 575             {
 576                GLfloat t[4];
 577                fetch_vector4( &inst->SrcReg[0], machine, t );
 578                machine->AddressReg = (GLint) floor(t[0]);
 579             }
 580             break;
 581          case VP_OPCODE_DPH:
 582             {
 583                GLfloat t[4], u[4], dot[4];
 584                fetch_vector4( &inst->SrcReg[0], machine, t );
 585                fetch_vector4( &inst->SrcReg[1], machine, u );
 586                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 587                dot[1] = dot[2] = dot[3] = dot[0];
 588                store_vector4( &inst->DstReg, machine, dot );
 589             }
 590             break;
 591          case VP_OPCODE_RCC:
 592             {
 593                GLfloat t[4], u;
 594                fetch_vector1( &inst->SrcReg[0], machine, t );
 595                if (t[0] == 1.0F)
 596                   u = 1.0F;
 597                else
 598                   u = 1.0F / t[0];
 599                if (u > 0.0F) {
 600                   if (u > 1.884467e+019F) {
 601                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 602                   }
 603                   else if (u < 5.42101e-020F) {
 604                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 605                   }
 606                }
 607                else {
 608                   if (u < -1.884467e+019F) {
 609                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 610                   }
 611                   else if (u > -5.42101e-020F) {
 612                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 613                   }
 614                }
 615                t[0] = t[1] = t[2] = t[3] = u;
 616                store_vector4( &inst->DstReg, machine, t );
 617             }
 618             break;
 619          case VP_OPCODE_SUB:
 620             {
 621                GLfloat t[4], u[4], sum[4];
 622                fetch_vector4( &inst->SrcReg[0], machine, t );
 623                fetch_vector4( &inst->SrcReg[1], machine, u );
 624                sum[0] = t[0] - u[0];
 625                sum[1] = t[1] - u[1];
 626                sum[2] = t[2] - u[2];
 627                sum[3] = t[3] - u[3];
 628                store_vector4( &inst->DstReg, machine, sum );
 629             }
 630             break;
 631          case VP_OPCODE_ABS:
 632             {
 633                GLfloat t[4];
 634                fetch_vector4( &inst->SrcReg[0], machine, t );
 635                if (t[0] < 0.0)  t[0] = -t[0];
 636                if (t[1] < 0.0)  t[1] = -t[1];
 637                if (t[2] < 0.0)  t[2] = -t[2];
 638                if (t[3] < 0.0)  t[3] = -t[3];
 639                store_vector4( &inst->DstReg, machine, t );
 640             }
 641             break;
 642
 643          case VP_OPCODE_END:
 644             return;
 645          default:
 646             /* bad instruction opcode */
 647             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 648             return;
 649       }
 650    }
 651 }
 652
 653
 654
 655 /**
 656 Thoughts on vertex program optimization:
 657
 658 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 659 assembly code.  That will probably be a lot of work.
 660
 661 Another approach might be to replace the vp_instruction->Opcode field with
 662 a pointer to a specialized C function which executes the instruction.
 663 In particular we can write functions which skip swizzling, negating,
 664 masking, relative addressing, etc. when they're not needed.
 665
 666 For example:
 667
 668 void simple_add( struct vp_instruction *inst )
 669 {
 670    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 671    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 672    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 673    sum[0] = a[0] + b[0];
 674    sum[1] = a[1] + b[1];
 675    sum[2] = a[2] + b[2];
 676    sum[3] = a[3] + b[3];
 677 }
 678
 679 */
 680
 681 /*
 682
 683 KW:
 684
 685 A first step would be to 'vectorize' the programs in the same way as
 686 the normal transformation code in the tnl module.  Thus each opcode
 687 takes zero or more input vectors (registers) and produces one or more
 688 output vectors.
 689
 690 These operations would intially be coded in C, with machine-specific
 691 assembly following, as is currently the case for matrix
 692 transformations in the math/ directory.  The preprocessing scheme for
 693 selecting simpler operations Brian describes above would also work
 694 here.
 695
 696 This should give reasonable performance without excessive effort.
 697
 698 */