src/mesa/main/nvvertexec.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  5.1
   4  *
   5  * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file nvvertexec.c
  27  * \brief Code to execute vertex programs.
  28  * \author Brian Paul
  29  */
  30
  31 #include "glheader.h"
  32 #include "context.h"
  33 #include "imports.h"
  34 #include "macros.h"
  35 #include "mtypes.h"
  36 #include "nvvertexec.h"
  37 #include "nvvertprog.h"
  38 #include "math/m_matrix.h"
  39
  40
  41 static const GLfloat zeroVec[4] = { 0, 0, 0, 0 };
  42
  43
  44 /**
  45  * Load/initialize the vertex program registers.
  46  * This needs to be done per vertex.
  47  */
  48 void
  49 _mesa_init_vp_registers(GLcontext *ctx)
  50 {
  51    struct vp_machine *machine = &(ctx->VertexProgram.Machine);
  52    GLuint i;
  53
  54    /* Input registers get initialized from the current vertex attribs */
  55    MEMCPY(machine->Registers[VP_INPUT_REG_START],
  56           ctx->Current.Attrib,
  57           16 * 4 * sizeof(GLfloat));
  58
  59    /* Output and temp regs are initialized to [0,0,0,1] */
  60    for (i = VP_OUTPUT_REG_START; i <= VP_OUTPUT_REG_END; i++) {
  61       machine->Registers[i][0] = 0.0F;
  62       machine->Registers[i][1] = 0.0F;
  63       machine->Registers[i][2] = 0.0F;
  64       machine->Registers[i][3] = 1.0F;
  65    }
  66    for (i = VP_TEMP_REG_START; i <= VP_TEMP_REG_END; i++) {
  67       machine->Registers[i][0] = 0.0F;
  68       machine->Registers[i][1] = 0.0F;
  69       machine->Registers[i][2] = 0.0F;
  70       machine->Registers[i][3] = 1.0F;
  71    }
  72
  73    /* The program regs aren't touched */
  74 }
  75
  76
  77
  78 /**
  79  * Copy the 16 elements of a matrix into four consecutive program
  80  * registers starting at 'pos'.
  81  */
  82 static void
  83 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
  84 {
  85    GLuint i;
  86    pos += VP_PROG_REG_START;
  87    for (i = 0; i < 4; i++) {
  88       registers[pos + i][0] = mat[0 + i];
  89       registers[pos + i][1] = mat[4 + i];
  90       registers[pos + i][2] = mat[8 + i];
  91       registers[pos + i][3] = mat[12 + i];
  92    }
  93 }
  94
  95
  96 /**
  97  * As above, but transpose the matrix.
  98  */
  99 static void
 100 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
 101                       const GLfloat mat[16])
 102 {
 103    pos += VP_PROG_REG_START;
 104    MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
 105 }
 106
 107
 108 /**
 109  * Load all currently tracked matrices into the program registers.
 110  * This needs to be done per glBegin/glEnd.
 111  */
 112 void
 113 _mesa_init_tracked_matrices(GLcontext *ctx)
 114 {
 115    GLuint i;
 116
 117    for (i = 0; i < VP_NUM_PROG_REGS / 4; i++) {
 118       /* point 'mat' at source matrix */
 119       GLmatrix *mat;
 120       if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
 121          mat = ctx->ModelviewMatrixStack.Top;
 122       }
 123       else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
 124          mat = ctx->ProjectionMatrixStack.Top;
 125       }
 126       else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
 127          mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
 128       }
 129       else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
 130          mat = ctx->ColorMatrixStack.Top;
 131       }
 132       else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
 133          /* XXX verify the combined matrix is up to date */
 134          mat = &ctx->_ModelProjectMatrix;
 135       }
 136       else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
 137                ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
 138          GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
 139          ASSERT(n < MAX_PROGRAM_MATRICES);
 140          mat = ctx->ProgramMatrixStack[n].Top;
 141       }
 142       else {
 143          /* no matrix is tracked, but we leave the register values as-is */
 144          assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
 145          continue;
 146       }
 147
 148       /* load the matrix */
 149       if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
 150          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 151       }
 152       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
 153          _math_matrix_analyse(mat); /* update the inverse */
 154          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 155          load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->inv);
 156       }
 157       else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
 158          load_transpose_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
 159       }
 160       else {
 161          assert(ctx->VertexProgram.TrackMatrixTransform[i]
 162                 == GL_INVERSE_TRANSPOSE_NV);
 163          _math_matrix_analyse(mat); /* update the inverse */
 164          assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
 165          load_transpose_matrix(ctx->VertexProgram.Machine.Registers,
 166                                i*4, mat->inv);
 167       }
 168    }
 169 }
 170
 171
 172
 173 /**
 174  * For debugging.  Dump the current vertex program machine registers.
 175  */
 176 void
 177 _mesa_dump_vp_machine( const struct vp_machine *machine )
 178 {
 179    int i;
 180    _mesa_printf("VertexIn:\n");
 181    for (i = 0; i < VP_NUM_INPUT_REGS; i++) {
 182       _mesa_printf("%d: %f %f %f %f   ", i,
 183              machine->Registers[i + VP_INPUT_REG_START][0],
 184              machine->Registers[i + VP_INPUT_REG_START][1],
 185              machine->Registers[i + VP_INPUT_REG_START][2],
 186              machine->Registers[i + VP_INPUT_REG_START][3]);
 187    }
 188    _mesa_printf("\n");
 189
 190    _mesa_printf("VertexOut:\n");
 191    for (i = 0; i < VP_NUM_OUTPUT_REGS; i++) {
 192       _mesa_printf("%d: %f %f %f %f   ", i,
 193              machine->Registers[i + VP_OUTPUT_REG_START][0],
 194              machine->Registers[i + VP_OUTPUT_REG_START][1],
 195              machine->Registers[i + VP_OUTPUT_REG_START][2],
 196              machine->Registers[i + VP_OUTPUT_REG_START][3]);
 197    }
 198    _mesa_printf("\n");
 199
 200    _mesa_printf("Registers:\n");
 201    for (i = 0; i < VP_NUM_TEMP_REGS; i++) {
 202       _mesa_printf("%d: %f %f %f %f   ", i,
 203              machine->Registers[i + VP_TEMP_REG_START][0],
 204              machine->Registers[i + VP_TEMP_REG_START][1],
 205              machine->Registers[i + VP_TEMP_REG_START][2],
 206              machine->Registers[i + VP_TEMP_REG_START][3]);
 207    }
 208    _mesa_printf("\n");
 209
 210    _mesa_printf("Parameters:\n");
 211    for (i = 0; i < VP_NUM_PROG_REGS; i++) {
 212       _mesa_printf("%d: %f %f %f %f   ", i,
 213              machine->Registers[i + VP_PROG_REG_START][0],
 214              machine->Registers[i + VP_PROG_REG_START][1],
 215              machine->Registers[i + VP_PROG_REG_START][2],
 216              machine->Registers[i + VP_PROG_REG_START][3]);
 217    }
 218    _mesa_printf("\n");
 219 }
 220
 221
 222 /**
 223  * Fetch a 4-element float vector from the given source register.
 224  * Apply swizzling and negating as needed.
 225  */
 226 static void
 227 fetch_vector4( const struct vp_src_register *source,
 228                const struct vp_machine *machine,
 229                GLfloat result[4] )
 230 {
 231    const GLfloat *src;
 232
 233    if (source->RelAddr) {
 234       const GLint reg = source->Register + machine->AddressReg;
 235       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 236          src = zeroVec;
 237       else
 238          src = machine->Registers[VP_PROG_REG_START + reg];
 239    }
 240    else {
 241       src = machine->Registers[source->Register];
 242    }
 243
 244    if (source->Negate) {
 245       result[0] = -src[source->Swizzle[0]];
 246       result[1] = -src[source->Swizzle[1]];
 247       result[2] = -src[source->Swizzle[2]];
 248       result[3] = -src[source->Swizzle[3]];
 249    }
 250    else {
 251       result[0] = src[source->Swizzle[0]];
 252       result[1] = src[source->Swizzle[1]];
 253       result[2] = src[source->Swizzle[2]];
 254       result[3] = src[source->Swizzle[3]];
 255    }
 256 }
 257
 258
 259 /**
 260  * As above, but only return result[0] element.
 261  */
 262 static void
 263 fetch_vector1( const struct vp_src_register *source,
 264                const struct vp_machine *machine,
 265                GLfloat result[4] )
 266 {
 267    const GLfloat *src;
 268
 269    if (source->RelAddr) {
 270       const GLint reg = source->Register + machine->AddressReg;
 271       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 272          src = zeroVec;
 273       else
 274          src = machine->Registers[VP_PROG_REG_START + reg];
 275    }
 276    else {
 277       src = machine->Registers[source->Register];
 278    }
 279
 280    if (source->Negate) {
 281       result[0] = -src[source->Swizzle[0]];
 282    }
 283    else {
 284       result[0] = src[source->Swizzle[0]];
 285    }
 286 }
 287
 288
 289 /**
 290  * Store 4 floats into a register.
 291  */
 292 static void
 293 store_vector4( const struct vp_dst_register *dest, struct vp_machine *machine,
 294                const GLfloat value[4] )
 295 {
 296    GLfloat *dst = machine->Registers[dest->Register];
 297
 298    if (dest->WriteMask[0])
 299       dst[0] = value[0];
 300    if (dest->WriteMask[1])
 301       dst[1] = value[1];
 302    if (dest->WriteMask[2])
 303       dst[2] = value[2];
 304    if (dest->WriteMask[3])
 305       dst[3] = value[3];
 306 }
 307
 308
 309 /**
 310  * Set x to positive or negative infinity.
 311  */
 312 #ifdef USE_IEEE
 313 #define SET_POS_INFINITY(x)  ( *((GLuint *) &x) = 0x7F800000 )
 314 #define SET_NEG_INFINITY(x)  ( *((GLuint *) &x) = 0xFF800000 )
 315 #elif defined(VMS)
 316 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 317 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 318 #else
 319 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 320 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 321 #endif
 322
 323 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
 324
 325
 326 /**
 327  * Execute the given vertex program
 328  */
 329 void
 330 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
 331 {
 332    struct vp_machine *machine = &ctx->VertexProgram.Machine;
 333    const struct vp_instruction *inst;
 334
 335    for (inst = program->Instructions; inst->Opcode != VP_OPCODE_END; inst++) {
 336       switch (inst->Opcode) {
 337          case VP_OPCODE_MOV:
 338             {
 339                GLfloat t[4];
 340                fetch_vector4( &inst->SrcReg[0], machine, t );
 341                store_vector4( &inst->DstReg, machine, t );
 342             }
 343             break;
 344          case VP_OPCODE_LIT:
 345             {
 346                const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
 347                GLfloat t[4], lit[4];
 348                fetch_vector4( &inst->SrcReg[0], machine, t );
 349                if (t[3] < -(128.0F - epsilon))
 350                    t[3] = - (128.0F - epsilon);
 351                else if (t[3] > 128.0F - epsilon)
 352                   t[3] = 128.0F - epsilon;
 353                if (t[0] < 0.0)
 354                   t[0] = 0.0;
 355                if (t[1] < 0.0)
 356                   t[1] = 0.0;
 357                lit[0] = 1.0;
 358                lit[1] = t[0];
 359                lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
 360                lit[3] = 1.0;
 361                store_vector4( &inst->DstReg, machine, lit );
 362             }
 363             break;
 364          case VP_OPCODE_RCP:
 365             {
 366                GLfloat t[4];
 367                fetch_vector1( &inst->SrcReg[0], machine, t );
 368                if (t[0] != 1.0F)
 369                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
 370                t[1] = t[2] = t[3] = t[0];
 371                store_vector4( &inst->DstReg, machine, t );
 372             }
 373             break;
 374          case VP_OPCODE_RSQ:
 375             {
 376                GLfloat t[4];
 377                fetch_vector1( &inst->SrcReg[0], machine, t );
 378                t[0] = INV_SQRTF(FABSF(t[0]));
 379                t[1] = t[2] = t[3] = t[0];
 380                store_vector4( &inst->DstReg, machine, t );
 381             }
 382             break;
 383          case VP_OPCODE_EXP:
 384             {
 385                GLfloat t[4], q[4], floor_t0;
 386                fetch_vector1( &inst->SrcReg[0], machine, t );
 387                floor_t0 = (float) floor(t[0]);
 388                if (floor_t0 > FLT_MAX_EXP) {
 389                   SET_POS_INFINITY(q[0]);
 390                   SET_POS_INFINITY(q[2]);
 391                }
 392                else if (floor_t0 < FLT_MIN_EXP) {
 393                   q[0] = 0.0F;
 394                   q[2] = 0.0F;
 395                }
 396                else {
 397 #ifdef USE_IEEE
 398                   GLint ii = (GLint) floor_t0;
 399                   ii = (ii < 23) + 0x3f800000;
 400                   SET_FLOAT_BITS(q[0], ii);
 401                   q[0] = *((GLfloat *) &ii);
 402 #else
 403                   q[0] = (GLfloat) pow(2.0, floor_t0);
 404 #endif
 405                   q[2] = (GLfloat) (q[0] * LOG2(q[1]));
 406                }
 407                q[1] = t[0] - floor_t0;
 408                q[3] = 1.0F;
 409                store_vector4( &inst->DstReg, machine, q );
 410             }
 411             break;
 412          case VP_OPCODE_LOG:
 413             {
 414                GLfloat t[4], q[4], abs_t0;
 415                fetch_vector1( &inst->SrcReg[0], machine, t );
 416                abs_t0 = (GLfloat) fabs(t[0]);
 417                if (abs_t0 != 0.0F) {
 418                   /* Since we really can't handle infinite values on VMS
 419                    * like other OSes we'll use __MAXFLOAT to represent
 420                    * infinity.  This may need some tweaking.
 421                    */
 422 #ifdef VMS
 423                   if (abs_t0 == __MAXFLOAT) {
 424 #else
 425                   if (IS_INF_OR_NAN(abs_t0)) {
 426 #endif
 427                      SET_POS_INFINITY(q[0]);
 428                      q[1] = 1.0F;
 429                      SET_POS_INFINITY(q[2]);
 430                   }
 431                   else {
 432                      int exponent;
 433                      double mantissa = frexp(t[0], &exponent);
 434                      q[0] = (GLfloat) (exponent - 1);
 435                      q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 436                      q[2] = (GLfloat) (q[0] + LOG2(q[1]));
 437                   }
 438                }
 439                else {
 440                   SET_NEG_INFINITY(q[0]);
 441                   q[1] = 1.0F;
 442                   SET_NEG_INFINITY(q[2]);
 443                }
 444                q[3] = 1.0;
 445                store_vector4( &inst->DstReg, machine, q );
 446             }
 447             break;
 448          case VP_OPCODE_MUL:
 449             {
 450                GLfloat t[4], u[4], prod[4];
 451                fetch_vector4( &inst->SrcReg[0], machine, t );
 452                fetch_vector4( &inst->SrcReg[1], machine, u );
 453                prod[0] = t[0] * u[0];
 454                prod[1] = t[1] * u[1];
 455                prod[2] = t[2] * u[2];
 456                prod[3] = t[3] * u[3];
 457                store_vector4( &inst->DstReg, machine, prod );
 458             }
 459             break;
 460          case VP_OPCODE_ADD:
 461             {
 462                GLfloat t[4], u[4], sum[4];
 463                fetch_vector4( &inst->SrcReg[0], machine, t );
 464                fetch_vector4( &inst->SrcReg[1], machine, u );
 465                sum[0] = t[0] + u[0];
 466                sum[1] = t[1] + u[1];
 467                sum[2] = t[2] + u[2];
 468                sum[3] = t[3] + u[3];
 469                store_vector4( &inst->DstReg, machine, sum );
 470             }
 471             break;
 472          case VP_OPCODE_DP3:
 473             {
 474                GLfloat t[4], u[4], dot[4];
 475                fetch_vector4( &inst->SrcReg[0], machine, t );
 476                fetch_vector4( &inst->SrcReg[1], machine, u );
 477                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
 478                dot[1] = dot[2] = dot[3] = dot[0];
 479                store_vector4( &inst->DstReg, machine, dot );
 480             }
 481             break;
 482          case VP_OPCODE_DP4:
 483             {
 484                GLfloat t[4], u[4], dot[4];
 485                fetch_vector4( &inst->SrcReg[0], machine, t );
 486                fetch_vector4( &inst->SrcReg[1], machine, u );
 487                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
 488                dot[1] = dot[2] = dot[3] = dot[0];
 489                store_vector4( &inst->DstReg, machine, dot );
 490             }
 491             break;
 492          case VP_OPCODE_DST:
 493             {
 494                GLfloat t[4], u[4], dst[4];
 495                fetch_vector4( &inst->SrcReg[0], machine, t );
 496                fetch_vector4( &inst->SrcReg[1], machine, u );
 497                dst[0] = 1.0F;
 498                dst[1] = t[1] * u[1];
 499                dst[2] = t[2];
 500                dst[3] = u[3];
 501                store_vector4( &inst->DstReg, machine, dst );
 502             }
 503             break;
 504          case VP_OPCODE_MIN:
 505             {
 506                GLfloat t[4], u[4], min[4];
 507                fetch_vector4( &inst->SrcReg[0], machine, t );
 508                fetch_vector4( &inst->SrcReg[1], machine, u );
 509                min[0] = (t[0] < u[0]) ? t[0] : u[0];
 510                min[1] = (t[1] < u[1]) ? t[1] : u[1];
 511                min[2] = (t[2] < u[2]) ? t[2] : u[2];
 512                min[3] = (t[3] < u[3]) ? t[3] : u[3];
 513                store_vector4( &inst->DstReg, machine, min );
 514             }
 515             break;
 516          case VP_OPCODE_MAX:
 517             {
 518                GLfloat t[4], u[4], max[4];
 519                fetch_vector4( &inst->SrcReg[0], machine, t );
 520                fetch_vector4( &inst->SrcReg[1], machine, u );
 521                max[0] = (t[0] > u[0]) ? t[0] : u[0];
 522                max[1] = (t[1] > u[1]) ? t[1] : u[1];
 523                max[2] = (t[2] > u[2]) ? t[2] : u[2];
 524                max[3] = (t[3] > u[3]) ? t[3] : u[3];
 525                store_vector4( &inst->DstReg, machine, max );
 526             }
 527             break;
 528          case VP_OPCODE_SLT:
 529             {
 530                GLfloat t[4], u[4], slt[4];
 531                fetch_vector4( &inst->SrcReg[0], machine, t );
 532                fetch_vector4( &inst->SrcReg[1], machine, u );
 533                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
 534                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
 535                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
 536                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
 537                store_vector4( &inst->DstReg, machine, slt );
 538             }
 539             break;
 540          case VP_OPCODE_SGE:
 541             {
 542                GLfloat t[4], u[4], sge[4];
 543                fetch_vector4( &inst->SrcReg[0], machine, t );
 544                fetch_vector4( &inst->SrcReg[1], machine, u );
 545                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
 546                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
 547                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
 548                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
 549                store_vector4( &inst->DstReg, machine, sge );
 550             }
 551             break;
 552          case VP_OPCODE_MAD:
 553             {
 554                GLfloat t[4], u[4], v[4], sum[4];
 555                fetch_vector4( &inst->SrcReg[0], machine, t );
 556                fetch_vector4( &inst->SrcReg[1], machine, u );
 557                fetch_vector4( &inst->SrcReg[2], machine, v );
 558                sum[0] = t[0] * u[0] + v[0];
 559                sum[1] = t[1] * u[1] + v[1];
 560                sum[2] = t[2] * u[2] + v[2];
 561                sum[3] = t[3] * u[3] + v[3];
 562                store_vector4( &inst->DstReg, machine, sum );
 563             }
 564             break;
 565          case VP_OPCODE_ARL:
 566             {
 567                GLfloat t[4];
 568                fetch_vector4( &inst->SrcReg[0], machine, t );
 569                machine->AddressReg = (GLint) floor(t[0]);
 570             }
 571             break;
 572          case VP_OPCODE_DPH:
 573             {
 574                GLfloat t[4], u[4], dot[4];
 575                fetch_vector4( &inst->SrcReg[0], machine, t );
 576                fetch_vector4( &inst->SrcReg[1], machine, u );
 577                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
 578                dot[1] = dot[2] = dot[3] = dot[0];
 579                store_vector4( &inst->DstReg, machine, dot );
 580             }
 581             break;
 582          case VP_OPCODE_RCC:
 583             {
 584                GLfloat t[4], u;
 585                fetch_vector1( &inst->SrcReg[0], machine, t );
 586                if (t[0] == 1.0F)
 587                   u = 1.0F;
 588                else
 589                   u = 1.0F / t[0];
 590                if (u > 0.0F) {
 591                   if (u > 1.884467e+019F) {
 592                      u = 1.884467e+019F;  /* IEEE 32-bit binary value 0x5F800000 */
 593                   }
 594                   else if (u < 5.42101e-020F) {
 595                      u = 5.42101e-020F;   /* IEEE 32-bit binary value 0x1F800000 */
 596                   }
 597                }
 598                else {
 599                   if (u < -1.884467e+019F) {
 600                      u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
 601                   }
 602                   else if (u > -5.42101e-020F) {
 603                      u = -5.42101e-020F;  /* IEEE 32-bit binary value 0x9F800000 */
 604                   }
 605                }
 606                t[0] = t[1] = t[2] = t[3] = u;
 607                store_vector4( &inst->DstReg, machine, t );
 608             }
 609             break;
 610          case VP_OPCODE_SUB: /* GL_NV_vertex_program1_1 */
 611             {
 612                GLfloat t[4], u[4], sum[4];
 613                fetch_vector4( &inst->SrcReg[0], machine, t );
 614                fetch_vector4( &inst->SrcReg[1], machine, u );
 615                sum[0] = t[0] - u[0];
 616                sum[1] = t[1] - u[1];
 617                sum[2] = t[2] - u[2];
 618                sum[3] = t[3] - u[3];
 619                store_vector4( &inst->DstReg, machine, sum );
 620             }
 621             break;
 622          case VP_OPCODE_ABS: /* GL_NV_vertex_program1_1 */
 623             {
 624                GLfloat t[4];
 625                fetch_vector4( &inst->SrcReg[0], machine, t );
 626                if (t[0] < 0.0)  t[0] = -t[0];
 627                if (t[1] < 0.0)  t[1] = -t[1];
 628                if (t[2] < 0.0)  t[2] = -t[2];
 629                if (t[3] < 0.0)  t[3] = -t[3];
 630                store_vector4( &inst->DstReg, machine, t );
 631             }
 632             break;
 633          case VP_OPCODE_FLR: /* GL_ARB_vertex_program */
 634             {
 635                GLfloat t[4];
 636                fetch_vector4( &inst->SrcReg[0], machine, t );
 637                t[0] = FLOORF(t[0]);
 638                t[1] = FLOORF(t[1]);
 639                t[2] = FLOORF(t[2]);
 640                t[3] = FLOORF(t[3]);
 641                store_vector4( &inst->DstReg, machine, t );
 642             }
 643             break;
 644          case VP_OPCODE_FRC: /* GL_ARB_vertex_program */
 645             {
 646                GLfloat t[4];
 647                fetch_vector4( &inst->SrcReg[0], machine, t );
 648                t[0] = t[0] - FLOORF(t[0]);
 649                t[1] = t[1] - FLOORF(t[1]);
 650                t[2] = t[2] - FLOORF(t[2]);
 651                t[3] = t[3] - FLOORF(t[3]);
 652                store_vector4( &inst->DstReg, machine, t );
 653             }
 654             break;
 655          case VP_OPCODE_EX2: /* GL_ARB_vertex_program */
 656             {
 657                GLfloat t[4];
 658                fetch_vector1( &inst->SrcReg[0], machine, t );
 659                t[0] = t[1] = t[2] = t[3] = _mesa_pow(2.0, t[0]);
 660                store_vector4( &inst->DstReg, machine, t );
 661             }
 662             break;
 663          case VP_OPCODE_LG2: /* GL_ARB_vertex_program */
 664             {
 665                GLfloat t[4];
 666                fetch_vector1( &inst->SrcReg[0], machine, t );
 667                t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
 668                store_vector4( &inst->DstReg, machine, t );
 669             }
 670             break;
 671          case VP_OPCODE_POW: /* GL_ARB_vertex_program */
 672             {
 673                GLfloat t[4], u[4];
 674                fetch_vector1( &inst->SrcReg[0], machine, t );
 675                fetch_vector1( &inst->SrcReg[1], machine, u );
 676                t[0] = t[1] = t[2] = t[3] = _mesa_pow(t[0], u[0]);
 677                store_vector4( &inst->DstReg, machine, t );
 678             }
 679             break;
 680          case VP_OPCODE_XPD: /* GL_ARB_vertex_program */
 681             {
 682                GLfloat t[4], u[4], cross[4];
 683                fetch_vector4( &inst->SrcReg[0], machine, t );
 684                fetch_vector4( &inst->SrcReg[1], machine, u );
 685                cross[0] = t[1] * u[2] - t[2] * u[1];
 686                cross[1] = t[2] * u[0] - t[0] * u[2];
 687                cross[2] = t[0] * u[1] - t[1] * u[0];
 688                store_vector4( &inst->DstReg, machine, cross );
 689             }
 690             break;
 691          case VP_OPCODE_SWZ: /* GL_ARB_vertex_program */
 692             {
 693                const struct vp_src_register *source = &inst->SrcReg[0];
 694                const GLfloat *src;
 695                GLfloat result[4];
 696                GLuint i;
 697
 698                /* Code similar to fetch_vector4() */
 699                if (source->RelAddr) {
 700                   const GLint reg = source->Register + machine->AddressReg;
 701                   if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
 702                      src = zeroVec;
 703                   else
 704                      src = machine->Registers[VP_PROG_REG_START + reg];
 705                }
 706                else {
 707                   src = machine->Registers[source->Register];
 708                }
 709
 710                /* extended swizzling here */
 711                for (i = 0; i < 3; i++) {
 712                   if (source->Swizzle[i] == SWIZZLE_ZERO)
 713                      result[i] = 0.0;
 714                   else if (source->Swizzle[i] == SWIZZLE_ONE)
 715                      result[i] = -1.0;
 716                   else
 717                      result[i] = -src[source->Swizzle[i]];
 718                   if (source->Negate)
 719                      result[i] = -result[i];
 720                }
 721                store_vector4( &inst->DstReg, machine, result );
 722             }
 723             break;
 724
 725          case VP_OPCODE_END:
 726             return;
 727          default:
 728             /* bad instruction opcode */
 729             _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
 730             return;
 731       }
 732    }
 733 }
 734
 735
 736
 737 /**
 738 Thoughts on vertex program optimization:
 739
 740 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
 741 assembly code.  That will probably be a lot of work.
 742
 743 Another approach might be to replace the vp_instruction->Opcode field with
 744 a pointer to a specialized C function which executes the instruction.
 745 In particular we can write functions which skip swizzling, negating,
 746 masking, relative addressing, etc. when they're not needed.
 747
 748 For example:
 749
 750 void simple_add( struct vp_instruction *inst )
 751 {
 752    GLfloat *sum = machine->Registers[inst->DstReg.Register];
 753    GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
 754    GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
 755    sum[0] = a[0] + b[0];
 756    sum[1] = a[1] + b[1];
 757    sum[2] = a[2] + b[2];
 758    sum[3] = a[3] + b[3];
 759 }
 760
 761 */
 762
 763 /*
 764
 765 KW:
 766
 767 A first step would be to 'vectorize' the programs in the same way as
 768 the normal transformation code in the tnl module.  Thus each opcode
 769 takes zero or more input vectors (registers) and produces one or more
 770 output vectors.
 771
 772 These operations would intially be coded in C, with machine-specific
 773 assembly following, as is currently the case for matrix
 774 transformations in the math/ directory.  The preprocessing scheme for
 775 selecting simpler operations Brian describes above would also work
 776 here.
 777
 778 This should give reasonable performance without excessive effort.
 779
 780 */