2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 * Code to execute vertex programs.
36 #include "nvvertexec.h"
37 #include "nvvertprog.h"
38 #include "math/m_matrix.h"
41 static const GLfloat zeroVec
[4] = { 0, 0, 0, 0 };
45 * Load/initialize the vertex program registers.
46 * This needs to be done per vertex.
49 _mesa_init_vp_registers(GLcontext
*ctx
)
51 struct vp_machine
*machine
= &(ctx
->VertexProgram
.Machine
);
54 /* Input registers get initialized from the current vertex attribs */
55 MEMCPY(machine
->Registers
[VP_INPUT_REG_START
],
57 16 * 4 * sizeof(GLfloat
));
59 /* Output and temp regs are initialized to [0,0,0,1] */
60 for (i
= VP_OUTPUT_REG_START
; i
<= VP_OUTPUT_REG_END
; i
++) {
61 machine
->Registers
[i
][0] = 0.0F
;
62 machine
->Registers
[i
][1] = 0.0F
;
63 machine
->Registers
[i
][2] = 0.0F
;
64 machine
->Registers
[i
][3] = 1.0F
;
66 for (i
= VP_TEMP_REG_START
; i
<= VP_TEMP_REG_END
; i
++) {
67 machine
->Registers
[i
][0] = 0.0F
;
68 machine
->Registers
[i
][1] = 0.0F
;
69 machine
->Registers
[i
][2] = 0.0F
;
70 machine
->Registers
[i
][3] = 1.0F
;
73 /* The program regs aren't touched */
79 * Copy the 16 elements of a matrix into four consecutive program
80 * registers starting at 'pos'.
83 load_matrix(GLfloat registers
[][4], GLuint pos
, const GLfloat mat
[16])
86 pos
+= VP_PROG_REG_START
;
87 for (i
= 0; i
< 4; i
++) {
88 registers
[pos
+ i
][0] = mat
[0 + i
];
89 registers
[pos
+ i
][1] = mat
[4 + i
];
90 registers
[pos
+ i
][2] = mat
[8 + i
];
91 registers
[pos
+ i
][3] = mat
[12 + i
];
97 * As above, but transpose the matrix.
100 load_transpose_matrix(GLfloat registers
[][4], GLuint pos
,
101 const GLfloat mat
[16])
103 pos
+= VP_PROG_REG_START
;
104 MEMCPY(registers
[pos
], mat
, 16 * sizeof(GLfloat
));
109 * Load all currently tracked matrices into the program registers.
110 * This needs to be done per glBegin/glEnd.
113 _mesa_init_tracked_matrices(GLcontext
*ctx
)
117 for (i
= 0; i
< VP_NUM_PROG_REGS
/ 4; i
++) {
118 /* point 'mat' at source matrix */
120 if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_MODELVIEW
) {
121 mat
= ctx
->ModelviewMatrixStack
.Top
;
123 else if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_PROJECTION
) {
124 mat
= ctx
->ProjectionMatrixStack
.Top
;
126 else if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_TEXTURE
) {
127 mat
= ctx
->TextureMatrixStack
[ctx
->Texture
.CurrentUnit
].Top
;
129 else if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_COLOR
) {
130 mat
= ctx
->ColorMatrixStack
.Top
;
132 else if (ctx
->VertexProgram
.TrackMatrix
[i
]==GL_MODELVIEW_PROJECTION_NV
) {
133 /* XXX verify the combined matrix is up to date */
134 mat
= &ctx
->_ModelProjectMatrix
;
136 else if (ctx
->VertexProgram
.TrackMatrix
[i
] >= GL_MATRIX0_NV
&&
137 ctx
->VertexProgram
.TrackMatrix
[i
] <= GL_MATRIX7_NV
) {
138 GLuint n
= ctx
->VertexProgram
.TrackMatrix
[i
] - GL_MATRIX0_NV
;
139 ASSERT(n
< MAX_PROGRAM_MATRICES
);
140 mat
= ctx
->ProgramMatrixStack
[n
].Top
;
143 /* no matrix is tracked, but we leave the register values as-is */
144 assert(ctx
->VertexProgram
.TrackMatrix
[i
] == GL_NONE
);
148 /* load the matrix */
149 if (ctx
->VertexProgram
.TrackMatrixTransform
[i
] == GL_IDENTITY_NV
) {
150 load_matrix(ctx
->VertexProgram
.Machine
.Registers
, i
*4, mat
->m
);
152 else if (ctx
->VertexProgram
.TrackMatrixTransform
[i
] == GL_INVERSE_NV
) {
153 _math_matrix_analyse(mat
); /* update the inverse */
154 assert((mat
->flags
& MAT_DIRTY_INVERSE
) == 0);
155 load_matrix(ctx
->VertexProgram
.Machine
.Registers
, i
*4, mat
->inv
);
157 else if (ctx
->VertexProgram
.TrackMatrixTransform
[i
] == GL_TRANSPOSE_NV
) {
158 load_transpose_matrix(ctx
->VertexProgram
.Machine
.Registers
, i
*4, mat
->m
);
161 assert(ctx
->VertexProgram
.TrackMatrixTransform
[i
]
162 == GL_INVERSE_TRANSPOSE_NV
);
163 _math_matrix_analyse(mat
); /* update the inverse */
164 assert((mat
->flags
& MAT_DIRTY_INVERSE
) == 0);
165 load_transpose_matrix(ctx
->VertexProgram
.Machine
.Registers
,
174 * For debugging. Dump the current vertex program machine registers.
177 _mesa_dump_vp_machine( const struct vp_machine
*machine
)
180 _mesa_printf("VertexIn:\n");
181 for (i
= 0; i
< VP_NUM_INPUT_REGS
; i
++) {
182 _mesa_printf("%d: %f %f %f %f ", i
,
183 machine
->Registers
[i
+ VP_INPUT_REG_START
][0],
184 machine
->Registers
[i
+ VP_INPUT_REG_START
][1],
185 machine
->Registers
[i
+ VP_INPUT_REG_START
][2],
186 machine
->Registers
[i
+ VP_INPUT_REG_START
][3]);
190 _mesa_printf("VertexOut:\n");
191 for (i
= 0; i
< VP_NUM_OUTPUT_REGS
; i
++) {
192 _mesa_printf("%d: %f %f %f %f ", i
,
193 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][0],
194 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][1],
195 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][2],
196 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][3]);
200 _mesa_printf("Registers:\n");
201 for (i
= 0; i
< VP_NUM_TEMP_REGS
; i
++) {
202 _mesa_printf("%d: %f %f %f %f ", i
,
203 machine
->Registers
[i
+ VP_TEMP_REG_START
][0],
204 machine
->Registers
[i
+ VP_TEMP_REG_START
][1],
205 machine
->Registers
[i
+ VP_TEMP_REG_START
][2],
206 machine
->Registers
[i
+ VP_TEMP_REG_START
][3]);
210 _mesa_printf("Parameters:\n");
211 for (i
= 0; i
< VP_NUM_PROG_REGS
; i
++) {
212 _mesa_printf("%d: %f %f %f %f ", i
,
213 machine
->Registers
[i
+ VP_PROG_REG_START
][0],
214 machine
->Registers
[i
+ VP_PROG_REG_START
][1],
215 machine
->Registers
[i
+ VP_PROG_REG_START
][2],
216 machine
->Registers
[i
+ VP_PROG_REG_START
][3]);
223 * Fetch a 4-element float vector from the given source register.
224 * Apply swizzling and negating as needed.
227 fetch_vector4( const struct vp_src_register
*source
,
228 const struct vp_machine
*machine
,
233 if (source
->RelAddr
) {
234 const GLint reg
= source
->Register
+ machine
->AddressReg
;
235 if (reg
< 0 || reg
> MAX_NV_VERTEX_PROGRAM_PARAMS
)
238 src
= machine
->Registers
[VP_PROG_REG_START
+ reg
];
241 src
= machine
->Registers
[source
->Register
];
244 if (source
->Negate
) {
245 result
[0] = -src
[source
->Swizzle
[0]];
246 result
[1] = -src
[source
->Swizzle
[1]];
247 result
[2] = -src
[source
->Swizzle
[2]];
248 result
[3] = -src
[source
->Swizzle
[3]];
251 result
[0] = src
[source
->Swizzle
[0]];
252 result
[1] = src
[source
->Swizzle
[1]];
253 result
[2] = src
[source
->Swizzle
[2]];
254 result
[3] = src
[source
->Swizzle
[3]];
260 * As above, but only return result[0] element.
263 fetch_vector1( const struct vp_src_register
*source
,
264 const struct vp_machine
*machine
,
269 if (source
->RelAddr
) {
270 const GLint reg
= source
->Register
+ machine
->AddressReg
;
271 if (reg
< 0 || reg
> MAX_NV_VERTEX_PROGRAM_PARAMS
)
274 src
= machine
->Registers
[VP_PROG_REG_START
+ reg
];
277 src
= machine
->Registers
[source
->Register
];
280 if (source
->Negate
) {
281 result
[0] = -src
[source
->Swizzle
[0]];
284 result
[0] = src
[source
->Swizzle
[0]];
290 * Store 4 floats into a register.
293 store_vector4( const struct vp_dst_register
*dest
, struct vp_machine
*machine
,
294 const GLfloat value
[4] )
296 GLfloat
*dst
= machine
->Registers
[dest
->Register
];
298 if (dest
->WriteMask
[0])
300 if (dest
->WriteMask
[1])
302 if (dest
->WriteMask
[2])
304 if (dest
->WriteMask
[3])
310 * Set x to positive or negative infinity.
312 #if defined(USE_IEEE) || defined(_WIN32)
313 #define SET_POS_INFINITY(x) ( *((GLuint *) &x) = 0x7F800000 )
314 #define SET_NEG_INFINITY(x) ( *((GLuint *) &x) = 0xFF800000 )
316 #define SET_POS_INFINITY(x) x = __MAXFLOAT
317 #define SET_NEG_INFINITY(x) x = -__MAXFLOAT
319 #define SET_POS_INFINITY(x) x = (GLfloat) HUGE_VAL
320 #define SET_NEG_INFINITY(x) x = (GLfloat) -HUGE_VAL
323 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
327 * Execute the given vertex program
330 _mesa_exec_vertex_program(GLcontext
*ctx
, const struct vertex_program
*program
)
332 struct vp_machine
*machine
= &ctx
->VertexProgram
.Machine
;
333 const struct vp_instruction
*inst
;
335 ctx
->_CurrentProgram
= GL_VERTEX_PROGRAM_ARB
; /* or NV, doesn't matter */
337 for (inst
= program
->Instructions
; inst
->Opcode
!= VP_OPCODE_END
; inst
++) {
339 if (ctx
->VertexProgram
.CallbackEnabled
&&
340 ctx
->VertexProgram
.Callback
) {
341 ctx
->VertexProgram
.CurrentPosition
= inst
->StringPos
;
342 ctx
->VertexProgram
.Callback(program
->Base
.Target
,
343 ctx
->VertexProgram
.CallbackData
);
346 switch (inst
->Opcode
) {
350 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
351 store_vector4( &inst
->DstReg
, machine
, t
);
356 const GLfloat epsilon
= 1.0e-5F
; /* XXX fix? */
357 GLfloat t
[4], lit
[4];
358 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
359 if (t
[3] < -(128.0F
- epsilon
))
360 t
[3] = - (128.0F
- epsilon
);
361 else if (t
[3] > 128.0F
- epsilon
)
362 t
[3] = 128.0F
- epsilon
;
369 lit
[2] = (t
[0] > 0.0) ? (GLfloat
) exp(t
[3] * log(t
[1])) : 0.0F
;
371 store_vector4( &inst
->DstReg
, machine
, lit
);
377 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
379 t
[0] = 1.0F
/ t
[0]; /* div by zero is infinity! */
380 t
[1] = t
[2] = t
[3] = t
[0];
381 store_vector4( &inst
->DstReg
, machine
, t
);
387 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
388 t
[0] = INV_SQRTF(FABSF(t
[0]));
389 t
[1] = t
[2] = t
[3] = t
[0];
390 store_vector4( &inst
->DstReg
, machine
, t
);
395 GLfloat t
[4], q
[4], floor_t0
;
396 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
397 floor_t0
= (float) floor(t
[0]);
398 if (floor_t0
> FLT_MAX_EXP
) {
399 SET_POS_INFINITY(q
[0]);
400 SET_POS_INFINITY(q
[2]);
402 else if (floor_t0
< FLT_MIN_EXP
) {
408 GLint ii
= (GLint
) floor_t0
;
409 ii
= (ii
< 23) + 0x3f800000;
410 SET_FLOAT_BITS(q
[0], ii
);
411 q
[0] = *((GLfloat
*) &ii
);
413 q
[0] = (GLfloat
) pow(2.0, floor_t0
);
415 q
[2] = (GLfloat
) (q
[0] * LOG2(q
[1]));
417 q
[1] = t
[0] - floor_t0
;
419 store_vector4( &inst
->DstReg
, machine
, q
);
424 GLfloat t
[4], q
[4], abs_t0
;
425 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
426 abs_t0
= (GLfloat
) fabs(t
[0]);
427 if (abs_t0
!= 0.0F
) {
428 /* Since we really can't handle infinite values on VMS
429 * like other OSes we'll use __MAXFLOAT to represent
430 * infinity. This may need some tweaking.
433 if (abs_t0
== __MAXFLOAT
)
435 if (IS_INF_OR_NAN(abs_t0
))
438 SET_POS_INFINITY(q
[0]);
440 SET_POS_INFINITY(q
[2]);
444 double mantissa
= frexp(t
[0], &exponent
);
445 q
[0] = (GLfloat
) (exponent
- 1);
446 q
[1] = (GLfloat
) (2.0 * mantissa
); /* map [.5, 1) -> [1, 2) */
447 q
[2] = (GLfloat
) (q
[0] + LOG2(q
[1]));
451 SET_NEG_INFINITY(q
[0]);
453 SET_NEG_INFINITY(q
[2]);
456 store_vector4( &inst
->DstReg
, machine
, q
);
461 GLfloat t
[4], u
[4], prod
[4];
462 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
463 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
464 prod
[0] = t
[0] * u
[0];
465 prod
[1] = t
[1] * u
[1];
466 prod
[2] = t
[2] * u
[2];
467 prod
[3] = t
[3] * u
[3];
468 store_vector4( &inst
->DstReg
, machine
, prod
);
473 GLfloat t
[4], u
[4], sum
[4];
474 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
475 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
476 sum
[0] = t
[0] + u
[0];
477 sum
[1] = t
[1] + u
[1];
478 sum
[2] = t
[2] + u
[2];
479 sum
[3] = t
[3] + u
[3];
480 store_vector4( &inst
->DstReg
, machine
, sum
);
485 GLfloat t
[4], u
[4], dot
[4];
486 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
487 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
488 dot
[0] = t
[0] * u
[0] + t
[1] * u
[1] + t
[2] * u
[2];
489 dot
[1] = dot
[2] = dot
[3] = dot
[0];
490 store_vector4( &inst
->DstReg
, machine
, dot
);
495 GLfloat t
[4], u
[4], dot
[4];
496 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
497 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
498 dot
[0] = t
[0] * u
[0] + t
[1] * u
[1] + t
[2] * u
[2] + t
[3] * u
[3];
499 dot
[1] = dot
[2] = dot
[3] = dot
[0];
500 store_vector4( &inst
->DstReg
, machine
, dot
);
505 GLfloat t
[4], u
[4], dst
[4];
506 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
507 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
509 dst
[1] = t
[1] * u
[1];
512 store_vector4( &inst
->DstReg
, machine
, dst
);
517 GLfloat t
[4], u
[4], min
[4];
518 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
519 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
520 min
[0] = (t
[0] < u
[0]) ? t
[0] : u
[0];
521 min
[1] = (t
[1] < u
[1]) ? t
[1] : u
[1];
522 min
[2] = (t
[2] < u
[2]) ? t
[2] : u
[2];
523 min
[3] = (t
[3] < u
[3]) ? t
[3] : u
[3];
524 store_vector4( &inst
->DstReg
, machine
, min
);
529 GLfloat t
[4], u
[4], max
[4];
530 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
531 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
532 max
[0] = (t
[0] > u
[0]) ? t
[0] : u
[0];
533 max
[1] = (t
[1] > u
[1]) ? t
[1] : u
[1];
534 max
[2] = (t
[2] > u
[2]) ? t
[2] : u
[2];
535 max
[3] = (t
[3] > u
[3]) ? t
[3] : u
[3];
536 store_vector4( &inst
->DstReg
, machine
, max
);
541 GLfloat t
[4], u
[4], slt
[4];
542 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
543 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
544 slt
[0] = (t
[0] < u
[0]) ? 1.0F
: 0.0F
;
545 slt
[1] = (t
[1] < u
[1]) ? 1.0F
: 0.0F
;
546 slt
[2] = (t
[2] < u
[2]) ? 1.0F
: 0.0F
;
547 slt
[3] = (t
[3] < u
[3]) ? 1.0F
: 0.0F
;
548 store_vector4( &inst
->DstReg
, machine
, slt
);
553 GLfloat t
[4], u
[4], sge
[4];
554 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
555 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
556 sge
[0] = (t
[0] >= u
[0]) ? 1.0F
: 0.0F
;
557 sge
[1] = (t
[1] >= u
[1]) ? 1.0F
: 0.0F
;
558 sge
[2] = (t
[2] >= u
[2]) ? 1.0F
: 0.0F
;
559 sge
[3] = (t
[3] >= u
[3]) ? 1.0F
: 0.0F
;
560 store_vector4( &inst
->DstReg
, machine
, sge
);
565 GLfloat t
[4], u
[4], v
[4], sum
[4];
566 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
567 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
568 fetch_vector4( &inst
->SrcReg
[2], machine
, v
);
569 sum
[0] = t
[0] * u
[0] + v
[0];
570 sum
[1] = t
[1] * u
[1] + v
[1];
571 sum
[2] = t
[2] * u
[2] + v
[2];
572 sum
[3] = t
[3] * u
[3] + v
[3];
573 store_vector4( &inst
->DstReg
, machine
, sum
);
579 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
580 machine
->AddressReg
= (GLint
) floor(t
[0]);
585 GLfloat t
[4], u
[4], dot
[4];
586 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
587 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
588 dot
[0] = t
[0] * u
[0] + t
[1] * u
[1] + t
[2] * u
[2] + u
[3];
589 dot
[1] = dot
[2] = dot
[3] = dot
[0];
590 store_vector4( &inst
->DstReg
, machine
, dot
);
596 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
602 if (u
> 1.884467e+019F
) {
603 u
= 1.884467e+019F
; /* IEEE 32-bit binary value 0x5F800000 */
605 else if (u
< 5.42101e-020F
) {
606 u
= 5.42101e-020F
; /* IEEE 32-bit binary value 0x1F800000 */
610 if (u
< -1.884467e+019F
) {
611 u
= -1.884467e+019F
; /* IEEE 32-bit binary value 0xDF800000 */
613 else if (u
> -5.42101e-020F
) {
614 u
= -5.42101e-020F
; /* IEEE 32-bit binary value 0x9F800000 */
617 t
[0] = t
[1] = t
[2] = t
[3] = u
;
618 store_vector4( &inst
->DstReg
, machine
, t
);
621 case VP_OPCODE_SUB
: /* GL_NV_vertex_program1_1 */
623 GLfloat t
[4], u
[4], sum
[4];
624 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
625 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
626 sum
[0] = t
[0] - u
[0];
627 sum
[1] = t
[1] - u
[1];
628 sum
[2] = t
[2] - u
[2];
629 sum
[3] = t
[3] - u
[3];
630 store_vector4( &inst
->DstReg
, machine
, sum
);
633 case VP_OPCODE_ABS
: /* GL_NV_vertex_program1_1 */
636 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
637 if (t
[0] < 0.0) t
[0] = -t
[0];
638 if (t
[1] < 0.0) t
[1] = -t
[1];
639 if (t
[2] < 0.0) t
[2] = -t
[2];
640 if (t
[3] < 0.0) t
[3] = -t
[3];
641 store_vector4( &inst
->DstReg
, machine
, t
);
644 case VP_OPCODE_FLR
: /* GL_ARB_vertex_program */
647 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
652 store_vector4( &inst
->DstReg
, machine
, t
);
655 case VP_OPCODE_FRC
: /* GL_ARB_vertex_program */
658 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
659 t
[0] = t
[0] - FLOORF(t
[0]);
660 t
[1] = t
[1] - FLOORF(t
[1]);
661 t
[2] = t
[2] - FLOORF(t
[2]);
662 t
[3] = t
[3] - FLOORF(t
[3]);
663 store_vector4( &inst
->DstReg
, machine
, t
);
666 case VP_OPCODE_EX2
: /* GL_ARB_vertex_program */
669 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
670 t
[0] = t
[1] = t
[2] = t
[3] = _mesa_pow(2.0, t
[0]);
671 store_vector4( &inst
->DstReg
, machine
, t
);
674 case VP_OPCODE_LG2
: /* GL_ARB_vertex_program */
677 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
678 t
[0] = t
[1] = t
[2] = t
[3] = LOG2(t
[0]);
679 store_vector4( &inst
->DstReg
, machine
, t
);
682 case VP_OPCODE_POW
: /* GL_ARB_vertex_program */
685 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
686 fetch_vector1( &inst
->SrcReg
[1], machine
, u
);
687 t
[0] = t
[1] = t
[2] = t
[3] = _mesa_pow(t
[0], u
[0]);
688 store_vector4( &inst
->DstReg
, machine
, t
);
691 case VP_OPCODE_XPD
: /* GL_ARB_vertex_program */
693 GLfloat t
[4], u
[4], cross
[4];
694 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
695 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
696 cross
[0] = t
[1] * u
[2] - t
[2] * u
[1];
697 cross
[1] = t
[2] * u
[0] - t
[0] * u
[2];
698 cross
[2] = t
[0] * u
[1] - t
[1] * u
[0];
699 store_vector4( &inst
->DstReg
, machine
, cross
);
702 case VP_OPCODE_SWZ
: /* GL_ARB_vertex_program */
704 const struct vp_src_register
*source
= &inst
->SrcReg
[0];
709 /* Code similar to fetch_vector4() */
710 if (source
->RelAddr
) {
711 const GLint reg
= source
->Register
+ machine
->AddressReg
;
712 if (reg
< 0 || reg
> MAX_NV_VERTEX_PROGRAM_PARAMS
)
715 src
= machine
->Registers
[VP_PROG_REG_START
+ reg
];
718 src
= machine
->Registers
[source
->Register
];
721 /* extended swizzling here */
722 for (i
= 0; i
< 3; i
++) {
723 if (source
->Swizzle
[i
] == SWIZZLE_ZERO
)
725 else if (source
->Swizzle
[i
] == SWIZZLE_ONE
)
728 result
[i
] = -src
[source
->Swizzle
[i
]];
730 result
[i
] = -result
[i
];
732 store_vector4( &inst
->DstReg
, machine
, result
);
737 ctx
->_CurrentProgram
= 0;
740 /* bad instruction opcode */
741 _mesa_problem(ctx
, "Bad VP Opcode in _mesa_exec_vertex_program");
742 ctx
->_CurrentProgram
= 0;
747 ctx
->_CurrentProgram
= 0;
753 Thoughts on vertex program optimization:
755 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
756 assembly code. That will probably be a lot of work.
758 Another approach might be to replace the vp_instruction->Opcode field with
759 a pointer to a specialized C function which executes the instruction.
760 In particular we can write functions which skip swizzling, negating,
761 masking, relative addressing, etc. when they're not needed.
765 void simple_add( struct vp_instruction *inst )
767 GLfloat *sum = machine->Registers[inst->DstReg.Register];
768 GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
769 GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
770 sum[0] = a[0] + b[0];
771 sum[1] = a[1] + b[1];
772 sum[2] = a[2] + b[2];
773 sum[3] = a[3] + b[3];
782 A first step would be to 'vectorize' the programs in the same way as
783 the normal transformation code in the tnl module. Thus each opcode
784 takes zero or more input vectors (registers) and produces one or more
787 These operations would intially be coded in C, with machine-specific
788 assembly following, as is currently the case for matrix
789 transformations in the math/ directory. The preprocessing scheme for
790 selecting simpler operations Brian describes above would also work
793 This should give reasonable performance without excessive effort.