fix stores to vertex state program registers
[mesa.git] / src / mesa / main / nvvertexec.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.0.1
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file nvvertexec.c
27 * Code to execute vertex programs.
28 * \author Brian Paul
29 */
30
31 #include "glheader.h"
32 #include "context.h"
33 #include "imports.h"
34 #include "macros.h"
35 #include "mtypes.h"
36 #include "nvvertexec.h"
37 #include "nvvertprog.h"
38 #include "program.h"
39 #include "math/m_matrix.h"
40
41
42 static const GLfloat zeroVec[4] = { 0, 0, 0, 0 };
43
44
45 /**
46 * Load/initialize the vertex program registers.
47 * This needs to be done per vertex.
48 */
49 void
50 _mesa_init_vp_registers(GLcontext *ctx)
51 {
52 GLuint i;
53
54 /* Input registers get initialized from the current vertex attribs */
55 MEMCPY(ctx->VertexProgram.Inputs, ctx->Current.Attrib,
56 VERT_ATTRIB_MAX * 4 * sizeof(GLfloat));
57
58 /* Output and temp regs are initialized to [0,0,0,1] */
59 for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
60 ASSIGN_4V(ctx->VertexProgram.Outputs[i], 0.0F, 0.0F, 0.0F, 1.0F);
61 }
62 for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
63 ASSIGN_4V(ctx->VertexProgram.Temporaries[i], 0.0F, 0.0F, 0.0F, 1.0F);
64 }
65
66 /* The program parameters aren't touched */
67 /* XXX: This should be moved to glBegin() time, but its safe (and slow!)
68 * here - Karl
69 */
70 if (ctx->VertexProgram.Current->Parameters) {
71 /* Grab the state */
72 _mesa_load_state_parameters(ctx, ctx->VertexProgram.Current->Parameters);
73
74 /* And copy it into the program state */
75 for (i=0; i<ctx->VertexProgram.Current->Parameters->NumParameters; i++) {
76 MEMCPY(ctx->VertexProgram.Parameters[i],
77 &ctx->VertexProgram.Current->Parameters->Parameters[i].Values,
78 4*sizeof(GLfloat));
79 }
80 }
81 }
82
83
84
85 /**
86 * Copy the 16 elements of a matrix into four consecutive program
87 * registers starting at 'pos'.
88 */
89 static void
90 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
91 {
92 GLuint i;
93 for (i = 0; i < 4; i++) {
94 registers[pos + i][0] = mat[0 + i];
95 registers[pos + i][1] = mat[4 + i];
96 registers[pos + i][2] = mat[8 + i];
97 registers[pos + i][3] = mat[12 + i];
98 }
99 }
100
101
102 /**
103 * As above, but transpose the matrix.
104 */
105 static void
106 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
107 const GLfloat mat[16])
108 {
109 MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
110 }
111
112
113 /**
114 * Load all currently tracked matrices into the program registers.
115 * This needs to be done per glBegin/glEnd.
116 */
117 void
118 _mesa_init_tracked_matrices(GLcontext *ctx)
119 {
120 GLuint i;
121
122 for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS / 4; i++) {
123 /* point 'mat' at source matrix */
124 GLmatrix *mat;
125 if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
126 mat = ctx->ModelviewMatrixStack.Top;
127 }
128 else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
129 mat = ctx->ProjectionMatrixStack.Top;
130 }
131 else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
132 mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
133 }
134 else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
135 mat = ctx->ColorMatrixStack.Top;
136 }
137 else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
138 /* XXX verify the combined matrix is up to date */
139 mat = &ctx->_ModelProjectMatrix;
140 }
141 else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
142 ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
143 GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
144 ASSERT(n < MAX_PROGRAM_MATRICES);
145 mat = ctx->ProgramMatrixStack[n].Top;
146 }
147 else {
148 /* no matrix is tracked, but we leave the register values as-is */
149 assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
150 continue;
151 }
152
153 /* load the matrix */
154 if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
155 load_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
156 }
157 else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
158 _math_matrix_analyse(mat); /* update the inverse */
159 assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
160 load_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
161 }
162 else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
163 load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
164 }
165 else {
166 assert(ctx->VertexProgram.TrackMatrixTransform[i]
167 == GL_INVERSE_TRANSPOSE_NV);
168 _math_matrix_analyse(mat); /* update the inverse */
169 assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
170 load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
171 }
172 }
173 }
174
175
176
177 /**
178 * For debugging. Dump the current vertex program machine registers.
179 */
180 void
181 _mesa_dump_vp_state( const struct vertex_program_state *state )
182 {
183 int i;
184 _mesa_printf("VertexIn:\n");
185 for (i = 0; i < MAX_NV_VERTEX_PROGRAM_INPUTS; i++) {
186 _mesa_printf("%d: %f %f %f %f ", i,
187 state->Inputs[i][0],
188 state->Inputs[i][1],
189 state->Inputs[i][2],
190 state->Inputs[i][3]);
191 }
192 _mesa_printf("\n");
193
194 _mesa_printf("VertexOut:\n");
195 for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
196 _mesa_printf("%d: %f %f %f %f ", i,
197 state->Outputs[i][0],
198 state->Outputs[i][1],
199 state->Outputs[i][2],
200 state->Outputs[i][3]);
201 }
202 _mesa_printf("\n");
203
204 _mesa_printf("Registers:\n");
205 for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
206 _mesa_printf("%d: %f %f %f %f ", i,
207 state->Temporaries[i][0],
208 state->Temporaries[i][1],
209 state->Temporaries[i][2],
210 state->Temporaries[i][3]);
211 }
212 _mesa_printf("\n");
213
214 _mesa_printf("Parameters:\n");
215 for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS; i++) {
216 _mesa_printf("%d: %f %f %f %f ", i,
217 state->Parameters[i][0],
218 state->Parameters[i][1],
219 state->Parameters[i][2],
220 state->Parameters[i][3]);
221 }
222 _mesa_printf("\n");
223 }
224
225
226
227 /**
228 * Return a pointer to the 4-element float vector specified by the given
229 * source register.
230 */
231 static INLINE const GLfloat *
232 get_register_pointer( const struct vp_src_register *source,
233 const struct vertex_program_state *state )
234 {
235 if (source->RelAddr) {
236 const GLint reg = source->Index + state->AddressReg[0];
237 ASSERT( (source->File == PROGRAM_ENV_PARAM) ||
238 (source->File == PROGRAM_STATE_VAR) );
239 if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
240 return zeroVec;
241 else
242 return state->Parameters[reg];
243 }
244 else {
245 switch (source->File) {
246 case PROGRAM_TEMPORARY:
247 return state->Temporaries[source->Index];
248 case PROGRAM_INPUT:
249 return state->Inputs[source->Index];
250 case PROGRAM_LOCAL_PARAM:
251 /* XXX fix */
252 return state->Temporaries[source->Index];
253 case PROGRAM_ENV_PARAM:
254 return state->Parameters[source->Index];
255 case PROGRAM_STATE_VAR:
256 return state->Parameters[source->Index];
257 default:
258 _mesa_problem(NULL,
259 "Bad source register file in fetch_vector4(vp)");
260 return NULL;
261 }
262 }
263 return NULL;
264 }
265
266
267 /**
268 * Fetch a 4-element float vector from the given source register.
269 * Apply swizzling and negating as needed.
270 */
271 static INLINE void
272 fetch_vector4( const struct vp_src_register *source,
273 const struct vertex_program_state *state,
274 GLfloat result[4] )
275 {
276 const GLfloat *src = get_register_pointer(source, state);
277
278 if (source->Negate) {
279 result[0] = -src[source->Swizzle[0]];
280 result[1] = -src[source->Swizzle[1]];
281 result[2] = -src[source->Swizzle[2]];
282 result[3] = -src[source->Swizzle[3]];
283 }
284 else {
285 result[0] = src[source->Swizzle[0]];
286 result[1] = src[source->Swizzle[1]];
287 result[2] = src[source->Swizzle[2]];
288 result[3] = src[source->Swizzle[3]];
289 }
290 }
291
292
293
294 /**
295 * As above, but only return result[0] element.
296 */
297 static INLINE void
298 fetch_vector1( const struct vp_src_register *source,
299 const struct vertex_program_state *state,
300 GLfloat result[4] )
301 {
302 const GLfloat *src = get_register_pointer(source, state);
303
304 if (source->Negate) {
305 result[0] = -src[source->Swizzle[0]];
306 }
307 else {
308 result[0] = src[source->Swizzle[0]];
309 }
310 }
311
312
313 /**
314 * Store 4 floats into a register.
315 */
316 static void
317 store_vector4( const struct vp_dst_register *dest,
318 struct vertex_program_state *state,
319 const GLfloat value[4] )
320 {
321 GLfloat *dst;
322 switch (dest->File) {
323 case PROGRAM_TEMPORARY:
324 dst = state->Temporaries[dest->Index];
325 break;
326 case PROGRAM_OUTPUT:
327 dst = state->Outputs[dest->Index];
328 break;
329 case PROGRAM_ENV_PARAM:
330 {
331 /* a slight hack */
332 GET_CURRENT_CONTEXT(ctx);
333 dst = ctx->VertexProgram.Parameters[dest->Index];
334 }
335 break;
336 default:
337 _mesa_problem(NULL, "Invalid register file in store_vector4(file=%d)",
338 dest->File);
339 return;
340 }
341
342 if (dest->WriteMask[0])
343 dst[0] = value[0];
344 if (dest->WriteMask[1])
345 dst[1] = value[1];
346 if (dest->WriteMask[2])
347 dst[2] = value[2];
348 if (dest->WriteMask[3])
349 dst[3] = value[3];
350 }
351
352
353 /**
354 * Set x to positive or negative infinity.
355 */
356 #if defined(USE_IEEE) || defined(_WIN32)
357 #define SET_POS_INFINITY(x) ( *((GLuint *) &x) = 0x7F800000 )
358 #define SET_NEG_INFINITY(x) ( *((GLuint *) &x) = 0xFF800000 )
359 #elif defined(VMS)
360 #define SET_POS_INFINITY(x) x = __MAXFLOAT
361 #define SET_NEG_INFINITY(x) x = -__MAXFLOAT
362 #else
363 #define SET_POS_INFINITY(x) x = (GLfloat) HUGE_VAL
364 #define SET_NEG_INFINITY(x) x = (GLfloat) -HUGE_VAL
365 #endif
366
367 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
368
369
370 /**
371 * Execute the given vertex program
372 */
373 void
374 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
375 {
376 struct vertex_program_state *state = &ctx->VertexProgram;
377 const struct vp_instruction *inst;
378
379 ctx->_CurrentProgram = GL_VERTEX_PROGRAM_ARB; /* or NV, doesn't matter */
380
381 /* If the program is position invariant, multiply the input
382 * position and the MVP matrix and stick it into the output pos slot
383 */
384 if (ctx->VertexProgram.Current->IsPositionInvariant) {
385 TRANSFORM_POINT( ctx->VertexProgram.Outputs[0],
386 ctx->_ModelProjectMatrix.m,
387 ctx->VertexProgram.Inputs[0]);
388
389 /* XXX: This could go elsewhere */
390 ctx->VertexProgram.Current->OutputsWritten |= 0x1;
391 }
392
393 for (inst = program->Instructions; /*inst->Opcode != VP_OPCODE_END*/; inst++) {
394
395 if (ctx->VertexProgram.CallbackEnabled &&
396 ctx->VertexProgram.Callback) {
397 ctx->VertexProgram.CurrentPosition = inst->StringPos;
398 ctx->VertexProgram.Callback(program->Base.Target,
399 ctx->VertexProgram.CallbackData);
400 }
401
402 switch (inst->Opcode) {
403 case VP_OPCODE_MOV:
404 {
405 GLfloat t[4];
406 fetch_vector4( &inst->SrcReg[0], state, t );
407 store_vector4( &inst->DstReg, state, t );
408 }
409 break;
410 case VP_OPCODE_LIT:
411 {
412 const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
413 GLfloat t[4], lit[4];
414 fetch_vector4( &inst->SrcReg[0], state, t );
415 if (t[3] < -(128.0F - epsilon))
416 t[3] = - (128.0F - epsilon);
417 else if (t[3] > 128.0F - epsilon)
418 t[3] = 128.0F - epsilon;
419 if (t[0] < 0.0)
420 t[0] = 0.0;
421 if (t[1] < 0.0)
422 t[1] = 0.0;
423 lit[0] = 1.0;
424 lit[1] = t[0];
425 lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
426 lit[3] = 1.0;
427 store_vector4( &inst->DstReg, state, lit );
428 }
429 break;
430 case VP_OPCODE_RCP:
431 {
432 GLfloat t[4];
433 fetch_vector1( &inst->SrcReg[0], state, t );
434 if (t[0] != 1.0F)
435 t[0] = 1.0F / t[0]; /* div by zero is infinity! */
436 t[1] = t[2] = t[3] = t[0];
437 store_vector4( &inst->DstReg, state, t );
438 }
439 break;
440 case VP_OPCODE_RSQ:
441 {
442 GLfloat t[4];
443 fetch_vector1( &inst->SrcReg[0], state, t );
444 t[0] = INV_SQRTF(FABSF(t[0]));
445 t[1] = t[2] = t[3] = t[0];
446 store_vector4( &inst->DstReg, state, t );
447 }
448 break;
449 case VP_OPCODE_EXP:
450 {
451 GLfloat t[4], q[4], floor_t0;
452 fetch_vector1( &inst->SrcReg[0], state, t );
453 floor_t0 = (float) floor(t[0]);
454 if (floor_t0 > FLT_MAX_EXP) {
455 SET_POS_INFINITY(q[0]);
456 SET_POS_INFINITY(q[2]);
457 }
458 else if (floor_t0 < FLT_MIN_EXP) {
459 q[0] = 0.0F;
460 q[2] = 0.0F;
461 }
462 else {
463 #ifdef USE_IEEE
464 GLint ii = (GLint) floor_t0;
465 ii = (ii < 23) + 0x3f800000;
466 SET_FLOAT_BITS(q[0], ii);
467 q[0] = *((GLfloat *) &ii);
468 #else
469 q[0] = (GLfloat) pow(2.0, floor_t0);
470 #endif
471 q[2] = (GLfloat) (q[0] * LOG2(q[1]));
472 }
473 q[1] = t[0] - floor_t0;
474 q[3] = 1.0F;
475 store_vector4( &inst->DstReg, state, q );
476 }
477 break;
478 case VP_OPCODE_LOG:
479 {
480 GLfloat t[4], q[4], abs_t0;
481 fetch_vector1( &inst->SrcReg[0], state, t );
482 abs_t0 = (GLfloat) fabs(t[0]);
483 if (abs_t0 != 0.0F) {
484 /* Since we really can't handle infinite values on VMS
485 * like other OSes we'll use __MAXFLOAT to represent
486 * infinity. This may need some tweaking.
487 */
488 #ifdef VMS
489 if (abs_t0 == __MAXFLOAT)
490 #else
491 if (IS_INF_OR_NAN(abs_t0))
492 #endif
493 {
494 SET_POS_INFINITY(q[0]);
495 q[1] = 1.0F;
496 SET_POS_INFINITY(q[2]);
497 }
498 else {
499 int exponent;
500 double mantissa = frexp(t[0], &exponent);
501 q[0] = (GLfloat) (exponent - 1);
502 q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
503 q[2] = (GLfloat) (q[0] + LOG2(q[1]));
504 }
505 }
506 else {
507 SET_NEG_INFINITY(q[0]);
508 q[1] = 1.0F;
509 SET_NEG_INFINITY(q[2]);
510 }
511 q[3] = 1.0;
512 store_vector4( &inst->DstReg, state, q );
513 }
514 break;
515 case VP_OPCODE_MUL:
516 {
517 GLfloat t[4], u[4], prod[4];
518 fetch_vector4( &inst->SrcReg[0], state, t );
519 fetch_vector4( &inst->SrcReg[1], state, u );
520 prod[0] = t[0] * u[0];
521 prod[1] = t[1] * u[1];
522 prod[2] = t[2] * u[2];
523 prod[3] = t[3] * u[3];
524 store_vector4( &inst->DstReg, state, prod );
525 }
526 break;
527 case VP_OPCODE_ADD:
528 {
529 GLfloat t[4], u[4], sum[4];
530 fetch_vector4( &inst->SrcReg[0], state, t );
531 fetch_vector4( &inst->SrcReg[1], state, u );
532 sum[0] = t[0] + u[0];
533 sum[1] = t[1] + u[1];
534 sum[2] = t[2] + u[2];
535 sum[3] = t[3] + u[3];
536 store_vector4( &inst->DstReg, state, sum );
537 }
538 break;
539 case VP_OPCODE_DP3:
540 {
541 GLfloat t[4], u[4], dot[4];
542 fetch_vector4( &inst->SrcReg[0], state, t );
543 fetch_vector4( &inst->SrcReg[1], state, u );
544 dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
545 dot[1] = dot[2] = dot[3] = dot[0];
546 store_vector4( &inst->DstReg, state, dot );
547 }
548 break;
549 case VP_OPCODE_DP4:
550 {
551 GLfloat t[4], u[4], dot[4];
552 fetch_vector4( &inst->SrcReg[0], state, t );
553 fetch_vector4( &inst->SrcReg[1], state, u );
554 dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
555 dot[1] = dot[2] = dot[3] = dot[0];
556 store_vector4( &inst->DstReg, state, dot );
557 }
558 break;
559 case VP_OPCODE_DST:
560 {
561 GLfloat t[4], u[4], dst[4];
562 fetch_vector4( &inst->SrcReg[0], state, t );
563 fetch_vector4( &inst->SrcReg[1], state, u );
564 dst[0] = 1.0F;
565 dst[1] = t[1] * u[1];
566 dst[2] = t[2];
567 dst[3] = u[3];
568 store_vector4( &inst->DstReg, state, dst );
569 }
570 break;
571 case VP_OPCODE_MIN:
572 {
573 GLfloat t[4], u[4], min[4];
574 fetch_vector4( &inst->SrcReg[0], state, t );
575 fetch_vector4( &inst->SrcReg[1], state, u );
576 min[0] = (t[0] < u[0]) ? t[0] : u[0];
577 min[1] = (t[1] < u[1]) ? t[1] : u[1];
578 min[2] = (t[2] < u[2]) ? t[2] : u[2];
579 min[3] = (t[3] < u[3]) ? t[3] : u[3];
580 store_vector4( &inst->DstReg, state, min );
581 }
582 break;
583 case VP_OPCODE_MAX:
584 {
585 GLfloat t[4], u[4], max[4];
586 fetch_vector4( &inst->SrcReg[0], state, t );
587 fetch_vector4( &inst->SrcReg[1], state, u );
588 max[0] = (t[0] > u[0]) ? t[0] : u[0];
589 max[1] = (t[1] > u[1]) ? t[1] : u[1];
590 max[2] = (t[2] > u[2]) ? t[2] : u[2];
591 max[3] = (t[3] > u[3]) ? t[3] : u[3];
592 store_vector4( &inst->DstReg, state, max );
593 }
594 break;
595 case VP_OPCODE_SLT:
596 {
597 GLfloat t[4], u[4], slt[4];
598 fetch_vector4( &inst->SrcReg[0], state, t );
599 fetch_vector4( &inst->SrcReg[1], state, u );
600 slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
601 slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
602 slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
603 slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
604 store_vector4( &inst->DstReg, state, slt );
605 }
606 break;
607 case VP_OPCODE_SGE:
608 {
609 GLfloat t[4], u[4], sge[4];
610 fetch_vector4( &inst->SrcReg[0], state, t );
611 fetch_vector4( &inst->SrcReg[1], state, u );
612 sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
613 sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
614 sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
615 sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
616 store_vector4( &inst->DstReg, state, sge );
617 }
618 break;
619 case VP_OPCODE_MAD:
620 {
621 GLfloat t[4], u[4], v[4], sum[4];
622 fetch_vector4( &inst->SrcReg[0], state, t );
623 fetch_vector4( &inst->SrcReg[1], state, u );
624 fetch_vector4( &inst->SrcReg[2], state, v );
625 sum[0] = t[0] * u[0] + v[0];
626 sum[1] = t[1] * u[1] + v[1];
627 sum[2] = t[2] * u[2] + v[2];
628 sum[3] = t[3] * u[3] + v[3];
629 store_vector4( &inst->DstReg, state, sum );
630 }
631 break;
632 case VP_OPCODE_ARL:
633 {
634 GLfloat t[4];
635 fetch_vector4( &inst->SrcReg[0], state, t );
636 state->AddressReg[0] = (GLint) floor(t[0]);
637 }
638 break;
639 case VP_OPCODE_DPH:
640 {
641 GLfloat t[4], u[4], dot[4];
642 fetch_vector4( &inst->SrcReg[0], state, t );
643 fetch_vector4( &inst->SrcReg[1], state, u );
644 dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
645 dot[1] = dot[2] = dot[3] = dot[0];
646 store_vector4( &inst->DstReg, state, dot );
647 }
648 break;
649 case VP_OPCODE_RCC:
650 {
651 GLfloat t[4], u;
652 fetch_vector1( &inst->SrcReg[0], state, t );
653 if (t[0] == 1.0F)
654 u = 1.0F;
655 else
656 u = 1.0F / t[0];
657 if (u > 0.0F) {
658 if (u > 1.884467e+019F) {
659 u = 1.884467e+019F; /* IEEE 32-bit binary value 0x5F800000 */
660 }
661 else if (u < 5.42101e-020F) {
662 u = 5.42101e-020F; /* IEEE 32-bit binary value 0x1F800000 */
663 }
664 }
665 else {
666 if (u < -1.884467e+019F) {
667 u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
668 }
669 else if (u > -5.42101e-020F) {
670 u = -5.42101e-020F; /* IEEE 32-bit binary value 0x9F800000 */
671 }
672 }
673 t[0] = t[1] = t[2] = t[3] = u;
674 store_vector4( &inst->DstReg, state, t );
675 }
676 break;
677 case VP_OPCODE_SUB: /* GL_NV_vertex_program1_1 */
678 {
679 GLfloat t[4], u[4], sum[4];
680 fetch_vector4( &inst->SrcReg[0], state, t );
681 fetch_vector4( &inst->SrcReg[1], state, u );
682 sum[0] = t[0] - u[0];
683 sum[1] = t[1] - u[1];
684 sum[2] = t[2] - u[2];
685 sum[3] = t[3] - u[3];
686 store_vector4( &inst->DstReg, state, sum );
687 }
688 break;
689 case VP_OPCODE_ABS: /* GL_NV_vertex_program1_1 */
690 {
691 GLfloat t[4];
692 fetch_vector4( &inst->SrcReg[0], state, t );
693 if (t[0] < 0.0) t[0] = -t[0];
694 if (t[1] < 0.0) t[1] = -t[1];
695 if (t[2] < 0.0) t[2] = -t[2];
696 if (t[3] < 0.0) t[3] = -t[3];
697 store_vector4( &inst->DstReg, state, t );
698 }
699 break;
700 case VP_OPCODE_FLR: /* GL_ARB_vertex_program */
701 {
702 GLfloat t[4];
703 fetch_vector4( &inst->SrcReg[0], state, t );
704 t[0] = FLOORF(t[0]);
705 t[1] = FLOORF(t[1]);
706 t[2] = FLOORF(t[2]);
707 t[3] = FLOORF(t[3]);
708 store_vector4( &inst->DstReg, state, t );
709 }
710 break;
711 case VP_OPCODE_FRC: /* GL_ARB_vertex_program */
712 {
713 GLfloat t[4];
714 fetch_vector4( &inst->SrcReg[0], state, t );
715 t[0] = t[0] - FLOORF(t[0]);
716 t[1] = t[1] - FLOORF(t[1]);
717 t[2] = t[2] - FLOORF(t[2]);
718 t[3] = t[3] - FLOORF(t[3]);
719 store_vector4( &inst->DstReg, state, t );
720 }
721 break;
722 case VP_OPCODE_EX2: /* GL_ARB_vertex_program */
723 {
724 GLfloat t[4];
725 fetch_vector1( &inst->SrcReg[0], state, t );
726 t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(2.0, t[0]);
727 store_vector4( &inst->DstReg, state, t );
728 }
729 break;
730 case VP_OPCODE_LG2: /* GL_ARB_vertex_program */
731 {
732 GLfloat t[4];
733 fetch_vector1( &inst->SrcReg[0], state, t );
734 t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
735 store_vector4( &inst->DstReg, state, t );
736 }
737 break;
738 case VP_OPCODE_POW: /* GL_ARB_vertex_program */
739 {
740 GLfloat t[4], u[4];
741 fetch_vector1( &inst->SrcReg[0], state, t );
742 fetch_vector1( &inst->SrcReg[1], state, u );
743 t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(t[0], u[0]);
744 store_vector4( &inst->DstReg, state, t );
745 }
746 break;
747 case VP_OPCODE_XPD: /* GL_ARB_vertex_program */
748 {
749 GLfloat t[4], u[4], cross[4];
750 fetch_vector4( &inst->SrcReg[0], state, t );
751 fetch_vector4( &inst->SrcReg[1], state, u );
752 cross[0] = t[1] * u[2] - t[2] * u[1];
753 cross[1] = t[2] * u[0] - t[0] * u[2];
754 cross[2] = t[0] * u[1] - t[1] * u[0];
755 store_vector4( &inst->DstReg, state, cross );
756 }
757 break;
758 case VP_OPCODE_SWZ: /* GL_ARB_vertex_program */
759 {
760 const struct vp_src_register *source = &inst->SrcReg[0];
761 const GLfloat *src = get_register_pointer(source, state);
762 GLfloat result[4];
763 GLuint i;
764
765 /* do extended swizzling here */
766 for (i = 0; i < 3; i++) {
767 if (source->Swizzle[i] == SWIZZLE_ZERO)
768 result[i] = 0.0;
769 else if (source->Swizzle[i] == SWIZZLE_ONE)
770 result[i] = -1.0;
771 else
772 result[i] = -src[source->Swizzle[i]];
773 if (source->Negate)
774 result[i] = -result[i];
775 }
776 store_vector4( &inst->DstReg, state, result );
777 }
778 break;
779
780 case VP_OPCODE_END:
781 ctx->_CurrentProgram = 0;
782 return;
783 default:
784 /* bad instruction opcode */
785 _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
786 ctx->_CurrentProgram = 0;
787 return;
788 } /* switch */
789 } /* for */
790
791 ctx->_CurrentProgram = 0;
792 }
793
794
795
796 /**
797 Thoughts on vertex program optimization:
798
799 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
800 assembly code. That will probably be a lot of work.
801
802 Another approach might be to replace the vp_instruction->Opcode field with
803 a pointer to a specialized C function which executes the instruction.
804 In particular we can write functions which skip swizzling, negating,
805 masking, relative addressing, etc. when they're not needed.
806
807 For example:
808
809 void simple_add( struct vp_instruction *inst )
810 {
811 GLfloat *sum = machine->Registers[inst->DstReg.Register];
812 GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
813 GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
814 sum[0] = a[0] + b[0];
815 sum[1] = a[1] + b[1];
816 sum[2] = a[2] + b[2];
817 sum[3] = a[3] + b[3];
818 }
819
820 */
821
822 /*
823
824 KW:
825
826 A first step would be to 'vectorize' the programs in the same way as
827 the normal transformation code in the tnl module. Thus each opcode
828 takes zero or more input vectors (registers) and produces one or more
829 output vectors.
830
831 These operations would intially be coded in C, with machine-specific
832 assembly following, as is currently the case for matrix
833 transformations in the math/ directory. The preprocessing scheme for
834 selecting simpler operations Brian describes above would also work
835 here.
836
837 This should give reasonable performance without excessive effort.
838
839 */