new X86 CPU detection code (Petr Sebor)
[mesa.git] / src / mesa / main / nvvertexec.c
1 /* $Id: nvvertexec.c,v 1.1 2003/01/14 04:55:46 brianp Exp $ */
2
3 /*
4 * Mesa 3-D graphics library
5 * Version: 5.1
6 *
7 * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
23 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 /**
28 * \file nvvertexec.c
29 * \brief Code to execute vertex programs.
30 * \author Brian Paul
31 */
32
33 #include "glheader.h"
34 #include "context.h"
35 #include "imports.h"
36 #include "macros.h"
37 #include "mtypes.h"
38 #include "nvvertexec.h"
39 #include "nvvertprog.h"
40 #include "mmath.h"
41 #include "math/m_matrix.h"
42
43
44 /**
45 * Load/initialize the vertex program registers.
46 * This needs to be done per vertex.
47 */
48 void
49 _mesa_init_vp_registers(GLcontext *ctx)
50 {
51 struct vp_machine *machine = &(ctx->VertexProgram.Machine);
52 GLuint i;
53
54 /* Input registers get initialized from the current vertex attribs */
55 MEMCPY(machine->Registers[VP_INPUT_REG_START],
56 ctx->Current.Attrib,
57 16 * 4 * sizeof(GLfloat));
58
59 /* Output and temp regs are initialized to [0,0,0,1] */
60 for (i = VP_OUTPUT_REG_START; i <= VP_OUTPUT_REG_END; i++) {
61 machine->Registers[i][0] = 0.0F;
62 machine->Registers[i][1] = 0.0F;
63 machine->Registers[i][2] = 0.0F;
64 machine->Registers[i][3] = 1.0F;
65 }
66 for (i = VP_TEMP_REG_START; i <= VP_TEMP_REG_END; i++) {
67 machine->Registers[i][0] = 0.0F;
68 machine->Registers[i][1] = 0.0F;
69 machine->Registers[i][2] = 0.0F;
70 machine->Registers[i][3] = 1.0F;
71 }
72
73 /* The program regs aren't touched */
74 }
75
76
77
78 /**
79 * Copy the 16 elements of a matrix into four consecutive program
80 * registers starting at 'pos'.
81 */
82 static void
83 load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
84 {
85 GLuint i;
86 pos += VP_PROG_REG_START;
87 for (i = 0; i < 4; i++) {
88 registers[pos + i][0] = mat[0 + i];
89 registers[pos + i][1] = mat[4 + i];
90 registers[pos + i][2] = mat[8 + i];
91 registers[pos + i][3] = mat[12 + i];
92 }
93 }
94
95
96 /**
97 * As above, but transpose the matrix.
98 */
99 static void
100 load_transpose_matrix(GLfloat registers[][4], GLuint pos,
101 const GLfloat mat[16])
102 {
103 pos += VP_PROG_REG_START;
104 MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
105 }
106
107
108 /**
109 * Load all currently tracked matrices into the program registers.
110 * This needs to be done per glBegin/glEnd.
111 */
112 void
113 _mesa_init_tracked_matrices(GLcontext *ctx)
114 {
115 GLuint i;
116
117 for (i = 0; i < VP_NUM_PROG_REGS / 4; i++) {
118 /* point 'mat' at source matrix */
119 GLmatrix *mat;
120 if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
121 mat = ctx->ModelviewMatrixStack.Top;
122 }
123 else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
124 mat = ctx->ProjectionMatrixStack.Top;
125 }
126 else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
127 mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
128 }
129 else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
130 mat = ctx->ColorMatrixStack.Top;
131 }
132 else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
133 /* XXX verify the combined matrix is up to date */
134 mat = &ctx->_ModelProjectMatrix;
135 }
136 else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
137 ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
138 GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
139 ASSERT(n < MAX_PROGRAM_MATRICES);
140 mat = ctx->ProgramMatrixStack[n].Top;
141 }
142 else {
143 /* no matrix is tracked, but we leave the register values as-is */
144 assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
145 continue;
146 }
147
148 /* load the matrix */
149 if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
150 load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
151 }
152 else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
153 _math_matrix_analyse(mat); /* update the inverse */
154 assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
155 load_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->inv);
156 }
157 else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
158 load_transpose_matrix(ctx->VertexProgram.Machine.Registers, i*4, mat->m);
159 }
160 else {
161 assert(ctx->VertexProgram.TrackMatrixTransform[i]
162 == GL_INVERSE_TRANSPOSE_NV);
163 _math_matrix_analyse(mat); /* update the inverse */
164 assert((mat->flags & MAT_DIRTY_INVERSE) == 0);
165 load_transpose_matrix(ctx->VertexProgram.Machine.Registers,
166 i*4, mat->inv);
167 }
168 }
169 }
170
171
172
173 /**
174 * For debugging. Dump the current vertex program machine registers.
175 */
176 void
177 _mesa_dump_vp_machine( const struct vp_machine *machine )
178 {
179 int i;
180 _mesa_printf("VertexIn:\n");
181 for (i = 0; i < VP_NUM_INPUT_REGS; i++) {
182 _mesa_printf("%d: %f %f %f %f ", i,
183 machine->Registers[i + VP_INPUT_REG_START][0],
184 machine->Registers[i + VP_INPUT_REG_START][1],
185 machine->Registers[i + VP_INPUT_REG_START][2],
186 machine->Registers[i + VP_INPUT_REG_START][3]);
187 }
188 _mesa_printf("\n");
189
190 _mesa_printf("VertexOut:\n");
191 for (i = 0; i < VP_NUM_OUTPUT_REGS; i++) {
192 _mesa_printf("%d: %f %f %f %f ", i,
193 machine->Registers[i + VP_OUTPUT_REG_START][0],
194 machine->Registers[i + VP_OUTPUT_REG_START][1],
195 machine->Registers[i + VP_OUTPUT_REG_START][2],
196 machine->Registers[i + VP_OUTPUT_REG_START][3]);
197 }
198 _mesa_printf("\n");
199
200 _mesa_printf("Registers:\n");
201 for (i = 0; i < VP_NUM_TEMP_REGS; i++) {
202 _mesa_printf("%d: %f %f %f %f ", i,
203 machine->Registers[i + VP_TEMP_REG_START][0],
204 machine->Registers[i + VP_TEMP_REG_START][1],
205 machine->Registers[i + VP_TEMP_REG_START][2],
206 machine->Registers[i + VP_TEMP_REG_START][3]);
207 }
208 _mesa_printf("\n");
209
210 _mesa_printf("Parameters:\n");
211 for (i = 0; i < VP_NUM_PROG_REGS; i++) {
212 _mesa_printf("%d: %f %f %f %f ", i,
213 machine->Registers[i + VP_PROG_REG_START][0],
214 machine->Registers[i + VP_PROG_REG_START][1],
215 machine->Registers[i + VP_PROG_REG_START][2],
216 machine->Registers[i + VP_PROG_REG_START][3]);
217 }
218 _mesa_printf("\n");
219 }
220
221
222 /**
223 * Fetch a 4-element float vector from the given source register.
224 * Apply swizzling and negating as needed.
225 */
226 static void
227 fetch_vector4( const struct vp_src_register *source,
228 const struct vp_machine *machine,
229 GLfloat result[4] )
230 {
231 static const GLfloat zero[4] = { 0, 0, 0, 0 };
232 const GLfloat *src;
233
234 if (source->RelAddr) {
235 GLint reg = source->Register + machine->AddressReg;
236 if (reg < VP_PROG_REG_START || reg > VP_PROG_REG_END)
237 src = zero;
238 else
239 src = machine->Registers[reg];
240 }
241 else {
242 src = machine->Registers[source->Register];
243 }
244
245 if (source->Negate) {
246 result[0] = -src[source->Swizzle[0]];
247 result[1] = -src[source->Swizzle[1]];
248 result[2] = -src[source->Swizzle[2]];
249 result[3] = -src[source->Swizzle[3]];
250 }
251 else {
252 result[0] = src[source->Swizzle[0]];
253 result[1] = src[source->Swizzle[1]];
254 result[2] = src[source->Swizzle[2]];
255 result[3] = src[source->Swizzle[3]];
256 }
257 }
258
259
260 /**
261 * As above, but only return result[0] element.
262 */
263 static void
264 fetch_vector1( const struct vp_src_register *source,
265 const struct vp_machine *machine,
266 GLfloat result[4] )
267 {
268 static const GLfloat zero[4] = { 0, 0, 0, 0 };
269 const GLfloat *src;
270
271 if (source->RelAddr) {
272 GLint reg = source->Register + machine->AddressReg;
273 if (reg < VP_PROG_REG_START || reg > VP_PROG_REG_END)
274 src = zero;
275 else
276 src = machine->Registers[reg];
277 }
278 else {
279 src = machine->Registers[source->Register];
280 }
281
282 if (source->Negate) {
283 result[0] = -src[source->Swizzle[0]];
284 }
285 else {
286 result[0] = src[source->Swizzle[0]];
287 }
288 }
289
290
291 /**
292 * Store 4 floats into a register.
293 */
294 static void
295 store_vector4( const struct vp_dst_register *dest, struct vp_machine *machine,
296 const GLfloat value[4] )
297 {
298 GLfloat *dst = machine->Registers[dest->Register];
299
300 if (dest->WriteMask[0])
301 dst[0] = value[0];
302 if (dest->WriteMask[1])
303 dst[1] = value[1];
304 if (dest->WriteMask[2])
305 dst[2] = value[2];
306 if (dest->WriteMask[3])
307 dst[3] = value[3];
308 }
309
310
311 /**
312 * Set x to positive or negative infinity.
313 */
314 #ifdef USE_IEEE
315 #define SET_POS_INFINITY(x) ( *((GLuint *) &x) = 0x7F800000 )
316 #define SET_NEG_INFINITY(x) ( *((GLuint *) &x) = 0xFF800000 )
317 #elif defined(VMS)
318 #define SET_POS_INFINITY(x) x = __MAXFLOAT
319 #define SET_NEG_INFINITY(x) x = -__MAXFLOAT
320 #else
321 #define SET_POS_INFINITY(x) x = (GLfloat) HUGE_VAL
322 #define SET_NEG_INFINITY(x) x = (GLfloat) -HUGE_VAL
323 #endif
324
325 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
326
327
328 /**
329 * Execute the given vertex program
330 */
331 void
332 _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
333 {
334 struct vp_machine *machine = &ctx->VertexProgram.Machine;
335 const struct vp_instruction *inst;
336
337 /* XXX load vertex fields into input registers */
338 /* and do other initialization */
339
340
341 for (inst = program->Instructions; inst->Opcode != VP_OPCODE_END; inst++) {
342 switch (inst->Opcode) {
343 case VP_OPCODE_MOV:
344 {
345 GLfloat t[4];
346 fetch_vector4( &inst->SrcReg[0], machine, t );
347 store_vector4( &inst->DstReg, machine, t );
348 }
349 break;
350 case VP_OPCODE_LIT:
351 {
352 const GLfloat epsilon = 1.0e-5F; /* XXX fix? */
353 GLfloat t[4], lit[4];
354 fetch_vector4( &inst->SrcReg[0], machine, t );
355 if (t[3] < -(128.0F - epsilon))
356 t[3] = - (128.0F - epsilon);
357 else if (t[3] > 128.0F - epsilon)
358 t[3] = 128.0F - epsilon;
359 if (t[0] < 0.0)
360 t[0] = 0.0;
361 if (t[1] < 0.0)
362 t[1] = 0.0;
363 lit[0] = 1.0;
364 lit[1] = t[0];
365 lit[2] = (t[0] > 0.0) ? (GLfloat) exp(t[3] * log(t[1])) : 0.0F;
366 lit[3] = 1.0;
367 store_vector4( &inst->DstReg, machine, lit );
368 }
369 break;
370 case VP_OPCODE_RCP:
371 {
372 GLfloat t[4];
373 fetch_vector1( &inst->SrcReg[0], machine, t );
374 if (t[0] != 1.0F)
375 t[0] = 1.0F / t[0]; /* div by zero is infinity! */
376 t[1] = t[2] = t[3] = t[0];
377 store_vector4( &inst->DstReg, machine, t );
378 }
379 break;
380 case VP_OPCODE_RSQ:
381 {
382 GLfloat t[4];
383 fetch_vector1( &inst->SrcReg[0], machine, t );
384 t[0] = (float) (1.0 / sqrt(fabs(t[0])));
385 t[1] = t[2] = t[3] = t[0];
386 store_vector4( &inst->DstReg, machine, t );
387 }
388 break;
389 case VP_OPCODE_EXP:
390 {
391 GLfloat t[4], q[4], floor_t0;
392 fetch_vector1( &inst->SrcReg[0], machine, t );
393 floor_t0 = (float) floor(t[0]);
394 if (floor_t0 > FLT_MAX_EXP) {
395 SET_POS_INFINITY(q[0]);
396 q[1] = 0.0F;
397 SET_POS_INFINITY(q[2]);
398 q[3] = 1.0F;
399 }
400 else if (floor_t0 < FLT_MIN_EXP) {
401 q[0] = 0.0F;
402 q[1] = 0.0F;
403 q[2] = 0.0F;
404 q[3] = 0.0F;
405 }
406 else {
407 #ifdef USE_IEEE
408 GLint ii = (GLint) floor_t0;
409 ii = (ii < 23) + 0x3f800000;
410 SET_FLOAT_BITS(q[0], ii);
411 q[0] = *((GLfloat *) &ii);
412 #else
413 q[0] = (GLfloat) pow(2.0, floor_t0);
414 #endif
415 q[1] = t[0] - floor_t0;
416 q[2] = (GLfloat) (q[0] * LOG2(q[1]));
417 q[3] = 1.0F;
418 }
419 store_vector4( &inst->DstReg, machine, t );
420 }
421 break;
422 case VP_OPCODE_LOG:
423 {
424 GLfloat t[4], q[4], abs_t0;
425 fetch_vector1( &inst->SrcReg[0], machine, t );
426 abs_t0 = (GLfloat) fabs(t[0]);
427 if (abs_t0 != 0.0F) {
428 /* Since we really can't handle infinite values on VMS
429 * like other OSes we'll use __MAXFLOAT to represent
430 * infinity. This may need some tweaking.
431 */
432 #ifdef VMS
433 if (abs_t0 == __MAXFLOAT) {
434 #else
435 if (IS_INF_OR_NAN(abs_t0)) {
436 #endif
437 SET_POS_INFINITY(q[0]);
438 q[1] = 1.0F;
439 SET_POS_INFINITY(q[2]);
440 }
441 else {
442 int exponent;
443 double mantissa = frexp(t[0], &exponent);
444 q[0] = (GLfloat) (exponent - 1);
445 q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
446 q[2] = (GLfloat) (q[0] + LOG2(q[1]));
447 }
448 }
449 else {
450 SET_NEG_INFINITY(q[0]);
451 q[1] = 1.0F;
452 SET_NEG_INFINITY(q[2]);
453 }
454 q[3] = 1.0;
455 store_vector4( &inst->DstReg, machine, q );
456 }
457 break;
458 case VP_OPCODE_MUL:
459 {
460 GLfloat t[4], u[4], prod[4];
461 fetch_vector4( &inst->SrcReg[0], machine, t );
462 fetch_vector4( &inst->SrcReg[1], machine, u );
463 prod[0] = t[0] * u[0];
464 prod[1] = t[1] * u[1];
465 prod[2] = t[2] * u[2];
466 prod[3] = t[3] * u[3];
467 store_vector4( &inst->DstReg, machine, prod );
468 }
469 break;
470 case VP_OPCODE_ADD:
471 {
472 GLfloat t[4], u[4], sum[4];
473 fetch_vector4( &inst->SrcReg[0], machine, t );
474 fetch_vector4( &inst->SrcReg[1], machine, u );
475 sum[0] = t[0] + u[0];
476 sum[1] = t[1] + u[1];
477 sum[2] = t[2] + u[2];
478 sum[3] = t[3] + u[3];
479 store_vector4( &inst->DstReg, machine, sum );
480 }
481 break;
482 case VP_OPCODE_DP3:
483 {
484 GLfloat t[4], u[4], dot[4];
485 fetch_vector4( &inst->SrcReg[0], machine, t );
486 fetch_vector4( &inst->SrcReg[1], machine, u );
487 dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
488 dot[1] = dot[2] = dot[3] = dot[0];
489 store_vector4( &inst->DstReg, machine, dot );
490 }
491 break;
492 case VP_OPCODE_DP4:
493 {
494 GLfloat t[4], u[4], dot[4];
495 fetch_vector4( &inst->SrcReg[0], machine, t );
496 fetch_vector4( &inst->SrcReg[1], machine, u );
497 dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
498 dot[1] = dot[2] = dot[3] = dot[0];
499 store_vector4( &inst->DstReg, machine, dot );
500 }
501 break;
502 case VP_OPCODE_DST:
503 {
504 GLfloat t[4], u[4], dst[4];
505 fetch_vector4( &inst->SrcReg[0], machine, t );
506 fetch_vector4( &inst->SrcReg[1], machine, u );
507 dst[0] = 1.0F;
508 dst[1] = t[1] * u[1];
509 dst[2] = t[2];
510 dst[3] = u[3];
511 store_vector4( &inst->DstReg, machine, dst );
512 }
513 break;
514 case VP_OPCODE_MIN:
515 {
516 GLfloat t[4], u[4], min[4];
517 fetch_vector4( &inst->SrcReg[0], machine, t );
518 fetch_vector4( &inst->SrcReg[1], machine, u );
519 min[0] = (t[0] < u[0]) ? t[0] : u[0];
520 min[1] = (t[1] < u[1]) ? t[1] : u[1];
521 min[2] = (t[2] < u[2]) ? t[2] : u[2];
522 min[3] = (t[3] < u[3]) ? t[3] : u[3];
523 store_vector4( &inst->DstReg, machine, min );
524 }
525 break;
526 case VP_OPCODE_MAX:
527 {
528 GLfloat t[4], u[4], max[4];
529 fetch_vector4( &inst->SrcReg[0], machine, t );
530 fetch_vector4( &inst->SrcReg[1], machine, u );
531 max[0] = (t[0] > u[0]) ? t[0] : u[0];
532 max[1] = (t[1] > u[1]) ? t[1] : u[1];
533 max[2] = (t[2] > u[2]) ? t[2] : u[2];
534 max[3] = (t[3] > u[3]) ? t[3] : u[3];
535 store_vector4( &inst->DstReg, machine, max );
536 }
537 break;
538 case VP_OPCODE_SLT:
539 {
540 GLfloat t[4], u[4], slt[4];
541 fetch_vector4( &inst->SrcReg[0], machine, t );
542 fetch_vector4( &inst->SrcReg[1], machine, u );
543 slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
544 slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
545 slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
546 slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
547 store_vector4( &inst->DstReg, machine, slt );
548 }
549 break;
550 case VP_OPCODE_SGE:
551 {
552 GLfloat t[4], u[4], sge[4];
553 fetch_vector4( &inst->SrcReg[0], machine, t );
554 fetch_vector4( &inst->SrcReg[1], machine, u );
555 sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
556 sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
557 sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
558 sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
559 store_vector4( &inst->DstReg, machine, sge );
560 }
561 break;
562 case VP_OPCODE_MAD:
563 {
564 GLfloat t[4], u[4], v[4], sum[4];
565 fetch_vector4( &inst->SrcReg[0], machine, t );
566 fetch_vector4( &inst->SrcReg[1], machine, u );
567 fetch_vector4( &inst->SrcReg[2], machine, v );
568 sum[0] = t[0] * u[0] + v[0];
569 sum[1] = t[1] * u[1] + v[1];
570 sum[2] = t[2] * u[2] + v[2];
571 sum[3] = t[3] * u[3] + v[3];
572 store_vector4( &inst->DstReg, machine, sum );
573 }
574 break;
575 case VP_OPCODE_ARL:
576 {
577 GLfloat t[4];
578 fetch_vector4( &inst->SrcReg[0], machine, t );
579 machine->AddressReg = (GLint) floor(t[0]);
580 }
581 break;
582 case VP_OPCODE_DPH:
583 {
584 GLfloat t[4], u[4], dot[4];
585 fetch_vector4( &inst->SrcReg[0], machine, t );
586 fetch_vector4( &inst->SrcReg[1], machine, u );
587 dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
588 dot[1] = dot[2] = dot[3] = dot[0];
589 store_vector4( &inst->DstReg, machine, dot );
590 }
591 break;
592 case VP_OPCODE_RCC:
593 {
594 GLfloat t[4], u;
595 fetch_vector1( &inst->SrcReg[0], machine, t );
596 if (t[0] == 1.0F)
597 u = 1.0F;
598 else
599 u = 1.0F / t[0];
600 if (u > 0.0F) {
601 if (u > 1.884467e+019F) {
602 u = 1.884467e+019F; /* IEEE 32-bit binary value 0x5F800000 */
603 }
604 else if (u < 5.42101e-020F) {
605 u = 5.42101e-020F; /* IEEE 32-bit binary value 0x1F800000 */
606 }
607 }
608 else {
609 if (u < -1.884467e+019F) {
610 u = -1.884467e+019F; /* IEEE 32-bit binary value 0xDF800000 */
611 }
612 else if (u > -5.42101e-020F) {
613 u = -5.42101e-020F; /* IEEE 32-bit binary value 0x9F800000 */
614 }
615 }
616 t[0] = t[1] = t[2] = t[3] = u;
617 store_vector4( &inst->DstReg, machine, t );
618 }
619 break;
620 case VP_OPCODE_SUB:
621 {
622 GLfloat t[4], u[4], sum[4];
623 fetch_vector4( &inst->SrcReg[0], machine, t );
624 fetch_vector4( &inst->SrcReg[1], machine, u );
625 sum[0] = t[0] - u[0];
626 sum[1] = t[1] - u[1];
627 sum[2] = t[2] - u[2];
628 sum[3] = t[3] - u[3];
629 store_vector4( &inst->DstReg, machine, sum );
630 }
631 break;
632 case VP_OPCODE_ABS:
633 {
634 GLfloat t[4];
635 fetch_vector4( &inst->SrcReg[0], machine, t );
636 if (t[0] < 0.0) t[0] = -t[0];
637 if (t[1] < 0.0) t[1] = -t[1];
638 if (t[2] < 0.0) t[2] = -t[2];
639 if (t[3] < 0.0) t[3] = -t[3];
640 store_vector4( &inst->DstReg, machine, t );
641 }
642 break;
643
644 case VP_OPCODE_END:
645 return;
646 default:
647 /* bad instruction opcode */
648 _mesa_problem(ctx, "Bad VP Opcode in _mesa_exec_vertex_program");
649 return;
650 }
651 }
652 }
653
654
655
656 /**
657 Thoughts on vertex program optimization:
658
659 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
660 assembly code. That will probably be a lot of work.
661
662 Another approach might be to replace the vp_instruction->Opcode field with
663 a pointer to a specialized C function which executes the instruction.
664 In particular we can write functions which skip swizzling, negating,
665 masking, relative addressing, etc. when they're not needed.
666
667 For example:
668
669 void simple_add( struct vp_instruction *inst )
670 {
671 GLfloat *sum = machine->Registers[inst->DstReg.Register];
672 GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
673 GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
674 sum[0] = a[0] + b[0];
675 sum[1] = a[1] + b[1];
676 sum[2] = a[2] + b[2];
677 sum[3] = a[3] + b[3];
678 }
679
680 */
681
682 /*
683
684 KW:
685
686 A first step would be to 'vectorize' the programs in the same way as
687 the normal transformation code in the tnl module. Thus each opcode
688 takes zero or more input vectors (registers) and produces one or more
689 output vectors.
690
691 These operations would intially be coded in C, with machine-specific
692 assembly following, as is currently the case for matrix
693 transformations in the math/ directory. The preprocessing scheme for
694 selecting simpler operations Brian describes above would also work
695 here.
696
697 This should give reasonable performance without excessive effort.
698
699 */