2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 * \brief Code to execute vertex programs.
36 #include "nvvertexec.h"
37 #include "nvvertprog.h"
38 #include "math/m_matrix.h"
42 * Load/initialize the vertex program registers.
43 * This needs to be done per vertex.
46 _mesa_init_vp_registers(GLcontext
*ctx
)
48 struct vp_machine
*machine
= &(ctx
->VertexProgram
.Machine
);
51 /* Input registers get initialized from the current vertex attribs */
52 MEMCPY(machine
->Registers
[VP_INPUT_REG_START
],
54 16 * 4 * sizeof(GLfloat
));
56 /* Output and temp regs are initialized to [0,0,0,1] */
57 for (i
= VP_OUTPUT_REG_START
; i
<= VP_OUTPUT_REG_END
; i
++) {
58 machine
->Registers
[i
][0] = 0.0F
;
59 machine
->Registers
[i
][1] = 0.0F
;
60 machine
->Registers
[i
][2] = 0.0F
;
61 machine
->Registers
[i
][3] = 1.0F
;
63 for (i
= VP_TEMP_REG_START
; i
<= VP_TEMP_REG_END
; i
++) {
64 machine
->Registers
[i
][0] = 0.0F
;
65 machine
->Registers
[i
][1] = 0.0F
;
66 machine
->Registers
[i
][2] = 0.0F
;
67 machine
->Registers
[i
][3] = 1.0F
;
70 /* The program regs aren't touched */
76 * Copy the 16 elements of a matrix into four consecutive program
77 * registers starting at 'pos'.
80 load_matrix(GLfloat registers
[][4], GLuint pos
, const GLfloat mat
[16])
83 pos
+= VP_PROG_REG_START
;
84 for (i
= 0; i
< 4; i
++) {
85 registers
[pos
+ i
][0] = mat
[0 + i
];
86 registers
[pos
+ i
][1] = mat
[4 + i
];
87 registers
[pos
+ i
][2] = mat
[8 + i
];
88 registers
[pos
+ i
][3] = mat
[12 + i
];
94 * As above, but transpose the matrix.
97 load_transpose_matrix(GLfloat registers
[][4], GLuint pos
,
98 const GLfloat mat
[16])
100 pos
+= VP_PROG_REG_START
;
101 MEMCPY(registers
[pos
], mat
, 16 * sizeof(GLfloat
));
106 * Load all currently tracked matrices into the program registers.
107 * This needs to be done per glBegin/glEnd.
110 _mesa_init_tracked_matrices(GLcontext
*ctx
)
114 for (i
= 0; i
< VP_NUM_PROG_REGS
/ 4; i
++) {
115 /* point 'mat' at source matrix */
117 if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_MODELVIEW
) {
118 mat
= ctx
->ModelviewMatrixStack
.Top
;
120 else if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_PROJECTION
) {
121 mat
= ctx
->ProjectionMatrixStack
.Top
;
123 else if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_TEXTURE
) {
124 mat
= ctx
->TextureMatrixStack
[ctx
->Texture
.CurrentUnit
].Top
;
126 else if (ctx
->VertexProgram
.TrackMatrix
[i
] == GL_COLOR
) {
127 mat
= ctx
->ColorMatrixStack
.Top
;
129 else if (ctx
->VertexProgram
.TrackMatrix
[i
]==GL_MODELVIEW_PROJECTION_NV
) {
130 /* XXX verify the combined matrix is up to date */
131 mat
= &ctx
->_ModelProjectMatrix
;
133 else if (ctx
->VertexProgram
.TrackMatrix
[i
] >= GL_MATRIX0_NV
&&
134 ctx
->VertexProgram
.TrackMatrix
[i
] <= GL_MATRIX7_NV
) {
135 GLuint n
= ctx
->VertexProgram
.TrackMatrix
[i
] - GL_MATRIX0_NV
;
136 ASSERT(n
< MAX_PROGRAM_MATRICES
);
137 mat
= ctx
->ProgramMatrixStack
[n
].Top
;
140 /* no matrix is tracked, but we leave the register values as-is */
141 assert(ctx
->VertexProgram
.TrackMatrix
[i
] == GL_NONE
);
145 /* load the matrix */
146 if (ctx
->VertexProgram
.TrackMatrixTransform
[i
] == GL_IDENTITY_NV
) {
147 load_matrix(ctx
->VertexProgram
.Machine
.Registers
, i
*4, mat
->m
);
149 else if (ctx
->VertexProgram
.TrackMatrixTransform
[i
] == GL_INVERSE_NV
) {
150 _math_matrix_analyse(mat
); /* update the inverse */
151 assert((mat
->flags
& MAT_DIRTY_INVERSE
) == 0);
152 load_matrix(ctx
->VertexProgram
.Machine
.Registers
, i
*4, mat
->inv
);
154 else if (ctx
->VertexProgram
.TrackMatrixTransform
[i
] == GL_TRANSPOSE_NV
) {
155 load_transpose_matrix(ctx
->VertexProgram
.Machine
.Registers
, i
*4, mat
->m
);
158 assert(ctx
->VertexProgram
.TrackMatrixTransform
[i
]
159 == GL_INVERSE_TRANSPOSE_NV
);
160 _math_matrix_analyse(mat
); /* update the inverse */
161 assert((mat
->flags
& MAT_DIRTY_INVERSE
) == 0);
162 load_transpose_matrix(ctx
->VertexProgram
.Machine
.Registers
,
171 * For debugging. Dump the current vertex program machine registers.
174 _mesa_dump_vp_machine( const struct vp_machine
*machine
)
177 _mesa_printf("VertexIn:\n");
178 for (i
= 0; i
< VP_NUM_INPUT_REGS
; i
++) {
179 _mesa_printf("%d: %f %f %f %f ", i
,
180 machine
->Registers
[i
+ VP_INPUT_REG_START
][0],
181 machine
->Registers
[i
+ VP_INPUT_REG_START
][1],
182 machine
->Registers
[i
+ VP_INPUT_REG_START
][2],
183 machine
->Registers
[i
+ VP_INPUT_REG_START
][3]);
187 _mesa_printf("VertexOut:\n");
188 for (i
= 0; i
< VP_NUM_OUTPUT_REGS
; i
++) {
189 _mesa_printf("%d: %f %f %f %f ", i
,
190 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][0],
191 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][1],
192 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][2],
193 machine
->Registers
[i
+ VP_OUTPUT_REG_START
][3]);
197 _mesa_printf("Registers:\n");
198 for (i
= 0; i
< VP_NUM_TEMP_REGS
; i
++) {
199 _mesa_printf("%d: %f %f %f %f ", i
,
200 machine
->Registers
[i
+ VP_TEMP_REG_START
][0],
201 machine
->Registers
[i
+ VP_TEMP_REG_START
][1],
202 machine
->Registers
[i
+ VP_TEMP_REG_START
][2],
203 machine
->Registers
[i
+ VP_TEMP_REG_START
][3]);
207 _mesa_printf("Parameters:\n");
208 for (i
= 0; i
< VP_NUM_PROG_REGS
; i
++) {
209 _mesa_printf("%d: %f %f %f %f ", i
,
210 machine
->Registers
[i
+ VP_PROG_REG_START
][0],
211 machine
->Registers
[i
+ VP_PROG_REG_START
][1],
212 machine
->Registers
[i
+ VP_PROG_REG_START
][2],
213 machine
->Registers
[i
+ VP_PROG_REG_START
][3]);
220 * Fetch a 4-element float vector from the given source register.
221 * Apply swizzling and negating as needed.
224 fetch_vector4( const struct vp_src_register
*source
,
225 const struct vp_machine
*machine
,
228 static const GLfloat zero
[4] = { 0, 0, 0, 0 };
231 if (source
->RelAddr
) {
232 const GLint reg
= source
->Register
+ machine
->AddressReg
;
233 if (reg
< 0 || reg
> MAX_NV_VERTEX_PROGRAM_PARAMS
)
236 src
= machine
->Registers
[VP_PROG_REG_START
+ reg
];
239 src
= machine
->Registers
[source
->Register
];
242 if (source
->Negate
) {
243 result
[0] = -src
[source
->Swizzle
[0]];
244 result
[1] = -src
[source
->Swizzle
[1]];
245 result
[2] = -src
[source
->Swizzle
[2]];
246 result
[3] = -src
[source
->Swizzle
[3]];
249 result
[0] = src
[source
->Swizzle
[0]];
250 result
[1] = src
[source
->Swizzle
[1]];
251 result
[2] = src
[source
->Swizzle
[2]];
252 result
[3] = src
[source
->Swizzle
[3]];
258 * As above, but only return result[0] element.
261 fetch_vector1( const struct vp_src_register
*source
,
262 const struct vp_machine
*machine
,
265 static const GLfloat zero
[4] = { 0, 0, 0, 0 };
268 if (source
->RelAddr
) {
269 const GLint reg
= source
->Register
+ machine
->AddressReg
;
270 if (reg
< 0 || reg
> MAX_NV_VERTEX_PROGRAM_PARAMS
)
273 src
= machine
->Registers
[VP_PROG_REG_START
+ reg
];
276 src
= machine
->Registers
[source
->Register
];
279 if (source
->Negate
) {
280 result
[0] = -src
[source
->Swizzle
[0]];
283 result
[0] = src
[source
->Swizzle
[0]];
289 * Store 4 floats into a register.
292 store_vector4( const struct vp_dst_register
*dest
, struct vp_machine
*machine
,
293 const GLfloat value
[4] )
295 GLfloat
*dst
= machine
->Registers
[dest
->Register
];
297 if (dest
->WriteMask
[0])
299 if (dest
->WriteMask
[1])
301 if (dest
->WriteMask
[2])
303 if (dest
->WriteMask
[3])
309 * Set x to positive or negative infinity.
312 #define SET_POS_INFINITY(x) ( *((GLuint *) &x) = 0x7F800000 )
313 #define SET_NEG_INFINITY(x) ( *((GLuint *) &x) = 0xFF800000 )
315 #define SET_POS_INFINITY(x) x = __MAXFLOAT
316 #define SET_NEG_INFINITY(x) x = -__MAXFLOAT
318 #define SET_POS_INFINITY(x) x = (GLfloat) HUGE_VAL
319 #define SET_NEG_INFINITY(x) x = (GLfloat) -HUGE_VAL
322 #define SET_FLOAT_BITS(x, bits) ((fi_type *) &(x))->i = bits
326 * Execute the given vertex program
329 _mesa_exec_vertex_program(GLcontext
*ctx
, const struct vertex_program
*program
)
331 struct vp_machine
*machine
= &ctx
->VertexProgram
.Machine
;
332 const struct vp_instruction
*inst
;
334 for (inst
= program
->Instructions
; inst
->Opcode
!= VP_OPCODE_END
; inst
++) {
335 switch (inst
->Opcode
) {
339 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
340 store_vector4( &inst
->DstReg
, machine
, t
);
345 const GLfloat epsilon
= 1.0e-5F
; /* XXX fix? */
346 GLfloat t
[4], lit
[4];
347 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
348 if (t
[3] < -(128.0F
- epsilon
))
349 t
[3] = - (128.0F
- epsilon
);
350 else if (t
[3] > 128.0F
- epsilon
)
351 t
[3] = 128.0F
- epsilon
;
358 lit
[2] = (t
[0] > 0.0) ? (GLfloat
) exp(t
[3] * log(t
[1])) : 0.0F
;
360 store_vector4( &inst
->DstReg
, machine
, lit
);
366 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
368 t
[0] = 1.0F
/ t
[0]; /* div by zero is infinity! */
369 t
[1] = t
[2] = t
[3] = t
[0];
370 store_vector4( &inst
->DstReg
, machine
, t
);
376 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
377 t
[0] = INV_SQRTF(FABSF(t
[0]));
378 t
[1] = t
[2] = t
[3] = t
[0];
379 store_vector4( &inst
->DstReg
, machine
, t
);
384 GLfloat t
[4], q
[4], floor_t0
;
385 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
386 floor_t0
= (float) floor(t
[0]);
387 if (floor_t0
> FLT_MAX_EXP
) {
388 SET_POS_INFINITY(q
[0]);
389 SET_POS_INFINITY(q
[2]);
391 else if (floor_t0
< FLT_MIN_EXP
) {
397 GLint ii
= (GLint
) floor_t0
;
398 ii
= (ii
< 23) + 0x3f800000;
399 SET_FLOAT_BITS(q
[0], ii
);
400 q
[0] = *((GLfloat
*) &ii
);
402 q
[0] = (GLfloat
) pow(2.0, floor_t0
);
404 q
[2] = (GLfloat
) (q
[0] * LOG2(q
[1]));
406 q
[1] = t
[0] - floor_t0
;
408 store_vector4( &inst
->DstReg
, machine
, q
);
413 GLfloat t
[4], q
[4], abs_t0
;
414 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
415 abs_t0
= (GLfloat
) fabs(t
[0]);
416 if (abs_t0
!= 0.0F
) {
417 /* Since we really can't handle infinite values on VMS
418 * like other OSes we'll use __MAXFLOAT to represent
419 * infinity. This may need some tweaking.
422 if (abs_t0
== __MAXFLOAT
) {
424 if (IS_INF_OR_NAN(abs_t0
)) {
426 SET_POS_INFINITY(q
[0]);
428 SET_POS_INFINITY(q
[2]);
432 double mantissa
= frexp(t
[0], &exponent
);
433 q
[0] = (GLfloat
) (exponent
- 1);
434 q
[1] = (GLfloat
) (2.0 * mantissa
); /* map [.5, 1) -> [1, 2) */
435 q
[2] = (GLfloat
) (q
[0] + LOG2(q
[1]));
439 SET_NEG_INFINITY(q
[0]);
441 SET_NEG_INFINITY(q
[2]);
444 store_vector4( &inst
->DstReg
, machine
, q
);
449 GLfloat t
[4], u
[4], prod
[4];
450 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
451 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
452 prod
[0] = t
[0] * u
[0];
453 prod
[1] = t
[1] * u
[1];
454 prod
[2] = t
[2] * u
[2];
455 prod
[3] = t
[3] * u
[3];
456 store_vector4( &inst
->DstReg
, machine
, prod
);
461 GLfloat t
[4], u
[4], sum
[4];
462 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
463 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
464 sum
[0] = t
[0] + u
[0];
465 sum
[1] = t
[1] + u
[1];
466 sum
[2] = t
[2] + u
[2];
467 sum
[3] = t
[3] + u
[3];
468 store_vector4( &inst
->DstReg
, machine
, sum
);
473 GLfloat t
[4], u
[4], dot
[4];
474 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
475 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
476 dot
[0] = t
[0] * u
[0] + t
[1] * u
[1] + t
[2] * u
[2];
477 dot
[1] = dot
[2] = dot
[3] = dot
[0];
478 store_vector4( &inst
->DstReg
, machine
, dot
);
483 GLfloat t
[4], u
[4], dot
[4];
484 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
485 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
486 dot
[0] = t
[0] * u
[0] + t
[1] * u
[1] + t
[2] * u
[2] + t
[3] * u
[3];
487 dot
[1] = dot
[2] = dot
[3] = dot
[0];
488 store_vector4( &inst
->DstReg
, machine
, dot
);
493 GLfloat t
[4], u
[4], dst
[4];
494 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
495 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
497 dst
[1] = t
[1] * u
[1];
500 store_vector4( &inst
->DstReg
, machine
, dst
);
505 GLfloat t
[4], u
[4], min
[4];
506 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
507 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
508 min
[0] = (t
[0] < u
[0]) ? t
[0] : u
[0];
509 min
[1] = (t
[1] < u
[1]) ? t
[1] : u
[1];
510 min
[2] = (t
[2] < u
[2]) ? t
[2] : u
[2];
511 min
[3] = (t
[3] < u
[3]) ? t
[3] : u
[3];
512 store_vector4( &inst
->DstReg
, machine
, min
);
517 GLfloat t
[4], u
[4], max
[4];
518 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
519 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
520 max
[0] = (t
[0] > u
[0]) ? t
[0] : u
[0];
521 max
[1] = (t
[1] > u
[1]) ? t
[1] : u
[1];
522 max
[2] = (t
[2] > u
[2]) ? t
[2] : u
[2];
523 max
[3] = (t
[3] > u
[3]) ? t
[3] : u
[3];
524 store_vector4( &inst
->DstReg
, machine
, max
);
529 GLfloat t
[4], u
[4], slt
[4];
530 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
531 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
532 slt
[0] = (t
[0] < u
[0]) ? 1.0F
: 0.0F
;
533 slt
[1] = (t
[1] < u
[1]) ? 1.0F
: 0.0F
;
534 slt
[2] = (t
[2] < u
[2]) ? 1.0F
: 0.0F
;
535 slt
[3] = (t
[3] < u
[3]) ? 1.0F
: 0.0F
;
536 store_vector4( &inst
->DstReg
, machine
, slt
);
541 GLfloat t
[4], u
[4], sge
[4];
542 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
543 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
544 sge
[0] = (t
[0] >= u
[0]) ? 1.0F
: 0.0F
;
545 sge
[1] = (t
[1] >= u
[1]) ? 1.0F
: 0.0F
;
546 sge
[2] = (t
[2] >= u
[2]) ? 1.0F
: 0.0F
;
547 sge
[3] = (t
[3] >= u
[3]) ? 1.0F
: 0.0F
;
548 store_vector4( &inst
->DstReg
, machine
, sge
);
553 GLfloat t
[4], u
[4], v
[4], sum
[4];
554 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
555 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
556 fetch_vector4( &inst
->SrcReg
[2], machine
, v
);
557 sum
[0] = t
[0] * u
[0] + v
[0];
558 sum
[1] = t
[1] * u
[1] + v
[1];
559 sum
[2] = t
[2] * u
[2] + v
[2];
560 sum
[3] = t
[3] * u
[3] + v
[3];
561 store_vector4( &inst
->DstReg
, machine
, sum
);
567 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
568 machine
->AddressReg
= (GLint
) floor(t
[0]);
573 GLfloat t
[4], u
[4], dot
[4];
574 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
575 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
576 dot
[0] = t
[0] * u
[0] + t
[1] * u
[1] + t
[2] * u
[2] + u
[3];
577 dot
[1] = dot
[2] = dot
[3] = dot
[0];
578 store_vector4( &inst
->DstReg
, machine
, dot
);
584 fetch_vector1( &inst
->SrcReg
[0], machine
, t
);
590 if (u
> 1.884467e+019F
) {
591 u
= 1.884467e+019F
; /* IEEE 32-bit binary value 0x5F800000 */
593 else if (u
< 5.42101e-020F
) {
594 u
= 5.42101e-020F
; /* IEEE 32-bit binary value 0x1F800000 */
598 if (u
< -1.884467e+019F
) {
599 u
= -1.884467e+019F
; /* IEEE 32-bit binary value 0xDF800000 */
601 else if (u
> -5.42101e-020F
) {
602 u
= -5.42101e-020F
; /* IEEE 32-bit binary value 0x9F800000 */
605 t
[0] = t
[1] = t
[2] = t
[3] = u
;
606 store_vector4( &inst
->DstReg
, machine
, t
);
611 GLfloat t
[4], u
[4], sum
[4];
612 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
613 fetch_vector4( &inst
->SrcReg
[1], machine
, u
);
614 sum
[0] = t
[0] - u
[0];
615 sum
[1] = t
[1] - u
[1];
616 sum
[2] = t
[2] - u
[2];
617 sum
[3] = t
[3] - u
[3];
618 store_vector4( &inst
->DstReg
, machine
, sum
);
624 fetch_vector4( &inst
->SrcReg
[0], machine
, t
);
625 if (t
[0] < 0.0) t
[0] = -t
[0];
626 if (t
[1] < 0.0) t
[1] = -t
[1];
627 if (t
[2] < 0.0) t
[2] = -t
[2];
628 if (t
[3] < 0.0) t
[3] = -t
[3];
629 store_vector4( &inst
->DstReg
, machine
, t
);
636 /* bad instruction opcode */
637 _mesa_problem(ctx
, "Bad VP Opcode in _mesa_exec_vertex_program");
646 Thoughts on vertex program optimization:
648 The obvious thing to do is to compile the vertex program into X86/SSE/3DNow!
649 assembly code. That will probably be a lot of work.
651 Another approach might be to replace the vp_instruction->Opcode field with
652 a pointer to a specialized C function which executes the instruction.
653 In particular we can write functions which skip swizzling, negating,
654 masking, relative addressing, etc. when they're not needed.
658 void simple_add( struct vp_instruction *inst )
660 GLfloat *sum = machine->Registers[inst->DstReg.Register];
661 GLfloat *a = machine->Registers[inst->SrcReg[0].Register];
662 GLfloat *b = machine->Registers[inst->SrcReg[1].Register];
663 sum[0] = a[0] + b[0];
664 sum[1] = a[1] + b[1];
665 sum[2] = a[2] + b[2];
666 sum[3] = a[3] + b[3];
675 A first step would be to 'vectorize' the programs in the same way as
676 the normal transformation code in the tnl module. Thus each opcode
677 takes zero or more input vectors (registers) and produces one or more
680 These operations would intially be coded in C, with machine-specific
681 assembly following, as is currently the case for matrix
682 transformations in the math/ directory. The preprocessing scheme for
683 selecting simpler operations Brian describes above would also work
686 This should give reasonable performance without excessive effort.