2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2006 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * \file t_arb_program.c
27 * Compile vertex programs to an intermediate representation.
28 * Execute vertex programs over a buffer of vertices.
29 * \author Keith Whitwell, Brian Paul
37 #include "arbprogparse.h"
40 #include "math/m_matrix.h"
41 #include "math/m_translate.h"
42 #include "t_context.h"
43 #include "t_pipeline.h"
44 #include "t_vb_arbprogram.h"
46 #include "program_instruction.h"
54 union instruction
*csr
;
58 #define ARB_VP_MACHINE(stage) ((struct arb_vp_machine *)(stage->privatePtr))
60 #define PUFF(x) ((x)[1] = (x)[2] = (x)[3] = (x)[0])
64 /* Lower precision functions for the EXP, LOG and LIT opcodes. The
65 * LOG2() implementation is probably not accurate enough, and the
66 * attempted optimization for Exp2 is definitely not accurate
67 * enough - it discards all of t's fractional bits!
69 static GLfloat
RoughApproxLog2(GLfloat t
)
74 static GLfloat
RoughApproxExp2(GLfloat t
)
79 fi
.i
= (fi
.i
<< 23) + 0x3f800000;
82 return (GLfloat
) _mesa_pow(2.0, t
);
86 static GLfloat
RoughApproxPower(GLfloat x
, GLfloat y
)
88 if (x
== 0.0 && y
== 0.0)
89 return 1.0; /* spec requires this */
91 return RoughApproxExp2(y
* RoughApproxLog2(x
));
95 /* Higher precision functions for the EX2, LG2 and POW opcodes:
97 static GLfloat
ApproxLog2(GLfloat t
)
99 return (GLfloat
) (LOGF(t
) * 1.442695F
);
102 static GLfloat
ApproxExp2(GLfloat t
)
104 return (GLfloat
) _mesa_pow(2.0, t
);
107 static GLfloat
ApproxPower(GLfloat x
, GLfloat y
)
109 return (GLfloat
) _mesa_pow(x
, y
);
112 static GLfloat
rough_approx_log2_0_1(GLfloat x
)
119 * Perform a reduced swizzle:
121 static void do_RSW( struct arb_vp_machine
*m
, union instruction op
)
123 GLfloat
*result
= m
->File
[0][op
.rsw
.dst
];
124 const GLfloat
*arg0
= m
->File
[op
.rsw
.file0
][op
.rsw
.idx0
];
125 GLuint swz
= op
.rsw
.swz
;
126 GLuint neg
= op
.rsw
.neg
;
129 /* Need a temporary to be correct in the case where result == arg0.
133 result
[0] = tmp
[GET_SWZ(swz
, 0)];
134 result
[1] = tmp
[GET_SWZ(swz
, 1)];
135 result
[2] = tmp
[GET_SWZ(swz
, 2)];
136 result
[3] = tmp
[GET_SWZ(swz
, 3)];
139 if (neg
& 0x1) result
[0] = -result
[0];
140 if (neg
& 0x2) result
[1] = -result
[1];
141 if (neg
& 0x4) result
[2] = -result
[2];
142 if (neg
& 0x8) result
[3] = -result
[3];
147 * Perform a full swizzle
149 static void do_SWZ( struct arb_vp_machine
*m
, union instruction op
)
151 GLfloat
*result
= m
->File
[0][op
.rsw
.dst
];
152 const GLfloat
*arg0
= m
->File
[op
.rsw
.file0
][op
.rsw
.idx0
];
153 GLuint swz
= op
.rsw
.swz
;
154 GLuint neg
= op
.rsw
.neg
;
159 /* Need a temporary to be correct in the case where result == arg0.
163 result
[0] = tmp
[GET_SWZ(swz
, 0)];
164 result
[1] = tmp
[GET_SWZ(swz
, 1)];
165 result
[2] = tmp
[GET_SWZ(swz
, 2)];
166 result
[3] = tmp
[GET_SWZ(swz
, 3)];
169 if (neg
& 0x1) result
[0] = -result
[0];
170 if (neg
& 0x2) result
[1] = -result
[1];
171 if (neg
& 0x4) result
[2] = -result
[2];
172 if (neg
& 0x8) result
[3] = -result
[3];
176 /* Used to implement write masking. To make things easier for the sse
177 * generator I've gone back to a 1 argument version of this function
178 * (dst.msk = arg), rather than the semantically cleaner (dst = SEL
181 * That means this is the only instruction which doesn't write a full
182 * 4 dwords out. This would make such a program harder to analyse,
183 * but it looks like analysis is going to take place on a higher level
186 static void do_MSK( struct arb_vp_machine
*m
, union instruction op
)
188 GLfloat
*dst
= m
->File
[0][op
.msk
.dst
];
189 const GLfloat
*arg
= m
->File
[op
.msk
.file
][op
.msk
.idx
];
191 if (op
.msk
.mask
& 0x1) dst
[0] = arg
[0];
192 if (op
.msk
.mask
& 0x2) dst
[1] = arg
[1];
193 if (op
.msk
.mask
& 0x4) dst
[2] = arg
[2];
194 if (op
.msk
.mask
& 0x8) dst
[3] = arg
[3];
198 static void do_PRT( struct arb_vp_machine
*m
, union instruction op
)
200 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
202 _mesa_printf("%d: %f %f %f %f\n", m
->vtx_nr
,
203 arg0
[0], arg0
[1], arg0
[2], arg0
[3]);
208 * The traditional ALU and texturing instructions. All operate on
209 * internal registers and ignore write masks and swizzling issues.
212 static void do_ABS( struct arb_vp_machine
*m
, union instruction op
)
214 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
215 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
217 result
[0] = (arg0
[0] < 0.0) ? -arg0
[0] : arg0
[0];
218 result
[1] = (arg0
[1] < 0.0) ? -arg0
[1] : arg0
[1];
219 result
[2] = (arg0
[2] < 0.0) ? -arg0
[2] : arg0
[2];
220 result
[3] = (arg0
[3] < 0.0) ? -arg0
[3] : arg0
[3];
223 static void do_ADD( struct arb_vp_machine
*m
, union instruction op
)
225 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
226 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
227 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
229 result
[0] = arg0
[0] + arg1
[0];
230 result
[1] = arg0
[1] + arg1
[1];
231 result
[2] = arg0
[2] + arg1
[2];
232 result
[3] = arg0
[3] + arg1
[3];
236 static void do_DP3( struct arb_vp_machine
*m
, union instruction op
)
238 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
239 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
240 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
242 result
[0] = (arg0
[0] * arg1
[0] +
251 static void do_DP4( struct arb_vp_machine
*m
, union instruction op
)
253 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
254 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
255 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
257 result
[0] = (arg0
[0] * arg1
[0] +
265 static void do_DPH( struct arb_vp_machine
*m
, union instruction op
)
267 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
268 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
269 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
271 result
[0] = (arg0
[0] * arg1
[0] +
279 static void do_DST( struct arb_vp_machine
*m
, union instruction op
)
281 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
282 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
283 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
285 /* This should be ok even if result == arg0 or result == arg1.
288 result
[1] = arg0
[1] * arg1
[1];
294 /* Intended to be high precision:
296 static void do_EX2( struct arb_vp_machine
*m
, union instruction op
)
298 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
299 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
301 result
[0] = (GLfloat
)ApproxExp2(arg0
[0]);
306 /* Allowed to be lower precision:
308 static void do_EXP( struct arb_vp_machine
*m
, union instruction op
)
310 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
311 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
312 GLfloat tmp
= arg0
[0];
313 GLfloat flr_tmp
= FLOORF(tmp
);
314 GLfloat frac_tmp
= tmp
- flr_tmp
;
316 result
[0] = LDEXPF(1.0, (int)flr_tmp
);
317 result
[1] = frac_tmp
;
318 result
[2] = LDEXPF(rough_approx_log2_0_1(frac_tmp
), (int)flr_tmp
);
322 static void do_FLR( struct arb_vp_machine
*m
, union instruction op
)
324 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
325 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
327 result
[0] = FLOORF(arg0
[0]);
328 result
[1] = FLOORF(arg0
[1]);
329 result
[2] = FLOORF(arg0
[2]);
330 result
[3] = FLOORF(arg0
[3]);
333 static void do_FRC( struct arb_vp_machine
*m
, union instruction op
)
335 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
336 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
338 result
[0] = arg0
[0] - FLOORF(arg0
[0]);
339 result
[1] = arg0
[1] - FLOORF(arg0
[1]);
340 result
[2] = arg0
[2] - FLOORF(arg0
[2]);
341 result
[3] = arg0
[3] - FLOORF(arg0
[3]);
344 /* High precision log base 2:
346 static void do_LG2( struct arb_vp_machine
*m
, union instruction op
)
348 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
349 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
351 result
[0] = ApproxLog2(arg0
[0]);
357 static void do_LIT( struct arb_vp_machine
*m
, union instruction op
)
359 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
360 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
366 tmp
[2] = RoughApproxPower(arg0
[1], arg0
[3]);
374 COPY_4V(result
, tmp
);
378 /* Intended to allow a lower precision than required for LG2 above.
380 static void do_LOG( struct arb_vp_machine
*m
, union instruction op
)
382 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
383 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
384 GLfloat tmp
= FABSF(arg0
[0]);
386 GLfloat mantissa
= FREXPF(tmp
, &exponent
);
388 result
[0] = (GLfloat
) (exponent
- 1);
389 result
[1] = 2.0 * mantissa
; /* map [.5, 1) -> [1, 2) */
390 result
[2] = exponent
+ LOG2(mantissa
);
394 static void do_MAX( struct arb_vp_machine
*m
, union instruction op
)
396 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
397 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
398 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
400 result
[0] = (arg0
[0] > arg1
[0]) ? arg0
[0] : arg1
[0];
401 result
[1] = (arg0
[1] > arg1
[1]) ? arg0
[1] : arg1
[1];
402 result
[2] = (arg0
[2] > arg1
[2]) ? arg0
[2] : arg1
[2];
403 result
[3] = (arg0
[3] > arg1
[3]) ? arg0
[3] : arg1
[3];
407 static void do_MIN( struct arb_vp_machine
*m
, union instruction op
)
409 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
410 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
411 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
413 result
[0] = (arg0
[0] < arg1
[0]) ? arg0
[0] : arg1
[0];
414 result
[1] = (arg0
[1] < arg1
[1]) ? arg0
[1] : arg1
[1];
415 result
[2] = (arg0
[2] < arg1
[2]) ? arg0
[2] : arg1
[2];
416 result
[3] = (arg0
[3] < arg1
[3]) ? arg0
[3] : arg1
[3];
419 static void do_MOV( struct arb_vp_machine
*m
, union instruction op
)
421 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
422 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
430 static void do_MUL( struct arb_vp_machine
*m
, union instruction op
)
432 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
433 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
434 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
436 result
[0] = arg0
[0] * arg1
[0];
437 result
[1] = arg0
[1] * arg1
[1];
438 result
[2] = arg0
[2] * arg1
[2];
439 result
[3] = arg0
[3] * arg1
[3];
443 /* Intended to be "high" precision
445 static void do_POW( struct arb_vp_machine
*m
, union instruction op
)
447 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
448 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
449 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
451 result
[0] = (GLfloat
)ApproxPower(arg0
[0], arg1
[0]);
455 static void do_REL( struct arb_vp_machine
*m
, union instruction op
)
457 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
458 GLuint idx
= (op
.alu
.idx0
+ (GLint
)m
->File
[0][REG_ADDR
][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS
-1);
459 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][idx
];
467 static void do_RCP( struct arb_vp_machine
*m
, union instruction op
)
469 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
470 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
472 result
[0] = 1.0F
/ arg0
[0];
476 static void do_RSQ( struct arb_vp_machine
*m
, union instruction op
)
478 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
479 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
481 result
[0] = INV_SQRTF(FABSF(arg0
[0]));
486 static void do_SGE( struct arb_vp_machine
*m
, union instruction op
)
488 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
489 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
490 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
492 result
[0] = (arg0
[0] >= arg1
[0]) ? 1.0F
: 0.0F
;
493 result
[1] = (arg0
[1] >= arg1
[1]) ? 1.0F
: 0.0F
;
494 result
[2] = (arg0
[2] >= arg1
[2]) ? 1.0F
: 0.0F
;
495 result
[3] = (arg0
[3] >= arg1
[3]) ? 1.0F
: 0.0F
;
499 static void do_SLT( struct arb_vp_machine
*m
, union instruction op
)
501 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
502 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
503 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
505 result
[0] = (arg0
[0] < arg1
[0]) ? 1.0F
: 0.0F
;
506 result
[1] = (arg0
[1] < arg1
[1]) ? 1.0F
: 0.0F
;
507 result
[2] = (arg0
[2] < arg1
[2]) ? 1.0F
: 0.0F
;
508 result
[3] = (arg0
[3] < arg1
[3]) ? 1.0F
: 0.0F
;
511 static void do_SUB( struct arb_vp_machine
*m
, union instruction op
)
513 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
514 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
515 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
517 result
[0] = arg0
[0] - arg1
[0];
518 result
[1] = arg0
[1] - arg1
[1];
519 result
[2] = arg0
[2] - arg1
[2];
520 result
[3] = arg0
[3] - arg1
[3];
524 static void do_XPD( struct arb_vp_machine
*m
, union instruction op
)
526 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
527 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
528 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
531 tmp
[0] = arg0
[1] * arg1
[2] - arg0
[2] * arg1
[1];
532 tmp
[1] = arg0
[2] * arg1
[0] - arg0
[0] * arg1
[2];
533 tmp
[2] = arg0
[0] * arg1
[1] - arg0
[1] * arg1
[0];
535 /* Need a temporary to be correct in the case where result == arg0
543 static void do_NOP( struct arb_vp_machine
*m
, union instruction op
)
547 /* Some useful debugging functions:
549 static void print_mask( GLuint mask
)
552 if (mask
&0x1) _mesa_printf("x");
553 if (mask
&0x2) _mesa_printf("y");
554 if (mask
&0x4) _mesa_printf("z");
555 if (mask
&0x8) _mesa_printf("w");
558 static void print_reg( GLuint file
, GLuint reg
)
560 static const char *reg_file
[] = {
570 else if (reg
>= REG_ARG0
&& reg
<= REG_ARG1
)
571 _mesa_printf("ARG%d", reg
- REG_ARG0
);
572 else if (reg
>= REG_TMP0
&& reg
<= REG_TMP11
)
573 _mesa_printf("TMP%d", reg
- REG_TMP0
);
574 else if (reg
>= REG_IN0
&& reg
<= REG_IN31
)
575 _mesa_printf("IN%d", reg
- REG_IN0
);
576 else if (reg
>= REG_OUT0
&& reg
<= REG_OUT14
)
577 _mesa_printf("OUT%d", reg
- REG_OUT0
);
578 else if (reg
== REG_ADDR
)
579 _mesa_printf("ADDR");
580 else if (reg
== REG_ID
)
583 _mesa_printf("REG%d", reg
);
586 _mesa_printf("%s:%d", reg_file
[file
], reg
);
590 static void print_RSW( union instruction op
)
592 GLuint swz
= op
.rsw
.swz
;
593 GLuint neg
= op
.rsw
.neg
;
596 _mesa_printf("RSW ");
597 print_reg(0, op
.rsw
.dst
);
599 print_reg(op
.rsw
.file0
, op
.rsw
.idx0
);
601 for (i
= 0; i
< 4; i
++, swz
>>= 3) {
602 const char *cswz
= "xyzw01";
605 _mesa_printf("%c", cswz
[swz
&0x7]);
610 static void print_SWZ( union instruction op
)
612 GLuint swz
= op
.rsw
.swz
;
613 GLuint neg
= op
.rsw
.neg
;
616 _mesa_printf("SWZ ");
617 print_reg(0, op
.rsw
.dst
);
619 print_reg(op
.rsw
.file0
, op
.rsw
.idx0
);
621 for (i
= 0; i
< 4; i
++, swz
>>= 3) {
622 const char *cswz
= "xyzw01";
625 _mesa_printf("%c", cswz
[swz
&0x7]);
631 static void print_ALU( union instruction op
)
633 _mesa_printf("%s ", _mesa_opcode_string((enum prog_opcode
) op
.alu
.opcode
));
634 print_reg(0, op
.alu
.dst
);
636 print_reg(op
.alu
.file0
, op
.alu
.idx0
);
637 if (_mesa_num_inst_src_regs((enum prog_opcode
) op
.alu
.opcode
) > 1) {
639 print_reg(op
.alu
.file1
, op
.alu
.idx1
);
644 static void print_MSK( union instruction op
)
646 _mesa_printf("MSK ");
647 print_reg(0, op
.msk
.dst
);
648 print_mask(op
.msk
.mask
);
650 print_reg(op
.msk
.file
, op
.msk
.idx
);
654 static void print_NOP( union instruction op
)
659 _tnl_disassem_vba_insn( union instruction op
)
661 switch (op
.alu
.opcode
) {
717 _mesa_problem(NULL
, "Bad opcode in _tnl_disassem_vba_insn()");
722 static void (* const opcode_func
[MAX_OPCODE
+3])(struct arb_vp_machine
*, union instruction
) =
799 static union instruction
*cvp_next_instruction( struct compilation
*cp
)
801 union instruction
*op
= cp
->csr
++;
802 _mesa_bzero(op
, sizeof(*op
));
806 static struct reg
cvp_make_reg( GLuint file
, GLuint idx
)
814 static struct reg
cvp_emit_rel( struct compilation
*cp
,
818 union instruction
*op
= cvp_next_instruction(cp
);
819 op
->alu
.opcode
= REL
;
820 op
->alu
.file0
= reg
.file
;
821 op
->alu
.idx0
= reg
.idx
;
822 op
->alu
.dst
= tmpreg
.idx
;
827 static struct reg
cvp_load_reg( struct compilation
*cp
,
833 struct reg tmpreg
= cvp_make_reg(FILE_REG
, tmpidx
);
837 case PROGRAM_TEMPORARY
:
838 return cvp_make_reg(FILE_REG
, REG_TMP0
+ index
);
841 return cvp_make_reg(FILE_REG
, REG_IN0
+ index
);
844 return cvp_make_reg(FILE_REG
, REG_OUT0
+ index
);
846 /* These two aren't populated by the parser?
848 case PROGRAM_LOCAL_PARAM
:
849 reg
= cvp_make_reg(FILE_LOCAL_PARAM
, index
);
851 return cvp_emit_rel(cp
, reg
, tmpreg
);
855 case PROGRAM_ENV_PARAM
:
856 reg
= cvp_make_reg(FILE_ENV_PARAM
, index
);
858 return cvp_emit_rel(cp
, reg
, tmpreg
);
862 case PROGRAM_STATE_VAR
:
863 reg
= cvp_make_reg(FILE_STATE_PARAM
, index
);
865 return cvp_emit_rel(cp
, reg
, tmpreg
);
871 case PROGRAM_WRITE_ONLY
:
872 case PROGRAM_ADDRESS
:
874 _mesa_problem(NULL
, "Invalid register file %d in cvp_load_reg()");
876 return tmpreg
; /* can't happen */
880 static struct reg
cvp_emit_arg( struct compilation
*cp
,
881 const struct prog_src_register
*src
,
884 struct reg reg
= cvp_load_reg( cp
, src
->File
, src
->Index
, src
->RelAddr
, arg
);
885 union instruction rsw
, noop
;
887 /* Emit any necessary swizzling.
889 _mesa_bzero(&rsw
, sizeof(rsw
));
890 rsw
.rsw
.neg
= src
->NegateBase
? WRITEMASK_XYZW
: 0;
892 /* we're expecting 2-bit swizzles below... */
893 #if 1 /* XXX THESE ASSERTIONS CURRENTLY FAIL DURING GLEAN TESTS! */
894 /* hopefully no longer happens? */
895 ASSERT(GET_SWZ(src
->Swizzle
, 0) < 4);
896 ASSERT(GET_SWZ(src
->Swizzle
, 1) < 4);
897 ASSERT(GET_SWZ(src
->Swizzle
, 2) < 4);
898 ASSERT(GET_SWZ(src
->Swizzle
, 3) < 4);
900 rsw
.rsw
.swz
= src
->Swizzle
;
902 _mesa_bzero(&noop
, sizeof(noop
));
904 noop
.rsw
.swz
= SWIZZLE_NOOP
;
906 if (_mesa_memcmp(&rsw
, &noop
, sizeof(rsw
)) !=0) {
907 union instruction
*op
= cvp_next_instruction(cp
);
908 struct reg rsw_reg
= cvp_make_reg(FILE_REG
, REG_ARG0
+ arg
);
910 op
->rsw
.opcode
= RSW
;
911 op
->rsw
.file0
= reg
.file
;
912 op
->rsw
.idx0
= reg
.idx
;
913 op
->rsw
.dst
= rsw_reg
.idx
;
920 static GLuint
cvp_choose_result( struct compilation
*cp
,
921 const struct prog_dst_register
*dst
,
922 union instruction
*fixup
)
924 GLuint mask
= dst
->WriteMask
;
928 case PROGRAM_TEMPORARY
:
929 idx
= REG_TMP0
+ dst
->Index
;
932 idx
= REG_OUT0
+ dst
->Index
;
936 return REG_RES
; /* can't happen */
939 /* Optimization: When writing (with a writemask) to an undefined
940 * value for the first time, the writemask may be ignored.
942 if (mask
!= WRITEMASK_XYZW
&& (cp
->reg_active
& (1 << idx
))) {
943 fixup
->msk
.opcode
= MSK
;
944 fixup
->msk
.dst
= idx
;
945 fixup
->msk
.file
= FILE_REG
;
946 fixup
->msk
.idx
= REG_RES
;
947 fixup
->msk
.mask
= mask
;
948 cp
->reg_active
|= 1 << idx
;
952 _mesa_bzero(fixup
, sizeof(*fixup
));
953 cp
->reg_active
|= 1 << idx
;
959 static void cvp_emit_inst( struct compilation
*cp
,
960 const struct prog_instruction
*inst
)
962 union instruction
*op
;
963 union instruction fixup
;
965 GLuint result
, nr_args
, i
;
967 /* Need to handle SWZ, ARL specially.
969 switch (inst
->Opcode
) {
970 /* Split into mul and add:
973 result
= cvp_choose_result( cp
, &inst
->DstReg
, &fixup
);
974 for (i
= 0; i
< 3; i
++)
975 reg
[i
] = cvp_emit_arg( cp
, &inst
->SrcReg
[i
], REG_ARG0
+i
);
977 op
= cvp_next_instruction(cp
);
978 op
->alu
.opcode
= OPCODE_MUL
;
979 op
->alu
.file0
= reg
[0].file
;
980 op
->alu
.idx0
= reg
[0].idx
;
981 op
->alu
.file1
= reg
[1].file
;
982 op
->alu
.idx1
= reg
[1].idx
;
983 op
->alu
.dst
= REG_ARG0
;
985 op
= cvp_next_instruction(cp
);
986 op
->alu
.opcode
= OPCODE_ADD
;
987 op
->alu
.file0
= FILE_REG
;
988 op
->alu
.idx0
= REG_ARG0
;
989 op
->alu
.file1
= reg
[2].file
;
990 op
->alu
.idx1
= reg
[2].idx
;
991 op
->alu
.dst
= result
;
993 if (result
== REG_RES
) {
994 op
= cvp_next_instruction(cp
);
1000 reg
[0] = cvp_emit_arg( cp
, &inst
->SrcReg
[0], REG_ARG0
);
1002 op
= cvp_next_instruction(cp
);
1003 op
->alu
.opcode
= OPCODE_FLR
;
1004 op
->alu
.dst
= REG_ADDR
;
1005 op
->alu
.file0
= reg
[0].file
;
1006 op
->alu
.idx0
= reg
[0].idx
;
1013 result
= cvp_choose_result( cp
, &inst
->DstReg
, &fixup
);
1014 reg
[0] = cvp_load_reg( cp
, inst
->SrcReg
[0].File
,
1015 inst
->SrcReg
[0].Index
, inst
->SrcReg
[0].RelAddr
, REG_ARG0
);
1016 op
= cvp_next_instruction(cp
);
1017 op
->rsw
.opcode
= inst
->Opcode
;
1018 op
->rsw
.file0
= reg
[0].file
;
1019 op
->rsw
.idx0
= reg
[0].idx
;
1020 op
->rsw
.dst
= result
;
1021 op
->rsw
.swz
= inst
->SrcReg
[0].Swizzle
;
1022 op
->rsw
.neg
= inst
->SrcReg
[0].NegateBase
;
1024 if (result
== REG_RES
) {
1025 op
= cvp_next_instruction(cp
);
1031 result
= cvp_choose_result( cp
, &inst
->DstReg
, &fixup
);
1032 nr_args
= _mesa_num_inst_src_regs(inst
->Opcode
);
1033 for (i
= 0; i
< nr_args
; i
++)
1034 reg
[i
] = cvp_emit_arg( cp
, &inst
->SrcReg
[i
], REG_ARG0
+ i
);
1036 op
= cvp_next_instruction(cp
);
1037 op
->alu
.opcode
= inst
->Opcode
;
1038 op
->alu
.file0
= reg
[0].file
;
1039 op
->alu
.idx0
= reg
[0].idx
;
1040 op
->alu
.file1
= reg
[1].file
;
1041 op
->alu
.idx1
= reg
[1].idx
;
1042 op
->alu
.dst
= result
;
1044 if (result
== REG_RES
) {
1045 op
= cvp_next_instruction(cp
);
1052 static void free_tnl_data( struct vertex_program
*program
)
1054 struct tnl_compiled_program
*p
= (struct tnl_compiled_program
*) program
->TnlData
;
1055 if (p
->compiled_func
)
1056 _mesa_free((void *)p
->compiled_func
);
1058 program
->TnlData
= NULL
;
1061 static void compile_vertex_program( struct vertex_program
*program
,
1062 GLboolean try_codegen
)
1064 struct compilation cp
;
1065 struct tnl_compiled_program
*p
= CALLOC_STRUCT(tnl_compiled_program
);
1068 if (program
->TnlData
)
1069 free_tnl_data( program
);
1071 program
->TnlData
= p
;
1073 /* Initialize cp. Note that ctx and VB aren't used in compilation
1074 * so we don't have to worry about statechanges:
1076 _mesa_memset(&cp
, 0, sizeof(cp
));
1077 cp
.csr
= p
->instructions
;
1079 /* Compile instructions:
1081 for (i
= 0; i
< program
->Base
.NumInstructions
; i
++) {
1082 cvp_emit_inst(&cp
, &program
->Base
.Instructions
[i
]);
1087 p
->nr_instructions
= cp
.csr
- p
->instructions
;
1089 /* Print/disassemble:
1092 for (i
= 0; i
< p
->nr_instructions
; i
++) {
1093 _tnl_disassem_vba_insn(p
->instructions
[i
]);
1095 _mesa_printf("\n\n");
1100 _tnl_sse_codegen_vertex_program(p
);
1108 /* ----------------------------------------------------------------------
1111 static void userclip( GLcontext
*ctx
,
1114 GLubyte
*clipormask
,
1115 GLubyte
*clipandmask
)
1119 for (p
= 0; p
< ctx
->Const
.MaxClipPlanes
; p
++) {
1120 if (ctx
->Transform
.ClipPlanesEnabled
& (1 << p
)) {
1122 const GLfloat a
= ctx
->Transform
._ClipUserPlane
[p
][0];
1123 const GLfloat b
= ctx
->Transform
._ClipUserPlane
[p
][1];
1124 const GLfloat c
= ctx
->Transform
._ClipUserPlane
[p
][2];
1125 const GLfloat d
= ctx
->Transform
._ClipUserPlane
[p
][3];
1126 GLfloat
*coord
= (GLfloat
*)clip
->data
;
1127 GLuint stride
= clip
->stride
;
1128 GLuint count
= clip
->count
;
1130 for (nr
= 0, i
= 0 ; i
< count
; i
++) {
1131 GLfloat dp
= (coord
[0] * a
+
1138 clipmask
[i
] |= CLIP_USER_BIT
;
1141 STRIDE_F(coord
, stride
);
1145 *clipormask
|= CLIP_USER_BIT
;
1147 *clipandmask
|= CLIP_USER_BIT
;
1157 do_ndc_cliptest(GLcontext
*ctx
, struct arb_vp_machine
*m
)
1159 TNLcontext
*tnl
= TNL_CONTEXT(ctx
);
1160 struct vertex_buffer
*VB
= m
->VB
;
1162 /* Cliptest and perspective divide. Clip functions must clear
1166 m
->andmask
= CLIP_FRUSTUM_BITS
;
1168 if (tnl
->NeedNdcCoords
) {
1170 _mesa_clip_tab
[VB
->ClipPtr
->size
]( VB
->ClipPtr
,
1178 _mesa_clip_np_tab
[VB
->ClipPtr
->size
]( VB
->ClipPtr
,
1186 /* All vertices are outside the frustum */
1190 /* Test userclip planes. This contributes to VB->ClipMask.
1192 if (ctx
->Transform
.ClipPlanesEnabled
&& !ctx
->VertexProgram
._Enabled
) {
1204 VB
->ClipAndMask
= m
->andmask
;
1205 VB
->ClipOrMask
= m
->ormask
;
1206 VB
->ClipMask
= m
->clipmask
;
1212 static INLINE
void call_func( struct tnl_compiled_program
*p
,
1213 struct arb_vp_machine
*m
)
1215 p
->compiled_func(m
);
1219 * Execute the given vertex program.
1221 * TODO: Integrate the t_vertex.c code here, to build machine vertices
1222 * directly at this point.
1224 * TODO: Eliminate the VB struct entirely and just use
1225 * struct arb_vertex_machine.
1228 run_arb_vertex_program(GLcontext
*ctx
, struct tnl_pipeline_stage
*stage
)
1230 const struct vertex_program
*program
;
1231 struct vertex_buffer
*VB
= &TNL_CONTEXT(ctx
)->vb
;
1232 struct arb_vp_machine
*m
= ARB_VP_MACHINE(stage
);
1233 struct tnl_compiled_program
*p
;
1237 if (ctx
->ShaderObjects
._VertexShaderPresent
)
1240 program
= (ctx
->VertexProgram
._Enabled
? ctx
->VertexProgram
.Current
: ctx
->_TnlProgram
);
1241 if (!program
|| program
->IsNVProgram
)
1244 if (program
->Base
.Parameters
) {
1245 _mesa_load_state_parameters(ctx
, program
->Base
.Parameters
);
1248 p
= (struct tnl_compiled_program
*)program
->TnlData
;
1252 m
->nr_inputs
= m
->nr_outputs
= 0;
1254 for (i
= 0; i
< _TNL_ATTRIB_MAX
; i
++) {
1255 if (program
->Base
.InputsRead
& (1<<i
) ||
1256 (i
== VERT_ATTRIB_POS
&& program
->IsPositionInvariant
)) {
1257 GLuint j
= m
->nr_inputs
++;
1258 m
->input
[j
].idx
= i
;
1259 m
->input
[j
].data
= (GLfloat
*)m
->VB
->AttribPtr
[i
]->data
;
1260 m
->input
[j
].stride
= m
->VB
->AttribPtr
[i
]->stride
;
1261 m
->input
[j
].size
= m
->VB
->AttribPtr
[i
]->size
;
1262 ASSIGN_4V(m
->File
[0][REG_IN0
+ i
], 0, 0, 0, 1);
1266 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1267 if (program
->Base
.OutputsWritten
& (1 << i
) ||
1268 (i
== VERT_RESULT_HPOS
&& program
->IsPositionInvariant
)) {
1269 GLuint j
= m
->nr_outputs
++;
1270 m
->output
[j
].idx
= i
;
1271 m
->output
[j
].data
= (GLfloat
*)m
->attribs
[i
].data
;
1276 /* Run the actual program:
1278 for (m
->vtx_nr
= 0; m
->vtx_nr
< VB
->Count
; m
->vtx_nr
++) {
1279 for (j
= 0; j
< m
->nr_inputs
; j
++) {
1280 GLuint idx
= REG_IN0
+ m
->input
[j
].idx
;
1281 switch (m
->input
[j
].size
) {
1282 case 4: m
->File
[0][idx
][3] = m
->input
[j
].data
[3];
1283 case 3: m
->File
[0][idx
][2] = m
->input
[j
].data
[2];
1284 case 2: m
->File
[0][idx
][1] = m
->input
[j
].data
[1];
1285 case 1: m
->File
[0][idx
][0] = m
->input
[j
].data
[0];
1288 STRIDE_F(m
->input
[j
].data
, m
->input
[j
].stride
);
1292 if (p
->compiled_func
) {
1296 for (j
= 0; j
< p
->nr_instructions
; j
++) {
1297 union instruction inst
= p
->instructions
[j
];
1298 opcode_func
[inst
.alu
.opcode
]( m
, inst
);
1302 /* If the program is position invariant, multiply the input position
1303 * by the MVP matrix and store in the vertex position result register.
1305 if (program
->IsPositionInvariant
) {
1306 TRANSFORM_POINT( m
->File
[0][REG_OUT0
+0],
1307 ctx
->_ModelProjectMatrix
.m
,
1308 m
->File
[0][REG_IN0
+0]);
1311 for (j
= 0; j
< m
->nr_outputs
; j
++) {
1312 GLuint idx
= REG_OUT0
+ m
->output
[j
].idx
;
1313 m
->output
[j
].data
[0] = m
->File
[0][idx
][0];
1314 m
->output
[j
].data
[1] = m
->File
[0][idx
][1];
1315 m
->output
[j
].data
[2] = m
->File
[0][idx
][2];
1316 m
->output
[j
].data
[3] = m
->File
[0][idx
][3];
1317 m
->output
[j
].data
+= 4;
1322 /* Setup the VB pointers so that the next pipeline stages get
1323 * their data from the right place (the program output arrays).
1325 * TODO: 1) Have tnl use these RESULT values for outputs rather
1326 * than trying to shoe-horn inputs and outputs into one set of
1329 * TODO: 2) Integrate t_vertex.c so that we just go straight ahead
1330 * and build machine vertices here.
1332 VB
->ClipPtr
= &m
->attribs
[VERT_RESULT_HPOS
];
1333 VB
->ClipPtr
->count
= VB
->Count
;
1335 outputs
= program
->Base
.OutputsWritten
;
1336 if (program
->IsPositionInvariant
)
1337 outputs
|= (1<<VERT_RESULT_HPOS
);
1339 if (outputs
& (1<<VERT_RESULT_COL0
)) {
1340 VB
->ColorPtr
[0] = &m
->attribs
[VERT_RESULT_COL0
];
1341 VB
->AttribPtr
[VERT_ATTRIB_COLOR0
] = VB
->ColorPtr
[0];
1344 if (outputs
& (1<<VERT_RESULT_BFC0
)) {
1345 VB
->ColorPtr
[1] = &m
->attribs
[VERT_RESULT_BFC0
];
1348 if (outputs
& (1<<VERT_RESULT_COL1
)) {
1349 VB
->SecondaryColorPtr
[0] = &m
->attribs
[VERT_RESULT_COL1
];
1350 VB
->AttribPtr
[VERT_ATTRIB_COLOR1
] = VB
->SecondaryColorPtr
[0];
1353 if (outputs
& (1<<VERT_RESULT_BFC1
)) {
1354 VB
->SecondaryColorPtr
[1] = &m
->attribs
[VERT_RESULT_BFC1
];
1357 if (outputs
& (1<<VERT_RESULT_FOGC
)) {
1358 VB
->FogCoordPtr
= &m
->attribs
[VERT_RESULT_FOGC
];
1359 VB
->AttribPtr
[VERT_ATTRIB_FOG
] = VB
->FogCoordPtr
;
1362 if (outputs
& (1<<VERT_RESULT_PSIZ
)) {
1363 VB
->PointSizePtr
= &m
->attribs
[VERT_RESULT_PSIZ
];
1364 VB
->AttribPtr
[_TNL_ATTRIB_POINTSIZE
] = &m
->attribs
[VERT_RESULT_PSIZ
];
1367 for (i
= 0; i
< ctx
->Const
.MaxTextureCoordUnits
; i
++) {
1368 if (outputs
& (1<<(VERT_RESULT_TEX0
+i
))) {
1369 VB
->TexCoordPtr
[i
] = &m
->attribs
[VERT_RESULT_TEX0
+ i
];
1370 VB
->AttribPtr
[VERT_ATTRIB_TEX0
+i
] = VB
->TexCoordPtr
[i
];
1375 for (i
= 0; i
< VB
->Count
; i
++) {
1376 printf("Out %d: %f %f %f %f %f %f %f %f\n", i
,
1377 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[0],
1378 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[1],
1379 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[2],
1380 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[3],
1381 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[0],
1382 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[1],
1383 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[2],
1384 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[3]);
1388 /* Perform NDC and cliptest operations:
1390 return do_ndc_cliptest(ctx
, m
);
1395 validate_vertex_program( GLcontext
*ctx
, struct tnl_pipeline_stage
*stage
)
1397 struct arb_vp_machine
*m
= ARB_VP_MACHINE(stage
);
1398 struct vertex_program
*program
;
1400 if (ctx
->ShaderObjects
._VertexShaderPresent
)
1403 program
= (ctx
->VertexProgram
._Enabled
? ctx
->VertexProgram
.Current
: 0);
1404 if (!program
&& ctx
->_MaintainTnlProgram
) {
1405 program
= ctx
->_TnlProgram
;
1409 if (!program
->TnlData
)
1410 compile_vertex_program( program
, m
->try_codegen
);
1412 /* Grab the state GL state and put into registers:
1414 m
->File
[FILE_LOCAL_PARAM
] = program
->Base
.LocalParams
;
1415 m
->File
[FILE_ENV_PARAM
] = ctx
->VertexProgram
.Parameters
;
1416 /* GL_NV_vertex_programs can't reference GL state */
1417 if (program
->Base
.Parameters
)
1418 m
->File
[FILE_STATE_PARAM
] = program
->Base
.Parameters
->ParameterValues
;
1420 m
->File
[FILE_STATE_PARAM
] = NULL
;
1431 * Called the first time stage->run is called. In effect, don't
1432 * allocate data until the first time the stage is run.
1434 static GLboolean
init_vertex_program( GLcontext
*ctx
,
1435 struct tnl_pipeline_stage
*stage
)
1437 TNLcontext
*tnl
= TNL_CONTEXT(ctx
);
1438 struct vertex_buffer
*VB
= &(tnl
->vb
);
1439 struct arb_vp_machine
*m
;
1440 const GLuint size
= VB
->Size
;
1443 stage
->privatePtr
= _mesa_calloc(sizeof(*m
));
1444 m
= ARB_VP_MACHINE(stage
);
1448 /* arb_vertex_machine struct should subsume the VB:
1452 m
->File
[0] = (GLfloat(*)[4])ALIGN_MALLOC(REG_MAX
* sizeof(GLfloat
) * 4, 16);
1454 /* Initialize regs where necessary:
1456 ASSIGN_4V(m
->File
[0][REG_ID
], 0, 0, 0, 1);
1457 ASSIGN_4V(m
->File
[0][REG_ONES
], 1, 1, 1, 1);
1458 ASSIGN_4V(m
->File
[0][REG_SWZ
], 1, -1, 0, 0);
1459 ASSIGN_4V(m
->File
[0][REG_NEG
], -1, -1, -1, -1);
1460 ASSIGN_4V(m
->File
[0][REG_LIT
], 1, 0, 0, 1);
1461 ASSIGN_4V(m
->File
[0][REG_LIT2
], 1, .5, .2, 1); /* debug value */
1463 if (_mesa_getenv("MESA_EXPERIMENTAL"))
1464 m
->try_codegen
= GL_TRUE
;
1466 /* Allocate arrays of vertex output values */
1467 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1468 _mesa_vector4f_alloc( &m
->attribs
[i
], 0, size
, 32 );
1469 m
->attribs
[i
].size
= 4;
1472 /* a few other misc allocations */
1473 _mesa_vector4f_alloc( &m
->ndcCoords
, 0, size
, 32 );
1474 m
->clipmask
= (GLubyte
*) ALIGN_MALLOC(sizeof(GLubyte
)*size
, 32 );
1476 if (ctx
->_MaintainTnlProgram
)
1477 _mesa_allow_light_in_model( ctx
, GL_FALSE
);
1479 m
->fpucntl_rnd_neg
= RND_NEG_FPU
; /* const value */
1480 m
->fpucntl_restore
= RESTORE_FPU
; /* const value */
1489 * Destructor for this pipeline stage.
1491 static void dtr( struct tnl_pipeline_stage
*stage
)
1493 struct arb_vp_machine
*m
= ARB_VP_MACHINE(stage
);
1498 /* free the vertex program result arrays */
1499 for (i
= 0; i
< VERT_RESULT_MAX
; i
++)
1500 _mesa_vector4f_free( &m
->attribs
[i
] );
1502 /* free misc arrays */
1503 _mesa_vector4f_free( &m
->ndcCoords
);
1504 ALIGN_FREE( m
->clipmask
);
1505 ALIGN_FREE( m
->File
[0] );
1508 stage
->privatePtr
= NULL
;
1513 * Public description of this pipeline stage.
1515 const struct tnl_pipeline_stage _tnl_arb_vertex_program_stage
=
1518 NULL
, /* private_data */
1519 init_vertex_program
, /* create */
1521 validate_vertex_program
, /* validate */
1522 run_arb_vertex_program
/* run */
1527 * Called via ctx->Driver.ProgramStringNotify() after a new vertex program
1528 * string has been parsed.
1531 _tnl_program_string(GLcontext
*ctx
, GLenum target
, struct program
*program
)
1533 if (program
->Target
== GL_VERTEX_PROGRAM_ARB
) {
1534 /* free any existing tnl data hanging off the program */
1535 struct vertex_program
*vprog
= (struct vertex_program
*) program
;
1536 if (vprog
->TnlData
) {
1537 free_tnl_data(vprog
);