reindent, doxygen-style comments
[mesa.git] / src / mesa / tnl / t_vb_arbprogram_sse.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file t_vb_arb_program_sse.c
27 *
28 * Translate simplified vertex_program representation to
29 * x86/x87/SSE/SSE2 machine code using mesa's rtasm runtime assembler.
30 *
31 * This is very much a first attempt - build something that works.
32 * There are probably better approaches for applying SSE to vertex
33 * programs, and the whole thing is crying out for static analysis of
34 * the programs to avoid redundant operations.
35 *
36 * \author Keith Whitwell
37 */
38
39 #include "glheader.h"
40 #include "context.h"
41 #include "imports.h"
42 #include "macros.h"
43 #include "mtypes.h"
44 #include "arbprogparse.h"
45 #include "program.h"
46 #include "program_instruction.h"
47 #include "math/m_matrix.h"
48 #include "math/m_translate.h"
49 #include "t_context.h"
50 #include "t_vb_arbprogram.h"
51
52 #if defined(USE_SSE_ASM)
53
54 #include "x86/rtasm/x86sse.h"
55 #include "x86/common_x86_asm.h"
56
57 #define X 0
58 #define Y 1
59 #define Z 2
60 #define W 3
61
62 /* Reg usage:
63 *
64 * EAX - temp
65 * EBX - point to 'm->File[0]'
66 * ECX - point to 'm->File[3]'
67 * EDX - holds 'm'
68 * EBP,
69 * ESI,
70 * EDI
71 */
72
73 #define DISASSEM 0
74
75 #define FAIL \
76 do { \
77 _mesa_printf("x86 translation failed in %s\n", __FUNCTION__); \
78 return GL_FALSE; \
79 } while (0)
80
81 struct compilation {
82 struct x86_function func;
83 struct tnl_compiled_program *p;
84 GLuint insn_counter;
85
86 struct {
87 GLuint file:2;
88 GLuint idx:7;
89 GLuint dirty:1;
90 GLuint last_used:10;
91 } xmm[8];
92
93 struct {
94 struct x86_reg base;
95 } file[4];
96
97 GLboolean have_sse2;
98 GLshort fpucntl;
99 };
100
101 static INLINE GLboolean eq( struct x86_reg a,
102 struct x86_reg b )
103 {
104 return (a.file == b.file &&
105 a.idx == b.idx &&
106 a.mod == b.mod &&
107 a.disp == b.disp);
108 }
109
110 static GLint get_offset( const void *a, const void *b )
111 {
112 return (const char *)b - (const char *)a;
113 }
114
115
116 static struct x86_reg get_reg_ptr(GLuint file,
117 GLuint idx )
118 {
119 struct x86_reg reg;
120
121 switch (file) {
122 case FILE_REG:
123 reg = x86_make_reg(file_REG32, reg_BX);
124 assert(idx != REG_UNDEF);
125 break;
126 case FILE_STATE_PARAM:
127 reg = x86_make_reg(file_REG32, reg_CX);
128 break;
129 default:
130 assert(0);
131 }
132
133 return x86_make_disp(reg, 16 * idx);
134 }
135
136
137 static void spill( struct compilation *cp, GLuint idx )
138 {
139 struct x86_reg oldval = get_reg_ptr(cp->xmm[idx].file,
140 cp->xmm[idx].idx);
141
142 assert(cp->xmm[idx].dirty);
143 sse_movups(&cp->func, oldval, x86_make_reg(file_XMM, idx));
144 cp->xmm[idx].dirty = 0;
145 }
146
147 static struct x86_reg get_xmm_reg( struct compilation *cp )
148 {
149 GLuint i;
150 GLuint oldest = 0;
151
152 for (i = 0; i < 8; i++)
153 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
154 oldest = i;
155
156 /* Need to write out the old value?
157 */
158 if (cp->xmm[oldest].dirty)
159 spill(cp, oldest);
160
161 assert(cp->xmm[oldest].last_used != cp->insn_counter);
162
163 cp->xmm[oldest].file = FILE_REG;
164 cp->xmm[oldest].idx = REG_UNDEF;
165 cp->xmm[oldest].last_used = cp->insn_counter;
166 return x86_make_reg(file_XMM, oldest);
167 }
168
169 static void invalidate_xmm( struct compilation *cp,
170 GLuint file, GLuint idx )
171 {
172 GLuint i;
173
174 /* Invalidate any old copy of this register in XMM0-7.
175 */
176 for (i = 0; i < 8; i++) {
177 if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
178 cp->xmm[i].file = FILE_REG;
179 cp->xmm[i].idx = REG_UNDEF;
180 cp->xmm[i].dirty = 0;
181 break;
182 }
183 }
184 }
185
186
187 /* Return an XMM reg to receive the results of an operation.
188 */
189 static struct x86_reg get_dst_xmm_reg( struct compilation *cp,
190 GLuint file, GLuint idx )
191 {
192 struct x86_reg reg;
193
194 /* Invalidate any old copy of this register in XMM0-7. Don't reuse
195 * as this may be one of the arguments.
196 */
197 invalidate_xmm( cp, file, idx );
198
199 reg = get_xmm_reg( cp );
200 cp->xmm[reg.idx].file = file;
201 cp->xmm[reg.idx].idx = idx;
202 cp->xmm[reg.idx].dirty = 1;
203 return reg;
204 }
205
206 /* As above, but return a pointer. Note - this pointer may alias
207 * those returned by get_arg_ptr().
208 */
209 static struct x86_reg get_dst_ptr( struct compilation *cp,
210 GLuint file, GLuint idx )
211 {
212 /* Invalidate any old copy of this register in XMM0-7. Don't reuse
213 * as this may be one of the arguments.
214 */
215 invalidate_xmm( cp, file, idx );
216
217 return get_reg_ptr(file, idx);
218 }
219
220
221
222 /* Return an XMM reg if the argument is resident, otherwise return a
223 * base+offset pointer to the saved value.
224 */
225 static struct x86_reg get_arg( struct compilation *cp, GLuint file, GLuint idx )
226 {
227 GLuint i;
228
229 for (i = 0; i < 8; i++) {
230 if (cp->xmm[i].file == file &&
231 cp->xmm[i].idx == idx) {
232 cp->xmm[i].last_used = cp->insn_counter;
233 return x86_make_reg(file_XMM, i);
234 }
235 }
236
237 return get_reg_ptr(file, idx);
238 }
239
240 /* As above, but always return a pointer:
241 */
242 static struct x86_reg get_arg_ptr( struct compilation *cp, GLuint file, GLuint idx )
243 {
244 GLuint i;
245
246 /* If there is a modified version of this register in one of the
247 * XMM regs, write it out to memory.
248 */
249 for (i = 0; i < 8; i++) {
250 if (cp->xmm[i].file == file &&
251 cp->xmm[i].idx == idx &&
252 cp->xmm[i].dirty)
253 spill(cp, i);
254 }
255
256 return get_reg_ptr(file, idx);
257 }
258
259 /* Emulate pshufd insn in regular SSE, if necessary:
260 */
261 static void emit_pshufd( struct compilation *cp,
262 struct x86_reg dst,
263 struct x86_reg arg0,
264 GLubyte shuf )
265 {
266 if (cp->have_sse2) {
267 sse2_pshufd(&cp->func, dst, arg0, shuf);
268 cp->func.fn = 0;
269 }
270 else {
271 if (!eq(dst, arg0))
272 sse_movups(&cp->func, dst, arg0);
273
274 sse_shufps(&cp->func, dst, dst, shuf);
275 }
276 }
277
278 static void set_fpu_round_neg_inf( struct compilation *cp )
279 {
280 if (cp->fpucntl != RND_NEG_FPU) {
281 struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
282 struct arb_vp_machine *m = NULL;
283
284 cp->fpucntl = RND_NEG_FPU;
285 x87_fnclex(&cp->func);
286 x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg)));
287 }
288 }
289
290
291 /* Perform a reduced swizzle.
292 */
293 static GLboolean emit_RSW( struct compilation *cp, union instruction op )
294 {
295 struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
296 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
297 GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) |
298 (GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6));
299 GLuint neg = op.rsw.neg;
300
301 emit_pshufd(cp, dst, arg0, swz);
302
303 if (neg) {
304 struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
305 struct x86_reg tmp = get_xmm_reg(cp);
306 /* Load 1,-1,0,0
307 * Use neg as arg to pshufd
308 * Multiply
309 */
310 /* is the emit_pshufd necessary? only SWZ can negate individual components */
311 emit_pshufd(cp, tmp, negs,
312 SHUF((neg & 1) ? 1 : 0,
313 (neg & 2) ? 1 : 0,
314 (neg & 4) ? 1 : 0,
315 (neg & 8) ? 1 : 0));
316 sse_mulps(&cp->func, dst, tmp);
317 }
318
319 return GL_TRUE;
320 }
321
322 /* Perform a full swizzle
323 */
324 static GLboolean emit_SWZ( struct compilation *cp, union instruction op )
325 {
326 struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
327 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
328 struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
329 struct x86_reg tmp = get_xmm_reg(cp);
330 GLubyte neg = op.rsw.neg;
331 GLubyte shuf2, swz, savepos, savemask, swizzle[4];
332
333 swizzle[0] = GET_SWZ(op.rsw.swz, 0);
334 swizzle[1] = GET_SWZ(op.rsw.swz, 1);
335 swizzle[2] = GET_SWZ(op.rsw.swz, 2);
336 swizzle[3] = GET_SWZ(op.rsw.swz, 3);
337
338 swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3),
339 (swizzle[2] & 3), (swizzle[3] & 3));
340
341 emit_pshufd(cp, dst, arg0, swz);
342
343 /* can handle negation and replace with zero with the same shuffle/mul */
344 shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1),
345 swizzle[1] == 4 ? 2 : ((neg & 2) >> 1),
346 swizzle[2] == 4 ? 2 : ((neg & 4) >> 2),
347 swizzle[3] == 4 ? 2 : ((neg & 8) >> 3));
348
349 /* now the hard part is getting those 1's in there... */
350 savepos = 0;
351 savemask = 0;
352 if (swizzle[0] == 5) savepos = 1;
353 if (swizzle[1] == 5) savepos = 2;
354 else savemask |= 1 << 2;
355 if (swizzle[2] == 5) savepos = 3;
356 else savemask |= 2 << 4;
357 if (swizzle[3] == 5) savepos = 4;
358 else savemask |= 3 << 6;
359 if (savepos) {
360 /* need a mov first as movss from memory will overwrite high bits of xmm reg */
361 sse_movups(&cp->func, tmp, negs);
362 /* can only replace lowest 32bits, thus move away that part first */
363 emit_pshufd(cp, dst, dst, savemask);
364 sse_movss(&cp->func, dst, tmp);
365 emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc));
366 }
367
368 if (shuf2) {
369 /* Load 1,-1,0,0
370 * Use neg as arg to pshufd
371 * Multiply
372 */
373 emit_pshufd(cp, tmp, negs, shuf2);
374 sse_mulps(&cp->func, dst, tmp);
375 }
376
377 return GL_TRUE;
378 }
379
380 /* Helper for writemask:
381 */
382 static GLboolean emit_shuf_copy1( struct compilation *cp,
383 struct x86_reg dst,
384 struct x86_reg arg0,
385 struct x86_reg arg1,
386 GLubyte shuf )
387 {
388 struct x86_reg tmp = get_xmm_reg(cp);
389 sse_movups(&cp->func, dst, arg1);
390 emit_pshufd(cp, dst, dst, shuf);
391 emit_pshufd(cp, tmp, arg0, shuf);
392
393 sse_movss(&cp->func, dst, tmp);
394
395 emit_pshufd(cp, dst, dst, shuf);
396 return GL_TRUE;
397 }
398
399
400 /* Helper for writemask:
401 */
402 static GLboolean emit_shuf_copy2( struct compilation *cp,
403 struct x86_reg dst,
404 struct x86_reg arg0,
405 struct x86_reg arg1,
406 GLubyte shuf )
407 {
408 struct x86_reg tmp = get_xmm_reg(cp);
409 emit_pshufd(cp, dst, arg1, shuf);
410 emit_pshufd(cp, tmp, arg0, shuf);
411
412 sse_shufps(&cp->func, dst, tmp, SHUF(X, Y, Z, W));
413
414 emit_pshufd(cp, dst, dst, shuf);
415 return GL_TRUE;
416 }
417
418
419 static void emit_x87_ex2( struct compilation *cp )
420 {
421 struct x86_reg st0 = x86_make_reg(file_x87, 0);
422 struct x86_reg st1 = x86_make_reg(file_x87, 1);
423 struct x86_reg st3 = x86_make_reg(file_x87, 3);
424
425 set_fpu_round_neg_inf( cp );
426
427 x87_fld(&cp->func, st0); /* a a */
428 x87_fprndint( &cp->func ); /* int(a) a */
429 x87_fld(&cp->func, st0); /* int(a) int(a) a */
430 x87_fstp(&cp->func, st3); /* int(a) a int(a)*/
431 x87_fsubp(&cp->func, st1); /* frac(a) int(a) */
432 x87_f2xm1(&cp->func); /* (2^frac(a))-1 int(a)*/
433 x87_fld1(&cp->func); /* 1 (2^frac(a))-1 int(a)*/
434 x87_faddp(&cp->func, st1); /* 2^frac(a) int(a) */
435 x87_fscale(&cp->func); /* 2^a */
436 }
437
438 #if 0
439 static GLboolean emit_MSK2( struct compilation *cp, union instruction op )
440 {
441 struct x86_reg arg0 = get_arg(cp, op.msk.file, op.msk.arg);
442 struct x86_reg arg1 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
443 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
444
445 /* make full width bitmask in tmp
446 * dst = ~tmp
447 * tmp &= arg0
448 * dst &= arg1
449 * dst |= tmp
450 */
451 emit_pshufd(cp, tmp, get_arg(cp, FILE_REG, REG_NEGS),
452 SHUF((op.msk.mask & 1) ? 2 : 0,
453 (op.msk.mask & 2) ? 2 : 0,
454 (op.msk.mask & 4) ? 2 : 0,
455 (op.msk.mask & 8) ? 2 : 0));
456 sse2_pnot(&cp->func, dst, tmp);
457 sse2_pand(&cp->func, arg0, tmp);
458 sse2_pand(&cp->func, arg1, dst);
459 sse2_por(&cp->func, tmp, dst);
460 return GL_TRUE;
461 }
462 #endif
463
464
465 /* Used to implement write masking. This and most of the other instructions
466 * here would be easier to implement if there had been a translation
467 * to a 2 argument format (dst/arg0, arg1) at the shader level before
468 * attempting to translate to x86/sse code.
469 */
470 static GLboolean emit_MSK( struct compilation *cp, union instruction op )
471 {
472 struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx);
473 struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
474 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
475
476 /* Note that dst and dst0 refer to the same program variable, but
477 * will definitely be different XMM registers. We're effectively
478 * treating this as a 2 argument SEL now, just one of which happens
479 * always to be the same register as the destination.
480 */
481
482 switch (op.msk.mask) {
483 case 0:
484 sse_movups(&cp->func, dst, dst0);
485 return GL_TRUE;
486
487 case WRITEMASK_X:
488 if (arg.file == file_XMM) {
489 sse_movups(&cp->func, dst, dst0);
490 sse_movss(&cp->func, dst, arg);
491 }
492 else {
493 struct x86_reg tmp = get_xmm_reg(cp);
494 sse_movups(&cp->func, dst, dst0);
495 sse_movss(&cp->func, tmp, arg);
496 sse_movss(&cp->func, dst, tmp);
497 }
498 return GL_TRUE;
499
500 case WRITEMASK_XY:
501 sse_movups(&cp->func, dst, dst0);
502 sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));
503 return GL_TRUE;
504
505 case WRITEMASK_ZW:
506 sse_movups(&cp->func, dst, arg);
507 sse_shufps(&cp->func, dst, dst0, SHUF(X, Y, Z, W));
508 return GL_TRUE;
509
510 case WRITEMASK_YZW:
511 if (dst0.file == file_XMM) {
512 sse_movups(&cp->func, dst, arg);
513 sse_movss(&cp->func, dst, dst0);
514 }
515 else {
516 struct x86_reg tmp = get_xmm_reg(cp);
517 sse_movups(&cp->func, dst, arg);
518 sse_movss(&cp->func, tmp, dst0);
519 sse_movss(&cp->func, dst, tmp);
520 }
521 return GL_TRUE;
522
523 case WRITEMASK_Y:
524 emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Y,X,Z,W));
525 return GL_TRUE;
526
527 case WRITEMASK_Z:
528 emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
529 return GL_TRUE;
530
531 case WRITEMASK_W:
532 emit_shuf_copy1(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
533 return GL_TRUE;
534
535 case WRITEMASK_XZ:
536 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,Z,Y,W));
537 return GL_TRUE;
538
539 case WRITEMASK_XW:
540 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,W,Z,Y));
541
542 case WRITEMASK_YZ:
543 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
544 return GL_TRUE;
545
546 case WRITEMASK_YW:
547 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
548 return GL_TRUE;
549
550 case WRITEMASK_XZW:
551 emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Y,X,Z,W));
552 return GL_TRUE;
553
554 case WRITEMASK_XYW:
555 emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Z,Y,X,W));
556 return GL_TRUE;
557
558 case WRITEMASK_XYZ:
559 emit_shuf_copy1(cp, dst, dst0, arg, SHUF(W,Y,Z,X));
560 return GL_TRUE;
561
562 case WRITEMASK_XYZW:
563 sse_movups(&cp->func, dst, arg);
564 return GL_TRUE;
565
566 default:
567 assert(0);
568 break;
569 }
570 }
571
572
573
574 static GLboolean emit_PRT( struct compilation *cp, union instruction op )
575 {
576 FAIL;
577 }
578
579
580 /**
581 * The traditional instructions. All operate on internal registers
582 * and ignore write masks and swizzling issues.
583 */
584
585 static GLboolean emit_ABS( struct compilation *cp, union instruction op )
586 {
587 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
588 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
589 struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
590
591 sse_movups(&cp->func, dst, arg0);
592 sse_mulps(&cp->func, dst, neg);
593 sse_maxps(&cp->func, dst, arg0);
594 return GL_TRUE;
595 }
596
597 static GLboolean emit_ADD( struct compilation *cp, union instruction op )
598 {
599 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
600 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
601 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
602
603 sse_movups(&cp->func, dst, arg0);
604 sse_addps(&cp->func, dst, arg1);
605 return GL_TRUE;
606 }
607
608
609 /* The dotproduct instructions don't really do that well in sse:
610 */
611 static GLboolean emit_DP3( struct compilation *cp, union instruction op )
612 {
613 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
614 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
615 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
616 struct x86_reg tmp = get_xmm_reg(cp);
617
618 sse_movups(&cp->func, dst, arg0);
619 sse_mulps(&cp->func, dst, arg1);
620
621 /* Now the hard bit: sum the first 3 values:
622 */
623 sse_movhlps(&cp->func, tmp, dst);
624 sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
625 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
626 sse_addss(&cp->func, dst, tmp);
627 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
628 return GL_TRUE;
629 }
630
631
632
633 static GLboolean emit_DP4( struct compilation *cp, union instruction op )
634 {
635 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
636 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
637 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
638 struct x86_reg tmp = get_xmm_reg(cp);
639
640 sse_movups(&cp->func, dst, arg0);
641 sse_mulps(&cp->func, dst, arg1);
642
643 /* Now the hard bit: sum the values:
644 */
645 sse_movhlps(&cp->func, tmp, dst);
646 sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
647 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
648 sse_addss(&cp->func, dst, tmp);
649 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
650 return GL_TRUE;
651 }
652
653 static GLboolean emit_DPH( struct compilation *cp, union instruction op )
654 {
655 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
656 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
657 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
658 struct x86_reg tmp = get_xmm_reg(cp);
659
660 sse_movups(&cp->func, dst, arg0);
661 sse_mulps(&cp->func, dst, arg1);
662
663 /* Now the hard bit: sum the values (from DP3):
664 */
665 sse_movhlps(&cp->func, tmp, dst);
666 sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
667 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
668 sse_addss(&cp->func, dst, tmp);
669 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
670 sse_addss(&cp->func, dst, tmp);
671 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
672 return GL_TRUE;
673 }
674
675 #if 0
676 static GLboolean emit_DST( struct compilation *cp, union instruction op )
677 {
678 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
679 struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
680 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
681
682 /* dst[0] = 1.0 * 1.0F; */
683 /* dst[1] = arg0[1] * arg1[1]; */
684 /* dst[2] = arg0[2] * 1.0; */
685 /* dst[3] = 1.0 * arg1[3]; */
686
687 /* Would rather do some of this with integer regs, but:
688 * 1) No proper support for immediate values yet
689 * 2) I'd need to push/pop somewhere to get a free reg.
690 */
691 x87_fld1(&cp->func);
692 x87_fstp(&cp->func, dst); /* would rather do an immediate store... */
693 x87_fld(&cp->func, x86_make_disp(arg0, 4));
694 x87_fmul(&cp->func, x86_make_disp(arg1, 4));
695 x87_fstp(&cp->func, x86_make_disp(dst, 4));
696
697 if (!eq(arg0, dst)) {
698 x86_fld(&cp->func, x86_make_disp(arg0, 8));
699 x86_stp(&cp->func, x86_make_disp(dst, 8));
700 }
701
702 if (!eq(arg1, dst)) {
703 x86_fld(&cp->func, x86_make_disp(arg0, 12));
704 x86_stp(&cp->func, x86_make_disp(dst, 12));
705 }
706
707 return GL_TRUE;
708 }
709 #else
710 static GLboolean emit_DST( struct compilation *cp, union instruction op )
711 {
712 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
713 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
714 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
715 struct x86_reg tmp = get_xmm_reg(cp);
716 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
717
718 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
719 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
720 sse_mulps(&cp->func, dst, tmp);
721
722 /* dst[0] = 1.0 * 1.0F; */
723 /* dst[1] = arg0[1] * arg1[1]; */
724 /* dst[2] = arg0[2] * 1.0; */
725 /* dst[3] = 1.0 * arg1[3]; */
726
727 return GL_TRUE;
728 }
729 #endif
730
731 static GLboolean emit_LG2( struct compilation *cp, union instruction op )
732 {
733 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
734 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
735
736 x87_fld1(&cp->func); /* 1 */
737 x87_fld(&cp->func, arg0); /* a0 1 */
738 x87_fyl2x(&cp->func); /* log2(a0) */
739 x87_fst(&cp->func, x86_make_disp(dst, 0));
740 x87_fst(&cp->func, x86_make_disp(dst, 4));
741 x87_fst(&cp->func, x86_make_disp(dst, 8));
742 x87_fstp(&cp->func, x86_make_disp(dst, 12));
743
744 return GL_TRUE;
745 }
746
747
748 static GLboolean emit_EX2( struct compilation *cp, union instruction op )
749 {
750 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
751 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
752
753 /* CAUTION: dst may alias arg0!
754 */
755 x87_fld(&cp->func, arg0);
756
757 emit_x87_ex2(cp);
758
759 x87_fst(&cp->func, x86_make_disp(dst, 0));
760 x87_fst(&cp->func, x86_make_disp(dst, 4));
761 x87_fst(&cp->func, x86_make_disp(dst, 8));
762 x87_fst(&cp->func, x86_make_disp(dst, 12));
763 return GL_TRUE;
764 }
765
766 static GLboolean emit_EXP( struct compilation *cp, union instruction op )
767 {
768 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
769 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
770 struct x86_reg st0 = x86_make_reg(file_x87, 0);
771 struct x86_reg st1 = x86_make_reg(file_x87, 1);
772 struct x86_reg st3 = x86_make_reg(file_x87, 3);
773
774 /* CAUTION: dst may alias arg0!
775 */
776 x87_fld(&cp->func, arg0); /* arg0.x */
777 x87_fld(&cp->func, st0); /* arg arg */
778
779 /* by default, fpu is setup to round-to-nearest. We want to
780 * change this now, and track the state through to the end of the
781 * generated function so that it isn't repeated unnecessarily.
782 * Alternately, could subtract .5 to get round to -inf behaviour.
783 */
784 set_fpu_round_neg_inf( cp );
785 x87_fprndint( &cp->func ); /* flr(a) a */
786 x87_fld(&cp->func, st0); /* flr(a) flr(a) a */
787 x87_fld1(&cp->func); /* 1 floor(a) floor(a) a */
788 x87_fst(&cp->func, x86_make_disp(dst, 12)); /* stack unchanged */
789 x87_fscale(&cp->func); /* 2^floor(a) floor(a) a */
790 x87_fst(&cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/
791 x87_fstp(&cp->func, x86_make_disp(dst, 0)); /* flr(a) a 2^flr(a) */
792 x87_fsubrp(&cp->func, st1); /* frac(a) 2^flr(a) */
793 x87_fst(&cp->func, x86_make_disp(dst, 4)); /* frac(a) 2^flr(a) */
794 x87_f2xm1(&cp->func); /* (2^frac(a))-1 2^flr(a)*/
795 x87_fld1(&cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/
796 x87_faddp(&cp->func, st1); /* 2^frac(a) 2^flr(a) */
797 x87_fmulp(&cp->func, st1); /* 2^a */
798 x87_fst(&cp->func, x86_make_disp(dst, 8));
799
800
801
802 /* dst[0] = 2^floor(tmp); */
803 /* dst[1] = frac(tmp); */
804 /* dst[2] = 2^floor(tmp) * 2^frac(tmp); */
805 /* dst[3] = 1.0F; */
806 return GL_TRUE;
807 }
808
809 static GLboolean emit_LOG( struct compilation *cp, union instruction op )
810 {
811 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
812 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
813 struct x86_reg st0 = x86_make_reg(file_x87, 0);
814 struct x86_reg st1 = x86_make_reg(file_x87, 1);
815 struct x86_reg st2 = x86_make_reg(file_x87, 2);
816
817 /* CAUTION: dst may alias arg0!
818 */
819 x87_fld(&cp->func, arg0); /* arg0.x */
820 x87_fabs(&cp->func); /* |arg0.x| */
821 x87_fxtract(&cp->func); /* mantissa(arg0.x), exponent(arg0.x) */
822 x87_fst(&cp->func, st2); /* mantissa, exponent, mantissa */
823 x87_fld1(&cp->func); /* 1, mantissa, exponent, mantissa */
824 x87_fyl2x(&cp->func); /* log2(mantissa), exponent, mantissa */
825 x87_fadd(&cp->func, st0, st1); /* e+l2(m), e, m */
826 x87_fstp(&cp->func, x86_make_disp(dst, 8)); /* e, m */
827
828 x87_fld1(&cp->func); /* 1, e, m */
829 x87_fsub(&cp->func, st1, st0); /* 1, e-1, m */
830 x87_fstp(&cp->func, x86_make_disp(dst, 12)); /* e-1,m */
831 x87_fstp(&cp->func, dst); /* m */
832
833 x87_fadd(&cp->func, st0, st0); /* 2m */
834 x87_fstp(&cp->func, x86_make_disp(dst, 4));
835
836 return GL_TRUE;
837 }
838
839 static GLboolean emit_FLR( struct compilation *cp, union instruction op )
840 {
841 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
842 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
843 int i;
844
845 set_fpu_round_neg_inf( cp );
846
847 for (i = 0; i < 4; i++) {
848 x87_fld(&cp->func, x86_make_disp(arg0, i*4));
849 x87_fprndint( &cp->func );
850 x87_fstp(&cp->func, x86_make_disp(dst, i*4));
851 }
852
853
854 return GL_TRUE;
855 }
856
857 static GLboolean emit_FRC( struct compilation *cp, union instruction op )
858 {
859 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
860 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
861 struct x86_reg st0 = x86_make_reg(file_x87, 0);
862 struct x86_reg st1 = x86_make_reg(file_x87, 1);
863 int i;
864
865 set_fpu_round_neg_inf( cp );
866
867 /* Knowing liveness info or even just writemask would be useful
868 * here:
869 */
870 for (i = 0; i < 4; i++) {
871 x87_fld(&cp->func, x86_make_disp(arg0, i*4));
872 x87_fld(&cp->func, st0); /* a a */
873 x87_fprndint( &cp->func ); /* flr(a) a */
874 x87_fsubrp(&cp->func, st1); /* frc(a) */
875 x87_fstp(&cp->func, x86_make_disp(dst, i*4));
876 }
877
878 return GL_TRUE;
879 }
880
881
882
883 static GLboolean emit_LIT( struct compilation *cp, union instruction op )
884 {
885 #if 1
886 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
887 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
888 struct x86_reg lit = get_arg(cp, FILE_REG, REG_LIT);
889 struct x86_reg tmp = get_xmm_reg(cp);
890 struct x86_reg st1 = x86_make_reg(file_x87, 1);
891 struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);
892 GLubyte *fixup1, *fixup2;
893
894
895 /* Load the interesting parts of arg0:
896 */
897 x87_fld(&cp->func, x86_make_disp(arg0, 12)); /* a3 */
898 x87_fld(&cp->func, x86_make_disp(arg0, 4)); /* a1 a3 */
899 x87_fld(&cp->func, x86_make_disp(arg0, 0)); /* a0 a1 a3 */
900
901 /* Intialize dst:
902 */
903 sse_movaps(&cp->func, tmp, lit);
904 sse_movaps(&cp->func, dst, tmp);
905
906 /* Check arg0[0]:
907 */
908 x87_fldz(&cp->func); /* 0 a0 a1 a3 */
909 x87_fucomp(&cp->func, st1); /* a0 a1 a3 */
910 x87_fnstsw(&cp->func, regEAX);
911 x86_sahf(&cp->func);
912 fixup1 = x86_jcc_forward(&cp->func, cc_AE);
913
914 x87_fstp(&cp->func, x86_make_disp(dst, 4)); /* a1 a3 */
915
916 /* Check arg0[1]:
917 */
918 x87_fldz(&cp->func); /* 0 a1 a3 */
919 x87_fucomp(&cp->func, st1); /* a1 a3 */
920 x87_fnstsw(&cp->func, regEAX);
921 x86_sahf(&cp->func);
922 fixup2 = x86_jcc_forward(&cp->func, cc_AE);
923
924 /* Compute pow(a1, a3)
925 */
926 x87_fyl2x(&cp->func); /* a3*log2(a1) */
927
928 emit_x87_ex2( cp ); /* 2^(a3*log2(a1)) */
929
930 x87_fstp(&cp->func, x86_make_disp(dst, 8));
931
932 /* Land jumps:
933 */
934 x86_fixup_fwd_jump(&cp->func, fixup1);
935 x86_fixup_fwd_jump(&cp->func, fixup2);
936 #else
937 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
938 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_LIT);
939 sse_movups(&cp->func, dst, ones);
940 #endif
941 return GL_TRUE;
942 }
943
944
945
946 static GLboolean emit_MAX( struct compilation *cp, union instruction op )
947 {
948 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
949 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
950 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
951
952 sse_movups(&cp->func, dst, arg0);
953 sse_maxps(&cp->func, dst, arg1);
954 return GL_TRUE;
955 }
956
957
958 static GLboolean emit_MIN( struct compilation *cp, union instruction op )
959 {
960 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
961 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
962 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
963
964 sse_movups(&cp->func, dst, arg0);
965 sse_minps(&cp->func, dst, arg1);
966 return GL_TRUE;
967 }
968
969 static GLboolean emit_MOV( struct compilation *cp, union instruction op )
970 {
971 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
972 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
973
974 sse_movups(&cp->func, dst, arg0);
975 return GL_TRUE;
976 }
977
978 static GLboolean emit_MUL( struct compilation *cp, union instruction op )
979 {
980 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
981 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
982 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
983
984 sse_movups(&cp->func, dst, arg0);
985 sse_mulps(&cp->func, dst, arg1);
986 return GL_TRUE;
987 }
988
989
990 static GLboolean emit_POW( struct compilation *cp, union instruction op )
991 {
992 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
993 struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
994 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
995
996 x87_fld(&cp->func, arg1); /* a1 */
997 x87_fld(&cp->func, arg0); /* a0 a1 */
998 x87_fyl2x(&cp->func); /* a1*log2(a0) */
999
1000 emit_x87_ex2( cp ); /* 2^(a1*log2(a0)) */
1001
1002 x87_fst(&cp->func, x86_make_disp(dst, 0));
1003 x87_fst(&cp->func, x86_make_disp(dst, 4));
1004 x87_fst(&cp->func, x86_make_disp(dst, 8));
1005 x87_fstp(&cp->func, x86_make_disp(dst, 12));
1006
1007 return GL_TRUE;
1008 }
1009
1010 static GLboolean emit_REL( struct compilation *cp, union instruction op )
1011 {
1012 /* GLuint idx = (op.alu.idx0 + (GLint)cp->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1); */
1013 /* GLuint idx = 0; */
1014 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, idx); */
1015 /* struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); */
1016
1017 /* dst[0] = arg0[0]; */
1018 /* dst[1] = arg0[1]; */
1019 /* dst[2] = arg0[2]; */
1020 /* dst[3] = arg0[3]; */
1021
1022 FAIL;
1023 }
1024
1025 static GLboolean emit_RCP( struct compilation *cp, union instruction op )
1026 {
1027 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1028 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1029
1030 if (cp->have_sse2) {
1031 sse2_rcpss(&cp->func, dst, arg0);
1032 }
1033 else {
1034 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1035 sse_movss(&cp->func, dst, ones);
1036 sse_divss(&cp->func, dst, arg0);
1037 }
1038
1039 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
1040 return GL_TRUE;
1041 }
1042
1043 static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
1044 {
1045 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1046 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1047 #if 0
1048 struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
1049
1050 /* get abs value first. This STILL doesn't work.
1051 Looks like we get bogus neg values ?
1052 */
1053 sse_movss(&cp->func, dst, arg0);
1054 sse_mulss(&cp->func, dst, neg);
1055 sse_maxss(&cp->func, dst, arg0);
1056
1057 sse_rsqrtss(&cp->func, dst, dst);
1058 #endif
1059 sse_rsqrtss(&cp->func, dst, arg0);
1060 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
1061 return GL_TRUE;
1062 }
1063
1064
1065 static GLboolean emit_SGE( struct compilation *cp, union instruction op )
1066 {
1067 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1068 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1069 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1070 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1071
1072 sse_movups(&cp->func, dst, arg0);
1073 sse_cmpps(&cp->func, dst, arg1, cc_NotLessThan);
1074 sse_andps(&cp->func, dst, ones);
1075 return GL_TRUE;
1076 }
1077
1078
1079 static GLboolean emit_SLT( struct compilation *cp, union instruction op )
1080 {
1081 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1082 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1083 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1084 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1085
1086 sse_movups(&cp->func, dst, arg0);
1087 sse_cmpps(&cp->func, dst, arg1, cc_LessThan);
1088 sse_andps(&cp->func, dst, ones);
1089 return GL_TRUE;
1090 }
1091
1092 static GLboolean emit_SUB( struct compilation *cp, union instruction op )
1093 {
1094 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1095 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1096 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1097
1098 sse_movups(&cp->func, dst, arg0);
1099 sse_subps(&cp->func, dst, arg1);
1100 return GL_TRUE;
1101 }
1102
1103
1104 static GLboolean emit_XPD( struct compilation *cp, union instruction op )
1105 {
1106 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1107 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1108 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1109 struct x86_reg tmp0 = get_xmm_reg(cp);
1110 struct x86_reg tmp1 = get_xmm_reg(cp);
1111
1112 /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way
1113 * to invalidate registers. This will come with better analysis
1114 * (liveness analysis) of the incoming program.
1115 */
1116 emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W));
1117 emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W));
1118 sse_mulps(&cp->func, dst, tmp1);
1119 emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W));
1120 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1121 sse_mulps(&cp->func, tmp0, tmp1);
1122 sse_subps(&cp->func, dst, tmp0);
1123
1124 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1125 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1126 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1127 /* dst[3] is undef */
1128
1129 return GL_TRUE;
1130 }
1131
1132 static GLboolean emit_NOP( struct compilation *cp, union instruction op )
1133 {
1134 return GL_TRUE;
1135 }
1136
1137
1138 static GLboolean (* const emit_func[])(struct compilation *, union instruction) =
1139 {
1140 emit_ABS,
1141 emit_ADD,
1142 emit_NOP, /* ARA */
1143 emit_NOP, /* ARL */
1144 emit_NOP, /* ARL_NV */
1145 emit_NOP, /* ARR */
1146 emit_NOP, /* BRA */
1147 emit_NOP, /* CAL */
1148 emit_NOP, /* CMP */
1149 emit_NOP, /* COS */
1150 emit_NOP, /* DDX */
1151 emit_NOP, /* DDY */
1152 emit_DP3,
1153 emit_DP4,
1154 emit_DPH,
1155 emit_DST,
1156 emit_NOP, /* END */
1157 emit_EX2,
1158 emit_EXP,
1159 emit_FLR,
1160 emit_FRC,
1161 emit_NOP, /* KIL */
1162 emit_NOP, /* KIL_NV */
1163 emit_LG2,
1164 emit_LIT,
1165 emit_LOG,
1166 emit_NOP, /* LRP */
1167 emit_NOP, /* MAD */
1168 emit_MAX,
1169 emit_MIN,
1170 emit_MOV,
1171 emit_MUL,
1172 emit_NOP, /* PK2H */
1173 emit_NOP, /* PK2US */
1174 emit_NOP, /* PK4B */
1175 emit_NOP, /* PK4UB */
1176 emit_POW,
1177 emit_NOP, /* POPA */
1178 emit_PRT,
1179 emit_NOP, /* PUSHA */
1180 emit_NOP, /* RCC */
1181 emit_RCP,
1182 emit_NOP, /* RET */
1183 emit_NOP, /* RFL */
1184 emit_RSQ,
1185 emit_NOP, /* SCS */
1186 emit_NOP, /* SEQ */
1187 emit_NOP, /* SFL */
1188 emit_SGE,
1189 emit_NOP, /* SGT */
1190 emit_NOP, /* SIN */
1191 emit_NOP, /* SLE */
1192 emit_SLT,
1193 emit_NOP, /* SNE */
1194 emit_NOP, /* SSG */
1195 emit_NOP, /* STR */
1196 emit_SUB,
1197 emit_SWZ, /* SWZ */
1198 emit_NOP, /* TEX */
1199 emit_NOP, /* TXB */
1200 emit_NOP, /* TXD */
1201 emit_NOP, /* TXL */
1202 emit_NOP, /* TXP */
1203 emit_NOP, /* TXP_NV */
1204 emit_NOP, /* UP2H */
1205 emit_NOP, /* UP2US */
1206 emit_NOP, /* UP4B */
1207 emit_NOP, /* UP4UB */
1208 emit_NOP, /* X2D */
1209 emit_XPD,
1210 emit_RSW,
1211 emit_MSK,
1212 emit_REL,
1213 };
1214
1215
1216
1217 static GLboolean build_vertex_program( struct compilation *cp )
1218 {
1219 struct arb_vp_machine *m = NULL;
1220 GLuint j;
1221
1222 struct x86_reg regEBX = x86_make_reg(file_REG32, reg_BX);
1223 struct x86_reg regECX = x86_make_reg(file_REG32, reg_CX);
1224 struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
1225
1226 x86_push(&cp->func, regEBX);
1227
1228 x86_mov(&cp->func, regEDX, x86_fn_arg(&cp->func, 1));
1229 x86_mov(&cp->func, regEBX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_REG)));
1230 x86_mov(&cp->func, regECX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_STATE_PARAM)));
1231
1232 for (j = 0; j < cp->p->nr_instructions; j++) {
1233 union instruction inst = cp->p->instructions[j];
1234 cp->insn_counter = j+1; /* avoid zero */
1235
1236 if (DISASSEM) {
1237 _mesa_printf("%p: ", cp->func.csr);
1238 _tnl_disassem_vba_insn( inst );
1239 }
1240 cp->func.fn = NULL;
1241
1242 if (!emit_func[inst.alu.opcode]( cp, inst )) {
1243 return GL_FALSE;
1244 }
1245 }
1246
1247 /* TODO: only for outputs:
1248 */
1249 for (j = 0; j < 8; j++) {
1250 if (cp->xmm[j].dirty)
1251 spill(cp, j);
1252 }
1253
1254
1255 /* Exit mmx state?
1256 */
1257 if (cp->func.need_emms)
1258 mmx_emms(&cp->func);
1259
1260 /* Restore FPU control word?
1261 */
1262 if (cp->fpucntl != RESTORE_FPU) {
1263 x87_fnclex(&cp->func);
1264 x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore)));
1265 }
1266
1267 x86_pop(&cp->func, regEBX);
1268 x86_ret(&cp->func);
1269
1270 return GL_TRUE;
1271 }
1272
1273 /**
1274 * Execute the given vertex program.
1275 *
1276 * TODO: Integrate the t_vertex.c code here, to build machine vertices
1277 * directly at this point.
1278 *
1279 * TODO: Eliminate the VB struct entirely and just use
1280 * struct arb_vertex_machine.
1281 */
1282 GLboolean
1283 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1284 {
1285 struct compilation cp;
1286
1287 /* sanity checks */
1288 assert(emit_func[OPCODE_ABS] == emit_ABS);
1289 assert(emit_func[OPCODE_MUL] == emit_MUL);
1290 assert(emit_func[OPCODE_XPD] == emit_XPD);
1291
1292 _mesa_memset(&cp, 0, sizeof(cp));
1293 cp.p = p;
1294 cp.have_sse2 = 1;
1295
1296 if (p->compiled_func) {
1297 _mesa_free((void *)p->compiled_func);
1298 p->compiled_func = NULL;
1299 }
1300
1301 x86_init_func(&cp.func);
1302
1303 cp.fpucntl = RESTORE_FPU;
1304
1305
1306 /* Note ctx state is not referenced in building the function, so it
1307 * depends only on the list of instructions:
1308 */
1309 if (!build_vertex_program(&cp)) {
1310 x86_release_func( &cp.func );
1311 return GL_FALSE;
1312 }
1313
1314
1315 p->compiled_func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
1316 return GL_TRUE;
1317 }
1318
1319
1320
1321 #else
1322
1323 GLboolean
1324 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1325 {
1326 /* Dummy version for when USE_SSE_ASM not defined */
1327 return GL_FALSE;
1328 }
1329
1330 #endif