Speedup the venerable mm.[ch] allocator with doubly linked lists and a
[mesa.git] / src / mesa / tnl / t_vb_arbprogram_sse.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file t_vb_arb_program_sse.c
27 *
28 * Translate simplified vertex_program representation to
29 * x86/x87/SSE/SSE2 machine code using mesa's rtasm runtime assembler.
30 *
31 * This is very much a first attempt - build something that works.
32 * There are probably better approaches for applying SSE to vertex
33 * programs, and the whole thing is crying out for static analysis of
34 * the programs to avoid redundant operations.
35 *
36 * \author Keith Whitwell
37 */
38
39 #include "glheader.h"
40 #include "context.h"
41 #include "imports.h"
42 #include "macros.h"
43 #include "mtypes.h"
44 #include "arbprogparse.h"
45 #include "program.h"
46 #include "program_instruction.h"
47 #include "math/m_matrix.h"
48 #include "math/m_translate.h"
49 #include "t_context.h"
50 #include "t_vb_arbprogram.h"
51
52 #if defined(USE_SSE_ASM)
53
54 #include "x86/rtasm/x86sse.h"
55 #include "x86/common_x86_asm.h"
56
57 #define X 0
58 #define Y 1
59 #define Z 2
60 #define W 3
61
62 /* Reg usage:
63 *
64 * EAX - temp
65 * EBX - point to 'm->File[0]'
66 * ECX - point to 'm->File[3]'
67 * EDX - holds 'm'
68 * EBP,
69 * ESI,
70 * EDI
71 */
72
73 #define DISASSEM 0
74
75 #define FAIL \
76 do { \
77 _mesa_printf("x86 translation failed in %s\n", __FUNCTION__); \
78 return GL_FALSE; \
79 } while (0)
80
81 struct compilation {
82 struct x86_function func;
83 struct tnl_compiled_program *p;
84 GLuint insn_counter;
85
86 struct {
87 GLuint file:2;
88 GLuint idx:7;
89 GLuint dirty:1;
90 GLuint last_used:10;
91 } xmm[8];
92
93 struct {
94 struct x86_reg base;
95 } file[4];
96
97 GLboolean have_sse2;
98 GLshort fpucntl;
99 };
100
101 static INLINE GLboolean eq( struct x86_reg a,
102 struct x86_reg b )
103 {
104 return (a.file == b.file &&
105 a.idx == b.idx &&
106 a.mod == b.mod &&
107 a.disp == b.disp);
108 }
109
110 static GLint get_offset( const void *a, const void *b )
111 {
112 return (const char *)b - (const char *)a;
113 }
114
115
116 static struct x86_reg get_reg_ptr(GLuint file,
117 GLuint idx )
118 {
119 struct x86_reg reg;
120
121 switch (file) {
122 case FILE_REG:
123 reg = x86_make_reg(file_REG32, reg_BX);
124 assert(idx != REG_UNDEF);
125 break;
126 case FILE_STATE_PARAM:
127 reg = x86_make_reg(file_REG32, reg_CX);
128 break;
129 default:
130 assert(0);
131 }
132
133 return x86_make_disp(reg, 16 * idx);
134 }
135
136
137 static void spill( struct compilation *cp, GLuint idx )
138 {
139 struct x86_reg oldval = get_reg_ptr(cp->xmm[idx].file,
140 cp->xmm[idx].idx);
141
142 assert(cp->xmm[idx].dirty);
143 sse_movups(&cp->func, oldval, x86_make_reg(file_XMM, idx));
144 cp->xmm[idx].dirty = 0;
145 }
146
147 static struct x86_reg get_xmm_reg( struct compilation *cp )
148 {
149 GLuint i;
150 GLuint oldest = 0;
151
152 for (i = 0; i < 8; i++)
153 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
154 oldest = i;
155
156 /* Need to write out the old value?
157 */
158 if (cp->xmm[oldest].dirty)
159 spill(cp, oldest);
160
161 assert(cp->xmm[oldest].last_used != cp->insn_counter);
162
163 cp->xmm[oldest].file = FILE_REG;
164 cp->xmm[oldest].idx = REG_UNDEF;
165 cp->xmm[oldest].last_used = cp->insn_counter;
166 return x86_make_reg(file_XMM, oldest);
167 }
168
169 static void invalidate_xmm( struct compilation *cp,
170 GLuint file, GLuint idx )
171 {
172 GLuint i;
173
174 /* Invalidate any old copy of this register in XMM0-7.
175 */
176 for (i = 0; i < 8; i++) {
177 if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
178 cp->xmm[i].file = FILE_REG;
179 cp->xmm[i].idx = REG_UNDEF;
180 cp->xmm[i].dirty = 0;
181 break;
182 }
183 }
184 }
185
186
187 /* Return an XMM reg to receive the results of an operation.
188 */
189 static struct x86_reg get_dst_xmm_reg( struct compilation *cp,
190 GLuint file, GLuint idx )
191 {
192 struct x86_reg reg;
193
194 /* Invalidate any old copy of this register in XMM0-7. Don't reuse
195 * as this may be one of the arguments.
196 */
197 invalidate_xmm( cp, file, idx );
198
199 reg = get_xmm_reg( cp );
200 cp->xmm[reg.idx].file = file;
201 cp->xmm[reg.idx].idx = idx;
202 cp->xmm[reg.idx].dirty = 1;
203 return reg;
204 }
205
206 /* As above, but return a pointer. Note - this pointer may alias
207 * those returned by get_arg_ptr().
208 */
209 static struct x86_reg get_dst_ptr( struct compilation *cp,
210 GLuint file, GLuint idx )
211 {
212 /* Invalidate any old copy of this register in XMM0-7. Don't reuse
213 * as this may be one of the arguments.
214 */
215 invalidate_xmm( cp, file, idx );
216
217 return get_reg_ptr(file, idx);
218 }
219
220
221
222 /* Return an XMM reg if the argument is resident, otherwise return a
223 * base+offset pointer to the saved value.
224 */
225 static struct x86_reg get_arg( struct compilation *cp, GLuint file, GLuint idx )
226 {
227 GLuint i;
228
229 for (i = 0; i < 8; i++) {
230 if (cp->xmm[i].file == file &&
231 cp->xmm[i].idx == idx) {
232 cp->xmm[i].last_used = cp->insn_counter;
233 return x86_make_reg(file_XMM, i);
234 }
235 }
236
237 return get_reg_ptr(file, idx);
238 }
239
240 /* As above, but always return a pointer:
241 */
242 static struct x86_reg get_arg_ptr( struct compilation *cp, GLuint file, GLuint idx )
243 {
244 GLuint i;
245
246 /* If there is a modified version of this register in one of the
247 * XMM regs, write it out to memory.
248 */
249 for (i = 0; i < 8; i++) {
250 if (cp->xmm[i].file == file &&
251 cp->xmm[i].idx == idx &&
252 cp->xmm[i].dirty)
253 spill(cp, i);
254 }
255
256 return get_reg_ptr(file, idx);
257 }
258
259 /* Emulate pshufd insn in regular SSE, if necessary:
260 */
261 static void emit_pshufd( struct compilation *cp,
262 struct x86_reg dst,
263 struct x86_reg arg0,
264 GLubyte shuf )
265 {
266 if (cp->have_sse2) {
267 sse2_pshufd(&cp->func, dst, arg0, shuf);
268 cp->func.fn = 0;
269 }
270 else {
271 if (!eq(dst, arg0))
272 sse_movups(&cp->func, dst, arg0);
273
274 sse_shufps(&cp->func, dst, dst, shuf);
275 }
276 }
277
278 static void set_fpu_round_neg_inf( struct compilation *cp )
279 {
280 if (cp->fpucntl != RND_NEG_FPU) {
281 struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
282 struct arb_vp_machine *m = NULL;
283
284 cp->fpucntl = RND_NEG_FPU;
285 x87_fnclex(&cp->func);
286 x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg)));
287 }
288 }
289
290
291 /* Perform a reduced swizzle.
292 */
293 static GLboolean emit_RSW( struct compilation *cp, union instruction op )
294 {
295 struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
296 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
297 GLuint swz = op.rsw.swz;
298 GLuint neg = op.rsw.neg;
299
300 emit_pshufd(cp, dst, arg0, swz);
301
302 if (neg) {
303 struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
304 struct x86_reg tmp = get_xmm_reg(cp);
305 /* Load 1,-1,0,0
306 * Use neg as arg to pshufd
307 * Multiply
308 */
309 emit_pshufd(cp, tmp, negs,
310 SHUF((neg & 1) ? 1 : 0,
311 (neg & 2) ? 1 : 0,
312 (neg & 4) ? 1 : 0,
313 (neg & 8) ? 1 : 0));
314 sse_mulps(&cp->func, dst, tmp);
315 }
316
317 return GL_TRUE;
318 }
319
320 /* Helper for writemask:
321 */
322 static GLboolean emit_shuf_copy1( struct compilation *cp,
323 struct x86_reg dst,
324 struct x86_reg arg0,
325 struct x86_reg arg1,
326 GLubyte shuf )
327 {
328 struct x86_reg tmp = get_xmm_reg(cp);
329 sse_movups(&cp->func, dst, arg1);
330 emit_pshufd(cp, dst, dst, shuf);
331 emit_pshufd(cp, tmp, arg0, shuf);
332
333 sse_movss(&cp->func, dst, tmp);
334
335 emit_pshufd(cp, dst, dst, shuf);
336 return GL_TRUE;
337 }
338
339
340 /* Helper for writemask:
341 */
342 static GLboolean emit_shuf_copy2( struct compilation *cp,
343 struct x86_reg dst,
344 struct x86_reg arg0,
345 struct x86_reg arg1,
346 GLubyte shuf )
347 {
348 struct x86_reg tmp = get_xmm_reg(cp);
349 emit_pshufd(cp, dst, arg1, shuf);
350 emit_pshufd(cp, tmp, arg0, shuf);
351
352 sse_shufps(&cp->func, dst, tmp, SHUF(X, Y, Z, W));
353
354 emit_pshufd(cp, dst, dst, shuf);
355 return GL_TRUE;
356 }
357
358
359 static void emit_x87_ex2( struct compilation *cp )
360 {
361 struct x86_reg st0 = x86_make_reg(file_x87, 0);
362 struct x86_reg st1 = x86_make_reg(file_x87, 1);
363 struct x86_reg st3 = x86_make_reg(file_x87, 3);
364
365 set_fpu_round_neg_inf( cp );
366
367 x87_fld(&cp->func, st0); /* a a */
368 x87_fprndint( &cp->func ); /* int(a) a */
369 x87_fld(&cp->func, st0); /* int(a) int(a) a */
370 x87_fstp(&cp->func, st3); /* int(a) a int(a)*/
371 x87_fsubp(&cp->func, st1); /* frac(a) int(a) */
372 x87_f2xm1(&cp->func); /* (2^frac(a))-1 int(a)*/
373 x87_fld1(&cp->func); /* 1 (2^frac(a))-1 int(a)*/
374 x87_faddp(&cp->func, st1); /* 2^frac(a) int(a) */
375 x87_fscale(&cp->func); /* 2^a */
376 }
377
378 #if 0
379 static GLboolean emit_MSK2( struct compilation *cp, union instruction op )
380 {
381 struct x86_reg arg0 = get_arg(cp, op.msk.file, op.msk.arg);
382 struct x86_reg arg1 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
383 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
384
385 /* make full width bitmask in tmp
386 * dst = ~tmp
387 * tmp &= arg0
388 * dst &= arg1
389 * dst |= tmp
390 */
391 emit_pshufd(cp, tmp, get_arg(cp, FILE_REG, REG_NEGS),
392 SHUF((op.msk.mask & 1) ? 2 : 0,
393 (op.msk.mask & 2) ? 2 : 0,
394 (op.msk.mask & 4) ? 2 : 0,
395 (op.msk.mask & 8) ? 2 : 0));
396 sse2_pnot(&cp->func, dst, tmp);
397 sse2_pand(&cp->func, arg0, tmp);
398 sse2_pand(&cp->func, arg1, dst);
399 sse2_por(&cp->func, tmp, dst);
400 return GL_TRUE;
401 }
402 #endif
403
404
405 /* Used to implement write masking. This and most of the other instructions
406 * here would be easier to implement if there had been a translation
407 * to a 2 argument format (dst/arg0, arg1) at the shader level before
408 * attempting to translate to x86/sse code.
409 */
410 static GLboolean emit_MSK( struct compilation *cp, union instruction op )
411 {
412 struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx);
413 struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
414 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
415
416 /* Note that dst and dst0 refer to the same program variable, but
417 * will definitely be different XMM registers. We're effectively
418 * treating this as a 2 argument SEL now, just one of which happens
419 * always to be the same register as the destination.
420 */
421
422 switch (op.msk.mask) {
423 case 0:
424 sse_movups(&cp->func, dst, dst0);
425 return GL_TRUE;
426
427 case WRITEMASK_X:
428 if (arg.file == file_XMM) {
429 sse_movups(&cp->func, dst, dst0);
430 sse_movss(&cp->func, dst, arg);
431 }
432 else {
433 struct x86_reg tmp = get_xmm_reg(cp);
434 sse_movups(&cp->func, dst, dst0);
435 sse_movss(&cp->func, tmp, arg);
436 sse_movss(&cp->func, dst, tmp);
437 }
438 return GL_TRUE;
439
440 case WRITEMASK_XY:
441 sse_movups(&cp->func, dst, dst0);
442 sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));
443 return GL_TRUE;
444
445 case WRITEMASK_ZW:
446 sse_movups(&cp->func, dst, arg);
447 sse_shufps(&cp->func, dst, dst0, SHUF(X, Y, Z, W));
448 return GL_TRUE;
449
450 case WRITEMASK_YZW:
451 if (dst0.file == file_XMM) {
452 sse_movups(&cp->func, dst, arg);
453 sse_movss(&cp->func, dst, dst0);
454 }
455 else {
456 struct x86_reg tmp = get_xmm_reg(cp);
457 sse_movups(&cp->func, dst, arg);
458 sse_movss(&cp->func, tmp, dst0);
459 sse_movss(&cp->func, dst, tmp);
460 }
461 return GL_TRUE;
462
463 case WRITEMASK_Y:
464 emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Y,X,Z,W));
465 return GL_TRUE;
466
467 case WRITEMASK_Z:
468 emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
469 return GL_TRUE;
470
471 case WRITEMASK_W:
472 emit_shuf_copy1(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
473 return GL_TRUE;
474
475 case WRITEMASK_XZ:
476 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,Z,Y,W));
477 return GL_TRUE;
478
479 case WRITEMASK_XW:
480 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,W,Z,Y));
481
482 case WRITEMASK_YZ:
483 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
484 return GL_TRUE;
485
486 case WRITEMASK_YW:
487 emit_shuf_copy2(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
488 return GL_TRUE;
489
490 case WRITEMASK_XZW:
491 emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Y,X,Z,W));
492 return GL_TRUE;
493
494 case WRITEMASK_XYW:
495 emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Z,Y,X,W));
496 return GL_TRUE;
497
498 case WRITEMASK_XYZ:
499 emit_shuf_copy1(cp, dst, dst0, arg, SHUF(W,Y,Z,X));
500 return GL_TRUE;
501
502 case WRITEMASK_XYZW:
503 sse_movups(&cp->func, dst, arg);
504 return GL_TRUE;
505
506 default:
507 assert(0);
508 break;
509 }
510 }
511
512
513
514 static GLboolean emit_PRT( struct compilation *cp, union instruction op )
515 {
516 FAIL;
517 }
518
519
520 /**
521 * The traditional instructions. All operate on internal registers
522 * and ignore write masks and swizzling issues.
523 */
524
525 static GLboolean emit_ABS( struct compilation *cp, union instruction op )
526 {
527 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
528 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
529 struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
530
531 sse_movups(&cp->func, dst, arg0);
532 sse_mulps(&cp->func, dst, neg);
533 sse_maxps(&cp->func, dst, arg0);
534 return GL_TRUE;
535 }
536
537 static GLboolean emit_ADD( struct compilation *cp, union instruction op )
538 {
539 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
540 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
541 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
542
543 sse_movups(&cp->func, dst, arg0);
544 sse_addps(&cp->func, dst, arg1);
545 return GL_TRUE;
546 }
547
548
549 /* The dotproduct instructions don't really do that well in sse:
550 */
551 static GLboolean emit_DP3( struct compilation *cp, union instruction op )
552 {
553 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
554 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
555 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
556 struct x86_reg tmp = get_xmm_reg(cp);
557
558 sse_movups(&cp->func, dst, arg0);
559 sse_mulps(&cp->func, dst, arg1);
560
561 /* Now the hard bit: sum the first 3 values:
562 */
563 sse_movhlps(&cp->func, tmp, dst);
564 sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
565 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
566 sse_addss(&cp->func, dst, tmp);
567 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
568 return GL_TRUE;
569 }
570
571
572
573 static GLboolean emit_DP4( struct compilation *cp, union instruction op )
574 {
575 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
576 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
577 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
578 struct x86_reg tmp = get_xmm_reg(cp);
579
580 sse_movups(&cp->func, dst, arg0);
581 sse_mulps(&cp->func, dst, arg1);
582
583 /* Now the hard bit: sum the values:
584 */
585 sse_movhlps(&cp->func, tmp, dst);
586 sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
587 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
588 sse_addss(&cp->func, dst, tmp);
589 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
590 return GL_TRUE;
591 }
592
593 static GLboolean emit_DPH( struct compilation *cp, union instruction op )
594 {
595 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
596 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
597 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
598 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
599 struct x86_reg tmp = get_xmm_reg(cp);
600
601 emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
602 sse_movss(&cp->func, dst, ones);
603 emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
604 sse_mulps(&cp->func, dst, arg1);
605
606 /* Now the hard bit: sum the values (from DP4):
607 */
608 sse_movhlps(&cp->func, tmp, dst);
609 sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
610 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
611 sse_addss(&cp->func, dst, tmp);
612 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
613 return GL_TRUE;
614 }
615
616 #if 0
617 static GLboolean emit_DST( struct compilation *cp, union instruction op )
618 {
619 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
620 struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
621 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
622
623 /* dst[0] = 1.0 * 1.0F; */
624 /* dst[1] = arg0[1] * arg1[1]; */
625 /* dst[2] = arg0[2] * 1.0; */
626 /* dst[3] = 1.0 * arg1[3]; */
627
628 /* Would rather do some of this with integer regs, but:
629 * 1) No proper support for immediate values yet
630 * 2) I'd need to push/pop somewhere to get a free reg.
631 */
632 x87_fld1(&cp->func);
633 x87_fstp(&cp->func, dst); /* would rather do an immediate store... */
634 x87_fld(&cp->func, x86_make_disp(arg0, 4));
635 x87_fmul(&cp->func, x86_make_disp(arg1, 4));
636 x87_fstp(&cp->func, x86_make_disp(dst, 4));
637
638 if (!eq(arg0, dst)) {
639 x86_fld(&cp->func, x86_make_disp(arg0, 8));
640 x86_stp(&cp->func, x86_make_disp(dst, 8));
641 }
642
643 if (!eq(arg1, dst)) {
644 x86_fld(&cp->func, x86_make_disp(arg0, 12));
645 x86_stp(&cp->func, x86_make_disp(dst, 12));
646 }
647
648 return GL_TRUE;
649 }
650 #else
651 static GLboolean emit_DST( struct compilation *cp, union instruction op )
652 {
653 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
654 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
655 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
656 struct x86_reg tmp = get_xmm_reg(cp);
657 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
658
659 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
660 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
661 sse_mulps(&cp->func, dst, tmp);
662
663 /* dst[0] = 1.0 * 1.0F; */
664 /* dst[1] = arg0[1] * arg1[1]; */
665 /* dst[2] = arg0[2] * 1.0; */
666 /* dst[3] = 1.0 * arg1[3]; */
667
668 return GL_TRUE;
669 }
670 #endif
671
672 static GLboolean emit_LG2( struct compilation *cp, union instruction op )
673 {
674 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
675 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
676
677 x87_fld1(&cp->func); /* 1 */
678 x87_fld(&cp->func, arg0); /* a0 1 */
679 x87_fyl2x(&cp->func); /* log2(a0) */
680 x87_fst(&cp->func, x86_make_disp(dst, 0));
681 x87_fst(&cp->func, x86_make_disp(dst, 4));
682 x87_fst(&cp->func, x86_make_disp(dst, 8));
683 x87_fstp(&cp->func, x86_make_disp(dst, 12));
684
685 return GL_TRUE;
686 }
687
688
689 static GLboolean emit_EX2( struct compilation *cp, union instruction op )
690 {
691 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
692 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
693
694 /* CAUTION: dst may alias arg0!
695 */
696 x87_fld(&cp->func, arg0);
697
698 emit_x87_ex2(cp);
699
700 x87_fst(&cp->func, x86_make_disp(dst, 0));
701 x87_fst(&cp->func, x86_make_disp(dst, 4));
702 x87_fst(&cp->func, x86_make_disp(dst, 8));
703 x87_fst(&cp->func, x86_make_disp(dst, 12));
704 return GL_TRUE;
705 }
706
707 static GLboolean emit_EXP( struct compilation *cp, union instruction op )
708 {
709 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
710 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
711 struct x86_reg st0 = x86_make_reg(file_x87, 0);
712 struct x86_reg st1 = x86_make_reg(file_x87, 1);
713 struct x86_reg st3 = x86_make_reg(file_x87, 3);
714
715 /* CAUTION: dst may alias arg0!
716 */
717 x87_fld(&cp->func, arg0); /* arg0.x */
718 x87_fld(&cp->func, st0); /* arg arg */
719
720 /* by default, fpu is setup to round-to-nearest. We want to
721 * change this now, and track the state through to the end of the
722 * generated function so that it isn't repeated unnecessarily.
723 * Alternately, could subtract .5 to get round to -inf behaviour.
724 */
725 set_fpu_round_neg_inf( cp );
726 x87_fprndint( &cp->func ); /* flr(a) a */
727 x87_fld(&cp->func, st0); /* flr(a) flr(a) a */
728 x87_fld1(&cp->func); /* 1 floor(a) floor(a) a */
729 x87_fst(&cp->func, x86_make_disp(dst, 12)); /* stack unchanged */
730 x87_fscale(&cp->func); /* 2^floor(a) floor(a) a */
731 x87_fst(&cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/
732 x87_fstp(&cp->func, x86_make_disp(dst, 0)); /* flr(a) a 2^flr(a) */
733 x87_fsubrp(&cp->func, st1); /* frac(a) 2^flr(a) */
734 x87_fst(&cp->func, x86_make_disp(dst, 4)); /* frac(a) 2^flr(a) */
735 x87_f2xm1(&cp->func); /* (2^frac(a))-1 2^flr(a)*/
736 x87_fld1(&cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/
737 x87_faddp(&cp->func, st1); /* 2^frac(a) 2^flr(a) */
738 x87_fmulp(&cp->func, st1); /* 2^a */
739 x87_fst(&cp->func, x86_make_disp(dst, 8));
740
741
742
743 /* dst[0] = 2^floor(tmp); */
744 /* dst[1] = frac(tmp); */
745 /* dst[2] = 2^floor(tmp) * 2^frac(tmp); */
746 /* dst[3] = 1.0F; */
747 return GL_TRUE;
748 }
749
750 static GLboolean emit_LOG( struct compilation *cp, union instruction op )
751 {
752 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
753 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
754 struct x86_reg st0 = x86_make_reg(file_x87, 0);
755 struct x86_reg st1 = x86_make_reg(file_x87, 1);
756 struct x86_reg st2 = x86_make_reg(file_x87, 2);
757
758 /* CAUTION: dst may alias arg0!
759 */
760 x87_fld(&cp->func, arg0); /* arg0.x */
761 x87_fabs(&cp->func); /* |arg0.x| */
762 x87_fxtract(&cp->func); /* mantissa(arg0.x), exponent(arg0.x) */
763 x87_fst(&cp->func, st2); /* mantissa, exponent, mantissa */
764 x87_fld1(&cp->func); /* 1, mantissa, exponent, mantissa */
765 x87_fyl2x(&cp->func); /* log2(mantissa), exponent, mantissa */
766 x87_fadd(&cp->func, st0, st1); /* e+l2(m), e, m */
767 x87_fstp(&cp->func, x86_make_disp(dst, 8)); /* e, m */
768
769 x87_fld1(&cp->func); /* 1, e, m */
770 x87_fsub(&cp->func, st1, st0); /* 1, e-1, m */
771 x87_fstp(&cp->func, x86_make_disp(dst, 12)); /* e-1,m */
772 x87_fstp(&cp->func, dst); /* m */
773
774 x87_fadd(&cp->func, st0, st0); /* 2m */
775 x87_fstp(&cp->func, x86_make_disp(dst, 4));
776
777 return GL_TRUE;
778 }
779
780 static GLboolean emit_FLR( struct compilation *cp, union instruction op )
781 {
782 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
783 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
784 int i;
785
786 set_fpu_round_neg_inf( cp );
787
788 for (i = 0; i < 4; i++) {
789 x87_fld(&cp->func, x86_make_disp(arg0, i*4));
790 x87_fprndint( &cp->func );
791 x87_fstp(&cp->func, x86_make_disp(dst, i*4));
792 }
793
794
795 return GL_TRUE;
796 }
797
798 static GLboolean emit_FRC( struct compilation *cp, union instruction op )
799 {
800 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
801 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
802 struct x86_reg st0 = x86_make_reg(file_x87, 0);
803 struct x86_reg st1 = x86_make_reg(file_x87, 1);
804 int i;
805
806 set_fpu_round_neg_inf( cp );
807
808 /* Knowing liveness info or even just writemask would be useful
809 * here:
810 */
811 for (i = 0; i < 4; i++) {
812 x87_fld(&cp->func, x86_make_disp(arg0, i*4));
813 x87_fld(&cp->func, st0); /* a a */
814 x87_fprndint( &cp->func ); /* flr(a) a */
815 x87_fsubrp(&cp->func, st1); /* frc(a) */
816 x87_fstp(&cp->func, x86_make_disp(dst, i*4));
817 }
818
819 return GL_TRUE;
820 }
821
822
823
824 static GLboolean emit_LIT( struct compilation *cp, union instruction op )
825 {
826 #if 1
827 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
828 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
829 struct x86_reg lit = get_arg(cp, FILE_REG, REG_LIT);
830 struct x86_reg tmp = get_xmm_reg(cp);
831 struct x86_reg st1 = x86_make_reg(file_x87, 1);
832 struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);
833 GLubyte *fixup1, *fixup2;
834
835
836 /* Load the interesting parts of arg0:
837 */
838 x87_fld(&cp->func, x86_make_disp(arg0, 12)); /* a3 */
839 x87_fld(&cp->func, x86_make_disp(arg0, 4)); /* a1 a3 */
840 x87_fld(&cp->func, x86_make_disp(arg0, 0)); /* a0 a1 a3 */
841
842 /* Intialize dst:
843 */
844 sse_movaps(&cp->func, tmp, lit);
845 sse_movaps(&cp->func, dst, tmp);
846
847 /* Check arg0[0]:
848 */
849 x87_fldz(&cp->func); /* 0 a0 a1 a3 */
850 x87_fucomp(&cp->func, st1); /* a0 a1 a3 */
851 x87_fnstsw(&cp->func, regEAX);
852 x86_sahf(&cp->func);
853 fixup1 = x86_jcc_forward(&cp->func, cc_AE);
854
855 x87_fstp(&cp->func, x86_make_disp(dst, 4)); /* a1 a3 */
856
857 /* Check arg0[1]:
858 */
859 x87_fldz(&cp->func); /* 0 a1 a3 */
860 x87_fucomp(&cp->func, st1); /* a1 a3 */
861 x87_fnstsw(&cp->func, regEAX);
862 x86_sahf(&cp->func);
863 fixup2 = x86_jcc_forward(&cp->func, cc_AE);
864
865 /* Compute pow(a1, a3)
866 */
867 x87_fyl2x(&cp->func); /* a3*log2(a1) */
868
869 emit_x87_ex2( cp ); /* 2^(a3*log2(a1)) */
870
871 x87_fstp(&cp->func, x86_make_disp(dst, 8));
872
873 /* Land jumps:
874 */
875 x86_fixup_fwd_jump(&cp->func, fixup1);
876 x86_fixup_fwd_jump(&cp->func, fixup2);
877 #else
878 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
879 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_LIT);
880 sse_movups(&cp->func, dst, ones);
881 #endif
882 return GL_TRUE;
883 }
884
885
886
887 static GLboolean emit_MAX( struct compilation *cp, union instruction op )
888 {
889 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
890 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
891 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
892
893 sse_movups(&cp->func, dst, arg0);
894 sse_maxps(&cp->func, dst, arg1);
895 return GL_TRUE;
896 }
897
898
899 static GLboolean emit_MIN( struct compilation *cp, union instruction op )
900 {
901 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
902 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
903 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
904
905 sse_movups(&cp->func, dst, arg0);
906 sse_minps(&cp->func, dst, arg1);
907 return GL_TRUE;
908 }
909
910 static GLboolean emit_MOV( struct compilation *cp, union instruction op )
911 {
912 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
913 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
914
915 sse_movups(&cp->func, dst, arg0);
916 return GL_TRUE;
917 }
918
919 static GLboolean emit_MUL( struct compilation *cp, union instruction op )
920 {
921 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
922 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
923 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
924
925 sse_movups(&cp->func, dst, arg0);
926 sse_mulps(&cp->func, dst, arg1);
927 return GL_TRUE;
928 }
929
930
931 static GLboolean emit_POW( struct compilation *cp, union instruction op )
932 {
933 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
934 struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
935 struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
936
937 x87_fld(&cp->func, arg1); /* a1 */
938 x87_fld(&cp->func, arg0); /* a0 a1 */
939 x87_fyl2x(&cp->func); /* a1*log2(a0) */
940
941 emit_x87_ex2( cp ); /* 2^(a1*log2(a0)) */
942
943 x87_fst(&cp->func, x86_make_disp(dst, 0));
944 x87_fst(&cp->func, x86_make_disp(dst, 4));
945 x87_fst(&cp->func, x86_make_disp(dst, 8));
946 x87_fstp(&cp->func, x86_make_disp(dst, 12));
947
948 return GL_TRUE;
949 }
950
951 static GLboolean emit_REL( struct compilation *cp, union instruction op )
952 {
953 /* GLuint idx = (op.alu.idx0 + (GLint)cp->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1); */
954 /* GLuint idx = 0; */
955 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, idx); */
956 /* struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); */
957
958 /* dst[0] = arg0[0]; */
959 /* dst[1] = arg0[1]; */
960 /* dst[2] = arg0[2]; */
961 /* dst[3] = arg0[3]; */
962
963 FAIL;
964 }
965
966 static GLboolean emit_RCP( struct compilation *cp, union instruction op )
967 {
968 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
969 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
970
971 if (cp->have_sse2) {
972 sse2_rcpss(&cp->func, dst, arg0);
973 }
974 else {
975 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
976 sse_movss(&cp->func, dst, ones);
977 sse_divss(&cp->func, dst, arg0);
978 }
979
980 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
981 return GL_TRUE;
982 }
983
984 static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
985 {
986 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
987 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
988
989 /* TODO: Calculate absolute value
990 */
991 #if 0
992 sse_movss(&cp->func, dst, arg0);
993 sse_mulss(&cp->func, dst, neg);
994 sse_maxss(&cp->func, dst, arg0);
995 #endif
996
997 sse_rsqrtss(&cp->func, dst, arg0);
998 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
999 return GL_TRUE;
1000 }
1001
1002
1003 static GLboolean emit_SGE( struct compilation *cp, union instruction op )
1004 {
1005 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1006 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1007 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1008 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1009
1010 sse_movups(&cp->func, dst, arg0);
1011 sse_cmpps(&cp->func, dst, arg1, cc_NotLessThan);
1012 sse_andps(&cp->func, dst, ones);
1013 return GL_TRUE;
1014 }
1015
1016
1017 static GLboolean emit_SLT( struct compilation *cp, union instruction op )
1018 {
1019 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1020 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1021 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1022 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1023
1024 sse_movups(&cp->func, dst, arg0);
1025 sse_cmpps(&cp->func, dst, arg1, cc_LessThan);
1026 sse_andps(&cp->func, dst, ones);
1027 return GL_TRUE;
1028 }
1029
1030 static GLboolean emit_SUB( struct compilation *cp, union instruction op )
1031 {
1032 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1033 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1034 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1035
1036 sse_movups(&cp->func, dst, arg0);
1037 sse_subps(&cp->func, dst, arg1);
1038 return GL_TRUE;
1039 }
1040
1041
1042 static GLboolean emit_XPD( struct compilation *cp, union instruction op )
1043 {
1044 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1045 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1046 struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1047 struct x86_reg tmp0 = get_xmm_reg(cp);
1048 struct x86_reg tmp1 = get_xmm_reg(cp);
1049
1050 /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way
1051 * to invalidate registers. This will come with better analysis
1052 * (liveness analysis) of the incoming program.
1053 */
1054 emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W));
1055 emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W));
1056 sse_mulps(&cp->func, dst, tmp1);
1057 emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W));
1058 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1059 sse_mulps(&cp->func, tmp0, tmp1);
1060 sse_subps(&cp->func, dst, tmp0);
1061
1062 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1063 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1064 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1065 /* dst[3] is undef */
1066
1067 return GL_TRUE;
1068 }
1069
1070 static GLboolean emit_NOP( struct compilation *cp, union instruction op )
1071 {
1072 return GL_TRUE;
1073 }
1074
1075
1076 static GLboolean (* const emit_func[])(struct compilation *, union instruction) =
1077 {
1078 emit_ABS,
1079 emit_ADD,
1080 emit_NOP, /* ARA */
1081 emit_NOP, /* ARL */
1082 emit_NOP, /* ARL_NV */
1083 emit_NOP, /* ARR */
1084 emit_NOP, /* BRA */
1085 emit_NOP, /* CAL */
1086 emit_NOP, /* CMP */
1087 emit_NOP, /* COS */
1088 emit_NOP, /* DDX */
1089 emit_NOP, /* DDY */
1090 emit_DP3,
1091 emit_DP4,
1092 emit_DPH,
1093 emit_DST,
1094 emit_NOP, /* END */
1095 emit_EX2,
1096 emit_EXP,
1097 emit_FLR,
1098 emit_FRC,
1099 emit_NOP, /* KIL */
1100 emit_NOP, /* KIL_NV */
1101 emit_LG2,
1102 emit_LIT,
1103 emit_LOG,
1104 emit_NOP, /* LRP */
1105 emit_NOP, /* MAD */
1106 emit_MAX,
1107 emit_MIN,
1108 emit_MOV,
1109 emit_MUL,
1110 emit_NOP, /* PK2H */
1111 emit_NOP, /* PK2US */
1112 emit_NOP, /* PK4B */
1113 emit_NOP, /* PK4UB */
1114 emit_POW,
1115 emit_NOP, /* POPA */
1116 emit_PRT,
1117 emit_NOP, /* PUSHA */
1118 emit_NOP, /* RCC */
1119 emit_RCP,
1120 emit_NOP, /* RET */
1121 emit_NOP, /* RFL */
1122 emit_RSQ,
1123 emit_NOP, /* SCS */
1124 emit_NOP, /* SEQ */
1125 emit_NOP, /* SFL */
1126 emit_SGE,
1127 emit_NOP, /* SGT */
1128 emit_NOP, /* SIN */
1129 emit_NOP, /* SLE */
1130 emit_SLT,
1131 emit_NOP, /* SNE */
1132 emit_NOP, /* SSG */
1133 emit_NOP, /* STR */
1134 emit_SUB,
1135 emit_RSW, /* SWZ */
1136 emit_NOP, /* TEX */
1137 emit_NOP, /* TXB */
1138 emit_NOP, /* TXD */
1139 emit_NOP, /* TXL */
1140 emit_NOP, /* TXP */
1141 emit_NOP, /* TXP_NV */
1142 emit_NOP, /* UP2H */
1143 emit_NOP, /* UP2US */
1144 emit_NOP, /* UP4B */
1145 emit_NOP, /* UP4UB */
1146 emit_NOP, /* X2D */
1147 emit_XPD,
1148 emit_RSW,
1149 emit_MSK,
1150 emit_REL,
1151 };
1152
1153
1154
1155 static GLboolean build_vertex_program( struct compilation *cp )
1156 {
1157 struct arb_vp_machine *m = NULL;
1158 GLuint j;
1159
1160 struct x86_reg regEBX = x86_make_reg(file_REG32, reg_BX);
1161 struct x86_reg regECX = x86_make_reg(file_REG32, reg_CX);
1162 struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
1163
1164 x86_push(&cp->func, regEBX);
1165
1166 x86_mov(&cp->func, regEDX, x86_fn_arg(&cp->func, 1));
1167 x86_mov(&cp->func, regEBX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_REG)));
1168 x86_mov(&cp->func, regECX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_STATE_PARAM)));
1169
1170 for (j = 0; j < cp->p->nr_instructions; j++) {
1171 union instruction inst = cp->p->instructions[j];
1172 cp->insn_counter = j+1; /* avoid zero */
1173
1174 if (DISASSEM) {
1175 _mesa_printf("%p: ", cp->func.csr);
1176 _tnl_disassem_vba_insn( inst );
1177 }
1178 cp->func.fn = NULL;
1179
1180 if (!emit_func[inst.alu.opcode]( cp, inst )) {
1181 return GL_FALSE;
1182 }
1183 }
1184
1185 /* TODO: only for outputs:
1186 */
1187 for (j = 0; j < 8; j++) {
1188 if (cp->xmm[j].dirty)
1189 spill(cp, j);
1190 }
1191
1192
1193 /* Exit mmx state?
1194 */
1195 if (cp->func.need_emms)
1196 mmx_emms(&cp->func);
1197
1198 /* Restore FPU control word?
1199 */
1200 if (cp->fpucntl != RESTORE_FPU) {
1201 x87_fnclex(&cp->func);
1202 x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore)));
1203 }
1204
1205 x86_pop(&cp->func, regEBX);
1206 x86_ret(&cp->func);
1207
1208 return GL_TRUE;
1209 }
1210
1211 /**
1212 * Execute the given vertex program.
1213 *
1214 * TODO: Integrate the t_vertex.c code here, to build machine vertices
1215 * directly at this point.
1216 *
1217 * TODO: Eliminate the VB struct entirely and just use
1218 * struct arb_vertex_machine.
1219 */
1220 GLboolean
1221 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1222 {
1223 struct compilation cp;
1224
1225 /* sanity checks */
1226 assert(emit_func[OPCODE_ABS] == emit_ABS);
1227 assert(emit_func[OPCODE_MUL] == emit_MUL);
1228 assert(emit_func[OPCODE_XPD] == emit_XPD);
1229
1230 _mesa_memset(&cp, 0, sizeof(cp));
1231 cp.p = p;
1232 cp.have_sse2 = 1;
1233
1234 if (p->compiled_func) {
1235 _mesa_free((void *)p->compiled_func);
1236 p->compiled_func = NULL;
1237 }
1238
1239 x86_init_func(&cp.func);
1240
1241 cp.fpucntl = RESTORE_FPU;
1242
1243
1244 /* Note ctx state is not referenced in building the function, so it
1245 * depends only on the list of instructions:
1246 */
1247 if (!build_vertex_program(&cp)) {
1248 x86_release_func( &cp.func );
1249 return GL_FALSE;
1250 }
1251
1252
1253 p->compiled_func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
1254 return GL_TRUE;
1255 }
1256
1257
1258
1259 #else
1260
1261 GLboolean
1262 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1263 {
1264 /* Dummy version for when USE_SSE_ASM not defined */
1265 return GL_FALSE;
1266 }
1267
1268 #endif