Turn off debug
[mesa.git] / src / mesa / tnl / t_vb_arbprogram_sse.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file t_vb_arb_program_sse.c
27 *
28 * Translate simplified vertex_program representation to x86/SSE/SSE2
29 * machine code using mesa's rtasm runtime assembler.
30 *
31 * \author Keith Whitwell
32 */
33
34 #include "glheader.h"
35 #include "context.h"
36 #include "imports.h"
37 #include "macros.h"
38 #include "mtypes.h"
39 #include "arbprogparse.h"
40 #include "program.h"
41 #include "math/m_matrix.h"
42 #include "math/m_translate.h"
43 #include "t_context.h"
44 #include "t_vb_arbprogram.h"
45
46 #if defined(USE_SSE_ASM)
47
48 #include "x86/rtasm/x86sse.h"
49 #include "x86/common_x86_asm.h"
50
51
52 #define X 0
53 #define Y 1
54 #define Z 2
55 #define W 3
56
57 /* Reg usage:
58 *
59 * EAX - point to 'm->File[0]'
60 * ECX - point to 'm->File[3]'
61 * EDX,
62 * EBX,
63 * ESP,
64 * EBP,
65 * ESI,
66 * EDI
67 */
68
69 #define DISASSEM 0
70
71 #define FAIL \
72 do { \
73 _mesa_printf("x86 translation failed in %s\n", __FUNCTION__); \
74 return GL_FALSE; \
75 } while (0)
76
77 struct compilation {
78 struct x86_function func;
79 struct arb_vp_machine *m;
80
81 GLuint insn_counter;
82
83 struct {
84 GLuint file:2;
85 GLuint idx:7;
86 GLuint dirty:1;
87 GLuint last_used:10;
88 } xmm[8];
89
90 struct {
91 struct x86_reg base;
92 } file[4];
93
94 GLboolean have_sse2;
95 };
96
97 static INLINE GLboolean eq( struct x86_reg a,
98 struct x86_reg b )
99 {
100 return (a.file == b.file &&
101 a.idx == b.idx &&
102 a.mod == b.mod &&
103 a.disp == b.disp);
104 }
105
106
107
108 static struct x86_reg get_reg_ptr(GLuint file,
109 GLuint idx )
110 {
111 struct x86_reg reg;
112
113 switch (file) {
114 case FILE_REG:
115 reg = x86_make_reg(file_REG32, reg_AX);
116 assert(idx != REG_UNDEF);
117 break;
118 case FILE_STATE_PARAM:
119 reg = x86_make_reg(file_REG32, reg_CX);
120 break;
121 default:
122 assert(0);
123 }
124
125 return x86_make_disp(reg, 16 * idx);
126 }
127
128
129 static void spill( struct compilation *cp, GLuint idx )
130 {
131 struct x86_reg oldval = get_reg_ptr(cp->xmm[idx].file,
132 cp->xmm[idx].idx);
133
134 assert(cp->xmm[idx].dirty);
135 sse_movups(&cp->func, oldval, x86_make_reg(file_XMM, idx));
136 cp->xmm[idx].dirty = 0;
137 }
138
139 static struct x86_reg get_xmm_reg( struct compilation *cp )
140 {
141 GLuint i;
142 GLuint oldest = 0;
143
144 for (i = 0; i < 8; i++)
145 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
146 oldest = i;
147
148 /* Need to write out the old value?
149 */
150 if (cp->xmm[oldest].dirty)
151 spill(cp, oldest);
152
153 assert(cp->xmm[oldest].last_used != cp->insn_counter);
154
155 cp->xmm[oldest].file = FILE_REG;
156 cp->xmm[oldest].idx = REG_UNDEF;
157 cp->xmm[oldest].last_used = cp->insn_counter;
158 return x86_make_reg(file_XMM, oldest);
159 }
160
161
162
163
164 static struct x86_reg get_dst_reg( struct compilation *cp,
165 GLuint file, GLuint idx )
166 {
167 struct x86_reg reg;
168 GLuint i;
169
170 /* Invalidate any old copy of this register in XMM0-7. Don't reuse
171 * as this may be one of the arguments.
172 */
173 for (i = 0; i < 8; i++) {
174 if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
175 cp->xmm[i].file = FILE_REG;
176 cp->xmm[i].idx = REG_UNDEF;
177 cp->xmm[i].dirty = 0;
178 break;
179 }
180 }
181
182 reg = get_xmm_reg( cp );
183 cp->xmm[reg.idx].file = file;
184 cp->xmm[reg.idx].idx = idx;
185 cp->xmm[reg.idx].dirty = 1;
186 return reg;
187 }
188
189
190 /* Return an XMM reg if the argument is resident, otherwise return a
191 * base+offset pointer to the saved value.
192 */
193 static struct x86_reg get_arg( struct compilation *cp, GLuint file, GLuint idx )
194 {
195 GLuint i;
196
197 for (i = 0; i < 8; i++) {
198 if (cp->xmm[i].file == file &&
199 cp->xmm[i].idx == idx) {
200 cp->xmm[i].last_used = cp->insn_counter;
201 return x86_make_reg(file_XMM, i);
202 }
203 }
204
205 return get_reg_ptr(file, idx);
206 }
207
208 static void emit_pshufd( struct compilation *cp,
209 struct x86_reg dst,
210 struct x86_reg arg0,
211 GLubyte shuf )
212 {
213 if (cp->have_sse2) {
214 sse2_pshufd(&cp->func, dst, arg0, shuf);
215 cp->func.fn = 0;
216 }
217 else {
218 if (!eq(dst, arg0))
219 sse_movups(&cp->func, dst, arg0);
220
221 sse_shufps(&cp->func, dst, dst, shuf);
222 }
223 }
224
225
226
227 /* Perform a reduced swizzle.
228 */
229 static GLboolean emit_RSW( struct compilation *cp, union instruction op )
230 {
231 struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
232 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.rsw.dst);
233 GLuint swz = op.rsw.swz;
234 GLuint neg = op.rsw.neg;
235
236 emit_pshufd(cp, dst, arg0, swz);
237
238 if (neg) {
239 struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
240 struct x86_reg tmp = get_xmm_reg(cp);
241 /* Load 1,-1,0,0
242 * Use neg as arg to pshufd
243 * Multiply
244 */
245 emit_pshufd(cp, tmp, negs,
246 SHUF((neg & 1) ? 1 : 0,
247 (neg & 2) ? 1 : 0,
248 (neg & 4) ? 1 : 0,
249 (neg & 8) ? 1 : 0));
250 sse_mulps(&cp->func, dst, tmp);
251 }
252
253 return GL_TRUE;
254 }
255
256 /* Used to implement write masking. This and most of the other instructions
257 * here would be easier to implement if there had been a translation
258 * to a 2 argument format (dst/arg0, arg1) at the shader level before
259 * attempting to translate to x86/sse code.
260 */
261 /* Hmm. I went back to MSK from SEL to make things easier -- was that just BS?
262 */
263 static GLboolean emit_MSK( struct compilation *cp, union instruction op )
264 {
265 struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx);
266 struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst);
267 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.msk.dst);
268
269 sse_movups(&cp->func, dst, dst0);
270
271 switch (op.msk.mask) {
272 case 0:
273 return GL_TRUE;
274
275 case WRITEMASK_X:
276 if (arg.file == file_XMM) {
277 sse_movss(&cp->func, dst, arg);
278 }
279 else {
280 struct x86_reg tmp = get_xmm_reg(cp);
281 sse_movss(&cp->func, tmp, arg);
282 sse_movss(&cp->func, dst, tmp);
283 }
284 return GL_TRUE;
285
286 case WRITEMASK_Y: {
287 struct x86_reg tmp = get_xmm_reg(cp);
288 emit_pshufd(cp, dst, dst, SHUF(Y, X, Z, W));
289 emit_pshufd(cp, tmp, arg, SHUF(Y, X, Z, W));
290 sse_movss(&cp->func, dst, tmp);
291 emit_pshufd(cp, dst, dst, SHUF(Y, X, Z, W));
292 return GL_TRUE;
293 }
294
295 case WRITEMASK_Z: {
296 struct x86_reg tmp = get_xmm_reg(cp);
297 emit_pshufd(cp, dst, dst, SHUF(Z, Y, X, W));
298 emit_pshufd(cp, tmp, arg, SHUF(Z, Y, X, W));
299 sse_movss(&cp->func, dst, tmp);
300 emit_pshufd(cp, dst, dst, SHUF(Z, Y, X, W));
301 return GL_TRUE;
302 }
303
304 case WRITEMASK_W: {
305 struct x86_reg tmp = get_xmm_reg(cp);
306 emit_pshufd(cp, dst, dst, SHUF(W, Y, Z, X));
307 emit_pshufd(cp, tmp, arg, SHUF(W, Y, Z, X));
308 sse_movss(&cp->func, dst, tmp);
309 emit_pshufd(cp, dst, dst, SHUF(W, Y, Z, X));
310 return GL_TRUE;
311 }
312
313 case WRITEMASK_XY:
314 sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));
315 return GL_TRUE;
316
317 case WRITEMASK_ZW: {
318 struct x86_reg tmp = get_xmm_reg(cp);
319 sse_movups(&cp->func, tmp, dst);
320 sse_movups(&cp->func, dst, arg);
321 sse_shufps(&cp->func, dst, tmp, SHUF(X, Y, Z, W));
322 return GL_TRUE;
323 }
324
325 case WRITEMASK_YZW: {
326 struct x86_reg tmp = get_xmm_reg(cp);
327 sse_movss(&cp->func, tmp, dst);
328 sse_movups(&cp->func, dst, arg);
329 sse_movss(&cp->func, dst, tmp);
330 return GL_TRUE;
331 }
332
333 case WRITEMASK_XYZW:
334 sse_movups(&cp->func, dst, arg);
335 return GL_TRUE;
336
337 default:
338 FAIL;
339 }
340
341 #if 0
342 /* The catchall implementation:
343 */
344
345 /* make full width bitmask in tmp
346 * dst = ~tmp
347 * tmp &= arg0
348 * dst &= arg1
349 * dst |= tmp
350 */
351 {
352 struct x86_reg negs = get_arg(cp, FILE_REG, REG_NEGS);
353 emit_pshufd(cp, tmp, negs,
354 SHUF((op.msk.mask & 1) ? 2 : 0,
355 (op.msk.mask & 2) ? 2 : 0,
356 (op.msk.mask & 4) ? 2 : 0,
357 (op.msk.mask & 8) ? 2 : 0));
358 sse_mulps(&cp->func, dst, tmp);
359 }
360
361 return GL_TRUE;
362 #endif
363 FAIL;
364 }
365
366
367
368 static GLboolean emit_PRT( struct compilation *cp, union instruction op )
369 {
370 FAIL;
371 }
372
373
374 /**
375 * The traditional instructions. All operate on internal registers
376 * and ignore write masks and swizzling issues.
377 */
378
379 static GLboolean emit_ABS( struct compilation *cp, union instruction op )
380 {
381 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
382 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
383 struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
384
385 sse_movups(&cp->func, dst, arg0);
386 sse_mulps(&cp->func, dst, neg);
387 sse_maxps(&cp->func, dst, arg0);
388 return GL_TRUE;
389 }
390
391 static GLboolean emit_ADD( struct compilation *cp, union instruction op )
392 {
393 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
394 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
395 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
396
397 sse_movups(&cp->func, dst, arg0);
398 sse_addps(&cp->func, dst, arg1);
399 return GL_TRUE;
400 }
401
402
403 /* The dotproduct instructions don't really do that well in sse:
404 */
405 static GLboolean emit_DP3( struct compilation *cp, union instruction op )
406 {
407 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
408 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
409 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
410 struct x86_reg tmp = get_xmm_reg(cp);
411
412 sse_movups(&cp->func, dst, arg0);
413 sse_mulps(&cp->func, dst, arg1);
414
415 /* Now the hard bit: sum the first 3 values:
416 */
417 sse_movhlps(&cp->func, tmp, dst);
418 sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
419 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
420 sse_addss(&cp->func, dst, tmp);
421 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
422 return GL_TRUE;
423 }
424
425
426
427 static GLboolean emit_DP4( struct compilation *cp, union instruction op )
428 {
429 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
430 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
431 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
432 struct x86_reg tmp = get_xmm_reg(cp);
433
434 sse_movups(&cp->func, dst, arg0);
435 sse_mulps(&cp->func, dst, arg1);
436
437 /* Now the hard bit: sum the values:
438 */
439 sse_movhlps(&cp->func, tmp, dst);
440 sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
441 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
442 sse_addss(&cp->func, dst, tmp);
443 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
444 return GL_TRUE;
445 }
446
447 static GLboolean emit_DPH( struct compilation *cp, union instruction op )
448 {
449 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
450 /* struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); */
451 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
452
453 /* dst[0] = (arg0[0] * arg1[0] + */
454 /* arg0[1] * arg1[1] + */
455 /* arg0[2] * arg1[2] + */
456 /* 1.0 * arg1[3]); */
457
458 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
459 FAIL;
460 }
461
462 static GLboolean emit_DST( struct compilation *cp, union instruction op )
463 {
464 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
465 /* struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); */
466 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
467
468 /* dst[0] = 1.0 * 1.0F; */
469 /* dst[1] = arg0[1] * arg1[1]; */
470 /* dst[2] = arg0[2] * 1.0; */
471 /* dst[3] = 1.0 * arg1[3]; */
472
473 FAIL;
474 }
475
476
477 static GLboolean emit_EX2( struct compilation *cp, union instruction op )
478 {
479 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
480 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
481
482 /* dst[0] = (GLfloat)RoughApproxPow2(arg0[0]); */
483 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
484 FAIL;
485 }
486
487 static GLboolean emit_EXP( struct compilation *cp, union instruction op )
488 {
489 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
490 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
491
492 /* GLfloat tmp = arg0[0]; */
493 /* GLfloat flr_tmp = FLOORF(tmp); */
494 /* dst[0] = (GLfloat) (1 << (int)flr_tmp); */
495 /* dst[1] = tmp - flr_tmp; */
496 /* dst[2] = RoughApproxPow2(tmp); */
497 /* dst[3] = 1.0F; */
498 FAIL;
499 }
500
501 static GLboolean emit_FLR( struct compilation *cp, union instruction op )
502 {
503 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
504 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
505
506 /* dst[0] = FLOORF(arg0[0]); */
507 /* dst[1] = FLOORF(arg0[1]); */
508 /* dst[2] = FLOORF(arg0[2]); */
509 /* dst[3] = FLOORF(arg0[3]); */
510 FAIL;
511 }
512
513 static GLboolean emit_FRC( struct compilation *cp, union instruction op )
514 {
515 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
516 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
517
518 /* dst[0] = arg0[0] - FLOORF(arg0[0]); */
519 /* dst[1] = arg0[1] - FLOORF(arg0[1]); */
520 /* dst[2] = arg0[2] - FLOORF(arg0[2]); */
521 /* dst[3] = arg0[3] - FLOORF(arg0[3]); */
522 FAIL;
523 }
524
525 static GLboolean emit_LG2( struct compilation *cp, union instruction op )
526 {
527 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
528 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
529
530 /* dst[0] = RoughApproxLog2(arg0[0]); */
531
532 /* sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X)); */
533 FAIL;
534 }
535
536
537
538 static GLboolean emit_LIT( struct compilation *cp, union instruction op )
539 {
540 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
541 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
542
543 /* const GLfloat epsilon = 1.0F / 256.0F; */
544 /* GLfloat tmp[4]; */
545
546 /* tmp[0] = MAX2(arg0[0], 0.0F); */
547 /* tmp[1] = MAX2(arg0[1], 0.0F); */
548 /* tmp[3] = CLAMP(arg0[3], -(128.0F - epsilon), (128.0F - epsilon)); */
549
550 /* dst[0] = 1.0; */
551 /* dst[1] = tmp[0]; */
552 /* dst[2] = (tmp[0] > 0.0) ? RoughApproxPower(tmp[1], tmp[3]) : 0.0F; */
553 /* dst[3] = 1.0; */
554 FAIL;
555 }
556
557
558 static GLboolean emit_LOG( struct compilation *cp, union instruction op )
559 {
560 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
561 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
562
563 /* GLfloat tmp = FABSF(arg0[0]); */
564 /* int exponent; */
565 /* GLfloat mantissa = FREXPF(tmp, &exponent); */
566 /* dst[0] = (GLfloat) (exponent - 1); */
567 /* dst[1] = 2.0 * mantissa; // map [.5, 1) -> [1, 2) */
568 /* dst[2] = dst[0] + LOG2(dst[1]); */
569 /* dst[3] = 1.0; */
570 FAIL;
571 }
572
573 static GLboolean emit_MAX( struct compilation *cp, union instruction op )
574 {
575 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
576 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
577 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
578
579 sse_movups(&cp->func, dst, arg0);
580 sse_maxps(&cp->func, dst, arg1);
581 return GL_TRUE;
582 }
583
584
585 static GLboolean emit_MIN( struct compilation *cp, union instruction op )
586 {
587 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
588 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
589 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
590
591 sse_movups(&cp->func, dst, arg0);
592 sse_minps(&cp->func, dst, arg1);
593 return GL_TRUE;
594 }
595
596 static GLboolean emit_MOV( struct compilation *cp, union instruction op )
597 {
598 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
599 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
600
601 sse_movups(&cp->func, dst, arg0);
602 return GL_TRUE;
603 }
604
605 static GLboolean emit_MUL( struct compilation *cp, union instruction op )
606 {
607 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
608 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
609 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
610
611 sse_movups(&cp->func, dst, arg0);
612 sse_mulps(&cp->func, dst, arg1);
613 return GL_TRUE;
614 }
615
616
617 static GLboolean emit_POW( struct compilation *cp, union instruction op )
618 {
619 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
620 /* struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); */
621 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
622
623 /* dst[0] = (GLfloat)RoughApproxPower(arg0[0], arg1[0]); */
624
625 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
626 FAIL;
627 }
628
629 static GLboolean emit_REL( struct compilation *cp, union instruction op )
630 {
631 /* GLuint idx = (op.alu.idx0 + (GLint)cp->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1); */
632 /* GLuint idx = 0; */
633 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, idx); */
634 /* struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
635
636 /* dst[0] = arg0[0]; */
637 /* dst[1] = arg0[1]; */
638 /* dst[2] = arg0[2]; */
639 /* dst[3] = arg0[3]; */
640
641 FAIL;
642 }
643
644 static GLboolean emit_RCP( struct compilation *cp, union instruction op )
645 {
646 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
647 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
648
649 if (cp->have_sse2) {
650 sse2_rcpss(&cp->func, dst, arg0);
651 }
652 else {
653 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
654 sse_movss(&cp->func, dst, ones);
655 sse_divss(&cp->func, dst, arg0);
656 }
657
658 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
659 return GL_TRUE;
660 }
661
662 static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
663 {
664 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
665 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
666
667 sse_rsqrtss(&cp->func, dst, arg0);
668 sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
669 return GL_TRUE;
670 }
671
672
673 static GLboolean emit_SGE( struct compilation *cp, union instruction op )
674 {
675 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
676 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
677 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
678 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
679
680 sse_movups(&cp->func, dst, arg0);
681 sse_cmpps(&cp->func, dst, arg1, cc_NotLessThan);
682 sse_andps(&cp->func, dst, ones);
683 return GL_TRUE;
684 }
685
686
687 static GLboolean emit_SLT( struct compilation *cp, union instruction op )
688 {
689 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
690 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
691 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
692 struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
693
694 sse_movups(&cp->func, dst, arg0);
695 sse_cmpps(&cp->func, dst, arg1, cc_LessThan);
696 sse_andps(&cp->func, dst, ones);
697 return GL_TRUE;
698 }
699
700 static GLboolean emit_SUB( struct compilation *cp, union instruction op )
701 {
702 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
703 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
704 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
705
706 sse_movups(&cp->func, dst, arg0);
707 sse_subps(&cp->func, dst, arg1);
708 return GL_TRUE;
709 }
710
711
712 static GLboolean emit_XPD( struct compilation *cp, union instruction op )
713 {
714 struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
715 struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
716 struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
717 struct x86_reg tmp0 = get_xmm_reg(cp);
718 struct x86_reg tmp1 = get_xmm_reg(cp);
719
720 /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way
721 * to invalidate registers. This will come with better analysis
722 * (liveness analysis) of the incoming program.
723 */
724 emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W));
725 emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W));
726 sse_mulps(&cp->func, dst, tmp1);
727 emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W));
728 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
729 sse_mulps(&cp->func, tmp0, tmp1);
730 sse_subps(&cp->func, dst, tmp0);
731
732 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
733 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
734 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
735 /* dst[3] is undef */
736
737 return GL_TRUE;
738 }
739
740 static GLboolean emit_NOP( struct compilation *cp, union instruction op )
741 {
742 return GL_TRUE;
743 }
744
745
746 static GLboolean (* const emit_func[])(struct compilation *, union instruction) =
747 {
748 emit_ABS,
749 emit_ADD,
750 emit_NOP,
751 emit_DP3,
752 emit_DP4,
753 emit_DPH,
754 emit_DST,
755 emit_NOP,
756 emit_EX2,
757 emit_EXP,
758 emit_FLR,
759 emit_FRC,
760 emit_LG2,
761 emit_LIT,
762 emit_LOG,
763 emit_NOP,
764 emit_MAX,
765 emit_MIN,
766 emit_MOV,
767 emit_MUL,
768 emit_POW,
769 emit_PRT,
770 emit_NOP,
771 emit_RCP,
772 emit_RSQ,
773 emit_SGE,
774 emit_SLT,
775 emit_SUB,
776 emit_RSW,
777 emit_XPD,
778 emit_RSW,
779 emit_MSK,
780 emit_REL,
781 };
782
783 static GLint get_offset( const void *a, const void *b )
784 {
785 return (const char *)b - (const char *)a;
786 }
787
788
789 static GLboolean build_vertex_program( struct compilation *cp )
790 {
791 GLuint j;
792
793 struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);
794 struct x86_reg parmECX = x86_make_reg(file_REG32, reg_CX);
795
796 x86_mov(&cp->func, regEAX, x86_fn_arg(&cp->func, 1));
797 x86_mov(&cp->func, parmECX, regEAX);
798
799 x86_mov(&cp->func, regEAX, x86_make_disp(regEAX, get_offset(cp->m, cp->m->File + FILE_REG)));
800 x86_mov(&cp->func, parmECX, x86_make_disp(parmECX, get_offset(cp->m, cp->m->File + FILE_STATE_PARAM)));
801
802 for (j = 0; j < cp->m->nr_instructions; j++) {
803 union instruction inst = cp->m->instructions[j];
804 cp->insn_counter = j+1; /* avoid zero */
805
806 if (DISASSEM) {
807 _mesa_printf("%p: ", cp->func.csr);
808 _tnl_disassem_vba_insn( inst );
809 }
810 cp->func.fn = NULL;
811
812 if (!emit_func[inst.alu.opcode]( cp, inst )) {
813 return GL_FALSE;
814 }
815 }
816
817 /* TODO: only for outputs:
818 */
819 for (j = 0; j < 8; j++) {
820 if (cp->xmm[j].dirty)
821 spill(cp, j);
822 }
823
824
825 /* Exit mmx state?
826 */
827 if (cp->func.need_emms)
828 mmx_emms(&cp->func);
829
830 x86_ret(&cp->func);
831
832 return GL_TRUE;
833 }
834
835 /**
836 * Execute the given vertex program.
837 *
838 * TODO: Integrate the t_vertex.c code here, to build machine vertices
839 * directly at this point.
840 *
841 * TODO: Eliminate the VB struct entirely and just use
842 * struct arb_vertex_machine.
843 */
844 GLboolean
845 _tnl_sse_codegen_vertex_program(struct arb_vp_machine *m)
846 {
847 struct compilation cp;
848
849 memset(&cp, 0, sizeof(cp));
850 cp.m = m;
851 cp.have_sse2 = 1;
852
853 if (m->func) {
854 free((void *)m->func);
855 m->func = NULL;
856 }
857
858 x86_init_func(&cp.func);
859
860 if (!build_vertex_program(&cp)) {
861 x86_release_func( &cp.func );
862 return GL_FALSE;
863 }
864
865 m->func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
866 return GL_TRUE;
867 }
868
869
870
871 #else
872
873 GLboolean
874 _tnl_sse_codegen_vertex_program( GLcontext *ctx )
875 {
876 /* Dummy version for when USE_SSE_ASM not defined */
877 return GL_FALSE;
878 }
879
880 #endif