draw: Implement TGSI_OPCODE_TRUNC.
[mesa.git] / src / gallium / auxiliary / draw / draw_vs_aos.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
29 */
30
31
32 #include "util/u_memory.h"
33 #include "util/u_math.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "pipe/p_debug.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi/tgsi_exec.h"
39 #include "tgsi/tgsi_dump.h"
40
41 #include "draw_vs.h"
42 #include "draw_vs_aos.h"
43
44 #include "rtasm/rtasm_x86sse.h"
45
46 #ifdef PIPE_ARCH_X86
47 #define DISASSEM 0
48 #define FAST_MATH 1
49
50 static const char *files[] =
51 {
52 "NULL",
53 "CONST",
54 "IN",
55 "OUT",
56 "TEMP",
57 "SAMP",
58 "ADDR",
59 "IMM",
60 "INTERNAL",
61 };
62
63 static INLINE boolean eq( struct x86_reg a,
64 struct x86_reg b )
65 {
66 return (a.file == b.file &&
67 a.idx == b.idx &&
68 a.mod == b.mod &&
69 a.disp == b.disp);
70 }
71
72 struct x86_reg aos_get_x86( struct aos_compilation *cp,
73 unsigned which_reg, /* quick hack */
74 unsigned value )
75 {
76 struct x86_reg reg;
77
78 if (which_reg == 0)
79 reg = cp->temp_EBP;
80 else
81 reg = cp->tmp_EAX;
82
83 if (cp->x86_reg[which_reg] != value) {
84 unsigned offset;
85
86 switch (value) {
87 case X86_IMMEDIATES:
88 assert(which_reg == 0);
89 offset = Offset(struct aos_machine, immediates);
90 break;
91 case X86_CONSTANTS:
92 assert(which_reg == 1);
93 offset = Offset(struct aos_machine, constants);
94 break;
95 case X86_BUFFERS:
96 assert(which_reg == 0);
97 offset = Offset(struct aos_machine, buffer);
98 break;
99 default:
100 assert(0);
101 offset = 0;
102 }
103
104
105 x86_mov(cp->func, reg,
106 x86_make_disp(cp->machine_EDX, offset));
107
108 cp->x86_reg[which_reg] = value;
109 }
110
111 return reg;
112 }
113
114
115 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
116 unsigned file,
117 unsigned idx )
118 {
119 struct x86_reg ptr = cp->machine_EDX;
120
121 switch (file) {
122 case TGSI_FILE_INPUT:
123 assert(idx < MAX_INPUTS);
124 return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
125
126 case TGSI_FILE_OUTPUT:
127 return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
128
129 case TGSI_FILE_TEMPORARY:
130 assert(idx < MAX_TEMPS);
131 return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
132
133 case AOS_FILE_INTERNAL:
134 assert(idx < MAX_INTERNALS);
135 return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
136
137 case TGSI_FILE_IMMEDIATE:
138 assert(idx < MAX_IMMEDIATES); /* just a sanity check */
139 return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
140
141 case TGSI_FILE_CONSTANT:
142 assert(idx < MAX_CONSTANTS); /* just a sanity check */
143 return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
144
145 default:
146 ERROR(cp, "unknown reg file");
147 return x86_make_reg(0,0);
148 }
149 }
150
151
152
153 #define X87_CW_EXCEPTION_INV_OP (1<<0)
154 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
155 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
156 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
157 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
158 #define X87_CW_EXCEPTION_PRECISION (1<<5)
159 #define X87_CW_PRECISION_SINGLE (0<<8)
160 #define X87_CW_PRECISION_RESERVED (1<<8)
161 #define X87_CW_PRECISION_DOUBLE (2<<8)
162 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
163 #define X87_CW_PRECISION_MASK (3<<8)
164 #define X87_CW_ROUND_NEAREST (0<<10)
165 #define X87_CW_ROUND_DOWN (1<<10)
166 #define X87_CW_ROUND_UP (2<<10)
167 #define X87_CW_ROUND_ZERO (3<<10)
168 #define X87_CW_ROUND_MASK (3<<10)
169 #define X87_CW_INFINITY (1<<12)
170
171
172
173
174 static void spill( struct aos_compilation *cp, unsigned idx )
175 {
176 if (!cp->xmm[idx].dirty ||
177 (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
178 cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
179 cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
180 ERROR(cp, "invalid spill");
181 return;
182 }
183 else {
184 struct x86_reg oldval = get_reg_ptr(cp,
185 cp->xmm[idx].file,
186 cp->xmm[idx].idx);
187
188 if (0) debug_printf("\nspill %s[%d]",
189 files[cp->xmm[idx].file],
190 cp->xmm[idx].idx);
191
192 assert(cp->xmm[idx].dirty);
193 sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
194 cp->xmm[idx].dirty = 0;
195 }
196 }
197
198
199 void aos_spill_all( struct aos_compilation *cp )
200 {
201 unsigned i;
202
203 for (i = 0; i < 8; i++) {
204 if (cp->xmm[i].dirty)
205 spill(cp, i);
206 aos_release_xmm_reg(cp, i);
207 }
208 }
209
210
211 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
212 struct x86_reg reg )
213 {
214 if (reg.file != file_XMM ||
215 cp->xmm[reg.idx].file != TGSI_FILE_NULL)
216 {
217 struct x86_reg tmp = aos_get_xmm_reg(cp);
218 sse_movaps(cp->func, tmp, reg);
219 reg = tmp;
220 }
221
222 cp->xmm[reg.idx].last_used = cp->insn_counter;
223 return reg;
224 }
225
226 static struct x86_reg get_xmm( struct aos_compilation *cp,
227 struct x86_reg reg )
228 {
229 if (reg.file != file_XMM)
230 {
231 struct x86_reg tmp = aos_get_xmm_reg(cp);
232 sse_movaps(cp->func, tmp, reg);
233 reg = tmp;
234 }
235
236 cp->xmm[reg.idx].last_used = cp->insn_counter;
237 return reg;
238 }
239
240
241 /* Allocate an empty xmm register, either as a temporary or later to
242 * "adopt" as a shader reg.
243 */
244 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
245 {
246 unsigned i;
247 unsigned oldest = 0;
248 boolean found = FALSE;
249
250 for (i = 0; i < 8; i++)
251 if (cp->xmm[i].last_used != cp->insn_counter &&
252 cp->xmm[i].file == TGSI_FILE_NULL) {
253 oldest = i;
254 found = TRUE;
255 }
256
257 if (!found) {
258 for (i = 0; i < 8; i++)
259 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
260 oldest = i;
261 }
262
263 /* Need to write out the old value?
264 */
265 if (cp->xmm[oldest].dirty)
266 spill(cp, oldest);
267
268 assert(cp->xmm[oldest].last_used != cp->insn_counter);
269
270 cp->xmm[oldest].file = TGSI_FILE_NULL;
271 cp->xmm[oldest].idx = 0;
272 cp->xmm[oldest].dirty = 0;
273 cp->xmm[oldest].last_used = cp->insn_counter;
274 return x86_make_reg(file_XMM, oldest);
275 }
276
277 void aos_release_xmm_reg( struct aos_compilation *cp,
278 unsigned idx )
279 {
280 cp->xmm[idx].file = TGSI_FILE_NULL;
281 cp->xmm[idx].idx = 0;
282 cp->xmm[idx].dirty = 0;
283 cp->xmm[idx].last_used = 0;
284 }
285
286
287
288
289 /* Mark an xmm reg as holding the current copy of a shader reg.
290 */
291 void aos_adopt_xmm_reg( struct aos_compilation *cp,
292 struct x86_reg reg,
293 unsigned file,
294 unsigned idx,
295 unsigned dirty )
296 {
297 unsigned i;
298
299 if (reg.file != file_XMM) {
300 assert(0);
301 return;
302 }
303
304
305 /* If any xmm reg thinks it holds this shader reg, break the
306 * illusion.
307 */
308 for (i = 0; i < 8; i++) {
309 if (cp->xmm[i].file == file &&
310 cp->xmm[i].idx == idx)
311 {
312 /* If an xmm reg is already holding this shader reg, take into account its
313 * dirty flag...
314 */
315 dirty |= cp->xmm[i].dirty;
316 aos_release_xmm_reg(cp, i);
317 }
318 }
319
320 cp->xmm[reg.idx].file = file;
321 cp->xmm[reg.idx].idx = idx;
322 cp->xmm[reg.idx].dirty = dirty;
323 cp->xmm[reg.idx].last_used = cp->insn_counter;
324 }
325
326
327 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
328 */
329 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
330 unsigned file,
331 unsigned idx )
332 {
333 unsigned i;
334
335 /* Ensure the in-memory copy of this reg is up-to-date
336 */
337 for (i = 0; i < 8; i++) {
338 if (cp->xmm[i].file == file &&
339 cp->xmm[i].idx == idx &&
340 cp->xmm[i].dirty) {
341 spill(cp, i);
342 }
343 }
344
345 return get_reg_ptr( cp, file, idx );
346 }
347
348
349 /* As above, but return a pointer. Note - this pointer may alias
350 * those returned by get_arg_ptr().
351 */
352 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
353 const struct tgsi_full_dst_register *dst )
354 {
355 unsigned file = dst->DstRegister.File;
356 unsigned idx = dst->DstRegister.Index;
357 unsigned i;
358
359
360 /* Ensure in-memory copy of this reg is up-to-date and invalidate
361 * any xmm copies.
362 */
363 for (i = 0; i < 8; i++) {
364 if (cp->xmm[i].file == file &&
365 cp->xmm[i].idx == idx)
366 {
367 if (cp->xmm[i].dirty)
368 spill(cp, i);
369
370 aos_release_xmm_reg(cp, i);
371 }
372 }
373
374 return get_reg_ptr( cp, file, idx );
375 }
376
377
378
379
380
381 /* Return an XMM reg if the argument is resident, otherwise return a
382 * base+offset pointer to the saved value.
383 */
384 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
385 unsigned file,
386 unsigned idx )
387 {
388 unsigned i;
389
390 for (i = 0; i < 8; i++) {
391 if (cp->xmm[i].file == file &&
392 cp->xmm[i].idx == idx)
393 {
394 cp->xmm[i].last_used = cp->insn_counter;
395 return x86_make_reg(file_XMM, i);
396 }
397 }
398
399 /* If not found in the XMM register file, return an indirect
400 * reference to the in-memory copy:
401 */
402 return get_reg_ptr( cp, file, idx );
403 }
404
405
406
407 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
408 unsigned file,
409 unsigned idx )
410 {
411 struct x86_reg reg = get_xmm( cp,
412 aos_get_shader_reg( cp, file, idx ) );
413
414 aos_adopt_xmm_reg( cp,
415 reg,
416 file,
417 idx,
418 FALSE );
419
420 return reg;
421 }
422
423
424
425 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
426 unsigned imm )
427 {
428 return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
429 }
430
431
432 struct x86_reg aos_get_internal( struct aos_compilation *cp,
433 unsigned imm )
434 {
435 return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
436 }
437
438
439
440
441
442 /* Emulate pshufd insn in regular SSE, if necessary:
443 */
444 static void emit_pshufd( struct aos_compilation *cp,
445 struct x86_reg dst,
446 struct x86_reg arg0,
447 ubyte shuf )
448 {
449 if (cp->have_sse2) {
450 sse2_pshufd(cp->func, dst, arg0, shuf);
451 }
452 else {
453 if (!eq(dst, arg0))
454 sse_movaps(cp->func, dst, arg0);
455
456 sse_shufps(cp->func, dst, dst, shuf);
457 }
458 }
459
460 /* load masks (pack into negs??)
461 * pshufd - shuffle according to writemask
462 * and - result, mask
463 * nand - dest, mask
464 * or - dest, result
465 */
466 static boolean mask_write( struct aos_compilation *cp,
467 struct x86_reg dst,
468 struct x86_reg result,
469 unsigned mask )
470 {
471 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
472 struct x86_reg tmp = aos_get_xmm_reg(cp);
473
474 emit_pshufd(cp, tmp, imm_swz,
475 SHUF((mask & 1) ? 2 : 3,
476 (mask & 2) ? 2 : 3,
477 (mask & 4) ? 2 : 3,
478 (mask & 8) ? 2 : 3));
479
480 sse_andps(cp->func, dst, tmp);
481 sse_andnps(cp->func, tmp, result);
482 sse_orps(cp->func, dst, tmp);
483
484 aos_release_xmm_reg(cp, tmp.idx);
485 return TRUE;
486 }
487
488
489
490
491 /* Helper for writemask:
492 */
493 static boolean emit_shuf_copy2( struct aos_compilation *cp,
494 struct x86_reg dst,
495 struct x86_reg arg0,
496 struct x86_reg arg1,
497 ubyte shuf )
498 {
499 struct x86_reg tmp = aos_get_xmm_reg(cp);
500
501 emit_pshufd(cp, dst, arg1, shuf);
502 emit_pshufd(cp, tmp, arg0, shuf);
503 sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
504 emit_pshufd(cp, dst, dst, shuf);
505
506 aos_release_xmm_reg(cp, tmp.idx);
507 return TRUE;
508 }
509
510
511
512 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
513
514
515 /* Locate a source register and perform any required (simple) swizzle.
516 *
517 * Just fail on complex swizzles at this point.
518 */
519 static struct x86_reg fetch_src( struct aos_compilation *cp,
520 const struct tgsi_full_src_register *src )
521 {
522 struct x86_reg arg0 = aos_get_shader_reg(cp,
523 src->SrcRegister.File,
524 src->SrcRegister.Index);
525 unsigned i;
526 ubyte swz = 0;
527 unsigned negs = 0;
528 unsigned abs = 0;
529
530 for (i = 0; i < 4; i++) {
531 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
532 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
533
534 switch (swizzle) {
535 case TGSI_EXTSWIZZLE_ZERO:
536 case TGSI_EXTSWIZZLE_ONE:
537 ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
538 break;
539
540 default:
541 swz |= (swizzle & 0x3) << (i * 2);
542 break;
543 }
544
545 switch (neg) {
546 case TGSI_UTIL_SIGN_TOGGLE:
547 negs |= (1<<i);
548 break;
549
550 case TGSI_UTIL_SIGN_KEEP:
551 break;
552
553 case TGSI_UTIL_SIGN_CLEAR:
554 abs |= (1<<i);
555 break;
556
557 default:
558 ERROR(cp, "unsupported sign-mode");
559 break;
560 }
561 }
562
563 if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
564 struct x86_reg dst = aos_get_xmm_reg(cp);
565
566 if (swz != SSE_SWIZZLE_NOOP)
567 emit_pshufd(cp, dst, arg0, swz);
568 else
569 sse_movaps(cp->func, dst, arg0);
570
571 if (negs && negs != 0xf) {
572 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
573 struct x86_reg tmp = aos_get_xmm_reg(cp);
574
575 /* Load 1,-1,0,0
576 * Use neg as arg to pshufd
577 * Multiply
578 */
579 emit_pshufd(cp, tmp, imm_swz,
580 SHUF((negs & 1) ? 1 : 0,
581 (negs & 2) ? 1 : 0,
582 (negs & 4) ? 1 : 0,
583 (negs & 8) ? 1 : 0));
584 sse_mulps(cp->func, dst, tmp);
585
586 aos_release_xmm_reg(cp, tmp.idx);
587 }
588 else if (negs) {
589 struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
590 sse_mulps(cp->func, dst, imm_negs);
591 }
592
593
594 if (abs && abs != 0xf) {
595 ERROR(cp, "unsupported partial abs");
596 }
597 else if (abs) {
598 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
599 struct x86_reg tmp = aos_get_xmm_reg(cp);
600
601 sse_movaps(cp->func, tmp, dst);
602 sse_mulps(cp->func, tmp, neg);
603 sse_maxps(cp->func, dst, tmp);
604
605 aos_release_xmm_reg(cp, tmp.idx);
606 }
607
608 return dst;
609 }
610
611 return arg0;
612 }
613
614 static void x87_fld_src( struct aos_compilation *cp,
615 const struct tgsi_full_src_register *src,
616 unsigned channel )
617 {
618 struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
619 src->SrcRegister.File,
620 src->SrcRegister.Index);
621
622 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
623 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
624
625 switch (swizzle) {
626 case TGSI_EXTSWIZZLE_ZERO:
627 x87_fldz( cp->func );
628 break;
629
630 case TGSI_EXTSWIZZLE_ONE:
631 x87_fld1( cp->func );
632 break;
633
634 default:
635 x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
636 break;
637 }
638
639
640 switch (neg) {
641 case TGSI_UTIL_SIGN_TOGGLE:
642 /* Flip the sign:
643 */
644 x87_fchs( cp->func );
645 break;
646
647 case TGSI_UTIL_SIGN_KEEP:
648 break;
649
650 case TGSI_UTIL_SIGN_CLEAR:
651 x87_fabs( cp->func );
652 break;
653
654 case TGSI_UTIL_SIGN_SET:
655 x87_fabs( cp->func );
656 x87_fchs( cp->func );
657 break;
658
659 default:
660 ERROR(cp, "unsupported sign-mode");
661 break;
662 }
663 }
664
665
666
667
668
669
670 /* Used to implement write masking. This and most of the other instructions
671 * here would be easier to implement if there had been a translation
672 * to a 2 argument format (dst/arg0, arg1) at the shader level before
673 * attempting to translate to x86/sse code.
674 */
675 static void store_dest( struct aos_compilation *cp,
676 const struct tgsi_full_dst_register *reg,
677 struct x86_reg result )
678 {
679 struct x86_reg dst;
680
681 switch (reg->DstRegister.WriteMask) {
682 case 0:
683 return;
684
685 case TGSI_WRITEMASK_XYZW:
686 aos_adopt_xmm_reg(cp,
687 get_xmm_writable(cp, result),
688 reg->DstRegister.File,
689 reg->DstRegister.Index,
690 TRUE);
691 return;
692 default:
693 break;
694 }
695
696 dst = aos_get_shader_reg_xmm(cp,
697 reg->DstRegister.File,
698 reg->DstRegister.Index);
699
700 switch (reg->DstRegister.WriteMask) {
701 case TGSI_WRITEMASK_X:
702 sse_movss(cp->func, dst, get_xmm(cp, result));
703 break;
704
705 case TGSI_WRITEMASK_ZW:
706 sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
707 break;
708
709 case TGSI_WRITEMASK_XY:
710 result = get_xmm_writable(cp, result);
711 sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
712 dst = result;
713 break;
714
715 case TGSI_WRITEMASK_YZW:
716 result = get_xmm_writable(cp, result);
717 sse_movss(cp->func, result, dst);
718 dst = result;
719 break;
720
721 default:
722 mask_write(cp, dst, result, reg->DstRegister.WriteMask);
723 break;
724 }
725
726 aos_adopt_xmm_reg(cp,
727 dst,
728 reg->DstRegister.File,
729 reg->DstRegister.Index,
730 TRUE);
731
732 }
733
734 static void inject_scalar( struct aos_compilation *cp,
735 struct x86_reg dst,
736 struct x86_reg result,
737 ubyte swizzle )
738 {
739 sse_shufps(cp->func, dst, dst, swizzle);
740 sse_movss(cp->func, dst, result);
741 sse_shufps(cp->func, dst, dst, swizzle);
742 }
743
744
745 static void store_scalar_dest( struct aos_compilation *cp,
746 const struct tgsi_full_dst_register *reg,
747 struct x86_reg result )
748 {
749 unsigned writemask = reg->DstRegister.WriteMask;
750 struct x86_reg dst;
751
752 if (writemask != TGSI_WRITEMASK_X &&
753 writemask != TGSI_WRITEMASK_Y &&
754 writemask != TGSI_WRITEMASK_Z &&
755 writemask != TGSI_WRITEMASK_W &&
756 writemask != 0)
757 {
758 result = get_xmm_writable(cp, result); /* already true, right? */
759 sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
760 store_dest(cp, reg, result);
761 return;
762 }
763
764 result = get_xmm(cp, result);
765 dst = aos_get_shader_reg_xmm(cp,
766 reg->DstRegister.File,
767 reg->DstRegister.Index);
768
769
770
771 switch (reg->DstRegister.WriteMask) {
772 case TGSI_WRITEMASK_X:
773 sse_movss(cp->func, dst, result);
774 break;
775
776 case TGSI_WRITEMASK_Y:
777 inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
778 break;
779
780 case TGSI_WRITEMASK_Z:
781 inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
782 break;
783
784 case TGSI_WRITEMASK_W:
785 inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
786 break;
787
788 default:
789 break;
790 }
791
792 aos_adopt_xmm_reg(cp,
793 dst,
794 reg->DstRegister.File,
795 reg->DstRegister.Index,
796 TRUE);
797 }
798
799
800
801 static void x87_fst_or_nop( struct x86_function *func,
802 unsigned writemask,
803 unsigned channel,
804 struct x86_reg ptr )
805 {
806 assert(ptr.file == file_REG32);
807 if (writemask & (1<<channel))
808 x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
809 }
810
811 static void x87_fstp_or_pop( struct x86_function *func,
812 unsigned writemask,
813 unsigned channel,
814 struct x86_reg ptr )
815 {
816 assert(ptr.file == file_REG32);
817 if (writemask & (1<<channel))
818 x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
819 else
820 x87_fstp( func, x86_make_reg( file_x87, 0 ));
821 }
822
823
824
825 /*
826 */
827 static void x87_fstp_dest4( struct aos_compilation *cp,
828 const struct tgsi_full_dst_register *dst )
829 {
830 struct x86_reg ptr = get_dst_ptr(cp, dst);
831 unsigned writemask = dst->DstRegister.WriteMask;
832
833 x87_fst_or_nop(cp->func, writemask, 0, ptr);
834 x87_fst_or_nop(cp->func, writemask, 1, ptr);
835 x87_fst_or_nop(cp->func, writemask, 2, ptr);
836 x87_fstp_or_pop(cp->func, writemask, 3, ptr);
837 }
838
839 /* Save current x87 state and put it into single precision mode.
840 */
841 static void save_fpu_state( struct aos_compilation *cp )
842 {
843 x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
844 Offset(struct aos_machine, fpu_restore)));
845 }
846
847 static void restore_fpu_state( struct aos_compilation *cp )
848 {
849 x87_fnclex(cp->func);
850 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
851 Offset(struct aos_machine, fpu_restore)));
852 }
853
854 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
855 {
856 if (cp->fpucntl != FPU_RND_NEG) {
857 cp->fpucntl = FPU_RND_NEG;
858 x87_fnclex(cp->func);
859 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
860 Offset(struct aos_machine, fpu_rnd_neg_inf)));
861 }
862 }
863
864 static void set_fpu_round_nearest( struct aos_compilation *cp )
865 {
866 if (cp->fpucntl != FPU_RND_NEAREST) {
867 cp->fpucntl = FPU_RND_NEAREST;
868 x87_fnclex(cp->func);
869 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
870 Offset(struct aos_machine, fpu_rnd_nearest)));
871 }
872 }
873
874
875 static void x87_emit_ex2( struct aos_compilation *cp )
876 {
877 struct x86_reg st0 = x86_make_reg(file_x87, 0);
878 struct x86_reg st1 = x86_make_reg(file_x87, 1);
879 int stack = cp->func->x87_stack;
880
881 // set_fpu_round_neg_inf( cp );
882
883 x87_fld(cp->func, st0); /* a a */
884 x87_fprndint( cp->func ); /* int(a) a*/
885 x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
886 x87_fxch(cp->func, st1); /* frc(a) int(a) */
887 x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
888 x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
889 x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
890 x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
891 /* 2^a int(a) */
892 x87_fstp(cp->func, st1); /* 2^a */
893
894 assert( stack == cp->func->x87_stack);
895
896 }
897
898 static void PIPE_CDECL print_reg( const char *msg,
899 const float *reg )
900 {
901 debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
902 }
903
904 static void emit_print( struct aos_compilation *cp,
905 const char *message, /* must point to a static string! */
906 unsigned file,
907 unsigned idx )
908 {
909 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
910 struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
911 unsigned i;
912
913 /* There shouldn't be anything on the x87 stack. Can add this
914 * capacity later if need be.
915 */
916 assert(cp->func->x87_stack == 0);
917
918 /* For absolute correctness, need to spill/invalidate all XMM regs
919 * too. We're obviously not concerned about performance on this
920 * debug path, so here goes:
921 */
922 for (i = 0; i < 8; i++) {
923 if (cp->xmm[i].dirty)
924 spill(cp, i);
925
926 aos_release_xmm_reg(cp, i);
927 }
928
929 /* Push caller-save (ie scratch) regs.
930 */
931 x86_cdecl_caller_push_regs( cp->func );
932
933
934 /* Push the arguments:
935 */
936 x86_lea( cp->func, ecx, arg );
937 x86_push( cp->func, ecx );
938 x86_push_imm32( cp->func, (int)message );
939
940 /* Call the helper. Could call debug_printf directly, but
941 * print_reg is a nice place to put a breakpoint if need be.
942 */
943 x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
944 x86_call( cp->func, ecx );
945 x86_pop( cp->func, ecx );
946 x86_pop( cp->func, ecx );
947
948 /* Pop caller-save regs
949 */
950 x86_cdecl_caller_pop_regs( cp->func );
951
952 /* Done...
953 */
954 }
955
956 /**
957 * The traditional instructions. All operate on internal registers
958 * and ignore write masks and swizzling issues.
959 */
960
961 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
962 {
963 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
964 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
965 struct x86_reg tmp = aos_get_xmm_reg(cp);
966
967 sse_movaps(cp->func, tmp, arg0);
968 sse_mulps(cp->func, tmp, neg);
969 sse_maxps(cp->func, tmp, arg0);
970
971 store_dest(cp, &op->FullDstRegisters[0], tmp);
972 return TRUE;
973 }
974
975 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
976 {
977 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
978 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
979 struct x86_reg dst = get_xmm_writable(cp, arg0);
980
981 sse_addps(cp->func, dst, arg1);
982
983 store_dest(cp, &op->FullDstRegisters[0], dst);
984 return TRUE;
985 }
986
987 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
988 {
989 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
990 x87_fcos(cp->func);
991 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
992 return TRUE;
993 }
994
995 /* The dotproduct instructions don't really do that well in sse:
996 * XXX: produces wrong results -- disabled.
997 */
998 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
999 {
1000 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1001 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1002 struct x86_reg tmp = aos_get_xmm_reg(cp);
1003 struct x86_reg dst = get_xmm_writable(cp, arg0);
1004
1005 sse_mulps(cp->func, dst, arg1);
1006 /* Now the hard bit: sum the first 3 values:
1007 */
1008 sse_movhlps(cp->func, tmp, dst);
1009 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1010 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1011 sse_addss(cp->func, dst, tmp);
1012
1013 aos_release_xmm_reg(cp, tmp.idx);
1014 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1015 return TRUE;
1016 }
1017
1018 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1019 {
1020 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1021 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1022 struct x86_reg tmp = aos_get_xmm_reg(cp);
1023 struct x86_reg dst = get_xmm_writable(cp, arg0);
1024
1025 sse_mulps(cp->func, dst, arg1);
1026
1027 /* Now the hard bit: sum the values:
1028 */
1029 sse_movhlps(cp->func, tmp, dst);
1030 sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1031 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1032 sse_addss(cp->func, dst, tmp);
1033
1034 aos_release_xmm_reg(cp, tmp.idx);
1035 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1036 return TRUE;
1037 }
1038
1039 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1040 {
1041 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1042 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1043 struct x86_reg tmp = aos_get_xmm_reg(cp);
1044 struct x86_reg dst = get_xmm_writable(cp, arg0);
1045
1046 sse_mulps(cp->func, dst, arg1);
1047
1048 /* Now the hard bit: sum the values (from DP3):
1049 */
1050 sse_movhlps(cp->func, tmp, dst);
1051 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1052 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1053 sse_addss(cp->func, dst, tmp);
1054 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
1055 sse_addss(cp->func, dst, tmp);
1056
1057 aos_release_xmm_reg(cp, tmp.idx);
1058 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1059 return TRUE;
1060 }
1061
1062 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1063 {
1064 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1065 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1066 struct x86_reg dst = aos_get_xmm_reg(cp);
1067 struct x86_reg tmp = aos_get_xmm_reg(cp);
1068 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1069
1070 /* dst[0] = 1.0 * 1.0F; */
1071 /* dst[1] = arg0[1] * arg1[1]; */
1072 /* dst[2] = arg0[2] * 1.0; */
1073 /* dst[3] = 1.0 * arg1[3]; */
1074
1075 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
1076 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
1077 sse_mulps(cp->func, dst, tmp);
1078
1079 aos_release_xmm_reg(cp, tmp.idx);
1080 store_dest(cp, &op->FullDstRegisters[0], dst);
1081 return TRUE;
1082 }
1083
1084 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1085 {
1086 x87_fld1(cp->func); /* 1 */
1087 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */
1088 x87_fyl2x(cp->func); /* log2(a0) */
1089 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1090 return TRUE;
1091 }
1092
1093
1094 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1095 {
1096 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1097 x87_emit_ex2(cp);
1098 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1099 return TRUE;
1100 }
1101
1102
1103 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1104 {
1105 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1106 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1107 int i;
1108
1109 set_fpu_round_neg_inf( cp );
1110
1111 /* Load all sources first to avoid aliasing
1112 */
1113 for (i = 3; i >= 0; i--) {
1114 if (writemask & (1<<i)) {
1115 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1116 }
1117 }
1118
1119 for (i = 0; i < 4; i++) {
1120 if (writemask & (1<<i)) {
1121 x87_fprndint( cp->func );
1122 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1123 }
1124 }
1125
1126 return TRUE;
1127 }
1128
1129
1130 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1131 {
1132 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1133 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1134 int i;
1135
1136 set_fpu_round_nearest( cp );
1137
1138 /* Load all sources first to avoid aliasing
1139 */
1140 for (i = 3; i >= 0; i--) {
1141 if (writemask & (1<<i)) {
1142 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1143 }
1144 }
1145
1146 for (i = 0; i < 4; i++) {
1147 if (writemask & (1<<i)) {
1148 x87_fprndint( cp->func );
1149 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1150 }
1151 }
1152
1153 return TRUE;
1154 }
1155
1156
1157 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1158 {
1159 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1160 struct x86_reg st0 = x86_make_reg(file_x87, 0);
1161 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1162 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1163 int i;
1164
1165 set_fpu_round_neg_inf( cp );
1166
1167 /* suck all the source values onto the stack before writing out any
1168 * dst, which may alias...
1169 */
1170 for (i = 3; i >= 0; i--) {
1171 if (writemask & (1<<i)) {
1172 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1173 }
1174 }
1175
1176 for (i = 0; i < 4; i++) {
1177 if (writemask & (1<<i)) {
1178 x87_fld(cp->func, st0); /* a a */
1179 x87_fprndint( cp->func ); /* flr(a) a */
1180 x87_fsubp(cp->func, st1); /* frc(a) */
1181 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1182 }
1183 }
1184
1185 return TRUE;
1186 }
1187
1188
1189
1190
1191
1192
1193 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1194 {
1195 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
1196 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1197 unsigned lit_count = cp->lit_count++;
1198 struct x86_reg result, arg0;
1199 unsigned i;
1200
1201 #if 1
1202 /* For absolute correctness, need to spill/invalidate all XMM regs
1203 * too.
1204 */
1205 for (i = 0; i < 8; i++) {
1206 if (cp->xmm[i].dirty)
1207 spill(cp, i);
1208 aos_release_xmm_reg(cp, i);
1209 }
1210 #endif
1211
1212 if (writemask != TGSI_WRITEMASK_XYZW)
1213 result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
1214 else
1215 result = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1216
1217
1218 arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
1219 if (arg0.file == file_XMM) {
1220 struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
1221 Offset(struct aos_machine, tmp[1]));
1222 sse_movaps( cp->func, tmp, arg0 );
1223 arg0 = tmp;
1224 }
1225
1226
1227
1228 /* Push caller-save (ie scratch) regs.
1229 */
1230 x86_cdecl_caller_push_regs( cp->func );
1231
1232 /* Push the arguments:
1233 */
1234 x86_push_imm32( cp->func, lit_count );
1235
1236 x86_lea( cp->func, ecx, arg0 );
1237 x86_push( cp->func, ecx );
1238
1239 x86_lea( cp->func, ecx, result );
1240 x86_push( cp->func, ecx );
1241
1242 x86_push( cp->func, cp->machine_EDX );
1243
1244 if (lit_count < MAX_LIT_INFO) {
1245 x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
1246 Offset(struct aos_machine, lit_info) +
1247 lit_count * sizeof(struct lit_info) +
1248 Offset(struct lit_info, func)));
1249 }
1250 else {
1251 x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
1252 }
1253
1254 x86_call( cp->func, ecx );
1255
1256 x86_pop( cp->func, ecx ); /* fixme... */
1257 x86_pop( cp->func, ecx );
1258 x86_pop( cp->func, ecx );
1259 x86_pop( cp->func, ecx );
1260
1261 x86_cdecl_caller_pop_regs( cp->func );
1262
1263 if (writemask != TGSI_WRITEMASK_XYZW) {
1264 store_dest( cp,
1265 &op->FullDstRegisters[0],
1266 get_xmm_writable( cp, result ) );
1267 }
1268
1269 return TRUE;
1270 }
1271
1272 #if 0
1273 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1274 {
1275 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1276 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1277
1278 if (writemask & TGSI_WRITEMASK_YZ) {
1279 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1280 struct x86_reg st2 = x86_make_reg(file_x87, 2);
1281
1282 /* a1' = a1 <= 0 ? 1 : a1;
1283 */
1284 x87_fldz(cp->func); /* 1 0 */
1285 #if 1
1286 x87_fld1(cp->func); /* 1 0 */
1287 #else
1288 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1289 */
1290 x87_fldz(cp->func); /* 1 0 */
1291 #endif
1292 x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */
1293 x87_fcomi(cp->func, st2); /* a1 1 0 */
1294 x87_fcmovb(cp->func, st1); /* a1' 1 0 */
1295 x87_fstp(cp->func, st1); /* a1' 0 */
1296 x87_fstp(cp->func, st1); /* a1' */
1297
1298 x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1' */
1299 x87_fxch(cp->func, st1); /* a1' a3 */
1300
1301
1302 /* Compute pow(a1, a3)
1303 */
1304 x87_fyl2x(cp->func); /* a3*log2(a1) */
1305 x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
1306
1307
1308 /* a0' = max2(a0, 0):
1309 */
1310 x87_fldz(cp->func); /* 0 r2 */
1311 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
1312 x87_fcomi(cp->func, st1);
1313 x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
1314
1315 x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
1316
1317 x87_fcomi(cp->func, st1); /* a0' 0 r2 */
1318 x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
1319
1320 x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
1321 x87_fpop(cp->func); /* r2 */
1322 x87_fpop(cp->func);
1323 }
1324
1325 if (writemask & TGSI_WRITEMASK_XW) {
1326 x87_fld1(cp->func);
1327 x87_fst_or_nop(cp->func, writemask, 0, dst);
1328 x87_fstp_or_pop(cp->func, writemask, 3, dst);
1329 }
1330
1331 return TRUE;
1332 }
1333 #endif
1334
1335
1336
1337 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1338 {
1339 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1340 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1341 struct x86_reg dst = get_xmm_writable(cp, arg0);
1342
1343 sse_maxps(cp->func, dst, arg1);
1344
1345 store_dest(cp, &op->FullDstRegisters[0], dst);
1346 return TRUE;
1347 }
1348
1349
1350 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1351 {
1352 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1353 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1354 struct x86_reg dst = get_xmm_writable(cp, arg0);
1355
1356 sse_minps(cp->func, dst, arg1);
1357
1358 store_dest(cp, &op->FullDstRegisters[0], dst);
1359 return TRUE;
1360 }
1361
1362 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1363 {
1364 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1365 struct x86_reg dst = get_xmm_writable(cp, arg0);
1366
1367 /* potentially nothing to do */
1368
1369 store_dest(cp, &op->FullDstRegisters[0], dst);
1370 return TRUE;
1371 }
1372
1373 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1374 {
1375 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1376 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1377 struct x86_reg dst = get_xmm_writable(cp, arg0);
1378
1379 sse_mulps(cp->func, dst, arg1);
1380
1381 store_dest(cp, &op->FullDstRegisters[0], dst);
1382 return TRUE;
1383 }
1384
1385
1386 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1387 {
1388 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1389 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1390 struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
1391
1392 /* If we can't clobber old contents of arg0, get a temporary & copy
1393 * it there, then clobber it...
1394 */
1395 arg0 = get_xmm_writable(cp, arg0);
1396
1397 sse_mulps(cp->func, arg0, arg1);
1398 sse_addps(cp->func, arg0, arg2);
1399 store_dest(cp, &op->FullDstRegisters[0], arg0);
1400 return TRUE;
1401 }
1402
1403
1404
1405 /* A wrapper for powf().
1406 * Makes sure it is cdecl and operates on floats.
1407 */
1408 static float PIPE_CDECL _powerf( float x, float y )
1409 {
1410 #if FAST_MATH
1411 return util_fast_pow(x, y);
1412 #else
1413 return powf( x, y );
1414 #endif
1415 }
1416
1417 #if FAST_MATH
1418 static float PIPE_CDECL _exp2(float x)
1419 {
1420 return util_fast_exp2(x);
1421 }
1422 #endif
1423
1424
1425 /* Really not sufficient -- need to check for conditions that could
1426 * generate inf/nan values, which will slow things down hugely.
1427 */
1428 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1429 {
1430 #if 0
1431 x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */
1432 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */
1433 x87_fyl2x(cp->func); /* a1*log2(a0) */
1434
1435 x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
1436
1437 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1438 #else
1439 uint i;
1440
1441 /* For absolute correctness, need to spill/invalidate all XMM regs
1442 * too.
1443 */
1444 for (i = 0; i < 8; i++) {
1445 if (cp->xmm[i].dirty)
1446 spill(cp, i);
1447 aos_release_xmm_reg(cp, i);
1448 }
1449
1450 /* Push caller-save (ie scratch) regs.
1451 */
1452 x86_cdecl_caller_push_regs( cp->func );
1453
1454 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
1455
1456 x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
1457 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
1458 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
1459 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1460
1461 /* tmp_EAX has been pushed & will be restored below */
1462 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
1463 x86_call( cp->func, cp->tmp_EAX );
1464
1465 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
1466
1467 x86_cdecl_caller_pop_regs( cp->func );
1468
1469 /* Note retval on x87 stack:
1470 */
1471 cp->func->x87_stack++;
1472
1473 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
1474 #endif
1475 return TRUE;
1476 }
1477
1478
1479 #if FAST_MATH
1480 static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1481 {
1482 uint i;
1483
1484 /* For absolute correctness, need to spill/invalidate all XMM regs
1485 * too.
1486 */
1487 for (i = 0; i < 8; i++) {
1488 if (cp->xmm[i].dirty)
1489 spill(cp, i);
1490 aos_release_xmm_reg(cp, i);
1491 }
1492
1493 /* Push caller-save (ie scratch) regs.
1494 */
1495 x86_cdecl_caller_push_regs( cp->func );
1496
1497 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
1498
1499 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
1500 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1501
1502 /* tmp_EAX has been pushed & will be restored below */
1503 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
1504 x86_call( cp->func, cp->tmp_EAX );
1505
1506 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
1507
1508 x86_cdecl_caller_pop_regs( cp->func );
1509
1510 /* Note retval on x87 stack:
1511 */
1512 cp->func->x87_stack++;
1513
1514 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
1515
1516 return TRUE;
1517 }
1518 #endif
1519
1520
1521 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1522 {
1523 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1524 struct x86_reg dst = aos_get_xmm_reg(cp);
1525
1526 if (cp->have_sse2) {
1527 sse2_rcpss(cp->func, dst, arg0);
1528 /* extend precision here...
1529 */
1530 }
1531 else {
1532 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1533 sse_movss(cp->func, dst, ones);
1534 sse_divss(cp->func, dst, arg0);
1535 }
1536
1537 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1538 return TRUE;
1539 }
1540
1541
1542 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1543 * implementations, it is possible to improve its precision at
1544 * fairly low cost, using a newton/raphson step, as below:
1545 *
1546 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1547 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1548 * or:
1549 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1550 *
1551 *
1552 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1553 */
1554 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1555 {
1556
1557 if (0) {
1558 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1559 struct x86_reg r = aos_get_xmm_reg(cp);
1560 sse_rsqrtss(cp->func, r, arg0);
1561 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1562 return TRUE;
1563 }
1564 else {
1565 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1566 struct x86_reg r = aos_get_xmm_reg(cp);
1567
1568 struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
1569 struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
1570 struct x86_reg src = get_xmm_writable( cp, arg0 );
1571
1572 sse_rsqrtss( cp->func, r, src ); /* rsqrtss(a) */
1573 sse_mulss( cp->func, src, neg_half ); /* -.5 * a */
1574 sse_mulss( cp->func, src, r ); /* -.5 * a * r */
1575 sse_mulss( cp->func, src, r ); /* -.5 * a * r * r */
1576 sse_addss( cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
1577 sse_mulss( cp->func, r, src ); /* r * (1.5 - .5 * a * r * r) */
1578
1579 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1580 return TRUE;
1581 }
1582 }
1583
1584
1585 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1586 {
1587 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1588 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1589 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1590 struct x86_reg dst = get_xmm_writable(cp, arg0);
1591
1592 sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
1593 sse_andps(cp->func, dst, ones);
1594
1595 store_dest(cp, &op->FullDstRegisters[0], dst);
1596 return TRUE;
1597 }
1598
1599 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1600 {
1601 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1602 x87_fsin(cp->func);
1603 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1604 return TRUE;
1605 }
1606
1607
1608
1609 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1610 {
1611 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1612 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1613 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1614 struct x86_reg dst = get_xmm_writable(cp, arg0);
1615
1616 sse_cmpps(cp->func, dst, arg1, cc_LessThan);
1617 sse_andps(cp->func, dst, ones);
1618
1619 store_dest(cp, &op->FullDstRegisters[0], dst);
1620 return TRUE;
1621 }
1622
1623 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1624 {
1625 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1626 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1627 struct x86_reg dst = get_xmm_writable(cp, arg0);
1628
1629 sse_subps(cp->func, dst, arg1);
1630
1631 store_dest(cp, &op->FullDstRegisters[0], dst);
1632 return TRUE;
1633 }
1634
1635 static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1636 {
1637 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1638 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1639
1640 sse2_cvttps2dq(cp->func, tmp0, arg0);
1641 sse2_cvtdq2ps(cp->func, tmp0, tmp0);
1642
1643 store_dest(cp, &op->FullDstRegisters[0], tmp0);
1644 return TRUE;
1645 }
1646
1647 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1648 {
1649 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1650 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1651 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1652 struct x86_reg tmp1 = aos_get_xmm_reg(cp);
1653
1654 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1655 sse_mulps(cp->func, tmp1, arg0);
1656 emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
1657 sse_mulps(cp->func, tmp0, arg1);
1658 sse_subps(cp->func, tmp1, tmp0);
1659 sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
1660
1661 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1662 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1663 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1664 /* dst[3] is undef */
1665
1666
1667 aos_release_xmm_reg(cp, tmp0.idx);
1668 store_dest(cp, &op->FullDstRegisters[0], tmp1);
1669 return TRUE;
1670 }
1671
1672
1673
1674 static boolean
1675 emit_instruction( struct aos_compilation *cp,
1676 struct tgsi_full_instruction *inst )
1677 {
1678 x87_assert_stack_empty(cp->func);
1679
1680 switch( inst->Instruction.Opcode ) {
1681 case TGSI_OPCODE_MOV:
1682 return emit_MOV( cp, inst );
1683
1684 case TGSI_OPCODE_LIT:
1685 return emit_LIT(cp, inst);
1686
1687 case TGSI_OPCODE_RCP:
1688 return emit_RCP(cp, inst);
1689
1690 case TGSI_OPCODE_RSQ:
1691 return emit_RSQ(cp, inst);
1692
1693 case TGSI_OPCODE_EXP:
1694 /*return emit_EXP(cp, inst);*/
1695 return FALSE;
1696
1697 case TGSI_OPCODE_LOG:
1698 /*return emit_LOG(cp, inst);*/
1699 return FALSE;
1700
1701 case TGSI_OPCODE_MUL:
1702 return emit_MUL(cp, inst);
1703
1704 case TGSI_OPCODE_ADD:
1705 return emit_ADD(cp, inst);
1706
1707 case TGSI_OPCODE_DP3:
1708 return emit_DP3(cp, inst);
1709
1710 case TGSI_OPCODE_DP4:
1711 return emit_DP4(cp, inst);
1712
1713 case TGSI_OPCODE_DST:
1714 return emit_DST(cp, inst);
1715
1716 case TGSI_OPCODE_MIN:
1717 return emit_MIN(cp, inst);
1718
1719 case TGSI_OPCODE_MAX:
1720 return emit_MAX(cp, inst);
1721
1722 case TGSI_OPCODE_SLT:
1723 return emit_SLT(cp, inst);
1724
1725 case TGSI_OPCODE_SGE:
1726 return emit_SGE(cp, inst);
1727
1728 case TGSI_OPCODE_MAD:
1729 return emit_MAD(cp, inst);
1730
1731 case TGSI_OPCODE_SUB:
1732 return emit_SUB(cp, inst);
1733
1734 case TGSI_OPCODE_LERP:
1735 // return emit_LERP(cp, inst);
1736 return FALSE;
1737
1738 case TGSI_OPCODE_FRAC:
1739 return emit_FRC(cp, inst);
1740
1741 case TGSI_OPCODE_CLAMP:
1742 // return emit_CLAMP(cp, inst);
1743 return FALSE;
1744
1745 case TGSI_OPCODE_FLOOR:
1746 return emit_FLR(cp, inst);
1747
1748 case TGSI_OPCODE_ROUND:
1749 return emit_RND(cp, inst);
1750
1751 case TGSI_OPCODE_EXPBASE2:
1752 #if FAST_MATH
1753 return emit_EXPBASE2(cp, inst);
1754 #elif 0
1755 /* this seems to fail for "larger" exponents.
1756 * See glean tvertProg1's EX2 test.
1757 */
1758 return emit_EX2(cp, inst);
1759 #else
1760 return FALSE;
1761 #endif
1762
1763 case TGSI_OPCODE_LOGBASE2:
1764 return emit_LG2(cp, inst);
1765
1766 case TGSI_OPCODE_POWER:
1767 return emit_POW(cp, inst);
1768
1769 case TGSI_OPCODE_CROSSPRODUCT:
1770 return emit_XPD(cp, inst);
1771
1772 case TGSI_OPCODE_ABS:
1773 return emit_ABS(cp, inst);
1774
1775 case TGSI_OPCODE_DPH:
1776 return emit_DPH(cp, inst);
1777
1778 case TGSI_OPCODE_COS:
1779 return emit_COS(cp, inst);
1780
1781 case TGSI_OPCODE_SIN:
1782 return emit_SIN(cp, inst);
1783
1784 case TGSI_OPCODE_TRUNC:
1785 return emit_TRUNC(cp, inst);
1786
1787 case TGSI_OPCODE_END:
1788 return TRUE;
1789
1790 default:
1791 return FALSE;
1792 }
1793 }
1794
1795
1796 static boolean emit_viewport( struct aos_compilation *cp )
1797 {
1798 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1799 TGSI_FILE_OUTPUT,
1800 cp->vaos->draw->vs.position_output );
1801
1802 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1803 Offset(struct aos_machine, scale));
1804
1805 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1806 Offset(struct aos_machine, translate));
1807
1808 sse_mulps(cp->func, pos, scale);
1809 sse_addps(cp->func, pos, translate);
1810
1811 aos_adopt_xmm_reg( cp,
1812 pos,
1813 TGSI_FILE_OUTPUT,
1814 cp->vaos->draw->vs.position_output,
1815 TRUE );
1816 return TRUE;
1817 }
1818
1819
1820 /* This is useful to be able to see the results on softpipe. Doesn't
1821 * do proper clipping, just assumes the backend can do it during
1822 * rasterization -- for debug only...
1823 */
1824 static boolean emit_rhw_viewport( struct aos_compilation *cp )
1825 {
1826 struct x86_reg tmp = aos_get_xmm_reg(cp);
1827 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1828 TGSI_FILE_OUTPUT,
1829 cp->vaos->draw->vs.position_output);
1830
1831 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1832 Offset(struct aos_machine, scale));
1833
1834 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1835 Offset(struct aos_machine, translate));
1836
1837
1838
1839 emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
1840 sse2_rcpss(cp->func, tmp, tmp);
1841 sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
1842
1843 sse_mulps(cp->func, pos, scale);
1844 sse_mulps(cp->func, pos, tmp);
1845 sse_addps(cp->func, pos, translate);
1846
1847 /* Set pos[3] = w
1848 */
1849 mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
1850
1851 aos_adopt_xmm_reg( cp,
1852 pos,
1853 TGSI_FILE_OUTPUT,
1854 cp->vaos->draw->vs.position_output,
1855 TRUE );
1856 return TRUE;
1857 }
1858
1859
1860 #if 0
1861 static boolean note_immediate( struct aos_compilation *cp,
1862 struct tgsi_full_immediate *imm )
1863 {
1864 unsigned pos = cp->num_immediates++;
1865 unsigned j;
1866
1867 for (j = 0; j < imm->Immediate.Size; j++) {
1868 cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
1869 }
1870
1871 return TRUE;
1872 }
1873 #endif
1874
1875
1876
1877
1878 static void find_last_write_outputs( struct aos_compilation *cp )
1879 {
1880 struct tgsi_parse_context parse;
1881 unsigned this_instruction = 0;
1882 unsigned i;
1883
1884 tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
1885
1886 while (!tgsi_parse_end_of_tokens( &parse )) {
1887
1888 tgsi_parse_token( &parse );
1889
1890 if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
1891 continue;
1892
1893 for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
1894 if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
1895 TGSI_FILE_OUTPUT)
1896 {
1897 unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
1898 cp->output_last_write[idx] = this_instruction;
1899 }
1900 }
1901
1902 this_instruction++;
1903 }
1904
1905 tgsi_parse_free( &parse );
1906 }
1907
1908
1909 #define ARG_MACHINE 1
1910 #define ARG_START_ELTS 2
1911 #define ARG_COUNT 3
1912 #define ARG_OUTBUF 4
1913
1914
1915 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
1916 boolean linear )
1917 {
1918 struct tgsi_parse_context parse;
1919 struct aos_compilation cp;
1920 unsigned fixup, label;
1921
1922 util_init_math();
1923
1924 tgsi_parse_init( &parse, varient->base.vs->state.tokens );
1925
1926 memset(&cp, 0, sizeof(cp));
1927
1928 cp.insn_counter = 1;
1929 cp.vaos = varient;
1930 cp.have_sse2 = 1;
1931 cp.func = &varient->func[ linear ? 0 : 1 ];
1932
1933 cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1934 cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
1935 cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
1936 cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
1937 cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
1938 cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
1939 cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
1940
1941 x86_init_func(cp.func);
1942
1943 find_last_write_outputs(&cp);
1944
1945 x86_push(cp.func, cp.idx_EBX);
1946 x86_push(cp.func, cp.count_ESI);
1947 x86_push(cp.func, cp.temp_EBP);
1948
1949
1950 /* Load arguments into regs:
1951 */
1952 x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
1953 x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
1954 x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
1955 x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
1956
1957
1958 /* Compare count to zero and possibly bail.
1959 */
1960 x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
1961 x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
1962 fixup = x86_jcc_forward(cp.func, cc_E);
1963
1964
1965 save_fpu_state( &cp );
1966 set_fpu_round_nearest( &cp );
1967
1968 aos_init_inputs( &cp, linear );
1969
1970 cp.x86_reg[0] = 0;
1971 cp.x86_reg[1] = 0;
1972
1973 /* Note address for loop jump
1974 */
1975 label = x86_get_label(cp.func);
1976 {
1977 /* Fetch inputs... TODO: fetch lazily...
1978 */
1979 if (!aos_fetch_inputs( &cp, linear ))
1980 goto fail;
1981
1982 /* Emit the shader:
1983 */
1984 while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
1985 {
1986 tgsi_parse_token( &parse );
1987
1988 switch (parse.FullToken.Token.Type) {
1989 case TGSI_TOKEN_TYPE_IMMEDIATE:
1990 #if 0
1991 if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
1992 goto fail;
1993 #endif
1994 break;
1995
1996 case TGSI_TOKEN_TYPE_INSTRUCTION:
1997 if (DISASSEM)
1998 tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
1999
2000 if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
2001 goto fail;
2002 break;
2003 }
2004
2005 x87_assert_stack_empty(cp.func);
2006 cp.insn_counter++;
2007
2008 if (DISASSEM)
2009 debug_printf("\n");
2010 }
2011
2012
2013 {
2014 unsigned i;
2015 for (i = 0; i < 8; i++) {
2016 if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
2017 cp.xmm[i].file = TGSI_FILE_NULL;
2018 cp.xmm[i].dirty = 0;
2019 }
2020 }
2021 }
2022
2023 if (cp.error)
2024 goto fail;
2025
2026 if (cp.vaos->base.key.clip) {
2027 /* not really handling clipping, just do the rhw so we can
2028 * see the results...
2029 */
2030 emit_rhw_viewport(&cp);
2031 }
2032 else if (cp.vaos->base.key.viewport) {
2033 emit_viewport(&cp);
2034 }
2035
2036 /* Emit output... TODO: do this eagerly after the last write to a
2037 * given output.
2038 */
2039 if (!aos_emit_outputs( &cp ))
2040 goto fail;
2041
2042
2043 /* Next vertex:
2044 */
2045 x86_lea(cp.func,
2046 cp.outbuf_ECX,
2047 x86_make_disp(cp.outbuf_ECX,
2048 cp.vaos->base.key.output_stride));
2049
2050 /* Incr index
2051 */
2052 aos_incr_inputs( &cp, linear );
2053 }
2054 /* decr count, loop if not zero
2055 */
2056 x86_dec(cp.func, cp.count_ESI);
2057 x86_jcc(cp.func, cc_NZ, label);
2058
2059 restore_fpu_state(&cp);
2060
2061 /* Land forward jump here:
2062 */
2063 x86_fixup_fwd_jump(cp.func, fixup);
2064
2065 /* Exit mmx state?
2066 */
2067 if (cp.func->need_emms)
2068 mmx_emms(cp.func);
2069
2070 x86_pop(cp.func, cp.temp_EBP);
2071 x86_pop(cp.func, cp.count_ESI);
2072 x86_pop(cp.func, cp.idx_EBX);
2073
2074 x87_assert_stack_empty(cp.func);
2075 x86_ret(cp.func);
2076
2077 tgsi_parse_free( &parse );
2078 return !cp.error;
2079
2080 fail:
2081 tgsi_parse_free( &parse );
2082 return FALSE;
2083 }
2084
2085
2086
2087 static void vaos_set_buffer( struct draw_vs_varient *varient,
2088 unsigned buf,
2089 const void *ptr,
2090 unsigned stride )
2091 {
2092 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2093
2094 if (buf < vaos->nr_vb) {
2095 vaos->buffer[buf].base_ptr = (char *)ptr;
2096 vaos->buffer[buf].stride = stride;
2097 }
2098
2099 if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
2100 }
2101
2102
2103
2104 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
2105 const unsigned *elts,
2106 unsigned count,
2107 void *output_buffer )
2108 {
2109 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2110 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2111
2112 if (0) debug_printf("%s %d\n", __FUNCTION__, count);
2113
2114 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2115 machine->constants = vaos->draw->vs.aligned_constants;
2116 machine->immediates = vaos->base.vs->immediates;
2117 machine->buffer = vaos->buffer;
2118
2119 vaos->gen_run_elts( machine,
2120 elts,
2121 count,
2122 output_buffer );
2123 }
2124
2125 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
2126 unsigned start,
2127 unsigned count,
2128 void *output_buffer )
2129 {
2130 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2131 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2132
2133 if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
2134 vaos->base.key.const_vbuffers);
2135
2136 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2137 machine->constants = vaos->draw->vs.aligned_constants;
2138 machine->immediates = vaos->base.vs->immediates;
2139 machine->buffer = vaos->buffer;
2140
2141 vaos->gen_run_linear( machine,
2142 start,
2143 count,
2144 output_buffer );
2145
2146 /* Sanity spot checks to make sure we didn't trash our constants */
2147 assert(machine->internal[IMM_ONES][0] == 1.0f);
2148 assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
2149 assert(machine->internal[IMM_NEGS][0] == -1.0f);
2150 }
2151
2152
2153
2154 static void vaos_destroy( struct draw_vs_varient *varient )
2155 {
2156 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2157
2158 FREE( vaos->buffer );
2159
2160 x86_release_func( &vaos->func[0] );
2161 x86_release_func( &vaos->func[1] );
2162
2163 FREE(vaos);
2164 }
2165
2166
2167
2168 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
2169 const struct draw_vs_varient_key *key )
2170 {
2171 unsigned i;
2172 struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
2173
2174 if (!vaos)
2175 goto fail;
2176
2177 vaos->base.key = *key;
2178 vaos->base.vs = vs;
2179 vaos->base.set_buffer = vaos_set_buffer;
2180 vaos->base.destroy = vaos_destroy;
2181 vaos->base.run_linear = vaos_run_linear;
2182 vaos->base.run_elts = vaos_run_elts;
2183
2184 vaos->draw = vs->draw;
2185
2186 for (i = 0; i < key->nr_inputs; i++)
2187 vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
2188
2189 vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
2190 if (!vaos->buffer)
2191 goto fail;
2192
2193 debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
2194
2195 #if 0
2196 tgsi_dump(vs->state.tokens, 0);
2197 #endif
2198
2199 if (!build_vertex_program( vaos, TRUE ))
2200 goto fail;
2201
2202 if (!build_vertex_program( vaos, FALSE ))
2203 goto fail;
2204
2205 vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
2206 if (!vaos->gen_run_linear)
2207 goto fail;
2208
2209 vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
2210 if (!vaos->gen_run_elts)
2211 goto fail;
2212
2213 return &vaos->base;
2214
2215 fail:
2216 if (vaos && vaos->buffer)
2217 FREE(vaos->buffer);
2218
2219 if (vaos)
2220 x86_release_func( &vaos->func[0] );
2221
2222 if (vaos)
2223 x86_release_func( &vaos->func[1] );
2224
2225 FREE(vaos);
2226
2227 return NULL;
2228 }
2229
2230
2231 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
2232 const struct draw_vs_varient_key *key )
2233 {
2234 struct draw_vs_varient *varient = varient_aos_sse( vs, key );
2235
2236 if (varient == NULL) {
2237 varient = draw_vs_varient_generic( vs, key );
2238 }
2239
2240 return varient;
2241 }
2242
2243
2244
2245 #endif /* PIPE_ARCH_X86 */