gallium: simplify tgsi_full_immediate struct
[mesa.git] / src / gallium / auxiliary / draw / draw_vs_aos.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
29 */
30
31
32 #include "util/u_memory.h"
33 #include "util/u_math.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_debug.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi/tgsi_exec.h"
39 #include "tgsi/tgsi_dump.h"
40
41 #include "draw_vs.h"
42 #include "draw_vs_aos.h"
43
44 #include "rtasm/rtasm_x86sse.h"
45
46 #ifdef PIPE_ARCH_X86
47 #define DISASSEM 0
48 #define FAST_MATH 1
49
50 static const char *files[] =
51 {
52 "NULL",
53 "CONST",
54 "IN",
55 "OUT",
56 "TEMP",
57 "SAMP",
58 "ADDR",
59 "IMM",
60 "INTERNAL",
61 };
62
63 static INLINE boolean eq( struct x86_reg a,
64 struct x86_reg b )
65 {
66 return (a.file == b.file &&
67 a.idx == b.idx &&
68 a.mod == b.mod &&
69 a.disp == b.disp);
70 }
71
72 struct x86_reg aos_get_x86( struct aos_compilation *cp,
73 unsigned which_reg, /* quick hack */
74 unsigned value )
75 {
76 struct x86_reg reg;
77
78 if (which_reg == 0)
79 reg = cp->temp_EBP;
80 else
81 reg = cp->tmp_EAX;
82
83 if (cp->x86_reg[which_reg] != value) {
84 unsigned offset;
85
86 switch (value) {
87 case X86_IMMEDIATES:
88 assert(which_reg == 0);
89 offset = Offset(struct aos_machine, immediates);
90 break;
91 case X86_CONSTANTS:
92 assert(which_reg == 1);
93 offset = Offset(struct aos_machine, constants);
94 break;
95 case X86_BUFFERS:
96 assert(which_reg == 0);
97 offset = Offset(struct aos_machine, buffer);
98 break;
99 default:
100 assert(0);
101 offset = 0;
102 }
103
104
105 x86_mov(cp->func, reg,
106 x86_make_disp(cp->machine_EDX, offset));
107
108 cp->x86_reg[which_reg] = value;
109 }
110
111 return reg;
112 }
113
114
115 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
116 unsigned file,
117 unsigned idx )
118 {
119 struct x86_reg ptr = cp->machine_EDX;
120
121 switch (file) {
122 case TGSI_FILE_INPUT:
123 assert(idx < MAX_INPUTS);
124 return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
125
126 case TGSI_FILE_OUTPUT:
127 return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
128
129 case TGSI_FILE_TEMPORARY:
130 assert(idx < MAX_TEMPS);
131 return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
132
133 case AOS_FILE_INTERNAL:
134 assert(idx < MAX_INTERNALS);
135 return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
136
137 case TGSI_FILE_IMMEDIATE:
138 assert(idx < MAX_IMMEDIATES); /* just a sanity check */
139 return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
140
141 case TGSI_FILE_CONSTANT:
142 assert(idx < MAX_CONSTANTS); /* just a sanity check */
143 return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
144
145 default:
146 AOS_ERROR(cp, "unknown reg file");
147 return x86_make_reg(0,0);
148 }
149 }
150
151
152
153 #define X87_CW_EXCEPTION_INV_OP (1<<0)
154 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
155 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
156 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
157 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
158 #define X87_CW_EXCEPTION_PRECISION (1<<5)
159 #define X87_CW_PRECISION_SINGLE (0<<8)
160 #define X87_CW_PRECISION_RESERVED (1<<8)
161 #define X87_CW_PRECISION_DOUBLE (2<<8)
162 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
163 #define X87_CW_PRECISION_MASK (3<<8)
164 #define X87_CW_ROUND_NEAREST (0<<10)
165 #define X87_CW_ROUND_DOWN (1<<10)
166 #define X87_CW_ROUND_UP (2<<10)
167 #define X87_CW_ROUND_ZERO (3<<10)
168 #define X87_CW_ROUND_MASK (3<<10)
169 #define X87_CW_INFINITY (1<<12)
170
171
172
173
174 static void spill( struct aos_compilation *cp, unsigned idx )
175 {
176 if (!cp->xmm[idx].dirty ||
177 (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
178 cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
179 cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
180 AOS_ERROR(cp, "invalid spill");
181 return;
182 }
183 else {
184 struct x86_reg oldval = get_reg_ptr(cp,
185 cp->xmm[idx].file,
186 cp->xmm[idx].idx);
187
188 if (0) debug_printf("\nspill %s[%d]",
189 files[cp->xmm[idx].file],
190 cp->xmm[idx].idx);
191
192 assert(cp->xmm[idx].dirty);
193 sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
194 cp->xmm[idx].dirty = 0;
195 }
196 }
197
198
199 void aos_spill_all( struct aos_compilation *cp )
200 {
201 unsigned i;
202
203 for (i = 0; i < 8; i++) {
204 if (cp->xmm[i].dirty)
205 spill(cp, i);
206 aos_release_xmm_reg(cp, i);
207 }
208 }
209
210
211 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
212 struct x86_reg reg )
213 {
214 if (reg.file != file_XMM ||
215 cp->xmm[reg.idx].file != TGSI_FILE_NULL)
216 {
217 struct x86_reg tmp = aos_get_xmm_reg(cp);
218 sse_movaps(cp->func, tmp, reg);
219 reg = tmp;
220 }
221
222 cp->xmm[reg.idx].last_used = cp->insn_counter;
223 return reg;
224 }
225
226 static struct x86_reg get_xmm( struct aos_compilation *cp,
227 struct x86_reg reg )
228 {
229 if (reg.file != file_XMM)
230 {
231 struct x86_reg tmp = aos_get_xmm_reg(cp);
232 sse_movaps(cp->func, tmp, reg);
233 reg = tmp;
234 }
235
236 cp->xmm[reg.idx].last_used = cp->insn_counter;
237 return reg;
238 }
239
240
241 /* Allocate an empty xmm register, either as a temporary or later to
242 * "adopt" as a shader reg.
243 */
244 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
245 {
246 unsigned i;
247 unsigned oldest = 0;
248 boolean found = FALSE;
249
250 for (i = 0; i < 8; i++)
251 if (cp->xmm[i].last_used != cp->insn_counter &&
252 cp->xmm[i].file == TGSI_FILE_NULL) {
253 oldest = i;
254 found = TRUE;
255 }
256
257 if (!found) {
258 for (i = 0; i < 8; i++)
259 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
260 oldest = i;
261 }
262
263 /* Need to write out the old value?
264 */
265 if (cp->xmm[oldest].dirty)
266 spill(cp, oldest);
267
268 assert(cp->xmm[oldest].last_used != cp->insn_counter);
269
270 cp->xmm[oldest].file = TGSI_FILE_NULL;
271 cp->xmm[oldest].idx = 0;
272 cp->xmm[oldest].dirty = 0;
273 cp->xmm[oldest].last_used = cp->insn_counter;
274 return x86_make_reg(file_XMM, oldest);
275 }
276
277 void aos_release_xmm_reg( struct aos_compilation *cp,
278 unsigned idx )
279 {
280 cp->xmm[idx].file = TGSI_FILE_NULL;
281 cp->xmm[idx].idx = 0;
282 cp->xmm[idx].dirty = 0;
283 cp->xmm[idx].last_used = 0;
284 }
285
286
287 static void aos_soft_release_xmm( struct aos_compilation *cp,
288 struct x86_reg reg )
289 {
290 if (reg.file == file_XMM) {
291 assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
292 cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
293 }
294 }
295
296
297
298 /* Mark an xmm reg as holding the current copy of a shader reg.
299 */
300 void aos_adopt_xmm_reg( struct aos_compilation *cp,
301 struct x86_reg reg,
302 unsigned file,
303 unsigned idx,
304 unsigned dirty )
305 {
306 unsigned i;
307
308 if (reg.file != file_XMM) {
309 assert(0);
310 return;
311 }
312
313
314 /* If any xmm reg thinks it holds this shader reg, break the
315 * illusion.
316 */
317 for (i = 0; i < 8; i++) {
318 if (cp->xmm[i].file == file &&
319 cp->xmm[i].idx == idx)
320 {
321 /* If an xmm reg is already holding this shader reg, take into account its
322 * dirty flag...
323 */
324 dirty |= cp->xmm[i].dirty;
325 aos_release_xmm_reg(cp, i);
326 }
327 }
328
329 cp->xmm[reg.idx].file = file;
330 cp->xmm[reg.idx].idx = idx;
331 cp->xmm[reg.idx].dirty = dirty;
332 cp->xmm[reg.idx].last_used = cp->insn_counter;
333 }
334
335
336 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
337 */
338 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
339 unsigned file,
340 unsigned idx )
341 {
342 unsigned i;
343
344 /* Ensure the in-memory copy of this reg is up-to-date
345 */
346 for (i = 0; i < 8; i++) {
347 if (cp->xmm[i].file == file &&
348 cp->xmm[i].idx == idx &&
349 cp->xmm[i].dirty) {
350 spill(cp, i);
351 }
352 }
353
354 return get_reg_ptr( cp, file, idx );
355 }
356
357
358 /* As above, but return a pointer. Note - this pointer may alias
359 * those returned by get_arg_ptr().
360 */
361 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
362 const struct tgsi_full_dst_register *dst )
363 {
364 unsigned file = dst->DstRegister.File;
365 unsigned idx = dst->DstRegister.Index;
366 unsigned i;
367
368
369 /* Ensure in-memory copy of this reg is up-to-date and invalidate
370 * any xmm copies.
371 */
372 for (i = 0; i < 8; i++) {
373 if (cp->xmm[i].file == file &&
374 cp->xmm[i].idx == idx)
375 {
376 if (cp->xmm[i].dirty)
377 spill(cp, i);
378
379 aos_release_xmm_reg(cp, i);
380 }
381 }
382
383 return get_reg_ptr( cp, file, idx );
384 }
385
386
387
388
389
390 /* Return an XMM reg if the argument is resident, otherwise return a
391 * base+offset pointer to the saved value.
392 */
393 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
394 unsigned file,
395 unsigned idx )
396 {
397 unsigned i;
398
399 for (i = 0; i < 8; i++) {
400 if (cp->xmm[i].file == file &&
401 cp->xmm[i].idx == idx)
402 {
403 cp->xmm[i].last_used = cp->insn_counter;
404 return x86_make_reg(file_XMM, i);
405 }
406 }
407
408 /* If not found in the XMM register file, return an indirect
409 * reference to the in-memory copy:
410 */
411 return get_reg_ptr( cp, file, idx );
412 }
413
414
415
416 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
417 unsigned file,
418 unsigned idx )
419 {
420 struct x86_reg reg = get_xmm( cp,
421 aos_get_shader_reg( cp, file, idx ) );
422
423 aos_adopt_xmm_reg( cp,
424 reg,
425 file,
426 idx,
427 FALSE );
428
429 return reg;
430 }
431
432
433
434 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
435 unsigned imm )
436 {
437 return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
438 }
439
440
441 struct x86_reg aos_get_internal( struct aos_compilation *cp,
442 unsigned imm )
443 {
444 return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
445 }
446
447
448
449
450
451 /* Emulate pshufd insn in regular SSE, if necessary:
452 */
453 static void emit_pshufd( struct aos_compilation *cp,
454 struct x86_reg dst,
455 struct x86_reg arg0,
456 ubyte shuf )
457 {
458 if (cp->have_sse2) {
459 sse2_pshufd(cp->func, dst, arg0, shuf);
460 }
461 else {
462 if (!eq(dst, arg0))
463 sse_movaps(cp->func, dst, arg0);
464
465 sse_shufps(cp->func, dst, dst, shuf);
466 }
467 }
468
469 /* load masks (pack into negs??)
470 * pshufd - shuffle according to writemask
471 * and - result, mask
472 * nand - dest, mask
473 * or - dest, result
474 */
475 static boolean mask_write( struct aos_compilation *cp,
476 struct x86_reg dst,
477 struct x86_reg result,
478 unsigned mask )
479 {
480 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
481 struct x86_reg tmp = aos_get_xmm_reg(cp);
482
483 emit_pshufd(cp, tmp, imm_swz,
484 SHUF((mask & 1) ? 2 : 3,
485 (mask & 2) ? 2 : 3,
486 (mask & 4) ? 2 : 3,
487 (mask & 8) ? 2 : 3));
488
489 sse_andps(cp->func, dst, tmp);
490 sse_andnps(cp->func, tmp, result);
491 sse_orps(cp->func, dst, tmp);
492
493 aos_release_xmm_reg(cp, tmp.idx);
494 return TRUE;
495 }
496
497
498
499
500 /* Helper for writemask:
501 */
502 static boolean emit_shuf_copy2( struct aos_compilation *cp,
503 struct x86_reg dst,
504 struct x86_reg arg0,
505 struct x86_reg arg1,
506 ubyte shuf )
507 {
508 struct x86_reg tmp = aos_get_xmm_reg(cp);
509
510 emit_pshufd(cp, dst, arg1, shuf);
511 emit_pshufd(cp, tmp, arg0, shuf);
512 sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
513 emit_pshufd(cp, dst, dst, shuf);
514
515 aos_release_xmm_reg(cp, tmp.idx);
516 return TRUE;
517 }
518
519
520
521 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
522
523
524 /* Locate a source register and perform any required (simple) swizzle.
525 *
526 * Just fail on complex swizzles at this point.
527 */
528 static struct x86_reg fetch_src( struct aos_compilation *cp,
529 const struct tgsi_full_src_register *src )
530 {
531 struct x86_reg arg0 = aos_get_shader_reg(cp,
532 src->SrcRegister.File,
533 src->SrcRegister.Index);
534 unsigned i;
535 ubyte swz = 0;
536 unsigned negs = 0;
537 unsigned abs = 0;
538
539 for (i = 0; i < 4; i++) {
540 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
541 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
542
543 switch (swizzle) {
544 case TGSI_EXTSWIZZLE_ZERO:
545 case TGSI_EXTSWIZZLE_ONE:
546 AOS_ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
547 break;
548
549 default:
550 swz |= (swizzle & 0x3) << (i * 2);
551 break;
552 }
553
554 switch (neg) {
555 case TGSI_UTIL_SIGN_TOGGLE:
556 negs |= (1<<i);
557 break;
558
559 case TGSI_UTIL_SIGN_KEEP:
560 break;
561
562 case TGSI_UTIL_SIGN_CLEAR:
563 abs |= (1<<i);
564 break;
565
566 default:
567 AOS_ERROR(cp, "unsupported sign-mode");
568 break;
569 }
570 }
571
572 if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
573 struct x86_reg dst = aos_get_xmm_reg(cp);
574
575 if (swz != SSE_SWIZZLE_NOOP)
576 emit_pshufd(cp, dst, arg0, swz);
577 else
578 sse_movaps(cp->func, dst, arg0);
579
580 if (negs && negs != 0xf) {
581 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
582 struct x86_reg tmp = aos_get_xmm_reg(cp);
583
584 /* Load 1,-1,0,0
585 * Use neg as arg to pshufd
586 * Multiply
587 */
588 emit_pshufd(cp, tmp, imm_swz,
589 SHUF((negs & 1) ? 1 : 0,
590 (negs & 2) ? 1 : 0,
591 (negs & 4) ? 1 : 0,
592 (negs & 8) ? 1 : 0));
593 sse_mulps(cp->func, dst, tmp);
594
595 aos_release_xmm_reg(cp, tmp.idx);
596 aos_soft_release_xmm(cp, imm_swz);
597 }
598 else if (negs) {
599 struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
600 sse_mulps(cp->func, dst, imm_negs);
601 aos_soft_release_xmm(cp, imm_negs);
602 }
603
604
605 if (abs && abs != 0xf) {
606 AOS_ERROR(cp, "unsupported partial abs");
607 }
608 else if (abs) {
609 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
610 struct x86_reg tmp = aos_get_xmm_reg(cp);
611
612 sse_movaps(cp->func, tmp, dst);
613 sse_mulps(cp->func, tmp, neg);
614 sse_maxps(cp->func, dst, tmp);
615
616 aos_release_xmm_reg(cp, tmp.idx);
617 aos_soft_release_xmm(cp, neg);
618 }
619
620 aos_soft_release_xmm(cp, arg0);
621 return dst;
622 }
623
624 return arg0;
625 }
626
627 static void x87_fld_src( struct aos_compilation *cp,
628 const struct tgsi_full_src_register *src,
629 unsigned channel )
630 {
631 struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
632 src->SrcRegister.File,
633 src->SrcRegister.Index);
634
635 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
636 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
637
638 switch (swizzle) {
639 case TGSI_EXTSWIZZLE_ZERO:
640 x87_fldz( cp->func );
641 break;
642
643 case TGSI_EXTSWIZZLE_ONE:
644 x87_fld1( cp->func );
645 break;
646
647 default:
648 x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
649 break;
650 }
651
652
653 switch (neg) {
654 case TGSI_UTIL_SIGN_TOGGLE:
655 /* Flip the sign:
656 */
657 x87_fchs( cp->func );
658 break;
659
660 case TGSI_UTIL_SIGN_KEEP:
661 break;
662
663 case TGSI_UTIL_SIGN_CLEAR:
664 x87_fabs( cp->func );
665 break;
666
667 case TGSI_UTIL_SIGN_SET:
668 x87_fabs( cp->func );
669 x87_fchs( cp->func );
670 break;
671
672 default:
673 AOS_ERROR(cp, "unsupported sign-mode");
674 break;
675 }
676 }
677
678
679
680
681
682
683 /* Used to implement write masking. This and most of the other instructions
684 * here would be easier to implement if there had been a translation
685 * to a 2 argument format (dst/arg0, arg1) at the shader level before
686 * attempting to translate to x86/sse code.
687 */
688 static void store_dest( struct aos_compilation *cp,
689 const struct tgsi_full_dst_register *reg,
690 struct x86_reg result )
691 {
692 struct x86_reg dst;
693
694 switch (reg->DstRegister.WriteMask) {
695 case 0:
696 return;
697
698 case TGSI_WRITEMASK_XYZW:
699 aos_adopt_xmm_reg(cp,
700 get_xmm_writable(cp, result),
701 reg->DstRegister.File,
702 reg->DstRegister.Index,
703 TRUE);
704 return;
705 default:
706 break;
707 }
708
709 dst = aos_get_shader_reg_xmm(cp,
710 reg->DstRegister.File,
711 reg->DstRegister.Index);
712
713 switch (reg->DstRegister.WriteMask) {
714 case TGSI_WRITEMASK_X:
715 sse_movss(cp->func, dst, get_xmm(cp, result));
716 break;
717
718 case TGSI_WRITEMASK_ZW:
719 sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
720 break;
721
722 case TGSI_WRITEMASK_XY:
723 result = get_xmm_writable(cp, result);
724 sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
725 dst = result;
726 break;
727
728 case TGSI_WRITEMASK_YZW:
729 result = get_xmm_writable(cp, result);
730 sse_movss(cp->func, result, dst);
731 dst = result;
732 break;
733
734 default:
735 mask_write(cp, dst, result, reg->DstRegister.WriteMask);
736 break;
737 }
738
739 aos_adopt_xmm_reg(cp,
740 dst,
741 reg->DstRegister.File,
742 reg->DstRegister.Index,
743 TRUE);
744
745 }
746
747 static void inject_scalar( struct aos_compilation *cp,
748 struct x86_reg dst,
749 struct x86_reg result,
750 ubyte swizzle )
751 {
752 sse_shufps(cp->func, dst, dst, swizzle);
753 sse_movss(cp->func, dst, result);
754 sse_shufps(cp->func, dst, dst, swizzle);
755 }
756
757
758 static void store_scalar_dest( struct aos_compilation *cp,
759 const struct tgsi_full_dst_register *reg,
760 struct x86_reg result )
761 {
762 unsigned writemask = reg->DstRegister.WriteMask;
763 struct x86_reg dst;
764
765 if (writemask != TGSI_WRITEMASK_X &&
766 writemask != TGSI_WRITEMASK_Y &&
767 writemask != TGSI_WRITEMASK_Z &&
768 writemask != TGSI_WRITEMASK_W &&
769 writemask != 0)
770 {
771 result = get_xmm_writable(cp, result); /* already true, right? */
772 sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
773 store_dest(cp, reg, result);
774 return;
775 }
776
777 result = get_xmm(cp, result);
778 dst = aos_get_shader_reg_xmm(cp,
779 reg->DstRegister.File,
780 reg->DstRegister.Index);
781
782
783
784 switch (reg->DstRegister.WriteMask) {
785 case TGSI_WRITEMASK_X:
786 sse_movss(cp->func, dst, result);
787 break;
788
789 case TGSI_WRITEMASK_Y:
790 inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
791 break;
792
793 case TGSI_WRITEMASK_Z:
794 inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
795 break;
796
797 case TGSI_WRITEMASK_W:
798 inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
799 break;
800
801 default:
802 break;
803 }
804
805 aos_adopt_xmm_reg(cp,
806 dst,
807 reg->DstRegister.File,
808 reg->DstRegister.Index,
809 TRUE);
810 }
811
812
813
814 static void x87_fst_or_nop( struct x86_function *func,
815 unsigned writemask,
816 unsigned channel,
817 struct x86_reg ptr )
818 {
819 assert(ptr.file == file_REG32);
820 if (writemask & (1<<channel))
821 x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
822 }
823
824 static void x87_fstp_or_pop( struct x86_function *func,
825 unsigned writemask,
826 unsigned channel,
827 struct x86_reg ptr )
828 {
829 assert(ptr.file == file_REG32);
830 if (writemask & (1<<channel))
831 x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
832 else
833 x87_fstp( func, x86_make_reg( file_x87, 0 ));
834 }
835
836
837
838 /*
839 */
840 static void x87_fstp_dest4( struct aos_compilation *cp,
841 const struct tgsi_full_dst_register *dst )
842 {
843 struct x86_reg ptr = get_dst_ptr(cp, dst);
844 unsigned writemask = dst->DstRegister.WriteMask;
845
846 x87_fst_or_nop(cp->func, writemask, 0, ptr);
847 x87_fst_or_nop(cp->func, writemask, 1, ptr);
848 x87_fst_or_nop(cp->func, writemask, 2, ptr);
849 x87_fstp_or_pop(cp->func, writemask, 3, ptr);
850 }
851
852 /* Save current x87 state and put it into single precision mode.
853 */
854 static void save_fpu_state( struct aos_compilation *cp )
855 {
856 x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
857 Offset(struct aos_machine, fpu_restore)));
858 }
859
860 static void restore_fpu_state( struct aos_compilation *cp )
861 {
862 x87_fnclex(cp->func);
863 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
864 Offset(struct aos_machine, fpu_restore)));
865 }
866
867 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
868 {
869 if (cp->fpucntl != FPU_RND_NEG) {
870 cp->fpucntl = FPU_RND_NEG;
871 x87_fnclex(cp->func);
872 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
873 Offset(struct aos_machine, fpu_rnd_neg_inf)));
874 }
875 }
876
877 static void set_fpu_round_nearest( struct aos_compilation *cp )
878 {
879 if (cp->fpucntl != FPU_RND_NEAREST) {
880 cp->fpucntl = FPU_RND_NEAREST;
881 x87_fnclex(cp->func);
882 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
883 Offset(struct aos_machine, fpu_rnd_nearest)));
884 }
885 }
886
887 #if 0
888 static void x87_emit_ex2( struct aos_compilation *cp )
889 {
890 struct x86_reg st0 = x86_make_reg(file_x87, 0);
891 struct x86_reg st1 = x86_make_reg(file_x87, 1);
892 int stack = cp->func->x87_stack;
893
894 // set_fpu_round_neg_inf( cp );
895
896 x87_fld(cp->func, st0); /* a a */
897 x87_fprndint( cp->func ); /* int(a) a*/
898 x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
899 x87_fxch(cp->func, st1); /* frc(a) int(a) */
900 x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
901 x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
902 x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
903 x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
904 /* 2^a int(a) */
905 x87_fstp(cp->func, st1); /* 2^a */
906
907 assert( stack == cp->func->x87_stack);
908
909 }
910 #endif
911
912 #if 0
913 static void PIPE_CDECL print_reg( const char *msg,
914 const float *reg )
915 {
916 debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
917 }
918 #endif
919
920 #if 0
921 static void emit_print( struct aos_compilation *cp,
922 const char *message, /* must point to a static string! */
923 unsigned file,
924 unsigned idx )
925 {
926 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
927 struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
928 unsigned i;
929
930 /* There shouldn't be anything on the x87 stack. Can add this
931 * capacity later if need be.
932 */
933 assert(cp->func->x87_stack == 0);
934
935 /* For absolute correctness, need to spill/invalidate all XMM regs
936 * too. We're obviously not concerned about performance on this
937 * debug path, so here goes:
938 */
939 for (i = 0; i < 8; i++) {
940 if (cp->xmm[i].dirty)
941 spill(cp, i);
942
943 aos_release_xmm_reg(cp, i);
944 }
945
946 /* Push caller-save (ie scratch) regs.
947 */
948 x86_cdecl_caller_push_regs( cp->func );
949
950
951 /* Push the arguments:
952 */
953 x86_lea( cp->func, ecx, arg );
954 x86_push( cp->func, ecx );
955 x86_push_imm32( cp->func, (int)message );
956
957 /* Call the helper. Could call debug_printf directly, but
958 * print_reg is a nice place to put a breakpoint if need be.
959 */
960 x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
961 x86_call( cp->func, ecx );
962 x86_pop( cp->func, ecx );
963 x86_pop( cp->func, ecx );
964
965 /* Pop caller-save regs
966 */
967 x86_cdecl_caller_pop_regs( cp->func );
968
969 /* Done...
970 */
971 }
972 #endif
973
974 /**
975 * The traditional instructions. All operate on internal registers
976 * and ignore write masks and swizzling issues.
977 */
978
979 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
980 {
981 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
982 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
983 struct x86_reg tmp = aos_get_xmm_reg(cp);
984
985 sse_movaps(cp->func, tmp, arg0);
986 sse_mulps(cp->func, tmp, neg);
987 sse_maxps(cp->func, tmp, arg0);
988
989 store_dest(cp, &op->FullDstRegisters[0], tmp);
990 return TRUE;
991 }
992
993 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
994 {
995 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
996 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
997 struct x86_reg dst = get_xmm_writable(cp, arg0);
998
999 sse_addps(cp->func, dst, arg1);
1000
1001 store_dest(cp, &op->FullDstRegisters[0], dst);
1002 return TRUE;
1003 }
1004
1005 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1006 {
1007 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1008 x87_fcos(cp->func);
1009 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1010 return TRUE;
1011 }
1012
1013 /* The dotproduct instructions don't really do that well in sse:
1014 * XXX: produces wrong results -- disabled.
1015 */
1016 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1017 {
1018 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1019 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1020 struct x86_reg tmp = aos_get_xmm_reg(cp);
1021 struct x86_reg dst = get_xmm_writable(cp, arg0);
1022
1023 sse_mulps(cp->func, dst, arg1);
1024 /* Now the hard bit: sum the first 3 values:
1025 */
1026 sse_movhlps(cp->func, tmp, dst);
1027 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1028 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1029 sse_addss(cp->func, dst, tmp);
1030
1031 aos_release_xmm_reg(cp, tmp.idx);
1032 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1033 return TRUE;
1034 }
1035
1036 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1037 {
1038 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1039 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1040 struct x86_reg tmp = aos_get_xmm_reg(cp);
1041 struct x86_reg dst = get_xmm_writable(cp, arg0);
1042
1043 sse_mulps(cp->func, dst, arg1);
1044
1045 /* Now the hard bit: sum the values:
1046 */
1047 sse_movhlps(cp->func, tmp, dst);
1048 sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1049 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1050 sse_addss(cp->func, dst, tmp);
1051
1052 aos_release_xmm_reg(cp, tmp.idx);
1053 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1054 return TRUE;
1055 }
1056
1057 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1058 {
1059 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1060 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1061 struct x86_reg tmp = aos_get_xmm_reg(cp);
1062 struct x86_reg dst = get_xmm_writable(cp, arg0);
1063
1064 sse_mulps(cp->func, dst, arg1);
1065
1066 /* Now the hard bit: sum the values (from DP3):
1067 */
1068 sse_movhlps(cp->func, tmp, dst);
1069 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1070 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1071 sse_addss(cp->func, dst, tmp);
1072 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
1073 sse_addss(cp->func, dst, tmp);
1074
1075 aos_release_xmm_reg(cp, tmp.idx);
1076 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1077 return TRUE;
1078 }
1079
1080 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1081 {
1082 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1083 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1084 struct x86_reg dst = aos_get_xmm_reg(cp);
1085 struct x86_reg tmp = aos_get_xmm_reg(cp);
1086 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1087
1088 /* dst[0] = 1.0 * 1.0F; */
1089 /* dst[1] = arg0[1] * arg1[1]; */
1090 /* dst[2] = arg0[2] * 1.0; */
1091 /* dst[3] = 1.0 * arg1[3]; */
1092
1093 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
1094 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
1095 sse_mulps(cp->func, dst, tmp);
1096
1097 aos_release_xmm_reg(cp, tmp.idx);
1098 store_dest(cp, &op->FullDstRegisters[0], dst);
1099 return TRUE;
1100 }
1101
1102 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1103 {
1104 x87_fld1(cp->func); /* 1 */
1105 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */
1106 x87_fyl2x(cp->func); /* log2(a0) */
1107 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1108 return TRUE;
1109 }
1110
1111 #if 0
1112 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1113 {
1114 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1115 x87_emit_ex2(cp);
1116 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1117 return TRUE;
1118 }
1119 #endif
1120
1121
1122 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1123 {
1124 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1125 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1126 int i;
1127
1128 set_fpu_round_neg_inf( cp );
1129
1130 /* Load all sources first to avoid aliasing
1131 */
1132 for (i = 3; i >= 0; i--) {
1133 if (writemask & (1<<i)) {
1134 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1135 }
1136 }
1137
1138 for (i = 0; i < 4; i++) {
1139 if (writemask & (1<<i)) {
1140 x87_fprndint( cp->func );
1141 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1142 }
1143 }
1144
1145 return TRUE;
1146 }
1147
1148
1149 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1150 {
1151 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1152 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1153 int i;
1154
1155 set_fpu_round_nearest( cp );
1156
1157 /* Load all sources first to avoid aliasing
1158 */
1159 for (i = 3; i >= 0; i--) {
1160 if (writemask & (1<<i)) {
1161 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1162 }
1163 }
1164
1165 for (i = 0; i < 4; i++) {
1166 if (writemask & (1<<i)) {
1167 x87_fprndint( cp->func );
1168 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1169 }
1170 }
1171
1172 return TRUE;
1173 }
1174
1175
1176 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1177 {
1178 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1179 struct x86_reg st0 = x86_make_reg(file_x87, 0);
1180 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1181 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1182 int i;
1183
1184 set_fpu_round_neg_inf( cp );
1185
1186 /* suck all the source values onto the stack before writing out any
1187 * dst, which may alias...
1188 */
1189 for (i = 3; i >= 0; i--) {
1190 if (writemask & (1<<i)) {
1191 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1192 }
1193 }
1194
1195 for (i = 0; i < 4; i++) {
1196 if (writemask & (1<<i)) {
1197 x87_fld(cp->func, st0); /* a a */
1198 x87_fprndint( cp->func ); /* flr(a) a */
1199 x87_fsubp(cp->func, st1); /* frc(a) */
1200 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1201 }
1202 }
1203
1204 return TRUE;
1205 }
1206
1207
1208
1209
1210
1211
1212 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1213 {
1214 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
1215 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1216 unsigned lit_count = cp->lit_count++;
1217 struct x86_reg result, arg0;
1218 unsigned i;
1219
1220 #if 1
1221 /* For absolute correctness, need to spill/invalidate all XMM regs
1222 * too.
1223 */
1224 for (i = 0; i < 8; i++) {
1225 if (cp->xmm[i].dirty)
1226 spill(cp, i);
1227 aos_release_xmm_reg(cp, i);
1228 }
1229 #endif
1230
1231 if (writemask != TGSI_WRITEMASK_XYZW)
1232 result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
1233 else
1234 result = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1235
1236
1237 arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
1238 if (arg0.file == file_XMM) {
1239 struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
1240 Offset(struct aos_machine, tmp[1]));
1241 sse_movaps( cp->func, tmp, arg0 );
1242 arg0 = tmp;
1243 }
1244
1245
1246
1247 /* Push caller-save (ie scratch) regs.
1248 */
1249 x86_cdecl_caller_push_regs( cp->func );
1250
1251 /* Push the arguments:
1252 */
1253 x86_push_imm32( cp->func, lit_count );
1254
1255 x86_lea( cp->func, ecx, arg0 );
1256 x86_push( cp->func, ecx );
1257
1258 x86_lea( cp->func, ecx, result );
1259 x86_push( cp->func, ecx );
1260
1261 x86_push( cp->func, cp->machine_EDX );
1262
1263 if (lit_count < MAX_LIT_INFO) {
1264 x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
1265 Offset(struct aos_machine, lit_info) +
1266 lit_count * sizeof(struct lit_info) +
1267 Offset(struct lit_info, func)));
1268 }
1269 else {
1270 x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
1271 }
1272
1273 x86_call( cp->func, ecx );
1274
1275 x86_pop( cp->func, ecx ); /* fixme... */
1276 x86_pop( cp->func, ecx );
1277 x86_pop( cp->func, ecx );
1278 x86_pop( cp->func, ecx );
1279
1280 x86_cdecl_caller_pop_regs( cp->func );
1281
1282 if (writemask != TGSI_WRITEMASK_XYZW) {
1283 store_dest( cp,
1284 &op->FullDstRegisters[0],
1285 get_xmm_writable( cp, result ) );
1286 }
1287
1288 return TRUE;
1289 }
1290
1291 #if 0
1292 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1293 {
1294 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1295 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1296
1297 if (writemask & TGSI_WRITEMASK_YZ) {
1298 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1299 struct x86_reg st2 = x86_make_reg(file_x87, 2);
1300
1301 /* a1' = a1 <= 0 ? 1 : a1;
1302 */
1303 x87_fldz(cp->func); /* 1 0 */
1304 #if 1
1305 x87_fld1(cp->func); /* 1 0 */
1306 #else
1307 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1308 */
1309 x87_fldz(cp->func); /* 1 0 */
1310 #endif
1311 x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */
1312 x87_fcomi(cp->func, st2); /* a1 1 0 */
1313 x87_fcmovb(cp->func, st1); /* a1' 1 0 */
1314 x87_fstp(cp->func, st1); /* a1' 0 */
1315 x87_fstp(cp->func, st1); /* a1' */
1316
1317 x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1' */
1318 x87_fxch(cp->func, st1); /* a1' a3 */
1319
1320
1321 /* Compute pow(a1, a3)
1322 */
1323 x87_fyl2x(cp->func); /* a3*log2(a1) */
1324 x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
1325
1326
1327 /* a0' = max2(a0, 0):
1328 */
1329 x87_fldz(cp->func); /* 0 r2 */
1330 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
1331 x87_fcomi(cp->func, st1);
1332 x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
1333
1334 x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
1335
1336 x87_fcomi(cp->func, st1); /* a0' 0 r2 */
1337 x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
1338
1339 x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
1340 x87_fpop(cp->func); /* r2 */
1341 x87_fpop(cp->func);
1342 }
1343
1344 if (writemask & TGSI_WRITEMASK_XW) {
1345 x87_fld1(cp->func);
1346 x87_fst_or_nop(cp->func, writemask, 0, dst);
1347 x87_fstp_or_pop(cp->func, writemask, 3, dst);
1348 }
1349
1350 return TRUE;
1351 }
1352 #endif
1353
1354
1355
1356 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1357 {
1358 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1359 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1360 struct x86_reg dst = get_xmm_writable(cp, arg0);
1361
1362 sse_maxps(cp->func, dst, arg1);
1363
1364 store_dest(cp, &op->FullDstRegisters[0], dst);
1365 return TRUE;
1366 }
1367
1368
1369 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1370 {
1371 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1372 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1373 struct x86_reg dst = get_xmm_writable(cp, arg0);
1374
1375 sse_minps(cp->func, dst, arg1);
1376
1377 store_dest(cp, &op->FullDstRegisters[0], dst);
1378 return TRUE;
1379 }
1380
1381 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1382 {
1383 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1384 struct x86_reg dst = get_xmm_writable(cp, arg0);
1385
1386 /* potentially nothing to do */
1387
1388 store_dest(cp, &op->FullDstRegisters[0], dst);
1389 return TRUE;
1390 }
1391
1392 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1393 {
1394 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1395 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1396 struct x86_reg dst = get_xmm_writable(cp, arg0);
1397
1398 sse_mulps(cp->func, dst, arg1);
1399
1400 store_dest(cp, &op->FullDstRegisters[0], dst);
1401 return TRUE;
1402 }
1403
1404
1405 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1406 {
1407 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1408 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1409 struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
1410
1411 /* If we can't clobber old contents of arg0, get a temporary & copy
1412 * it there, then clobber it...
1413 */
1414 arg0 = get_xmm_writable(cp, arg0);
1415
1416 sse_mulps(cp->func, arg0, arg1);
1417 sse_addps(cp->func, arg0, arg2);
1418 store_dest(cp, &op->FullDstRegisters[0], arg0);
1419 return TRUE;
1420 }
1421
1422
1423
1424 /* A wrapper for powf().
1425 * Makes sure it is cdecl and operates on floats.
1426 */
1427 static float PIPE_CDECL _powerf( float x, float y )
1428 {
1429 #if FAST_MATH
1430 return util_fast_pow(x, y);
1431 #else
1432 return powf( x, y );
1433 #endif
1434 }
1435
1436 #if FAST_MATH
1437 static float PIPE_CDECL _exp2(float x)
1438 {
1439 return util_fast_exp2(x);
1440 }
1441 #endif
1442
1443
1444 /* Really not sufficient -- need to check for conditions that could
1445 * generate inf/nan values, which will slow things down hugely.
1446 */
1447 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1448 {
1449 #if 0
1450 x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */
1451 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */
1452 x87_fyl2x(cp->func); /* a1*log2(a0) */
1453
1454 x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
1455
1456 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1457 #else
1458 uint i;
1459
1460 /* For absolute correctness, need to spill/invalidate all XMM regs
1461 * too.
1462 */
1463 for (i = 0; i < 8; i++) {
1464 if (cp->xmm[i].dirty)
1465 spill(cp, i);
1466 aos_release_xmm_reg(cp, i);
1467 }
1468
1469 /* Push caller-save (ie scratch) regs.
1470 */
1471 x86_cdecl_caller_push_regs( cp->func );
1472
1473 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
1474
1475 x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
1476 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
1477 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
1478 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1479
1480 /* tmp_EAX has been pushed & will be restored below */
1481 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
1482 x86_call( cp->func, cp->tmp_EAX );
1483
1484 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
1485
1486 x86_cdecl_caller_pop_regs( cp->func );
1487
1488 /* Note retval on x87 stack:
1489 */
1490 cp->func->x87_stack++;
1491
1492 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
1493 #endif
1494 return TRUE;
1495 }
1496
1497
1498 #if FAST_MATH
1499 static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1500 {
1501 uint i;
1502
1503 /* For absolute correctness, need to spill/invalidate all XMM regs
1504 * too.
1505 */
1506 for (i = 0; i < 8; i++) {
1507 if (cp->xmm[i].dirty)
1508 spill(cp, i);
1509 aos_release_xmm_reg(cp, i);
1510 }
1511
1512 /* Push caller-save (ie scratch) regs.
1513 */
1514 x86_cdecl_caller_push_regs( cp->func );
1515
1516 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
1517
1518 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
1519 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1520
1521 /* tmp_EAX has been pushed & will be restored below */
1522 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
1523 x86_call( cp->func, cp->tmp_EAX );
1524
1525 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
1526
1527 x86_cdecl_caller_pop_regs( cp->func );
1528
1529 /* Note retval on x87 stack:
1530 */
1531 cp->func->x87_stack++;
1532
1533 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
1534
1535 return TRUE;
1536 }
1537 #endif
1538
1539
1540 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1541 {
1542 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1543 struct x86_reg dst = aos_get_xmm_reg(cp);
1544
1545 if (cp->have_sse2) {
1546 sse2_rcpss(cp->func, dst, arg0);
1547 /* extend precision here...
1548 */
1549 }
1550 else {
1551 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1552 sse_movss(cp->func, dst, ones);
1553 sse_divss(cp->func, dst, arg0);
1554 }
1555
1556 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1557 return TRUE;
1558 }
1559
1560
1561 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1562 * implementations, it is possible to improve its precision at
1563 * fairly low cost, using a newton/raphson step, as below:
1564 *
1565 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1566 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1567 * or:
1568 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1569 *
1570 *
1571 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1572 */
1573 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1574 {
1575 if (0) {
1576 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1577 struct x86_reg r = aos_get_xmm_reg(cp);
1578 sse_rsqrtss(cp->func, r, arg0);
1579 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1580 return TRUE;
1581 }
1582 else {
1583 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1584 struct x86_reg r = aos_get_xmm_reg(cp);
1585
1586 struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
1587 struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
1588 struct x86_reg src = get_xmm_writable( cp, arg0 );
1589 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
1590 struct x86_reg tmp = aos_get_xmm_reg(cp);
1591
1592 sse_movaps(cp->func, tmp, src);
1593 sse_mulps(cp->func, tmp, neg);
1594 sse_maxps(cp->func, tmp, src);
1595
1596 sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */
1597 sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */
1598 sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */
1599 sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */
1600 sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
1601 sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */
1602
1603 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1604
1605 aos_release_xmm_reg(cp, tmp.idx);
1606
1607 return TRUE;
1608 }
1609 }
1610
1611
1612 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1613 {
1614 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1615 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1616 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1617 struct x86_reg dst = get_xmm_writable(cp, arg0);
1618
1619 sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
1620 sse_andps(cp->func, dst, ones);
1621
1622 store_dest(cp, &op->FullDstRegisters[0], dst);
1623 return TRUE;
1624 }
1625
1626 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1627 {
1628 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1629 x87_fsin(cp->func);
1630 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1631 return TRUE;
1632 }
1633
1634
1635
1636 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1637 {
1638 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1639 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1640 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1641 struct x86_reg dst = get_xmm_writable(cp, arg0);
1642
1643 sse_cmpps(cp->func, dst, arg1, cc_LessThan);
1644 sse_andps(cp->func, dst, ones);
1645
1646 store_dest(cp, &op->FullDstRegisters[0], dst);
1647 return TRUE;
1648 }
1649
1650 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1651 {
1652 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1653 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1654 struct x86_reg dst = get_xmm_writable(cp, arg0);
1655
1656 sse_subps(cp->func, dst, arg1);
1657
1658 store_dest(cp, &op->FullDstRegisters[0], dst);
1659 return TRUE;
1660 }
1661
1662 static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1663 {
1664 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1665 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1666
1667 sse2_cvttps2dq(cp->func, tmp0, arg0);
1668 sse2_cvtdq2ps(cp->func, tmp0, tmp0);
1669
1670 store_dest(cp, &op->FullDstRegisters[0], tmp0);
1671 return TRUE;
1672 }
1673
1674 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1675 {
1676 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1677 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1678 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1679 struct x86_reg tmp1 = aos_get_xmm_reg(cp);
1680
1681 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1682 sse_mulps(cp->func, tmp1, arg0);
1683 emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
1684 sse_mulps(cp->func, tmp0, arg1);
1685 sse_subps(cp->func, tmp1, tmp0);
1686 sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
1687
1688 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1689 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1690 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1691 /* dst[3] is undef */
1692
1693
1694 aos_release_xmm_reg(cp, tmp0.idx);
1695 store_dest(cp, &op->FullDstRegisters[0], tmp1);
1696 return TRUE;
1697 }
1698
1699
1700
1701 static boolean
1702 emit_instruction( struct aos_compilation *cp,
1703 struct tgsi_full_instruction *inst )
1704 {
1705 x87_assert_stack_empty(cp->func);
1706
1707 switch( inst->Instruction.Opcode ) {
1708 case TGSI_OPCODE_MOV:
1709 return emit_MOV( cp, inst );
1710
1711 case TGSI_OPCODE_LIT:
1712 return emit_LIT(cp, inst);
1713
1714 case TGSI_OPCODE_RCP:
1715 return emit_RCP(cp, inst);
1716
1717 case TGSI_OPCODE_RSQ:
1718 return emit_RSQ(cp, inst);
1719
1720 case TGSI_OPCODE_EXP:
1721 /*return emit_EXP(cp, inst);*/
1722 return FALSE;
1723
1724 case TGSI_OPCODE_LOG:
1725 /*return emit_LOG(cp, inst);*/
1726 return FALSE;
1727
1728 case TGSI_OPCODE_MUL:
1729 return emit_MUL(cp, inst);
1730
1731 case TGSI_OPCODE_ADD:
1732 return emit_ADD(cp, inst);
1733
1734 case TGSI_OPCODE_DP3:
1735 return emit_DP3(cp, inst);
1736
1737 case TGSI_OPCODE_DP4:
1738 return emit_DP4(cp, inst);
1739
1740 case TGSI_OPCODE_DST:
1741 return emit_DST(cp, inst);
1742
1743 case TGSI_OPCODE_MIN:
1744 return emit_MIN(cp, inst);
1745
1746 case TGSI_OPCODE_MAX:
1747 return emit_MAX(cp, inst);
1748
1749 case TGSI_OPCODE_SLT:
1750 return emit_SLT(cp, inst);
1751
1752 case TGSI_OPCODE_SGE:
1753 return emit_SGE(cp, inst);
1754
1755 case TGSI_OPCODE_MAD:
1756 return emit_MAD(cp, inst);
1757
1758 case TGSI_OPCODE_SUB:
1759 return emit_SUB(cp, inst);
1760
1761 case TGSI_OPCODE_LERP:
1762 // return emit_LERP(cp, inst);
1763 return FALSE;
1764
1765 case TGSI_OPCODE_FRAC:
1766 return emit_FRC(cp, inst);
1767
1768 case TGSI_OPCODE_CLAMP:
1769 // return emit_CLAMP(cp, inst);
1770 return FALSE;
1771
1772 case TGSI_OPCODE_FLOOR:
1773 return emit_FLR(cp, inst);
1774
1775 case TGSI_OPCODE_ROUND:
1776 return emit_RND(cp, inst);
1777
1778 case TGSI_OPCODE_EXPBASE2:
1779 #if FAST_MATH
1780 return emit_EXPBASE2(cp, inst);
1781 #elif 0
1782 /* this seems to fail for "larger" exponents.
1783 * See glean tvertProg1's EX2 test.
1784 */
1785 return emit_EX2(cp, inst);
1786 #else
1787 return FALSE;
1788 #endif
1789
1790 case TGSI_OPCODE_LOGBASE2:
1791 return emit_LG2(cp, inst);
1792
1793 case TGSI_OPCODE_POWER:
1794 return emit_POW(cp, inst);
1795
1796 case TGSI_OPCODE_CROSSPRODUCT:
1797 return emit_XPD(cp, inst);
1798
1799 case TGSI_OPCODE_ABS:
1800 return emit_ABS(cp, inst);
1801
1802 case TGSI_OPCODE_DPH:
1803 return emit_DPH(cp, inst);
1804
1805 case TGSI_OPCODE_COS:
1806 return emit_COS(cp, inst);
1807
1808 case TGSI_OPCODE_SIN:
1809 return emit_SIN(cp, inst);
1810
1811 case TGSI_OPCODE_TRUNC:
1812 return emit_TRUNC(cp, inst);
1813
1814 case TGSI_OPCODE_END:
1815 return TRUE;
1816
1817 default:
1818 return FALSE;
1819 }
1820 }
1821
1822
1823 static boolean emit_viewport( struct aos_compilation *cp )
1824 {
1825 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1826 TGSI_FILE_OUTPUT,
1827 cp->vaos->draw->vs.position_output );
1828
1829 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1830 Offset(struct aos_machine, scale));
1831
1832 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1833 Offset(struct aos_machine, translate));
1834
1835 sse_mulps(cp->func, pos, scale);
1836 sse_addps(cp->func, pos, translate);
1837
1838 aos_adopt_xmm_reg( cp,
1839 pos,
1840 TGSI_FILE_OUTPUT,
1841 cp->vaos->draw->vs.position_output,
1842 TRUE );
1843 return TRUE;
1844 }
1845
1846
1847 /* This is useful to be able to see the results on softpipe. Doesn't
1848 * do proper clipping, just assumes the backend can do it during
1849 * rasterization -- for debug only...
1850 */
1851 static boolean emit_rhw_viewport( struct aos_compilation *cp )
1852 {
1853 struct x86_reg tmp = aos_get_xmm_reg(cp);
1854 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1855 TGSI_FILE_OUTPUT,
1856 cp->vaos->draw->vs.position_output);
1857
1858 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1859 Offset(struct aos_machine, scale));
1860
1861 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1862 Offset(struct aos_machine, translate));
1863
1864
1865
1866 emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
1867 sse2_rcpss(cp->func, tmp, tmp);
1868 sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
1869
1870 sse_mulps(cp->func, pos, scale);
1871 sse_mulps(cp->func, pos, tmp);
1872 sse_addps(cp->func, pos, translate);
1873
1874 /* Set pos[3] = w
1875 */
1876 mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
1877
1878 aos_adopt_xmm_reg( cp,
1879 pos,
1880 TGSI_FILE_OUTPUT,
1881 cp->vaos->draw->vs.position_output,
1882 TRUE );
1883 return TRUE;
1884 }
1885
1886
1887 #if 0
1888 static boolean note_immediate( struct aos_compilation *cp,
1889 struct tgsi_full_immediate *imm )
1890 {
1891 unsigned pos = cp->num_immediates++;
1892 unsigned j;
1893
1894 assert( imm->Immediate.NrTokens <= 4 + 1 );
1895 for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
1896 cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
1897 }
1898
1899 return TRUE;
1900 }
1901 #endif
1902
1903
1904
1905
1906 static void find_last_write_outputs( struct aos_compilation *cp )
1907 {
1908 struct tgsi_parse_context parse;
1909 unsigned this_instruction = 0;
1910 unsigned i;
1911
1912 tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
1913
1914 while (!tgsi_parse_end_of_tokens( &parse )) {
1915
1916 tgsi_parse_token( &parse );
1917
1918 if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
1919 continue;
1920
1921 for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
1922 if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
1923 TGSI_FILE_OUTPUT)
1924 {
1925 unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
1926 cp->output_last_write[idx] = this_instruction;
1927 }
1928 }
1929
1930 this_instruction++;
1931 }
1932
1933 tgsi_parse_free( &parse );
1934 }
1935
1936
1937 #define ARG_MACHINE 1
1938 #define ARG_START_ELTS 2
1939 #define ARG_COUNT 3
1940 #define ARG_OUTBUF 4
1941
1942
1943 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
1944 boolean linear )
1945 {
1946 struct tgsi_parse_context parse;
1947 struct aos_compilation cp;
1948 unsigned fixup, label;
1949
1950 util_init_math();
1951
1952 tgsi_parse_init( &parse, varient->base.vs->state.tokens );
1953
1954 memset(&cp, 0, sizeof(cp));
1955
1956 cp.insn_counter = 1;
1957 cp.vaos = varient;
1958 cp.have_sse2 = 1;
1959 cp.func = &varient->func[ linear ? 0 : 1 ];
1960
1961 cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1962 cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
1963 cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
1964 cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
1965 cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
1966 cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
1967 cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
1968
1969 x86_init_func(cp.func);
1970
1971 find_last_write_outputs(&cp);
1972
1973 x86_push(cp.func, cp.idx_EBX);
1974 x86_push(cp.func, cp.count_ESI);
1975 x86_push(cp.func, cp.temp_EBP);
1976
1977
1978 /* Load arguments into regs:
1979 */
1980 x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
1981 x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
1982 x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
1983 x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
1984
1985
1986 /* Compare count to zero and possibly bail.
1987 */
1988 x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
1989 x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
1990 fixup = x86_jcc_forward(cp.func, cc_E);
1991
1992
1993 save_fpu_state( &cp );
1994 set_fpu_round_nearest( &cp );
1995
1996 aos_init_inputs( &cp, linear );
1997
1998 cp.x86_reg[0] = 0;
1999 cp.x86_reg[1] = 0;
2000
2001 /* Note address for loop jump
2002 */
2003 label = x86_get_label(cp.func);
2004 {
2005 /* Fetch inputs... TODO: fetch lazily...
2006 */
2007 if (!aos_fetch_inputs( &cp, linear ))
2008 goto fail;
2009
2010 /* Emit the shader:
2011 */
2012 while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
2013 {
2014 tgsi_parse_token( &parse );
2015
2016 switch (parse.FullToken.Token.Type) {
2017 case TGSI_TOKEN_TYPE_IMMEDIATE:
2018 #if 0
2019 if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
2020 goto fail;
2021 #endif
2022 break;
2023
2024 case TGSI_TOKEN_TYPE_INSTRUCTION:
2025 if (DISASSEM)
2026 tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
2027
2028 if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
2029 goto fail;
2030 break;
2031 }
2032
2033 x87_assert_stack_empty(cp.func);
2034 cp.insn_counter++;
2035
2036 if (DISASSEM)
2037 debug_printf("\n");
2038 }
2039
2040
2041 {
2042 unsigned i;
2043 for (i = 0; i < 8; i++) {
2044 if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
2045 cp.xmm[i].file = TGSI_FILE_NULL;
2046 cp.xmm[i].dirty = 0;
2047 }
2048 }
2049 }
2050
2051 if (cp.error)
2052 goto fail;
2053
2054 if (cp.vaos->base.key.clip) {
2055 /* not really handling clipping, just do the rhw so we can
2056 * see the results...
2057 */
2058 emit_rhw_viewport(&cp);
2059 }
2060 else if (cp.vaos->base.key.viewport) {
2061 emit_viewport(&cp);
2062 }
2063
2064 /* Emit output... TODO: do this eagerly after the last write to a
2065 * given output.
2066 */
2067 if (!aos_emit_outputs( &cp ))
2068 goto fail;
2069
2070
2071 /* Next vertex:
2072 */
2073 x86_lea(cp.func,
2074 cp.outbuf_ECX,
2075 x86_make_disp(cp.outbuf_ECX,
2076 cp.vaos->base.key.output_stride));
2077
2078 /* Incr index
2079 */
2080 aos_incr_inputs( &cp, linear );
2081 }
2082 /* decr count, loop if not zero
2083 */
2084 x86_dec(cp.func, cp.count_ESI);
2085 x86_jcc(cp.func, cc_NZ, label);
2086
2087 restore_fpu_state(&cp);
2088
2089 /* Land forward jump here:
2090 */
2091 x86_fixup_fwd_jump(cp.func, fixup);
2092
2093 /* Exit mmx state?
2094 */
2095 if (cp.func->need_emms)
2096 mmx_emms(cp.func);
2097
2098 x86_pop(cp.func, cp.temp_EBP);
2099 x86_pop(cp.func, cp.count_ESI);
2100 x86_pop(cp.func, cp.idx_EBX);
2101
2102 x87_assert_stack_empty(cp.func);
2103 x86_ret(cp.func);
2104
2105 tgsi_parse_free( &parse );
2106 return !cp.error;
2107
2108 fail:
2109 tgsi_parse_free( &parse );
2110 return FALSE;
2111 }
2112
2113
2114
2115 static void vaos_set_buffer( struct draw_vs_varient *varient,
2116 unsigned buf,
2117 const void *ptr,
2118 unsigned stride )
2119 {
2120 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2121
2122 if (buf < vaos->nr_vb) {
2123 vaos->buffer[buf].base_ptr = (char *)ptr;
2124 vaos->buffer[buf].stride = stride;
2125 }
2126
2127 if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
2128 }
2129
2130
2131
2132 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
2133 const unsigned *elts,
2134 unsigned count,
2135 void *output_buffer )
2136 {
2137 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2138 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2139
2140 if (0) debug_printf("%s %d\n", __FUNCTION__, count);
2141
2142 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2143 machine->constants = vaos->draw->vs.aligned_constants;
2144 machine->immediates = vaos->base.vs->immediates;
2145 machine->buffer = vaos->buffer;
2146
2147 vaos->gen_run_elts( machine,
2148 elts,
2149 count,
2150 output_buffer );
2151 }
2152
2153 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
2154 unsigned start,
2155 unsigned count,
2156 void *output_buffer )
2157 {
2158 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2159 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2160
2161 if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
2162 vaos->base.key.const_vbuffers);
2163
2164 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2165 machine->constants = vaos->draw->vs.aligned_constants;
2166 machine->immediates = vaos->base.vs->immediates;
2167 machine->buffer = vaos->buffer;
2168
2169 vaos->gen_run_linear( machine,
2170 start,
2171 count,
2172 output_buffer );
2173
2174 /* Sanity spot checks to make sure we didn't trash our constants */
2175 assert(machine->internal[IMM_ONES][0] == 1.0f);
2176 assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
2177 assert(machine->internal[IMM_NEGS][0] == -1.0f);
2178 }
2179
2180
2181
2182 static void vaos_destroy( struct draw_vs_varient *varient )
2183 {
2184 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2185
2186 FREE( vaos->buffer );
2187
2188 x86_release_func( &vaos->func[0] );
2189 x86_release_func( &vaos->func[1] );
2190
2191 FREE(vaos);
2192 }
2193
2194
2195
2196 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
2197 const struct draw_vs_varient_key *key )
2198 {
2199 unsigned i;
2200 struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
2201
2202 if (!vaos)
2203 goto fail;
2204
2205 vaos->base.key = *key;
2206 vaos->base.vs = vs;
2207 vaos->base.set_buffer = vaos_set_buffer;
2208 vaos->base.destroy = vaos_destroy;
2209 vaos->base.run_linear = vaos_run_linear;
2210 vaos->base.run_elts = vaos_run_elts;
2211
2212 vaos->draw = vs->draw;
2213
2214 for (i = 0; i < key->nr_inputs; i++)
2215 vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
2216
2217 vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
2218 if (!vaos->buffer)
2219 goto fail;
2220
2221 if (0)
2222 debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
2223
2224 #if 0
2225 tgsi_dump(vs->state.tokens, 0);
2226 #endif
2227
2228 if (!build_vertex_program( vaos, TRUE ))
2229 goto fail;
2230
2231 if (!build_vertex_program( vaos, FALSE ))
2232 goto fail;
2233
2234 vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
2235 if (!vaos->gen_run_linear)
2236 goto fail;
2237
2238 vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
2239 if (!vaos->gen_run_elts)
2240 goto fail;
2241
2242 return &vaos->base;
2243
2244 fail:
2245 if (vaos && vaos->buffer)
2246 FREE(vaos->buffer);
2247
2248 if (vaos)
2249 x86_release_func( &vaos->func[0] );
2250
2251 if (vaos)
2252 x86_release_func( &vaos->func[1] );
2253
2254 FREE(vaos);
2255
2256 return NULL;
2257 }
2258
2259
2260 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
2261 const struct draw_vs_varient_key *key )
2262 {
2263 struct draw_vs_varient *varient = varient_aos_sse( vs, key );
2264
2265 if (varient == NULL) {
2266 varient = draw_vs_varient_generic( vs, key );
2267 }
2268
2269 return varient;
2270 }
2271
2272
2273
2274 #endif /* PIPE_ARCH_X86 */