draw: s/varient/variant/
[mesa.git] / src / gallium / auxiliary / draw / draw_vs_aos.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
29 */
30
31
32 #include "util/u_memory.h"
33 #include "util/u_math.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_debug.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi/tgsi_exec.h"
39 #include "tgsi/tgsi_dump.h"
40
41 #include "draw_vs.h"
42 #include "draw_vs_aos.h"
43
44 #include "rtasm/rtasm_x86sse.h"
45
46 #ifdef PIPE_ARCH_X86
47 #define DISASSEM 0
48 #define FAST_MATH 1
49
50 static const char *files[] =
51 {
52 "NULL",
53 "CONST",
54 "IN",
55 "OUT",
56 "TEMP",
57 "SAMP",
58 "ADDR",
59 "IMM",
60 "INTERNAL",
61 };
62
63 static INLINE boolean eq( struct x86_reg a,
64 struct x86_reg b )
65 {
66 return (a.file == b.file &&
67 a.idx == b.idx &&
68 a.mod == b.mod &&
69 a.disp == b.disp);
70 }
71
72 struct x86_reg aos_get_x86( struct aos_compilation *cp,
73 unsigned which_reg, /* quick hack */
74 unsigned value )
75 {
76 struct x86_reg reg;
77
78 if (which_reg == 0)
79 reg = cp->temp_EBP;
80 else
81 reg = cp->tmp_EAX;
82
83 if (cp->x86_reg[which_reg] != value) {
84 unsigned offset;
85
86 switch (value) {
87 case X86_IMMEDIATES:
88 assert(which_reg == 0);
89 offset = Offset(struct aos_machine, immediates);
90 break;
91 case X86_CONSTANTS:
92 assert(which_reg == 1);
93 offset = Offset(struct aos_machine, constants);
94 break;
95 case X86_BUFFERS:
96 assert(which_reg == 0);
97 offset = Offset(struct aos_machine, buffer);
98 break;
99 default:
100 assert(0);
101 offset = 0;
102 }
103
104
105 x86_mov(cp->func, reg,
106 x86_make_disp(cp->machine_EDX, offset));
107
108 cp->x86_reg[which_reg] = value;
109 }
110
111 return reg;
112 }
113
114
115 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
116 unsigned file,
117 unsigned idx )
118 {
119 struct x86_reg ptr = cp->machine_EDX;
120
121 switch (file) {
122 case TGSI_FILE_INPUT:
123 assert(idx < MAX_INPUTS);
124 return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
125
126 case TGSI_FILE_OUTPUT:
127 return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
128
129 case TGSI_FILE_TEMPORARY:
130 assert(idx < MAX_TEMPS);
131 return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
132
133 case AOS_FILE_INTERNAL:
134 assert(idx < MAX_INTERNALS);
135 return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
136
137 case TGSI_FILE_IMMEDIATE:
138 assert(idx < MAX_IMMEDIATES); /* just a sanity check */
139 return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
140
141 case TGSI_FILE_CONSTANT:
142 assert(idx < MAX_CONSTANTS); /* just a sanity check */
143 return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
144
145 default:
146 AOS_ERROR(cp, "unknown reg file");
147 return x86_make_reg(0,0);
148 }
149 }
150
151
152
153 #define X87_CW_EXCEPTION_INV_OP (1<<0)
154 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
155 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
156 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
157 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
158 #define X87_CW_EXCEPTION_PRECISION (1<<5)
159 #define X87_CW_PRECISION_SINGLE (0<<8)
160 #define X87_CW_PRECISION_RESERVED (1<<8)
161 #define X87_CW_PRECISION_DOUBLE (2<<8)
162 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
163 #define X87_CW_PRECISION_MASK (3<<8)
164 #define X87_CW_ROUND_NEAREST (0<<10)
165 #define X87_CW_ROUND_DOWN (1<<10)
166 #define X87_CW_ROUND_UP (2<<10)
167 #define X87_CW_ROUND_ZERO (3<<10)
168 #define X87_CW_ROUND_MASK (3<<10)
169 #define X87_CW_INFINITY (1<<12)
170
171
172
173
174 static void spill( struct aos_compilation *cp, unsigned idx )
175 {
176 if (!cp->xmm[idx].dirty ||
177 (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
178 cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
179 cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
180 AOS_ERROR(cp, "invalid spill");
181 return;
182 }
183 else {
184 struct x86_reg oldval = get_reg_ptr(cp,
185 cp->xmm[idx].file,
186 cp->xmm[idx].idx);
187
188 if (0) debug_printf("\nspill %s[%d]",
189 files[cp->xmm[idx].file],
190 cp->xmm[idx].idx);
191
192 assert(cp->xmm[idx].dirty);
193 sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
194 cp->xmm[idx].dirty = 0;
195 }
196 }
197
198
199 void aos_spill_all( struct aos_compilation *cp )
200 {
201 unsigned i;
202
203 for (i = 0; i < 8; i++) {
204 if (cp->xmm[i].dirty)
205 spill(cp, i);
206 aos_release_xmm_reg(cp, i);
207 }
208 }
209
210
211 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
212 struct x86_reg reg )
213 {
214 if (reg.file != file_XMM ||
215 cp->xmm[reg.idx].file != TGSI_FILE_NULL)
216 {
217 struct x86_reg tmp = aos_get_xmm_reg(cp);
218 sse_movaps(cp->func, tmp, reg);
219 reg = tmp;
220 }
221
222 cp->xmm[reg.idx].last_used = cp->insn_counter;
223 return reg;
224 }
225
226 static struct x86_reg get_xmm( struct aos_compilation *cp,
227 struct x86_reg reg )
228 {
229 if (reg.file != file_XMM)
230 {
231 struct x86_reg tmp = aos_get_xmm_reg(cp);
232 sse_movaps(cp->func, tmp, reg);
233 reg = tmp;
234 }
235
236 cp->xmm[reg.idx].last_used = cp->insn_counter;
237 return reg;
238 }
239
240
241 /* Allocate an empty xmm register, either as a temporary or later to
242 * "adopt" as a shader reg.
243 */
244 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
245 {
246 unsigned i;
247 unsigned oldest = 0;
248 boolean found = FALSE;
249
250 for (i = 0; i < 8; i++)
251 if (cp->xmm[i].last_used != cp->insn_counter &&
252 cp->xmm[i].file == TGSI_FILE_NULL) {
253 oldest = i;
254 found = TRUE;
255 }
256
257 if (!found) {
258 for (i = 0; i < 8; i++)
259 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
260 oldest = i;
261 }
262
263 /* Need to write out the old value?
264 */
265 if (cp->xmm[oldest].dirty)
266 spill(cp, oldest);
267
268 assert(cp->xmm[oldest].last_used != cp->insn_counter);
269
270 cp->xmm[oldest].file = TGSI_FILE_NULL;
271 cp->xmm[oldest].idx = 0;
272 cp->xmm[oldest].dirty = 0;
273 cp->xmm[oldest].last_used = cp->insn_counter;
274 return x86_make_reg(file_XMM, oldest);
275 }
276
277 void aos_release_xmm_reg( struct aos_compilation *cp,
278 unsigned idx )
279 {
280 cp->xmm[idx].file = TGSI_FILE_NULL;
281 cp->xmm[idx].idx = 0;
282 cp->xmm[idx].dirty = 0;
283 cp->xmm[idx].last_used = 0;
284 }
285
286
287 static void aos_soft_release_xmm( struct aos_compilation *cp,
288 struct x86_reg reg )
289 {
290 if (reg.file == file_XMM) {
291 assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
292 cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
293 }
294 }
295
296
297
298 /* Mark an xmm reg as holding the current copy of a shader reg.
299 */
300 void aos_adopt_xmm_reg( struct aos_compilation *cp,
301 struct x86_reg reg,
302 unsigned file,
303 unsigned idx,
304 unsigned dirty )
305 {
306 unsigned i;
307
308 if (reg.file != file_XMM) {
309 assert(0);
310 return;
311 }
312
313
314 /* If any xmm reg thinks it holds this shader reg, break the
315 * illusion.
316 */
317 for (i = 0; i < 8; i++) {
318 if (cp->xmm[i].file == file &&
319 cp->xmm[i].idx == idx)
320 {
321 /* If an xmm reg is already holding this shader reg, take into account its
322 * dirty flag...
323 */
324 dirty |= cp->xmm[i].dirty;
325 aos_release_xmm_reg(cp, i);
326 }
327 }
328
329 cp->xmm[reg.idx].file = file;
330 cp->xmm[reg.idx].idx = idx;
331 cp->xmm[reg.idx].dirty = dirty;
332 cp->xmm[reg.idx].last_used = cp->insn_counter;
333 }
334
335
336 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
337 */
338 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
339 unsigned file,
340 unsigned idx )
341 {
342 unsigned i;
343
344 /* Ensure the in-memory copy of this reg is up-to-date
345 */
346 for (i = 0; i < 8; i++) {
347 if (cp->xmm[i].file == file &&
348 cp->xmm[i].idx == idx &&
349 cp->xmm[i].dirty) {
350 spill(cp, i);
351 }
352 }
353
354 return get_reg_ptr( cp, file, idx );
355 }
356
357
358 /* As above, but return a pointer. Note - this pointer may alias
359 * those returned by get_arg_ptr().
360 */
361 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
362 const struct tgsi_full_dst_register *dst )
363 {
364 unsigned file = dst->Register.File;
365 unsigned idx = dst->Register.Index;
366 unsigned i;
367
368
369 /* Ensure in-memory copy of this reg is up-to-date and invalidate
370 * any xmm copies.
371 */
372 for (i = 0; i < 8; i++) {
373 if (cp->xmm[i].file == file &&
374 cp->xmm[i].idx == idx)
375 {
376 if (cp->xmm[i].dirty)
377 spill(cp, i);
378
379 aos_release_xmm_reg(cp, i);
380 }
381 }
382
383 return get_reg_ptr( cp, file, idx );
384 }
385
386
387
388
389
390 /* Return an XMM reg if the argument is resident, otherwise return a
391 * base+offset pointer to the saved value.
392 */
393 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
394 unsigned file,
395 unsigned idx )
396 {
397 unsigned i;
398
399 for (i = 0; i < 8; i++) {
400 if (cp->xmm[i].file == file &&
401 cp->xmm[i].idx == idx)
402 {
403 cp->xmm[i].last_used = cp->insn_counter;
404 return x86_make_reg(file_XMM, i);
405 }
406 }
407
408 /* If not found in the XMM register file, return an indirect
409 * reference to the in-memory copy:
410 */
411 return get_reg_ptr( cp, file, idx );
412 }
413
414
415
416 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
417 unsigned file,
418 unsigned idx )
419 {
420 struct x86_reg reg = get_xmm( cp,
421 aos_get_shader_reg( cp, file, idx ) );
422
423 aos_adopt_xmm_reg( cp,
424 reg,
425 file,
426 idx,
427 FALSE );
428
429 return reg;
430 }
431
432
433
434 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
435 unsigned imm )
436 {
437 return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
438 }
439
440
441 struct x86_reg aos_get_internal( struct aos_compilation *cp,
442 unsigned imm )
443 {
444 return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
445 }
446
447
448
449
450
451 /* Emulate pshufd insn in regular SSE, if necessary:
452 */
453 static void emit_pshufd( struct aos_compilation *cp,
454 struct x86_reg dst,
455 struct x86_reg arg0,
456 ubyte shuf )
457 {
458 if (cp->have_sse2) {
459 sse2_pshufd(cp->func, dst, arg0, shuf);
460 }
461 else {
462 if (!eq(dst, arg0))
463 sse_movaps(cp->func, dst, arg0);
464
465 sse_shufps(cp->func, dst, dst, shuf);
466 }
467 }
468
469 /* load masks (pack into negs??)
470 * pshufd - shuffle according to writemask
471 * and - result, mask
472 * nand - dest, mask
473 * or - dest, result
474 */
475 static boolean mask_write( struct aos_compilation *cp,
476 struct x86_reg dst,
477 struct x86_reg result,
478 unsigned mask )
479 {
480 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
481 struct x86_reg tmp = aos_get_xmm_reg(cp);
482
483 emit_pshufd(cp, tmp, imm_swz,
484 SHUF((mask & 1) ? 2 : 3,
485 (mask & 2) ? 2 : 3,
486 (mask & 4) ? 2 : 3,
487 (mask & 8) ? 2 : 3));
488
489 sse_andps(cp->func, dst, tmp);
490 sse_andnps(cp->func, tmp, result);
491 sse_orps(cp->func, dst, tmp);
492
493 aos_release_xmm_reg(cp, tmp.idx);
494 return TRUE;
495 }
496
497
498
499
500 /* Helper for writemask:
501 */
502 static boolean emit_shuf_copy2( struct aos_compilation *cp,
503 struct x86_reg dst,
504 struct x86_reg arg0,
505 struct x86_reg arg1,
506 ubyte shuf )
507 {
508 struct x86_reg tmp = aos_get_xmm_reg(cp);
509
510 emit_pshufd(cp, dst, arg1, shuf);
511 emit_pshufd(cp, tmp, arg0, shuf);
512 sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
513 emit_pshufd(cp, dst, dst, shuf);
514
515 aos_release_xmm_reg(cp, tmp.idx);
516 return TRUE;
517 }
518
519
520
521 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
522
523
524 /* Locate a source register and perform any required (simple) swizzle.
525 *
526 * Just fail on complex swizzles at this point.
527 */
528 static struct x86_reg fetch_src( struct aos_compilation *cp,
529 const struct tgsi_full_src_register *src )
530 {
531 struct x86_reg arg0 = aos_get_shader_reg(cp,
532 src->Register.File,
533 src->Register.Index);
534 unsigned i;
535 ubyte swz = 0;
536 unsigned negs = 0;
537 unsigned abs = 0;
538
539 for (i = 0; i < 4; i++) {
540 unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
541 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
542
543 swz |= (swizzle & 0x3) << (i * 2);
544
545 switch (neg) {
546 case TGSI_UTIL_SIGN_TOGGLE:
547 negs |= (1<<i);
548 break;
549
550 case TGSI_UTIL_SIGN_KEEP:
551 break;
552
553 case TGSI_UTIL_SIGN_CLEAR:
554 abs |= (1<<i);
555 break;
556
557 default:
558 AOS_ERROR(cp, "unsupported sign-mode");
559 break;
560 }
561 }
562
563 if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
564 struct x86_reg dst = aos_get_xmm_reg(cp);
565
566 if (swz != SSE_SWIZZLE_NOOP)
567 emit_pshufd(cp, dst, arg0, swz);
568 else
569 sse_movaps(cp->func, dst, arg0);
570
571 if (negs && negs != 0xf) {
572 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
573 struct x86_reg tmp = aos_get_xmm_reg(cp);
574
575 /* Load 1,-1,0,0
576 * Use neg as arg to pshufd
577 * Multiply
578 */
579 emit_pshufd(cp, tmp, imm_swz,
580 SHUF((negs & 1) ? 1 : 0,
581 (negs & 2) ? 1 : 0,
582 (negs & 4) ? 1 : 0,
583 (negs & 8) ? 1 : 0));
584 sse_mulps(cp->func, dst, tmp);
585
586 aos_release_xmm_reg(cp, tmp.idx);
587 aos_soft_release_xmm(cp, imm_swz);
588 }
589 else if (negs) {
590 struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
591 sse_mulps(cp->func, dst, imm_negs);
592 aos_soft_release_xmm(cp, imm_negs);
593 }
594
595
596 if (abs && abs != 0xf) {
597 AOS_ERROR(cp, "unsupported partial abs");
598 }
599 else if (abs) {
600 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
601 struct x86_reg tmp = aos_get_xmm_reg(cp);
602
603 sse_movaps(cp->func, tmp, dst);
604 sse_mulps(cp->func, tmp, neg);
605 sse_maxps(cp->func, dst, tmp);
606
607 aos_release_xmm_reg(cp, tmp.idx);
608 aos_soft_release_xmm(cp, neg);
609 }
610
611 aos_soft_release_xmm(cp, arg0);
612 return dst;
613 }
614
615 return arg0;
616 }
617
618 static void x87_fld_src( struct aos_compilation *cp,
619 const struct tgsi_full_src_register *src,
620 unsigned channel )
621 {
622 struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
623 src->Register.File,
624 src->Register.Index);
625
626 unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
627 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
628
629 x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
630
631 switch (neg) {
632 case TGSI_UTIL_SIGN_TOGGLE:
633 /* Flip the sign:
634 */
635 x87_fchs( cp->func );
636 break;
637
638 case TGSI_UTIL_SIGN_KEEP:
639 break;
640
641 case TGSI_UTIL_SIGN_CLEAR:
642 x87_fabs( cp->func );
643 break;
644
645 case TGSI_UTIL_SIGN_SET:
646 x87_fabs( cp->func );
647 x87_fchs( cp->func );
648 break;
649
650 default:
651 AOS_ERROR(cp, "unsupported sign-mode");
652 break;
653 }
654 }
655
656
657
658
659
660
661 /* Used to implement write masking. This and most of the other instructions
662 * here would be easier to implement if there had been a translation
663 * to a 2 argument format (dst/arg0, arg1) at the shader level before
664 * attempting to translate to x86/sse code.
665 */
666 static void store_dest( struct aos_compilation *cp,
667 const struct tgsi_full_dst_register *reg,
668 struct x86_reg result )
669 {
670 struct x86_reg dst;
671
672 switch (reg->Register.WriteMask) {
673 case 0:
674 return;
675
676 case TGSI_WRITEMASK_XYZW:
677 aos_adopt_xmm_reg(cp,
678 get_xmm_writable(cp, result),
679 reg->Register.File,
680 reg->Register.Index,
681 TRUE);
682 return;
683 default:
684 break;
685 }
686
687 dst = aos_get_shader_reg_xmm(cp,
688 reg->Register.File,
689 reg->Register.Index);
690
691 switch (reg->Register.WriteMask) {
692 case TGSI_WRITEMASK_X:
693 sse_movss(cp->func, dst, get_xmm(cp, result));
694 break;
695
696 case TGSI_WRITEMASK_ZW:
697 sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
698 break;
699
700 case TGSI_WRITEMASK_XY:
701 result = get_xmm_writable(cp, result);
702 sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
703 dst = result;
704 break;
705
706 case TGSI_WRITEMASK_YZW:
707 result = get_xmm_writable(cp, result);
708 sse_movss(cp->func, result, dst);
709 dst = result;
710 break;
711
712 default:
713 mask_write(cp, dst, result, reg->Register.WriteMask);
714 break;
715 }
716
717 aos_adopt_xmm_reg(cp,
718 dst,
719 reg->Register.File,
720 reg->Register.Index,
721 TRUE);
722
723 }
724
725 static void inject_scalar( struct aos_compilation *cp,
726 struct x86_reg dst,
727 struct x86_reg result,
728 ubyte swizzle )
729 {
730 sse_shufps(cp->func, dst, dst, swizzle);
731 sse_movss(cp->func, dst, result);
732 sse_shufps(cp->func, dst, dst, swizzle);
733 }
734
735
736 static void store_scalar_dest( struct aos_compilation *cp,
737 const struct tgsi_full_dst_register *reg,
738 struct x86_reg result )
739 {
740 unsigned writemask = reg->Register.WriteMask;
741 struct x86_reg dst;
742
743 if (writemask != TGSI_WRITEMASK_X &&
744 writemask != TGSI_WRITEMASK_Y &&
745 writemask != TGSI_WRITEMASK_Z &&
746 writemask != TGSI_WRITEMASK_W &&
747 writemask != 0)
748 {
749 result = get_xmm_writable(cp, result); /* already true, right? */
750 sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
751 store_dest(cp, reg, result);
752 return;
753 }
754
755 result = get_xmm(cp, result);
756 dst = aos_get_shader_reg_xmm(cp,
757 reg->Register.File,
758 reg->Register.Index);
759
760
761
762 switch (reg->Register.WriteMask) {
763 case TGSI_WRITEMASK_X:
764 sse_movss(cp->func, dst, result);
765 break;
766
767 case TGSI_WRITEMASK_Y:
768 inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
769 break;
770
771 case TGSI_WRITEMASK_Z:
772 inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
773 break;
774
775 case TGSI_WRITEMASK_W:
776 inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
777 break;
778
779 default:
780 break;
781 }
782
783 aos_adopt_xmm_reg(cp,
784 dst,
785 reg->Register.File,
786 reg->Register.Index,
787 TRUE);
788 }
789
790
791
792 static void x87_fst_or_nop( struct x86_function *func,
793 unsigned writemask,
794 unsigned channel,
795 struct x86_reg ptr )
796 {
797 assert(ptr.file == file_REG32);
798 if (writemask & (1<<channel))
799 x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
800 }
801
802 static void x87_fstp_or_pop( struct x86_function *func,
803 unsigned writemask,
804 unsigned channel,
805 struct x86_reg ptr )
806 {
807 assert(ptr.file == file_REG32);
808 if (writemask & (1<<channel))
809 x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
810 else
811 x87_fstp( func, x86_make_reg( file_x87, 0 ));
812 }
813
814
815
816 /*
817 */
818 static void x87_fstp_dest4( struct aos_compilation *cp,
819 const struct tgsi_full_dst_register *dst )
820 {
821 struct x86_reg ptr = get_dst_ptr(cp, dst);
822 unsigned writemask = dst->Register.WriteMask;
823
824 x87_fst_or_nop(cp->func, writemask, 0, ptr);
825 x87_fst_or_nop(cp->func, writemask, 1, ptr);
826 x87_fst_or_nop(cp->func, writemask, 2, ptr);
827 x87_fstp_or_pop(cp->func, writemask, 3, ptr);
828 }
829
830 /* Save current x87 state and put it into single precision mode.
831 */
832 static void save_fpu_state( struct aos_compilation *cp )
833 {
834 x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
835 Offset(struct aos_machine, fpu_restore)));
836 }
837
838 static void restore_fpu_state( struct aos_compilation *cp )
839 {
840 x87_fnclex(cp->func);
841 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
842 Offset(struct aos_machine, fpu_restore)));
843 }
844
845 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
846 {
847 if (cp->fpucntl != FPU_RND_NEG) {
848 cp->fpucntl = FPU_RND_NEG;
849 x87_fnclex(cp->func);
850 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
851 Offset(struct aos_machine, fpu_rnd_neg_inf)));
852 }
853 }
854
855 static void set_fpu_round_nearest( struct aos_compilation *cp )
856 {
857 if (cp->fpucntl != FPU_RND_NEAREST) {
858 cp->fpucntl = FPU_RND_NEAREST;
859 x87_fnclex(cp->func);
860 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
861 Offset(struct aos_machine, fpu_rnd_nearest)));
862 }
863 }
864
865 #if 0
866 static void x87_emit_ex2( struct aos_compilation *cp )
867 {
868 struct x86_reg st0 = x86_make_reg(file_x87, 0);
869 struct x86_reg st1 = x86_make_reg(file_x87, 1);
870 int stack = cp->func->x87_stack;
871
872 /* set_fpu_round_neg_inf( cp ); */
873
874 x87_fld(cp->func, st0); /* a a */
875 x87_fprndint( cp->func ); /* int(a) a*/
876 x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
877 x87_fxch(cp->func, st1); /* frc(a) int(a) */
878 x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
879 x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
880 x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
881 x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
882 /* 2^a int(a) */
883 x87_fstp(cp->func, st1); /* 2^a */
884
885 assert( stack == cp->func->x87_stack);
886
887 }
888 #endif
889
890 #if 0
891 static void PIPE_CDECL print_reg( const char *msg,
892 const float *reg )
893 {
894 debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
895 }
896 #endif
897
898 #if 0
899 static void emit_print( struct aos_compilation *cp,
900 const char *message, /* must point to a static string! */
901 unsigned file,
902 unsigned idx )
903 {
904 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
905 struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
906 unsigned i;
907
908 /* There shouldn't be anything on the x87 stack. Can add this
909 * capacity later if need be.
910 */
911 assert(cp->func->x87_stack == 0);
912
913 /* For absolute correctness, need to spill/invalidate all XMM regs
914 * too. We're obviously not concerned about performance on this
915 * debug path, so here goes:
916 */
917 for (i = 0; i < 8; i++) {
918 if (cp->xmm[i].dirty)
919 spill(cp, i);
920
921 aos_release_xmm_reg(cp, i);
922 }
923
924 /* Push caller-save (ie scratch) regs.
925 */
926 x86_cdecl_caller_push_regs( cp->func );
927
928
929 /* Push the arguments:
930 */
931 x86_lea( cp->func, ecx, arg );
932 x86_push( cp->func, ecx );
933 x86_push_imm32( cp->func, (int)message );
934
935 /* Call the helper. Could call debug_printf directly, but
936 * print_reg is a nice place to put a breakpoint if need be.
937 */
938 x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
939 x86_call( cp->func, ecx );
940 x86_pop( cp->func, ecx );
941 x86_pop( cp->func, ecx );
942
943 /* Pop caller-save regs
944 */
945 x86_cdecl_caller_pop_regs( cp->func );
946
947 /* Done...
948 */
949 }
950 #endif
951
952 /**
953 * The traditional instructions. All operate on internal registers
954 * and ignore write masks and swizzling issues.
955 */
956
957 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
958 {
959 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
960 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
961 struct x86_reg tmp = aos_get_xmm_reg(cp);
962
963 sse_movaps(cp->func, tmp, arg0);
964 sse_mulps(cp->func, tmp, neg);
965 sse_maxps(cp->func, tmp, arg0);
966
967 store_dest(cp, &op->Dst[0], tmp);
968 return TRUE;
969 }
970
971 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
972 {
973 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
974 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
975 struct x86_reg dst = get_xmm_writable(cp, arg0);
976
977 sse_addps(cp->func, dst, arg1);
978
979 store_dest(cp, &op->Dst[0], dst);
980 return TRUE;
981 }
982
983 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
984 {
985 x87_fld_src(cp, &op->Src[0], 0);
986 x87_fcos(cp->func);
987 x87_fstp_dest4(cp, &op->Dst[0]);
988 return TRUE;
989 }
990
991 /* The dotproduct instructions don't really do that well in sse:
992 * XXX: produces wrong results -- disabled.
993 */
994 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
995 {
996 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
997 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
998 struct x86_reg tmp = aos_get_xmm_reg(cp);
999 struct x86_reg dst = get_xmm_writable(cp, arg0);
1000
1001 sse_mulps(cp->func, dst, arg1);
1002 /* Now the hard bit: sum the first 3 values:
1003 */
1004 sse_movhlps(cp->func, tmp, dst);
1005 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1006 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1007 sse_addss(cp->func, dst, tmp);
1008
1009 aos_release_xmm_reg(cp, tmp.idx);
1010 store_scalar_dest(cp, &op->Dst[0], dst);
1011 return TRUE;
1012 }
1013
1014 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1015 {
1016 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1017 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1018 struct x86_reg tmp = aos_get_xmm_reg(cp);
1019 struct x86_reg dst = get_xmm_writable(cp, arg0);
1020
1021 sse_mulps(cp->func, dst, arg1);
1022
1023 /* Now the hard bit: sum the values:
1024 */
1025 sse_movhlps(cp->func, tmp, dst);
1026 sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1027 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1028 sse_addss(cp->func, dst, tmp);
1029
1030 aos_release_xmm_reg(cp, tmp.idx);
1031 store_scalar_dest(cp, &op->Dst[0], dst);
1032 return TRUE;
1033 }
1034
1035 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1036 {
1037 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1038 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1039 struct x86_reg tmp = aos_get_xmm_reg(cp);
1040 struct x86_reg dst = get_xmm_writable(cp, arg0);
1041
1042 sse_mulps(cp->func, dst, arg1);
1043
1044 /* Now the hard bit: sum the values (from DP3):
1045 */
1046 sse_movhlps(cp->func, tmp, dst);
1047 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1048 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1049 sse_addss(cp->func, dst, tmp);
1050 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
1051 sse_addss(cp->func, dst, tmp);
1052
1053 aos_release_xmm_reg(cp, tmp.idx);
1054 store_scalar_dest(cp, &op->Dst[0], dst);
1055 return TRUE;
1056 }
1057
1058 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1059 {
1060 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1061 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1062 struct x86_reg dst = aos_get_xmm_reg(cp);
1063 struct x86_reg tmp = aos_get_xmm_reg(cp);
1064 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1065
1066 /* dst[0] = 1.0 * 1.0F; */
1067 /* dst[1] = arg0[1] * arg1[1]; */
1068 /* dst[2] = arg0[2] * 1.0; */
1069 /* dst[3] = 1.0 * arg1[3]; */
1070
1071 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
1072 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
1073 sse_mulps(cp->func, dst, tmp);
1074
1075 aos_release_xmm_reg(cp, tmp.idx);
1076 store_dest(cp, &op->Dst[0], dst);
1077 return TRUE;
1078 }
1079
1080 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1081 {
1082 x87_fld1(cp->func); /* 1 */
1083 x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */
1084 x87_fyl2x(cp->func); /* log2(a0) */
1085 x87_fstp_dest4(cp, &op->Dst[0]);
1086 return TRUE;
1087 }
1088
1089 #if 0
1090 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1091 {
1092 x87_fld_src(cp, &op->Src[0], 0);
1093 x87_emit_ex2(cp);
1094 x87_fstp_dest4(cp, &op->Dst[0]);
1095 return TRUE;
1096 }
1097 #endif
1098
1099
1100 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1101 {
1102 struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1103 unsigned writemask = op->Dst[0].Register.WriteMask;
1104 int i;
1105
1106 set_fpu_round_neg_inf( cp );
1107
1108 /* Load all sources first to avoid aliasing
1109 */
1110 for (i = 3; i >= 0; i--) {
1111 if (writemask & (1<<i)) {
1112 x87_fld_src(cp, &op->Src[0], i);
1113 }
1114 }
1115
1116 for (i = 0; i < 4; i++) {
1117 if (writemask & (1<<i)) {
1118 x87_fprndint( cp->func );
1119 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1120 }
1121 }
1122
1123 return TRUE;
1124 }
1125
1126
1127 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1128 {
1129 struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1130 unsigned writemask = op->Dst[0].Register.WriteMask;
1131 int i;
1132
1133 set_fpu_round_nearest( cp );
1134
1135 /* Load all sources first to avoid aliasing
1136 */
1137 for (i = 3; i >= 0; i--) {
1138 if (writemask & (1<<i)) {
1139 x87_fld_src(cp, &op->Src[0], i);
1140 }
1141 }
1142
1143 for (i = 0; i < 4; i++) {
1144 if (writemask & (1<<i)) {
1145 x87_fprndint( cp->func );
1146 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1147 }
1148 }
1149
1150 return TRUE;
1151 }
1152
1153
1154 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1155 {
1156 struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1157 struct x86_reg st0 = x86_make_reg(file_x87, 0);
1158 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1159 unsigned writemask = op->Dst[0].Register.WriteMask;
1160 int i;
1161
1162 set_fpu_round_neg_inf( cp );
1163
1164 /* suck all the source values onto the stack before writing out any
1165 * dst, which may alias...
1166 */
1167 for (i = 3; i >= 0; i--) {
1168 if (writemask & (1<<i)) {
1169 x87_fld_src(cp, &op->Src[0], i);
1170 }
1171 }
1172
1173 for (i = 0; i < 4; i++) {
1174 if (writemask & (1<<i)) {
1175 x87_fld(cp->func, st0); /* a a */
1176 x87_fprndint( cp->func ); /* flr(a) a */
1177 x87_fsubp(cp->func, st1); /* frc(a) */
1178 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1179 }
1180 }
1181
1182 return TRUE;
1183 }
1184
1185
1186
1187
1188
1189
1190 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1191 {
1192 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
1193 unsigned writemask = op->Dst[0].Register.WriteMask;
1194 unsigned lit_count = cp->lit_count++;
1195 struct x86_reg result, arg0;
1196 unsigned i;
1197
1198 #if 1
1199 /* For absolute correctness, need to spill/invalidate all XMM regs
1200 * too.
1201 */
1202 for (i = 0; i < 8; i++) {
1203 if (cp->xmm[i].dirty)
1204 spill(cp, i);
1205 aos_release_xmm_reg(cp, i);
1206 }
1207 #endif
1208
1209 if (writemask != TGSI_WRITEMASK_XYZW)
1210 result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
1211 else
1212 result = get_dst_ptr(cp, &op->Dst[0]);
1213
1214
1215 arg0 = fetch_src( cp, &op->Src[0] );
1216 if (arg0.file == file_XMM) {
1217 struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
1218 Offset(struct aos_machine, tmp[1]));
1219 sse_movaps( cp->func, tmp, arg0 );
1220 arg0 = tmp;
1221 }
1222
1223
1224
1225 /* Push caller-save (ie scratch) regs.
1226 */
1227 x86_cdecl_caller_push_regs( cp->func );
1228
1229 /* Push the arguments:
1230 */
1231 x86_push_imm32( cp->func, lit_count );
1232
1233 x86_lea( cp->func, ecx, arg0 );
1234 x86_push( cp->func, ecx );
1235
1236 x86_lea( cp->func, ecx, result );
1237 x86_push( cp->func, ecx );
1238
1239 x86_push( cp->func, cp->machine_EDX );
1240
1241 if (lit_count < MAX_LIT_INFO) {
1242 x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
1243 Offset(struct aos_machine, lit_info) +
1244 lit_count * sizeof(struct lit_info) +
1245 Offset(struct lit_info, func)));
1246 }
1247 else {
1248 x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
1249 }
1250
1251 x86_call( cp->func, ecx );
1252
1253 x86_pop( cp->func, ecx ); /* fixme... */
1254 x86_pop( cp->func, ecx );
1255 x86_pop( cp->func, ecx );
1256 x86_pop( cp->func, ecx );
1257
1258 x86_cdecl_caller_pop_regs( cp->func );
1259
1260 if (writemask != TGSI_WRITEMASK_XYZW) {
1261 store_dest( cp,
1262 &op->Dst[0],
1263 get_xmm_writable( cp, result ) );
1264 }
1265
1266 return TRUE;
1267 }
1268
1269 #if 0
1270 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1271 {
1272 struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1273 unsigned writemask = op->Dst[0].Register.WriteMask;
1274
1275 if (writemask & TGSI_WRITEMASK_YZ) {
1276 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1277 struct x86_reg st2 = x86_make_reg(file_x87, 2);
1278
1279 /* a1' = a1 <= 0 ? 1 : a1;
1280 */
1281 x87_fldz(cp->func); /* 1 0 */
1282 #if 1
1283 x87_fld1(cp->func); /* 1 0 */
1284 #else
1285 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1286 */
1287 x87_fldz(cp->func); /* 1 0 */
1288 #endif
1289 x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */
1290 x87_fcomi(cp->func, st2); /* a1 1 0 */
1291 x87_fcmovb(cp->func, st1); /* a1' 1 0 */
1292 x87_fstp(cp->func, st1); /* a1' 0 */
1293 x87_fstp(cp->func, st1); /* a1' */
1294
1295 x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */
1296 x87_fxch(cp->func, st1); /* a1' a3 */
1297
1298
1299 /* Compute pow(a1, a3)
1300 */
1301 x87_fyl2x(cp->func); /* a3*log2(a1) */
1302 x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
1303
1304
1305 /* a0' = max2(a0, 0):
1306 */
1307 x87_fldz(cp->func); /* 0 r2 */
1308 x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */
1309 x87_fcomi(cp->func, st1);
1310 x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
1311
1312 x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
1313
1314 x87_fcomi(cp->func, st1); /* a0' 0 r2 */
1315 x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
1316
1317 x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
1318 x87_fpop(cp->func); /* r2 */
1319 x87_fpop(cp->func);
1320 }
1321
1322 if (writemask & TGSI_WRITEMASK_XW) {
1323 x87_fld1(cp->func);
1324 x87_fst_or_nop(cp->func, writemask, 0, dst);
1325 x87_fstp_or_pop(cp->func, writemask, 3, dst);
1326 }
1327
1328 return TRUE;
1329 }
1330 #endif
1331
1332
1333
1334 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1335 {
1336 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1337 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1338 struct x86_reg dst = get_xmm_writable(cp, arg0);
1339
1340 sse_maxps(cp->func, dst, arg1);
1341
1342 store_dest(cp, &op->Dst[0], dst);
1343 return TRUE;
1344 }
1345
1346
1347 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1348 {
1349 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1350 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1351 struct x86_reg dst = get_xmm_writable(cp, arg0);
1352
1353 sse_minps(cp->func, dst, arg1);
1354
1355 store_dest(cp, &op->Dst[0], dst);
1356 return TRUE;
1357 }
1358
1359 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1360 {
1361 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1362 struct x86_reg dst = get_xmm_writable(cp, arg0);
1363
1364 /* potentially nothing to do */
1365
1366 store_dest(cp, &op->Dst[0], dst);
1367 return TRUE;
1368 }
1369
1370 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1371 {
1372 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1373 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1374 struct x86_reg dst = get_xmm_writable(cp, arg0);
1375
1376 sse_mulps(cp->func, dst, arg1);
1377
1378 store_dest(cp, &op->Dst[0], dst);
1379 return TRUE;
1380 }
1381
1382
1383 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1384 {
1385 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1386 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1387 struct x86_reg arg2 = fetch_src(cp, &op->Src[2]);
1388
1389 /* If we can't clobber old contents of arg0, get a temporary & copy
1390 * it there, then clobber it...
1391 */
1392 arg0 = get_xmm_writable(cp, arg0);
1393
1394 sse_mulps(cp->func, arg0, arg1);
1395 sse_addps(cp->func, arg0, arg2);
1396 store_dest(cp, &op->Dst[0], arg0);
1397 return TRUE;
1398 }
1399
1400
1401
1402 /* A wrapper for powf().
1403 * Makes sure it is cdecl and operates on floats.
1404 */
1405 static float PIPE_CDECL _powerf( float x, float y )
1406 {
1407 #if FAST_MATH
1408 return util_fast_pow(x, y);
1409 #else
1410 return powf( x, y );
1411 #endif
1412 }
1413
1414 #if FAST_MATH
1415 static float PIPE_CDECL _exp2(float x)
1416 {
1417 return util_fast_exp2(x);
1418 }
1419 #endif
1420
1421
1422 /* Really not sufficient -- need to check for conditions that could
1423 * generate inf/nan values, which will slow things down hugely.
1424 */
1425 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1426 {
1427 #if 0
1428 x87_fld_src(cp, &op->Src[1], 0); /* a1.x */
1429 x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */
1430 x87_fyl2x(cp->func); /* a1*log2(a0) */
1431
1432 x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
1433
1434 x87_fstp_dest4(cp, &op->Dst[0]);
1435 #else
1436 uint i;
1437
1438 /* For absolute correctness, need to spill/invalidate all XMM regs
1439 * too.
1440 */
1441 for (i = 0; i < 8; i++) {
1442 if (cp->xmm[i].dirty)
1443 spill(cp, i);
1444 aos_release_xmm_reg(cp, i);
1445 }
1446
1447 /* Push caller-save (ie scratch) regs.
1448 */
1449 x86_cdecl_caller_push_regs( cp->func );
1450
1451 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
1452
1453 x87_fld_src( cp, &op->Src[1], 0 );
1454 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
1455 x87_fld_src( cp, &op->Src[0], 0 );
1456 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1457
1458 /* tmp_EAX has been pushed & will be restored below */
1459 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
1460 x86_call( cp->func, cp->tmp_EAX );
1461
1462 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
1463
1464 x86_cdecl_caller_pop_regs( cp->func );
1465
1466 /* Note retval on x87 stack:
1467 */
1468 cp->func->x87_stack++;
1469
1470 x87_fstp_dest4( cp, &op->Dst[0] );
1471 #endif
1472 return TRUE;
1473 }
1474
1475
1476 #if FAST_MATH
1477 static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1478 {
1479 uint i;
1480
1481 /* For absolute correctness, need to spill/invalidate all XMM regs
1482 * too.
1483 */
1484 for (i = 0; i < 8; i++) {
1485 if (cp->xmm[i].dirty)
1486 spill(cp, i);
1487 aos_release_xmm_reg(cp, i);
1488 }
1489
1490 /* Push caller-save (ie scratch) regs.
1491 */
1492 x86_cdecl_caller_push_regs( cp->func );
1493
1494 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
1495
1496 x87_fld_src( cp, &op->Src[0], 0 );
1497 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1498
1499 /* tmp_EAX has been pushed & will be restored below */
1500 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
1501 x86_call( cp->func, cp->tmp_EAX );
1502
1503 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
1504
1505 x86_cdecl_caller_pop_regs( cp->func );
1506
1507 /* Note retval on x87 stack:
1508 */
1509 cp->func->x87_stack++;
1510
1511 x87_fstp_dest4( cp, &op->Dst[0] );
1512
1513 return TRUE;
1514 }
1515 #endif
1516
1517
1518 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1519 {
1520 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1521 struct x86_reg dst = aos_get_xmm_reg(cp);
1522
1523 if (cp->have_sse2) {
1524 sse2_rcpss(cp->func, dst, arg0);
1525 /* extend precision here...
1526 */
1527 }
1528 else {
1529 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1530 sse_movss(cp->func, dst, ones);
1531 sse_divss(cp->func, dst, arg0);
1532 }
1533
1534 store_scalar_dest(cp, &op->Dst[0], dst);
1535 return TRUE;
1536 }
1537
1538
1539 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1540 * implementations, it is possible to improve its precision at
1541 * fairly low cost, using a newton/raphson step, as below:
1542 *
1543 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1544 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1545 * or:
1546 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1547 *
1548 *
1549 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1550 */
1551 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1552 {
1553 if (0) {
1554 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1555 struct x86_reg r = aos_get_xmm_reg(cp);
1556 sse_rsqrtss(cp->func, r, arg0);
1557 store_scalar_dest(cp, &op->Dst[0], r);
1558 return TRUE;
1559 }
1560 else {
1561 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1562 struct x86_reg r = aos_get_xmm_reg(cp);
1563
1564 struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
1565 struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
1566 struct x86_reg src = get_xmm_writable( cp, arg0 );
1567 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
1568 struct x86_reg tmp = aos_get_xmm_reg(cp);
1569
1570 sse_movaps(cp->func, tmp, src);
1571 sse_mulps(cp->func, tmp, neg);
1572 sse_maxps(cp->func, tmp, src);
1573
1574 sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */
1575 sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */
1576 sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */
1577 sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */
1578 sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
1579 sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */
1580
1581 store_scalar_dest(cp, &op->Dst[0], r);
1582
1583 aos_release_xmm_reg(cp, tmp.idx);
1584
1585 return TRUE;
1586 }
1587 }
1588
1589
1590 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1591 {
1592 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1593 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1594 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1595 struct x86_reg dst = get_xmm_writable(cp, arg0);
1596
1597 sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
1598 sse_andps(cp->func, dst, ones);
1599
1600 store_dest(cp, &op->Dst[0], dst);
1601 return TRUE;
1602 }
1603
1604 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1605 {
1606 x87_fld_src(cp, &op->Src[0], 0);
1607 x87_fsin(cp->func);
1608 x87_fstp_dest4(cp, &op->Dst[0]);
1609 return TRUE;
1610 }
1611
1612
1613
1614 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1615 {
1616 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1617 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1618 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1619 struct x86_reg dst = get_xmm_writable(cp, arg0);
1620
1621 sse_cmpps(cp->func, dst, arg1, cc_LessThan);
1622 sse_andps(cp->func, dst, ones);
1623
1624 store_dest(cp, &op->Dst[0], dst);
1625 return TRUE;
1626 }
1627
1628 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1629 {
1630 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1631 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1632 struct x86_reg dst = get_xmm_writable(cp, arg0);
1633
1634 sse_subps(cp->func, dst, arg1);
1635
1636 store_dest(cp, &op->Dst[0], dst);
1637 return TRUE;
1638 }
1639
1640 static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1641 {
1642 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1643 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1644
1645 sse2_cvttps2dq(cp->func, tmp0, arg0);
1646 sse2_cvtdq2ps(cp->func, tmp0, tmp0);
1647
1648 store_dest(cp, &op->Dst[0], tmp0);
1649 return TRUE;
1650 }
1651
1652 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1653 {
1654 struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1655 struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1656 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1657 struct x86_reg tmp1 = aos_get_xmm_reg(cp);
1658
1659 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1660 sse_mulps(cp->func, tmp1, arg0);
1661 emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
1662 sse_mulps(cp->func, tmp0, arg1);
1663 sse_subps(cp->func, tmp1, tmp0);
1664 sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
1665
1666 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1667 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1668 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1669 /* dst[3] is undef */
1670
1671
1672 aos_release_xmm_reg(cp, tmp0.idx);
1673 store_dest(cp, &op->Dst[0], tmp1);
1674 return TRUE;
1675 }
1676
1677
1678
1679 static boolean
1680 emit_instruction( struct aos_compilation *cp,
1681 struct tgsi_full_instruction *inst )
1682 {
1683 x87_assert_stack_empty(cp->func);
1684
1685 switch( inst->Instruction.Opcode ) {
1686 case TGSI_OPCODE_MOV:
1687 return emit_MOV( cp, inst );
1688
1689 case TGSI_OPCODE_LIT:
1690 return emit_LIT(cp, inst);
1691
1692 case TGSI_OPCODE_RCP:
1693 return emit_RCP(cp, inst);
1694
1695 case TGSI_OPCODE_RSQ:
1696 return emit_RSQ(cp, inst);
1697
1698 case TGSI_OPCODE_EXP:
1699 /*return emit_EXP(cp, inst);*/
1700 return FALSE;
1701
1702 case TGSI_OPCODE_LOG:
1703 /*return emit_LOG(cp, inst);*/
1704 return FALSE;
1705
1706 case TGSI_OPCODE_MUL:
1707 return emit_MUL(cp, inst);
1708
1709 case TGSI_OPCODE_ADD:
1710 return emit_ADD(cp, inst);
1711
1712 case TGSI_OPCODE_DP3:
1713 return emit_DP3(cp, inst);
1714
1715 case TGSI_OPCODE_DP4:
1716 return emit_DP4(cp, inst);
1717
1718 case TGSI_OPCODE_DST:
1719 return emit_DST(cp, inst);
1720
1721 case TGSI_OPCODE_MIN:
1722 return emit_MIN(cp, inst);
1723
1724 case TGSI_OPCODE_MAX:
1725 return emit_MAX(cp, inst);
1726
1727 case TGSI_OPCODE_SLT:
1728 return emit_SLT(cp, inst);
1729
1730 case TGSI_OPCODE_SGE:
1731 return emit_SGE(cp, inst);
1732
1733 case TGSI_OPCODE_MAD:
1734 return emit_MAD(cp, inst);
1735
1736 case TGSI_OPCODE_SUB:
1737 return emit_SUB(cp, inst);
1738
1739 case TGSI_OPCODE_LRP:
1740 /*return emit_LERP(cp, inst);*/
1741 return FALSE;
1742
1743 case TGSI_OPCODE_FRC:
1744 return emit_FRC(cp, inst);
1745
1746 case TGSI_OPCODE_CLAMP:
1747 /*return emit_CLAMP(cp, inst);*/
1748 return FALSE;
1749
1750 case TGSI_OPCODE_FLR:
1751 return emit_FLR(cp, inst);
1752
1753 case TGSI_OPCODE_ROUND:
1754 return emit_RND(cp, inst);
1755
1756 case TGSI_OPCODE_EX2:
1757 #if FAST_MATH
1758 return emit_EXPBASE2(cp, inst);
1759 #elif 0
1760 /* this seems to fail for "larger" exponents.
1761 * See glean tvertProg1's EX2 test.
1762 */
1763 return emit_EX2(cp, inst);
1764 #else
1765 return FALSE;
1766 #endif
1767
1768 case TGSI_OPCODE_LG2:
1769 return emit_LG2(cp, inst);
1770
1771 case TGSI_OPCODE_POW:
1772 return emit_POW(cp, inst);
1773
1774 case TGSI_OPCODE_XPD:
1775 return emit_XPD(cp, inst);
1776
1777 case TGSI_OPCODE_ABS:
1778 return emit_ABS(cp, inst);
1779
1780 case TGSI_OPCODE_DPH:
1781 return emit_DPH(cp, inst);
1782
1783 case TGSI_OPCODE_COS:
1784 return emit_COS(cp, inst);
1785
1786 case TGSI_OPCODE_SIN:
1787 return emit_SIN(cp, inst);
1788
1789 case TGSI_OPCODE_TRUNC:
1790 return emit_TRUNC(cp, inst);
1791
1792 case TGSI_OPCODE_END:
1793 return TRUE;
1794
1795 default:
1796 return FALSE;
1797 }
1798 }
1799
1800
1801 static boolean emit_viewport( struct aos_compilation *cp )
1802 {
1803 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1804 TGSI_FILE_OUTPUT,
1805 cp->vaos->draw->vs.position_output );
1806
1807 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1808 Offset(struct aos_machine, scale));
1809
1810 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1811 Offset(struct aos_machine, translate));
1812
1813 sse_mulps(cp->func, pos, scale);
1814 sse_addps(cp->func, pos, translate);
1815
1816 aos_adopt_xmm_reg( cp,
1817 pos,
1818 TGSI_FILE_OUTPUT,
1819 cp->vaos->draw->vs.position_output,
1820 TRUE );
1821 return TRUE;
1822 }
1823
1824
1825 /* This is useful to be able to see the results on softpipe. Doesn't
1826 * do proper clipping, just assumes the backend can do it during
1827 * rasterization -- for debug only...
1828 */
1829 static boolean emit_rhw_viewport( struct aos_compilation *cp )
1830 {
1831 struct x86_reg tmp = aos_get_xmm_reg(cp);
1832 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1833 TGSI_FILE_OUTPUT,
1834 cp->vaos->draw->vs.position_output);
1835
1836 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1837 Offset(struct aos_machine, scale));
1838
1839 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1840 Offset(struct aos_machine, translate));
1841
1842
1843
1844 emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
1845 sse2_rcpss(cp->func, tmp, tmp);
1846 sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
1847
1848 sse_mulps(cp->func, pos, scale);
1849 sse_mulps(cp->func, pos, tmp);
1850 sse_addps(cp->func, pos, translate);
1851
1852 /* Set pos[3] = w
1853 */
1854 mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
1855
1856 aos_adopt_xmm_reg( cp,
1857 pos,
1858 TGSI_FILE_OUTPUT,
1859 cp->vaos->draw->vs.position_output,
1860 TRUE );
1861 return TRUE;
1862 }
1863
1864
1865 #if 0
1866 static boolean note_immediate( struct aos_compilation *cp,
1867 struct tgsi_full_immediate *imm )
1868 {
1869 unsigned pos = cp->num_immediates++;
1870 unsigned j;
1871
1872 assert( imm->Immediate.NrTokens <= 4 + 1 );
1873 for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
1874 cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
1875 }
1876
1877 return TRUE;
1878 }
1879 #endif
1880
1881
1882
1883
1884 static void find_last_write_outputs( struct aos_compilation *cp )
1885 {
1886 struct tgsi_parse_context parse;
1887 unsigned this_instruction = 0;
1888 unsigned i;
1889
1890 tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
1891
1892 while (!tgsi_parse_end_of_tokens( &parse )) {
1893
1894 tgsi_parse_token( &parse );
1895
1896 if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
1897 continue;
1898
1899 for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
1900 if (parse.FullToken.FullInstruction.Dst[i].Register.File ==
1901 TGSI_FILE_OUTPUT)
1902 {
1903 unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index;
1904 cp->output_last_write[idx] = this_instruction;
1905 }
1906 }
1907
1908 this_instruction++;
1909 }
1910
1911 tgsi_parse_free( &parse );
1912 }
1913
1914
1915 #define ARG_MACHINE 1
1916 #define ARG_START_ELTS 2
1917 #define ARG_COUNT 3
1918 #define ARG_OUTBUF 4
1919
1920
1921 static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant,
1922 boolean linear )
1923 {
1924 struct tgsi_parse_context parse;
1925 struct aos_compilation cp;
1926 unsigned fixup, label;
1927
1928 util_init_math();
1929
1930 tgsi_parse_init( &parse, variant->base.vs->state.tokens );
1931
1932 memset(&cp, 0, sizeof(cp));
1933
1934 cp.insn_counter = 1;
1935 cp.vaos = variant;
1936 cp.have_sse2 = 1;
1937 cp.func = &variant->func[ linear ? 0 : 1 ];
1938
1939 cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1940 cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
1941 cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
1942 cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
1943 cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
1944 cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
1945 cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
1946
1947 x86_init_func(cp.func);
1948
1949 find_last_write_outputs(&cp);
1950
1951 x86_push(cp.func, cp.idx_EBX);
1952 x86_push(cp.func, cp.count_ESI);
1953 x86_push(cp.func, cp.temp_EBP);
1954
1955
1956 /* Load arguments into regs:
1957 */
1958 x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
1959 x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
1960 x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
1961 x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
1962
1963
1964 /* Compare count to zero and possibly bail.
1965 */
1966 x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
1967 x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
1968 fixup = x86_jcc_forward(cp.func, cc_E);
1969
1970
1971 save_fpu_state( &cp );
1972 set_fpu_round_nearest( &cp );
1973
1974 aos_init_inputs( &cp, linear );
1975
1976 cp.x86_reg[0] = 0;
1977 cp.x86_reg[1] = 0;
1978
1979 /* Note address for loop jump
1980 */
1981 label = x86_get_label(cp.func);
1982 {
1983 /* Fetch inputs... TODO: fetch lazily...
1984 */
1985 if (!aos_fetch_inputs( &cp, linear ))
1986 goto fail;
1987
1988 /* Emit the shader:
1989 */
1990 while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
1991 {
1992 tgsi_parse_token( &parse );
1993
1994 switch (parse.FullToken.Token.Type) {
1995 case TGSI_TOKEN_TYPE_IMMEDIATE:
1996 #if 0
1997 if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
1998 goto fail;
1999 #endif
2000 break;
2001
2002 case TGSI_TOKEN_TYPE_INSTRUCTION:
2003 if (DISASSEM)
2004 tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
2005
2006 if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
2007 goto fail;
2008 break;
2009 }
2010
2011 x87_assert_stack_empty(cp.func);
2012 cp.insn_counter++;
2013
2014 if (DISASSEM)
2015 debug_printf("\n");
2016 }
2017
2018
2019 {
2020 unsigned i;
2021 for (i = 0; i < 8; i++) {
2022 if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
2023 cp.xmm[i].file = TGSI_FILE_NULL;
2024 cp.xmm[i].dirty = 0;
2025 }
2026 }
2027 }
2028
2029 if (cp.error)
2030 goto fail;
2031
2032 if (cp.vaos->base.key.clip) {
2033 /* not really handling clipping, just do the rhw so we can
2034 * see the results...
2035 */
2036 emit_rhw_viewport(&cp);
2037 }
2038 else if (cp.vaos->base.key.viewport) {
2039 emit_viewport(&cp);
2040 }
2041
2042 /* Emit output... TODO: do this eagerly after the last write to a
2043 * given output.
2044 */
2045 if (!aos_emit_outputs( &cp ))
2046 goto fail;
2047
2048
2049 /* Next vertex:
2050 */
2051 x86_lea(cp.func,
2052 cp.outbuf_ECX,
2053 x86_make_disp(cp.outbuf_ECX,
2054 cp.vaos->base.key.output_stride));
2055
2056 /* Incr index
2057 */
2058 aos_incr_inputs( &cp, linear );
2059 }
2060 /* decr count, loop if not zero
2061 */
2062 x86_dec(cp.func, cp.count_ESI);
2063 x86_jcc(cp.func, cc_NZ, label);
2064
2065 restore_fpu_state(&cp);
2066
2067 /* Land forward jump here:
2068 */
2069 x86_fixup_fwd_jump(cp.func, fixup);
2070
2071 /* Exit mmx state?
2072 */
2073 if (cp.func->need_emms)
2074 mmx_emms(cp.func);
2075
2076 x86_pop(cp.func, cp.temp_EBP);
2077 x86_pop(cp.func, cp.count_ESI);
2078 x86_pop(cp.func, cp.idx_EBX);
2079
2080 x87_assert_stack_empty(cp.func);
2081 x86_ret(cp.func);
2082
2083 tgsi_parse_free( &parse );
2084 return !cp.error;
2085
2086 fail:
2087 tgsi_parse_free( &parse );
2088 return FALSE;
2089 }
2090
2091
2092 /** cast wrapper */
2093 static INLINE struct draw_vs_variant_aos_sse *
2094 draw_vs_variant_aos_sse(struct draw_vs_variant *variant)
2095 {
2096 return (struct draw_vs_variant_aos_sse *) variant;
2097 }
2098
2099
2100 static void vaos_set_buffer( struct draw_vs_variant *variant,
2101 unsigned buf,
2102 const void *ptr,
2103 unsigned stride,
2104 unsigned max_stride)
2105 {
2106 struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2107
2108 if (buf < vaos->nr_vb) {
2109 vaos->buffer[buf].base_ptr = (char *)ptr;
2110 vaos->buffer[buf].stride = stride;
2111 }
2112
2113 if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
2114 }
2115
2116
2117
2118 static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant,
2119 const unsigned *elts,
2120 unsigned count,
2121 void *output_buffer )
2122 {
2123 struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2124 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2125 unsigned i;
2126
2127 if (0) debug_printf("%s %d\n", __FUNCTION__, count);
2128
2129 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2130 for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
2131 machine->constants[i] = vaos->draw->vs.aligned_constants[i];
2132 }
2133 machine->immediates = vaos->base.vs->immediates;
2134 machine->buffer = vaos->buffer;
2135
2136 vaos->gen_run_elts( machine,
2137 elts,
2138 count,
2139 output_buffer );
2140 }
2141
2142 static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant,
2143 unsigned start,
2144 unsigned count,
2145 void *output_buffer )
2146 {
2147 struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2148 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2149 unsigned i;
2150
2151 if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
2152 vaos->base.key.const_vbuffers);
2153
2154 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2155 for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
2156 machine->constants[i] = vaos->draw->vs.aligned_constants[i];
2157 }
2158 machine->immediates = vaos->base.vs->immediates;
2159 machine->buffer = vaos->buffer;
2160
2161 vaos->gen_run_linear( machine,
2162 start,
2163 count,
2164 output_buffer );
2165
2166 /* Sanity spot checks to make sure we didn't trash our constants */
2167 assert(machine->internal[IMM_ONES][0] == 1.0f);
2168 assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
2169 assert(machine->internal[IMM_NEGS][0] == -1.0f);
2170 }
2171
2172
2173
2174 static void vaos_destroy( struct draw_vs_variant *variant )
2175 {
2176 struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2177
2178 FREE( vaos->buffer );
2179
2180 x86_release_func( &vaos->func[0] );
2181 x86_release_func( &vaos->func[1] );
2182
2183 FREE(vaos);
2184 }
2185
2186
2187
2188 static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs,
2189 const struct draw_vs_variant_key *key )
2190 {
2191 unsigned i;
2192 struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse);
2193
2194 if (!vaos)
2195 goto fail;
2196
2197 vaos->base.key = *key;
2198 vaos->base.vs = vs;
2199 vaos->base.set_buffer = vaos_set_buffer;
2200 vaos->base.destroy = vaos_destroy;
2201 vaos->base.run_linear = vaos_run_linear;
2202 vaos->base.run_elts = vaos_run_elts;
2203
2204 vaos->draw = vs->draw;
2205
2206 for (i = 0; i < key->nr_inputs; i++)
2207 vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
2208
2209 vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
2210 if (!vaos->buffer)
2211 goto fail;
2212
2213 if (0)
2214 debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
2215
2216 #if 0
2217 tgsi_dump(vs->state.tokens, 0);
2218 #endif
2219
2220 if (!build_vertex_program( vaos, TRUE ))
2221 goto fail;
2222
2223 if (!build_vertex_program( vaos, FALSE ))
2224 goto fail;
2225
2226 vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
2227 if (!vaos->gen_run_linear)
2228 goto fail;
2229
2230 vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
2231 if (!vaos->gen_run_elts)
2232 goto fail;
2233
2234 return &vaos->base;
2235
2236 fail:
2237 if (vaos && vaos->buffer)
2238 FREE(vaos->buffer);
2239
2240 if (vaos)
2241 x86_release_func( &vaos->func[0] );
2242
2243 if (vaos)
2244 x86_release_func( &vaos->func[1] );
2245
2246 FREE(vaos);
2247
2248 return NULL;
2249 }
2250
2251
2252 struct draw_vs_variant *
2253 draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
2254 const struct draw_vs_variant_key *key )
2255 {
2256 struct draw_vs_variant *variant = variant_aos_sse( vs, key );
2257
2258 if (variant == NULL) {
2259 variant = draw_vs_create_variant_generic( vs, key );
2260 }
2261
2262 return variant;
2263 }
2264
2265
2266
2267 #endif /* PIPE_ARCH_X86 */