draw: michal's patch for calling powf... teapot still not quite right
[mesa.git] / src / gallium / auxiliary / draw / draw_vs_aos.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
29 */
30
31
32 #include "pipe/p_util.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/util/tgsi_parse.h"
35 #include "tgsi/util/tgsi_util.h"
36 #include "tgsi/exec/tgsi_exec.h"
37 #include "tgsi/util/tgsi_dump.h"
38
39 #include "draw_vs.h"
40 #include "draw_vs_aos.h"
41
42 #include "rtasm/rtasm_x86sse.h"
43
44 #ifdef PIPE_ARCH_X86
45 #define DISASSEM 0
46
47 static const char *files[] =
48 {
49 "NULL",
50 "CONST",
51 "IN",
52 "OUT",
53 "TEMP",
54 "SAMP",
55 "ADDR",
56 "IMM",
57 "INTERNAL",
58 };
59
60 static INLINE boolean eq( struct x86_reg a,
61 struct x86_reg b )
62 {
63 return (a.file == b.file &&
64 a.idx == b.idx &&
65 a.mod == b.mod &&
66 a.disp == b.disp);
67 }
68
69 struct x86_reg aos_get_x86( struct aos_compilation *cp,
70 unsigned value )
71 {
72 if (cp->ebp != value) {
73 unsigned offset;
74
75 switch (value) {
76 case X86_IMMEDIATES:
77 offset = Offset(struct aos_machine, immediates);
78 break;
79 case X86_CONSTANTS:
80 offset = Offset(struct aos_machine, constants);
81 break;
82 case X86_ATTRIBS:
83 offset = Offset(struct aos_machine, attrib);
84 break;
85 default:
86 assert(0);
87 offset = 0;
88 }
89
90 x86_mov(cp->func, cp->temp_EBP,
91 x86_make_disp(cp->machine_EDX, offset));
92 /* x86_deref(x86_make_disp(cp->machine_EDX, offset))); */
93
94 cp->ebp = value;
95 }
96
97 return cp->temp_EBP;
98 }
99
100
101 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
102 unsigned file,
103 unsigned idx )
104 {
105 struct x86_reg ptr = cp->machine_EDX;
106
107 switch (file) {
108 case TGSI_FILE_INPUT:
109 return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
110
111 case TGSI_FILE_OUTPUT:
112 return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
113
114 case TGSI_FILE_TEMPORARY:
115 return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
116
117 case AOS_FILE_INTERNAL:
118 return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
119
120 case TGSI_FILE_IMMEDIATE:
121 return x86_make_disp(aos_get_x86(cp, X86_IMMEDIATES), idx * 4 * sizeof(float));
122
123 case TGSI_FILE_CONSTANT:
124 return x86_make_disp(aos_get_x86(cp, X86_CONSTANTS), idx * 4 * sizeof(float));
125
126 default:
127 ERROR(cp, "unknown reg file");
128 return x86_make_reg(0,0);
129 }
130 }
131
132
133
134 #define X87_CW_EXCEPTION_INV_OP (1<<0)
135 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
136 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
137 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
138 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
139 #define X87_CW_EXCEPTION_PRECISION (1<<5)
140 #define X87_CW_PRECISION_SINGLE (0<<8)
141 #define X87_CW_PRECISION_RESERVED (1<<8)
142 #define X87_CW_PRECISION_DOUBLE (2<<8)
143 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
144 #define X87_CW_PRECISION_MASK (3<<8)
145 #define X87_CW_ROUND_NEAREST (0<<10)
146 #define X87_CW_ROUND_DOWN (1<<10)
147 #define X87_CW_ROUND_UP (2<<10)
148 #define X87_CW_ROUND_ZERO (3<<10)
149 #define X87_CW_ROUND_MASK (3<<10)
150 #define X87_CW_INFINITY (1<<12)
151
152
153
154
155 static void spill( struct aos_compilation *cp, unsigned idx )
156 {
157 if (!cp->xmm[idx].dirty ||
158 (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
159 cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
160 cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
161 ERROR(cp, "invalid spill");
162 return;
163 }
164 else {
165 struct x86_reg oldval = get_reg_ptr(cp,
166 cp->xmm[idx].file,
167 cp->xmm[idx].idx);
168
169 if (0) debug_printf("\nspill %s[%d]",
170 files[cp->xmm[idx].file],
171 cp->xmm[idx].idx);
172
173 assert(cp->xmm[idx].dirty);
174 sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
175 cp->xmm[idx].dirty = 0;
176 }
177 }
178
179
180 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
181 struct x86_reg reg )
182 {
183 if (reg.file != file_XMM ||
184 cp->xmm[reg.idx].file != TGSI_FILE_NULL)
185 {
186 struct x86_reg tmp = aos_get_xmm_reg(cp);
187 sse_movaps(cp->func, tmp, reg);
188 reg = tmp;
189 }
190
191 cp->xmm[reg.idx].last_used = cp->insn_counter;
192 return reg;
193 }
194
195 static struct x86_reg get_xmm( struct aos_compilation *cp,
196 struct x86_reg reg )
197 {
198 if (reg.file != file_XMM)
199 {
200 struct x86_reg tmp = aos_get_xmm_reg(cp);
201 sse_movaps(cp->func, tmp, reg);
202 reg = tmp;
203 }
204
205 cp->xmm[reg.idx].last_used = cp->insn_counter;
206 return reg;
207 }
208
209
210 /* Allocate an empty xmm register, either as a temporary or later to
211 * "adopt" as a shader reg.
212 */
213 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
214 {
215 unsigned i;
216 unsigned oldest = 0;
217 boolean found = FALSE;
218
219 for (i = 0; i < 8; i++)
220 if (cp->xmm[i].last_used != cp->insn_counter &&
221 cp->xmm[i].file == TGSI_FILE_NULL) {
222 oldest = i;
223 found = TRUE;
224 }
225
226 if (!found) {
227 for (i = 0; i < 8; i++)
228 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
229 oldest = i;
230 }
231
232 /* Need to write out the old value?
233 */
234 if (cp->xmm[oldest].dirty)
235 spill(cp, oldest);
236
237 assert(cp->xmm[oldest].last_used != cp->insn_counter);
238
239 cp->xmm[oldest].file = TGSI_FILE_NULL;
240 cp->xmm[oldest].idx = 0;
241 cp->xmm[oldest].dirty = 0;
242 cp->xmm[oldest].last_used = cp->insn_counter;
243 return x86_make_reg(file_XMM, oldest);
244 }
245
246 void aos_release_xmm_reg( struct aos_compilation *cp,
247 unsigned idx )
248 {
249 cp->xmm[idx].file = TGSI_FILE_NULL;
250 cp->xmm[idx].idx = 0;
251 cp->xmm[idx].dirty = 0;
252 cp->xmm[idx].last_used = 0;
253 }
254
255
256
257
258 /* Mark an xmm reg as holding the current copy of a shader reg.
259 */
260 void aos_adopt_xmm_reg( struct aos_compilation *cp,
261 struct x86_reg reg,
262 unsigned file,
263 unsigned idx,
264 unsigned dirty )
265 {
266 unsigned i;
267
268 if (reg.file != file_XMM) {
269 assert(0);
270 return;
271 }
272
273
274 /* If any xmm reg thinks it holds this shader reg, break the
275 * illusion.
276 */
277 for (i = 0; i < 8; i++) {
278 if (cp->xmm[i].file == file &&
279 cp->xmm[i].idx == idx)
280 {
281 /* If an xmm reg is already holding this shader reg, take into account its
282 * dirty flag...
283 */
284 dirty |= cp->xmm[i].dirty;
285 aos_release_xmm_reg(cp, i);
286 }
287 }
288
289 cp->xmm[reg.idx].file = file;
290 cp->xmm[reg.idx].idx = idx;
291 cp->xmm[reg.idx].dirty = dirty;
292 cp->xmm[reg.idx].last_used = cp->insn_counter;
293 }
294
295
296 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
297 */
298 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
299 unsigned file,
300 unsigned idx )
301 {
302 unsigned i;
303
304 /* Ensure the in-memory copy of this reg is up-to-date
305 */
306 for (i = 0; i < 8; i++) {
307 if (cp->xmm[i].file == file &&
308 cp->xmm[i].idx == idx &&
309 cp->xmm[i].dirty) {
310 spill(cp, i);
311 }
312 }
313
314 return get_reg_ptr( cp, file, idx );
315 }
316
317
318 /* As above, but return a pointer. Note - this pointer may alias
319 * those returned by get_arg_ptr().
320 */
321 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
322 const struct tgsi_full_dst_register *dst )
323 {
324 unsigned file = dst->DstRegister.File;
325 unsigned idx = dst->DstRegister.Index;
326 unsigned i;
327
328
329 /* Ensure in-memory copy of this reg is up-to-date and invalidate
330 * any xmm copies.
331 */
332 for (i = 0; i < 8; i++) {
333 if (cp->xmm[i].file == file &&
334 cp->xmm[i].idx == idx)
335 {
336 if (cp->xmm[i].dirty)
337 spill(cp, i);
338
339 aos_release_xmm_reg(cp, i);
340 }
341 }
342
343 return get_reg_ptr( cp, file, idx );
344 }
345
346
347
348
349
350 /* Return an XMM reg if the argument is resident, otherwise return a
351 * base+offset pointer to the saved value.
352 */
353 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
354 unsigned file,
355 unsigned idx )
356 {
357 unsigned i;
358
359 for (i = 0; i < 8; i++) {
360 if (cp->xmm[i].file == file &&
361 cp->xmm[i].idx == idx)
362 {
363 cp->xmm[i].last_used = cp->insn_counter;
364 return x86_make_reg(file_XMM, i);
365 }
366 }
367
368 /* If not found in the XMM register file, return an indirect
369 * reference to the in-memory copy:
370 */
371 return get_reg_ptr( cp, file, idx );
372 }
373
374
375
376 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
377 unsigned file,
378 unsigned idx )
379 {
380 struct x86_reg reg = get_xmm( cp,
381 aos_get_shader_reg( cp, file, idx ) );
382
383 aos_adopt_xmm_reg( cp,
384 reg,
385 file,
386 idx,
387 FALSE );
388
389 return reg;
390 }
391
392
393
394 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
395 unsigned imm )
396 {
397 return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
398 }
399
400
401 struct x86_reg aos_get_internal( struct aos_compilation *cp,
402 unsigned imm )
403 {
404 return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
405 }
406
407
408
409
410
411 /* Emulate pshufd insn in regular SSE, if necessary:
412 */
413 static void emit_pshufd( struct aos_compilation *cp,
414 struct x86_reg dst,
415 struct x86_reg arg0,
416 ubyte shuf )
417 {
418 if (cp->have_sse2) {
419 sse2_pshufd(cp->func, dst, arg0, shuf);
420 }
421 else {
422 if (!eq(dst, arg0))
423 sse_movaps(cp->func, dst, arg0);
424
425 sse_shufps(cp->func, dst, dst, shuf);
426 }
427 }
428
429 /* load masks (pack into negs??)
430 * pshufd - shuffle according to writemask
431 * and - result, mask
432 * nand - dest, mask
433 * or - dest, result
434 */
435 static boolean mask_write( struct aos_compilation *cp,
436 struct x86_reg dst,
437 struct x86_reg result,
438 unsigned mask )
439 {
440 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
441 struct x86_reg tmp = aos_get_xmm_reg(cp);
442
443 emit_pshufd(cp, tmp, imm_swz,
444 SHUF((mask & 1) ? 2 : 3,
445 (mask & 2) ? 2 : 3,
446 (mask & 4) ? 2 : 3,
447 (mask & 8) ? 2 : 3));
448
449 sse_andps(cp->func, dst, tmp);
450 sse_andnps(cp->func, tmp, result);
451 sse_orps(cp->func, dst, tmp);
452
453 aos_release_xmm_reg(cp, tmp.idx);
454 return TRUE;
455 }
456
457
458
459
460 /* Helper for writemask:
461 */
462 static boolean emit_shuf_copy2( struct aos_compilation *cp,
463 struct x86_reg dst,
464 struct x86_reg arg0,
465 struct x86_reg arg1,
466 ubyte shuf )
467 {
468 struct x86_reg tmp = aos_get_xmm_reg(cp);
469
470 emit_pshufd(cp, dst, arg1, shuf);
471 emit_pshufd(cp, tmp, arg0, shuf);
472 sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
473 emit_pshufd(cp, dst, dst, shuf);
474
475 aos_release_xmm_reg(cp, tmp.idx);
476 return TRUE;
477 }
478
479
480
481 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
482
483
484 /* Locate a source register and perform any required (simple) swizzle.
485 *
486 * Just fail on complex swizzles at this point.
487 */
488 static struct x86_reg fetch_src( struct aos_compilation *cp,
489 const struct tgsi_full_src_register *src )
490 {
491 struct x86_reg arg0 = aos_get_shader_reg(cp,
492 src->SrcRegister.File,
493 src->SrcRegister.Index);
494 unsigned i;
495 unsigned swz = 0;
496 unsigned negs = 0;
497 unsigned abs = 0;
498
499 for (i = 0; i < 4; i++) {
500 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
501 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
502
503 switch (swizzle) {
504 case TGSI_EXTSWIZZLE_ZERO:
505 case TGSI_EXTSWIZZLE_ONE:
506 ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
507 break;
508
509 default:
510 swz |= (swizzle & 0x3) << (i * 2);
511 break;
512 }
513
514 switch (neg) {
515 case TGSI_UTIL_SIGN_TOGGLE:
516 negs |= (1<<i);
517 break;
518
519 case TGSI_UTIL_SIGN_KEEP:
520 break;
521
522 case TGSI_UTIL_SIGN_CLEAR:
523 abs |= (1<<i);
524 break;
525
526 default:
527 ERROR(cp, "unsupported sign-mode");
528 break;
529 }
530 }
531
532 if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
533 struct x86_reg dst = aos_get_xmm_reg(cp);
534
535 if (swz != SSE_SWIZZLE_NOOP) {
536 emit_pshufd(cp, dst, arg0, swz);
537 arg0 = dst;
538 }
539
540 if (negs && negs != 0xf) {
541 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
542 struct x86_reg tmp = aos_get_xmm_reg(cp);
543
544 /* Load 1,-1,0,0
545 * Use neg as arg to pshufd
546 * Multiply
547 */
548 emit_pshufd(cp, tmp, imm_swz,
549 SHUF((negs & 1) ? 1 : 0,
550 (negs & 2) ? 1 : 0,
551 (negs & 4) ? 1 : 0,
552 (negs & 8) ? 1 : 0));
553 sse_mulps(cp->func, dst, arg0);
554
555 aos_release_xmm_reg(cp, tmp.idx);
556 arg0 = dst;
557 }
558 else if (negs) {
559 struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
560 sse_mulps(cp->func, dst, imm_negs);
561 arg0 = dst;
562 }
563
564
565 if (abs && abs != 0xf) {
566 ERROR(cp, "unsupported partial abs");
567 }
568 else if (abs) {
569 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
570 struct x86_reg tmp = aos_get_xmm_reg(cp);
571
572 sse_movaps(cp->func, tmp, arg0);
573 sse_mulps(cp->func, tmp, neg);
574 sse_maxps(cp->func, dst, arg0);
575
576 aos_release_xmm_reg(cp, tmp.idx);
577 arg0 = dst;
578 }
579 }
580
581 return arg0;
582 }
583
584 static void x87_fld_src( struct aos_compilation *cp,
585 const struct tgsi_full_src_register *src,
586 unsigned channel )
587 {
588 struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
589 src->SrcRegister.File,
590 src->SrcRegister.Index);
591
592 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
593 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
594
595 switch (swizzle) {
596 case TGSI_EXTSWIZZLE_ZERO:
597 x87_fldz( cp->func );
598 break;
599
600 case TGSI_EXTSWIZZLE_ONE:
601 x87_fld1( cp->func );
602 break;
603
604 default:
605 x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
606 break;
607 }
608
609
610 switch (neg) {
611 case TGSI_UTIL_SIGN_TOGGLE:
612 /* Flip the sign:
613 */
614 x87_fchs( cp->func );
615 break;
616
617 case TGSI_UTIL_SIGN_KEEP:
618 break;
619
620 case TGSI_UTIL_SIGN_CLEAR:
621 x87_fabs( cp->func );
622 break;
623
624 case TGSI_UTIL_SIGN_SET:
625 x87_fabs( cp->func );
626 x87_fchs( cp->func );
627 break;
628
629 default:
630 ERROR(cp, "unsupported sign-mode");
631 break;
632 }
633 }
634
635
636
637
638
639
640 /* Used to implement write masking. This and most of the other instructions
641 * here would be easier to implement if there had been a translation
642 * to a 2 argument format (dst/arg0, arg1) at the shader level before
643 * attempting to translate to x86/sse code.
644 */
645 static void store_dest( struct aos_compilation *cp,
646 const struct tgsi_full_dst_register *reg,
647 struct x86_reg result )
648 {
649 struct x86_reg dst;
650
651 switch (reg->DstRegister.WriteMask) {
652 case 0:
653 return;
654
655 case TGSI_WRITEMASK_XYZW:
656 aos_adopt_xmm_reg(cp,
657 get_xmm_writable(cp, result),
658 reg->DstRegister.File,
659 reg->DstRegister.Index,
660 TRUE);
661 return;
662 default:
663 break;
664 }
665
666 dst = aos_get_shader_reg_xmm(cp,
667 reg->DstRegister.File,
668 reg->DstRegister.Index);
669
670 switch (reg->DstRegister.WriteMask) {
671 case TGSI_WRITEMASK_X:
672 sse_movss(cp->func, dst, get_xmm(cp, result));
673 break;
674
675 case TGSI_WRITEMASK_ZW:
676 sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
677 break;
678
679 case TGSI_WRITEMASK_XY:
680 result = get_xmm_writable(cp, result);
681 sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
682 dst = result;
683 break;
684
685 case TGSI_WRITEMASK_YZW:
686 result = get_xmm_writable(cp, result);
687 sse_movss(cp->func, result, dst);
688 dst = result;
689 break;
690
691 default:
692 mask_write(cp, dst, result, reg->DstRegister.WriteMask);
693 break;
694 }
695
696 aos_adopt_xmm_reg(cp,
697 dst,
698 reg->DstRegister.File,
699 reg->DstRegister.Index,
700 TRUE);
701
702 }
703
704 static void inject_scalar( struct aos_compilation *cp,
705 struct x86_reg dst,
706 struct x86_reg result,
707 unsigned swizzle )
708 {
709 sse_shufps(cp->func, dst, dst, swizzle);
710 sse_movss(cp->func, dst, result);
711 sse_shufps(cp->func, dst, dst, swizzle);
712 }
713
714
715 static void store_scalar_dest( struct aos_compilation *cp,
716 const struct tgsi_full_dst_register *reg,
717 struct x86_reg result )
718 {
719 unsigned writemask = reg->DstRegister.WriteMask;
720 struct x86_reg dst;
721
722 if (writemask != TGSI_WRITEMASK_X &&
723 writemask != TGSI_WRITEMASK_Y &&
724 writemask != TGSI_WRITEMASK_Z &&
725 writemask != TGSI_WRITEMASK_W &&
726 writemask != 0)
727 {
728 result = get_xmm_writable(cp, result); /* already true, right? */
729 sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
730 store_dest(cp, reg, result);
731 return;
732 }
733
734 result = get_xmm(cp, result);
735 dst = aos_get_shader_reg_xmm(cp,
736 reg->DstRegister.File,
737 reg->DstRegister.Index);
738
739
740
741 switch (reg->DstRegister.WriteMask) {
742 case TGSI_WRITEMASK_X:
743 sse_movss(cp->func, dst, result);
744 break;
745
746 case TGSI_WRITEMASK_Y:
747 inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
748 break;
749
750 case TGSI_WRITEMASK_Z:
751 inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
752 break;
753
754 case TGSI_WRITEMASK_W:
755 inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
756 break;
757
758 default:
759 break;
760 }
761
762 aos_adopt_xmm_reg(cp,
763 dst,
764 reg->DstRegister.File,
765 reg->DstRegister.Index,
766 TRUE);
767 }
768
769
770
771 static void x87_fst_or_nop( struct x86_function *func,
772 unsigned writemask,
773 unsigned channel,
774 struct x86_reg ptr )
775 {
776 assert(ptr.file == file_REG32);
777 if (writemask & (1<<channel))
778 x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
779 }
780
781 static void x87_fstp_or_pop( struct x86_function *func,
782 unsigned writemask,
783 unsigned channel,
784 struct x86_reg ptr )
785 {
786 assert(ptr.file == file_REG32);
787 if (writemask & (1<<channel))
788 x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
789 else
790 x87_fstp( func, x86_make_reg( file_x87, 0 ));
791 }
792
793
794
795 /*
796 */
797 static void x87_fstp_dest4( struct aos_compilation *cp,
798 const struct tgsi_full_dst_register *dst )
799 {
800 struct x86_reg ptr = get_dst_ptr(cp, dst);
801 unsigned writemask = dst->DstRegister.WriteMask;
802
803 x87_fst_or_nop(cp->func, writemask, 0, ptr);
804 x87_fst_or_nop(cp->func, writemask, 1, ptr);
805 x87_fst_or_nop(cp->func, writemask, 2, ptr);
806 x87_fstp_or_pop(cp->func, writemask, 3, ptr);
807 }
808
809 /* Save current x87 state and put it into single precision mode.
810 */
811 static void save_fpu_state( struct aos_compilation *cp )
812 {
813 x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
814 Offset(struct aos_machine, fpu_restore)));
815 }
816
817 static void restore_fpu_state( struct aos_compilation *cp )
818 {
819 x87_fnclex(cp->func);
820 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
821 Offset(struct aos_machine, fpu_restore)));
822 }
823
824 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
825 {
826 if (cp->fpucntl != FPU_RND_NEG) {
827 cp->fpucntl = FPU_RND_NEG;
828 x87_fnclex(cp->func);
829 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
830 Offset(struct aos_machine, fpu_rnd_neg_inf)));
831 }
832 }
833
834 static void set_fpu_round_nearest( struct aos_compilation *cp )
835 {
836 if (cp->fpucntl != FPU_RND_NEAREST) {
837 cp->fpucntl = FPU_RND_NEAREST;
838 x87_fnclex(cp->func);
839 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
840 Offset(struct aos_machine, fpu_rnd_nearest)));
841 }
842 }
843
844
845 static void x87_emit_ex2( struct aos_compilation *cp )
846 {
847 struct x86_reg st0 = x86_make_reg(file_x87, 0);
848 struct x86_reg st1 = x86_make_reg(file_x87, 1);
849 int stack = cp->func->x87_stack;
850
851 // set_fpu_round_neg_inf( cp );
852
853 x87_fld(cp->func, st0); /* a a */
854 x87_fprndint( cp->func ); /* int(a) a*/
855 x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
856 x87_fxch(cp->func, st1); /* frc(a) int(a) */
857 x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
858 x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
859 x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
860 x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
861 /* 2^a int(a) */
862 x87_fstp(cp->func, st1); /* 2^a */
863
864 assert( stack == cp->func->x87_stack);
865
866 }
867
868 static void PIPE_CDECL print_reg( const char *msg,
869 const float *reg )
870 {
871 debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
872 }
873
874 static void emit_print( struct aos_compilation *cp,
875 const char *message, /* must point to a static string! */
876 unsigned file,
877 unsigned idx )
878 {
879 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
880 struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
881 unsigned i;
882
883 /* There shouldn't be anything on the x87 stack. Can add this
884 * capacity later if need be.
885 */
886 assert(cp->func->x87_stack == 0);
887
888 /* For absolute correctness, need to spill/invalidate all XMM regs
889 * too. We're obviously not concerned about performance on this
890 * debug path, so here goes:
891 */
892 for (i = 0; i < 8; i++) {
893 if (cp->xmm[i].dirty)
894 spill(cp, i);
895
896 aos_release_xmm_reg(cp, i);
897 }
898
899 /* Push caller-save (ie scratch) regs.
900 */
901 x86_cdecl_caller_push_regs( cp->func );
902
903
904 /* Push the arguments:
905 */
906 x86_lea( cp->func, ecx, arg );
907 x86_push( cp->func, ecx );
908 x86_push_imm32( cp->func, (int)message );
909
910 /* Call the helper. Could call debug_printf directly, but
911 * print_reg is a nice place to put a breakpoint if need be.
912 */
913 x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
914 x86_call( cp->func, ecx );
915 x86_pop( cp->func, ecx );
916 x86_pop( cp->func, ecx );
917
918 /* Pop caller-save regs
919 */
920 x86_cdecl_caller_pop_regs( cp->func );
921
922 /* Done...
923 */
924 }
925
926 /**
927 * The traditional instructions. All operate on internal registers
928 * and ignore write masks and swizzling issues.
929 */
930
931 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
932 {
933 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
934 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
935 struct x86_reg tmp = aos_get_xmm_reg(cp);
936
937 sse_movaps(cp->func, tmp, arg0);
938 sse_mulps(cp->func, tmp, neg);
939 sse_maxps(cp->func, tmp, arg0);
940
941 store_dest(cp, &op->FullDstRegisters[0], tmp);
942 return TRUE;
943 }
944
945 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
946 {
947 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
948 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
949 struct x86_reg dst = get_xmm_writable(cp, arg0);
950
951 sse_addps(cp->func, dst, arg1);
952
953 store_dest(cp, &op->FullDstRegisters[0], dst);
954 return TRUE;
955 }
956
957 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
958 {
959 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
960 x87_fcos(cp->func);
961 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
962 return TRUE;
963 }
964
965
966 /* The dotproduct instructions don't really do that well in sse:
967 */
968 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
969 {
970 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
971 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
972 struct x86_reg tmp = aos_get_xmm_reg(cp);
973 struct x86_reg dst = get_xmm_writable(cp, arg0);
974
975 sse_mulps(cp->func, dst, arg1);
976 /* Now the hard bit: sum the first 3 values:
977 */
978 sse_movhlps(cp->func, tmp, dst);
979 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
980 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
981 sse_addss(cp->func, dst, tmp);
982
983 aos_release_xmm_reg(cp, tmp.idx);
984 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
985 return TRUE;
986 }
987
988
989
990 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
991 {
992 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
993 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
994 struct x86_reg tmp = aos_get_xmm_reg(cp);
995 struct x86_reg dst = get_xmm_writable(cp, arg0);
996
997 sse_mulps(cp->func, dst, arg1);
998
999 /* Now the hard bit: sum the values:
1000 */
1001 sse_movhlps(cp->func, tmp, dst);
1002 sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1003 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1004 sse_addss(cp->func, dst, tmp);
1005
1006 aos_release_xmm_reg(cp, tmp.idx);
1007 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1008 return TRUE;
1009 }
1010
1011 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1012 {
1013 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1014 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1015 struct x86_reg tmp = aos_get_xmm_reg(cp);
1016 struct x86_reg dst = get_xmm_writable(cp, arg0);
1017
1018 sse_mulps(cp->func, dst, arg1);
1019
1020 /* Now the hard bit: sum the values (from DP3):
1021 */
1022 sse_movhlps(cp->func, tmp, dst);
1023 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1024 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1025 sse_addss(cp->func, dst, tmp);
1026 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
1027 sse_addss(cp->func, dst, tmp);
1028
1029 aos_release_xmm_reg(cp, tmp.idx);
1030 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1031 return TRUE;
1032 }
1033
1034 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1035 {
1036 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1037 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1038 struct x86_reg dst = aos_get_xmm_reg(cp);
1039 struct x86_reg tmp = aos_get_xmm_reg(cp);
1040 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1041
1042 /* dst[0] = 1.0 * 1.0F; */
1043 /* dst[1] = arg0[1] * arg1[1]; */
1044 /* dst[2] = arg0[2] * 1.0; */
1045 /* dst[3] = 1.0 * arg1[3]; */
1046
1047 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
1048 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
1049 sse_mulps(cp->func, dst, tmp);
1050
1051 aos_release_xmm_reg(cp, tmp.idx);
1052 store_dest(cp, &op->FullDstRegisters[0], dst);
1053 return TRUE;
1054 }
1055
1056 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1057 {
1058 x87_fld1(cp->func); /* 1 */
1059 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */
1060 x87_fyl2x(cp->func); /* log2(a0) */
1061 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1062 return TRUE;
1063 }
1064
1065
1066 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1067 {
1068 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1069 x87_emit_ex2(cp);
1070 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1071 return TRUE;
1072 }
1073
1074
1075 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1076 {
1077 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1078 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1079 int i;
1080
1081 set_fpu_round_neg_inf( cp );
1082
1083 /* Load all sources first to avoid aliasing
1084 */
1085 for (i = 3; i >= 0; i--) {
1086 if (writemask & (1<<i)) {
1087 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1088 }
1089 }
1090
1091 for (i = 0; i < 4; i++) {
1092 if (writemask & (1<<i)) {
1093 x87_fprndint( cp->func );
1094 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1095 }
1096 }
1097
1098 return TRUE;
1099 }
1100
1101
1102 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1103 {
1104 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1105 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1106 int i;
1107
1108 set_fpu_round_nearest( cp );
1109
1110 /* Load all sources first to avoid aliasing
1111 */
1112 for (i = 3; i >= 0; i--) {
1113 if (writemask & (1<<i)) {
1114 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1115 }
1116 }
1117
1118 for (i = 0; i < 4; i++) {
1119 if (writemask & (1<<i)) {
1120 x87_fprndint( cp->func );
1121 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1122 }
1123 }
1124
1125 return TRUE;
1126 }
1127
1128
1129 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1130 {
1131 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1132 struct x86_reg st0 = x86_make_reg(file_x87, 0);
1133 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1134 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1135 int i;
1136
1137 set_fpu_round_neg_inf( cp );
1138
1139 /* suck all the source values onto the stack before writing out any
1140 * dst, which may alias...
1141 */
1142 for (i = 3; i >= 0; i--) {
1143 if (writemask & (1<<i)) {
1144 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1145 }
1146 }
1147
1148 for (i = 0; i < 4; i++) {
1149 if (writemask & (1<<i)) {
1150 x87_fld(cp->func, st0); /* a a */
1151 x87_fprndint( cp->func ); /* flr(a) a */
1152 x87_fsubp(cp->func, st1); /* frc(a) */
1153 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1154 }
1155 }
1156
1157 return TRUE;
1158 }
1159
1160
1161
1162
1163
1164
1165 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1166 {
1167 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
1168 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1169 unsigned lit_count = cp->lit_count++;
1170 struct x86_reg result, arg0;
1171 unsigned i;
1172
1173 #if 1
1174 /* For absolute correctness, need to spill/invalidate all XMM regs
1175 * too.
1176 */
1177 for (i = 0; i < 8; i++) {
1178 if (cp->xmm[i].dirty)
1179 spill(cp, i);
1180 aos_release_xmm_reg(cp, i);
1181 }
1182 #endif
1183
1184 if (writemask != TGSI_WRITEMASK_XYZW)
1185 result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
1186 else
1187 result = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1188
1189
1190 arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
1191 if (arg0.file == file_XMM) {
1192 struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
1193 Offset(struct aos_machine, tmp[1]));
1194 sse_movaps( cp->func, tmp, arg0 );
1195 arg0 = tmp;
1196 }
1197
1198
1199
1200 /* Push caller-save (ie scratch) regs.
1201 */
1202 x86_cdecl_caller_push_regs( cp->func );
1203
1204 /* Push the arguments:
1205 */
1206 x86_push_imm32( cp->func, lit_count );
1207
1208 x86_lea( cp->func, ecx, arg0 );
1209 x86_push( cp->func, ecx );
1210
1211 x86_lea( cp->func, ecx, result );
1212 x86_push( cp->func, ecx );
1213
1214 x86_push( cp->func, cp->machine_EDX );
1215
1216 if (lit_count < MAX_LIT_INFO) {
1217 x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
1218 Offset(struct aos_machine, lit_info) +
1219 lit_count * sizeof(struct lit_info) +
1220 Offset(struct lit_info, func)));
1221 }
1222 else {
1223 x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
1224 }
1225
1226 x86_call( cp->func, ecx );
1227
1228 x86_pop( cp->func, ecx ); /* fixme... */
1229 x86_pop( cp->func, ecx );
1230 x86_pop( cp->func, ecx );
1231 x86_pop( cp->func, ecx );
1232
1233 x86_cdecl_caller_pop_regs( cp->func );
1234
1235 if (writemask != TGSI_WRITEMASK_XYZW) {
1236 store_dest( cp,
1237 &op->FullDstRegisters[0],
1238 get_xmm_writable( cp, result ) );
1239 }
1240
1241 return TRUE;
1242 }
1243
1244 #if 0
1245 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1246 {
1247 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1248 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1249
1250 if (writemask & TGSI_WRITEMASK_YZ) {
1251 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1252 struct x86_reg st2 = x86_make_reg(file_x87, 2);
1253
1254 /* a1' = a1 <= 0 ? 1 : a1;
1255 */
1256 x87_fldz(cp->func); /* 1 0 */
1257 #if 1
1258 x87_fld1(cp->func); /* 1 0 */
1259 #else
1260 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1261 */
1262 x87_fldz(cp->func); /* 1 0 */
1263 #endif
1264 x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */
1265 x87_fcomi(cp->func, st2); /* a1 1 0 */
1266 x87_fcmovb(cp->func, st1); /* a1' 1 0 */
1267 x87_fstp(cp->func, st1); /* a1' 0 */
1268 x87_fstp(cp->func, st1); /* a1' */
1269
1270 x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1' */
1271 x87_fxch(cp->func, st1); /* a1' a3 */
1272
1273
1274 /* Compute pow(a1, a3)
1275 */
1276 x87_fyl2x(cp->func); /* a3*log2(a1) */
1277 x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
1278
1279
1280 /* a0' = max2(a0, 0):
1281 */
1282 x87_fldz(cp->func); /* 0 r2 */
1283 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
1284 x87_fcomi(cp->func, st1);
1285 x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
1286
1287 x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
1288
1289 x87_fcomi(cp->func, st1); /* a0' 0 r2 */
1290 x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
1291
1292 x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
1293 x87_fpop(cp->func); /* r2 */
1294 x87_fpop(cp->func);
1295 }
1296
1297 if (writemask & TGSI_WRITEMASK_XW) {
1298 x87_fld1(cp->func);
1299 x87_fst_or_nop(cp->func, writemask, 0, dst);
1300 x87_fstp_or_pop(cp->func, writemask, 3, dst);
1301 }
1302
1303 return TRUE;
1304 }
1305 #endif
1306
1307
1308
1309 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1310 {
1311 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1312 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1313 struct x86_reg dst = get_xmm_writable(cp, arg0);
1314
1315 sse_maxps(cp->func, dst, arg1);
1316
1317 store_dest(cp, &op->FullDstRegisters[0], dst);
1318 return TRUE;
1319 }
1320
1321
1322 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1323 {
1324 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1325 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1326 struct x86_reg dst = get_xmm_writable(cp, arg0);
1327
1328 sse_minps(cp->func, dst, arg1);
1329
1330 store_dest(cp, &op->FullDstRegisters[0], dst);
1331 return TRUE;
1332 }
1333
1334 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1335 {
1336 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1337 struct x86_reg dst = get_xmm_writable(cp, arg0);
1338
1339 /* potentially nothing to do */
1340
1341 store_dest(cp, &op->FullDstRegisters[0], dst);
1342 return TRUE;
1343 }
1344
1345 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1346 {
1347 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1348 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1349 struct x86_reg dst = get_xmm_writable(cp, arg0);
1350
1351 sse_mulps(cp->func, dst, arg1);
1352
1353 store_dest(cp, &op->FullDstRegisters[0], dst);
1354 return TRUE;
1355 }
1356
1357
1358 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1359 {
1360 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1361 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1362 struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
1363
1364 /* If we can't clobber old contents of arg0, get a temporary & copy
1365 * it there, then clobber it...
1366 */
1367 arg0 = get_xmm_writable(cp, arg0);
1368
1369 sse_mulps(cp->func, arg0, arg1);
1370 sse_addps(cp->func, arg0, arg2);
1371 store_dest(cp, &op->FullDstRegisters[0], arg0);
1372 return TRUE;
1373 }
1374
1375 /* A wrapper for powf().
1376 * Makes sure it is cdecl and operates on floats.
1377 */
1378 static float PIPE_CDECL _powerf( float x, float y )
1379 {
1380 return powf( x, y );
1381 }
1382
1383 /* Really not sufficient -- need to check for conditions that could
1384 * generate inf/nan values, which will slow things down hugely.
1385 */
1386 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1387 {
1388 #if 0
1389 x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */
1390 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */
1391 x87_fyl2x(cp->func); /* a1*log2(a0) */
1392
1393 x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
1394
1395 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1396 #else
1397 uint i;
1398
1399 /* For absolute correctness, need to spill/invalidate all XMM regs
1400 * too.
1401 */
1402 for (i = 0; i < 8; i++) {
1403 if (cp->xmm[i].dirty)
1404 spill(cp, i);
1405 aos_release_xmm_reg(cp, i);
1406 }
1407
1408 /* Push caller-save (ie scratch) regs.
1409 */
1410 x86_cdecl_caller_push_regs( cp->func );
1411
1412 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
1413
1414 x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
1415 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
1416 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
1417 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1418
1419 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
1420 x86_call( cp->func, cp->tmp_EAX );
1421
1422 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
1423
1424 x86_cdecl_caller_pop_regs( cp->func );
1425
1426 /* Note retval on x87 stack:
1427 */
1428 cp->func->x87_stack++;
1429
1430 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
1431 #endif
1432 return TRUE;
1433 }
1434
1435
1436 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1437 {
1438 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1439 struct x86_reg dst = aos_get_xmm_reg(cp);
1440
1441 if (cp->have_sse2) {
1442 sse2_rcpss(cp->func, dst, arg0);
1443 /* extend precision here...
1444 */
1445 }
1446 else {
1447 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1448 sse_movss(cp->func, dst, ones);
1449 sse_divss(cp->func, dst, arg0);
1450 }
1451
1452 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1453 return TRUE;
1454 }
1455
1456
1457 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1458 * implementations, it is possible to improve its precision at
1459 * fairly low cost, using a newton/raphson step, as below:
1460 *
1461 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1462 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1463 * or:
1464 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1465 *
1466 *
1467 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1468 */
1469 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1470 {
1471
1472 if (0) {
1473 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1474 struct x86_reg r = aos_get_xmm_reg(cp);
1475 sse_rsqrtss(cp->func, r, arg0);
1476 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1477 return TRUE;
1478 }
1479 else {
1480 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1481 struct x86_reg r = aos_get_xmm_reg(cp);
1482
1483 struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
1484 struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
1485 struct x86_reg src = get_xmm_writable( cp, arg0 );
1486
1487 sse_rsqrtss( cp->func, r, src ); /* rsqrtss(a) */
1488 sse_mulss( cp->func, src, neg_half ); /* -.5 * a */
1489 sse_mulss( cp->func, src, r ); /* -.5 * a * r */
1490 sse_mulss( cp->func, src, r ); /* -.5 * a * r * r */
1491 sse_addss( cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
1492 sse_mulss( cp->func, r, src ); /* r * (1.5 - .5 * a * r * r) */
1493
1494 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1495 return TRUE;
1496 }
1497 }
1498
1499
1500 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1501 {
1502 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1503 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1504 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1505 struct x86_reg dst = get_xmm_writable(cp, arg0);
1506
1507 sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
1508 sse_andps(cp->func, dst, ones);
1509
1510 store_dest(cp, &op->FullDstRegisters[0], dst);
1511 return TRUE;
1512 }
1513
1514 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1515 {
1516 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1517 x87_fsin(cp->func);
1518 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1519 return TRUE;
1520 }
1521
1522
1523
1524 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1525 {
1526 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1527 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1528 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1529 struct x86_reg dst = get_xmm_writable(cp, arg0);
1530
1531 sse_cmpps(cp->func, dst, arg1, cc_LessThan);
1532 sse_andps(cp->func, dst, ones);
1533
1534 store_dest(cp, &op->FullDstRegisters[0], dst);
1535 return TRUE;
1536 }
1537
1538 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1539 {
1540 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1541 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1542 struct x86_reg dst = get_xmm_writable(cp, arg0);
1543
1544 sse_subps(cp->func, dst, arg1);
1545
1546 store_dest(cp, &op->FullDstRegisters[0], dst);
1547 return TRUE;
1548 }
1549
1550
1551 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1552 {
1553 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1554 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1555 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1556 struct x86_reg tmp1 = aos_get_xmm_reg(cp);
1557
1558 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1559 sse_mulps(cp->func, tmp1, arg0);
1560 emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
1561 sse_mulps(cp->func, tmp0, arg1);
1562 sse_subps(cp->func, tmp1, tmp0);
1563 sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
1564
1565 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1566 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1567 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1568 /* dst[3] is undef */
1569
1570
1571 aos_release_xmm_reg(cp, tmp0.idx);
1572 store_dest(cp, &op->FullDstRegisters[0], tmp1);
1573 return TRUE;
1574 }
1575
1576
1577
1578 static boolean
1579 emit_instruction( struct aos_compilation *cp,
1580 struct tgsi_full_instruction *inst )
1581 {
1582 x87_assert_stack_empty(cp->func);
1583
1584 switch( inst->Instruction.Opcode ) {
1585 case TGSI_OPCODE_MOV:
1586 return emit_MOV( cp, inst );
1587
1588 case TGSI_OPCODE_LIT:
1589 return emit_LIT(cp, inst);
1590
1591 case TGSI_OPCODE_RCP:
1592 return emit_RCP(cp, inst);
1593
1594 case TGSI_OPCODE_RSQ:
1595 return emit_RSQ(cp, inst);
1596
1597 case TGSI_OPCODE_EXP:
1598 /*return emit_EXP(cp, inst);*/
1599 return FALSE;
1600
1601 case TGSI_OPCODE_LOG:
1602 /*return emit_LOG(cp, inst);*/
1603 return FALSE;
1604
1605 case TGSI_OPCODE_MUL:
1606 return emit_MUL(cp, inst);
1607
1608 case TGSI_OPCODE_ADD:
1609 return emit_ADD(cp, inst);
1610
1611 case TGSI_OPCODE_DP3:
1612 return emit_DP3(cp, inst);
1613
1614 case TGSI_OPCODE_DP4:
1615 return emit_DP4(cp, inst);
1616
1617 case TGSI_OPCODE_DST:
1618 return emit_DST(cp, inst);
1619
1620 case TGSI_OPCODE_MIN:
1621 return emit_MIN(cp, inst);
1622
1623 case TGSI_OPCODE_MAX:
1624 return emit_MAX(cp, inst);
1625
1626 case TGSI_OPCODE_SLT:
1627 return emit_SLT(cp, inst);
1628
1629 case TGSI_OPCODE_SGE:
1630 return emit_SGE(cp, inst);
1631
1632 case TGSI_OPCODE_MAD:
1633 return emit_MAD(cp, inst);
1634
1635 case TGSI_OPCODE_SUB:
1636 return emit_SUB(cp, inst);
1637
1638 case TGSI_OPCODE_LERP:
1639 // return emit_LERP(cp, inst);
1640 return FALSE;
1641
1642 case TGSI_OPCODE_FRAC:
1643 return emit_FRC(cp, inst);
1644
1645 case TGSI_OPCODE_CLAMP:
1646 // return emit_CLAMP(cp, inst);
1647 return FALSE;
1648
1649 case TGSI_OPCODE_FLOOR:
1650 return emit_FLR(cp, inst);
1651
1652 case TGSI_OPCODE_ROUND:
1653 return emit_RND(cp, inst);
1654
1655 case TGSI_OPCODE_EXPBASE2:
1656 return emit_EX2(cp, inst);
1657
1658 case TGSI_OPCODE_LOGBASE2:
1659 return emit_LG2(cp, inst);
1660
1661 case TGSI_OPCODE_POWER:
1662 return emit_POW(cp, inst);
1663
1664 case TGSI_OPCODE_CROSSPRODUCT:
1665 return emit_XPD(cp, inst);
1666
1667 case TGSI_OPCODE_ABS:
1668 return emit_ABS(cp, inst);
1669
1670 case TGSI_OPCODE_DPH:
1671 return emit_DPH(cp, inst);
1672
1673 case TGSI_OPCODE_COS:
1674 return emit_COS(cp, inst);
1675
1676 case TGSI_OPCODE_SIN:
1677 return emit_SIN(cp, inst);
1678
1679 case TGSI_OPCODE_END:
1680 return TRUE;
1681
1682 default:
1683 return FALSE;
1684 }
1685 }
1686
1687
1688 static boolean emit_viewport( struct aos_compilation *cp )
1689 {
1690 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1691 TGSI_FILE_OUTPUT,
1692 0);
1693
1694 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1695 Offset(struct aos_machine, scale));
1696
1697 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1698 Offset(struct aos_machine, translate));
1699
1700 sse_mulps(cp->func, pos, scale);
1701 sse_addps(cp->func, pos, translate);
1702
1703 aos_adopt_xmm_reg( cp,
1704 pos,
1705 TGSI_FILE_OUTPUT,
1706 0,
1707 TRUE );
1708 return TRUE;
1709 }
1710
1711
1712 /* This is useful to be able to see the results on softpipe. Doesn't
1713 * do proper clipping, just assumes the backend can do it during
1714 * rasterization -- for debug only...
1715 */
1716 static boolean emit_rhw_viewport( struct aos_compilation *cp )
1717 {
1718 struct x86_reg tmp = aos_get_xmm_reg(cp);
1719 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1720 TGSI_FILE_OUTPUT,
1721 0);
1722
1723 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1724 Offset(struct aos_machine, scale));
1725
1726 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1727 Offset(struct aos_machine, translate));
1728
1729
1730
1731 emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
1732 sse2_rcpss(cp->func, tmp, tmp);
1733 sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
1734
1735 sse_mulps(cp->func, pos, scale);
1736 sse_mulps(cp->func, pos, tmp);
1737 sse_addps(cp->func, pos, translate);
1738
1739 /* Set pos[3] = w
1740 */
1741 mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
1742
1743 aos_adopt_xmm_reg( cp,
1744 pos,
1745 TGSI_FILE_OUTPUT,
1746 0,
1747 TRUE );
1748 return TRUE;
1749 }
1750
1751
1752 #if 0
1753 static boolean note_immediate( struct aos_compilation *cp,
1754 struct tgsi_full_immediate *imm )
1755 {
1756 unsigned pos = cp->num_immediates++;
1757 unsigned j;
1758
1759 for (j = 0; j < imm->Immediate.Size; j++) {
1760 cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
1761 }
1762
1763 return TRUE;
1764 }
1765 #endif
1766
1767
1768
1769
1770 static void find_last_write_outputs( struct aos_compilation *cp )
1771 {
1772 struct tgsi_parse_context parse;
1773 unsigned this_instruction = 0;
1774 unsigned i;
1775
1776 tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
1777
1778 while (!tgsi_parse_end_of_tokens( &parse )) {
1779
1780 tgsi_parse_token( &parse );
1781
1782 if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
1783 continue;
1784
1785 for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
1786 if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
1787 TGSI_FILE_OUTPUT)
1788 {
1789 unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
1790 cp->output_last_write[idx] = this_instruction;
1791 }
1792 }
1793
1794 this_instruction++;
1795 }
1796
1797 tgsi_parse_free( &parse );
1798 }
1799
1800
1801 #define ARG_MACHINE 1
1802 #define ARG_START_ELTS 2
1803 #define ARG_COUNT 3
1804 #define ARG_OUTBUF 4
1805
1806
1807 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
1808 boolean linear )
1809 {
1810 struct tgsi_parse_context parse;
1811 struct aos_compilation cp;
1812 unsigned fixup, label;
1813
1814 tgsi_parse_init( &parse, varient->base.vs->state.tokens );
1815
1816 memset(&cp, 0, sizeof(cp));
1817
1818 cp.insn_counter = 1;
1819 cp.vaos = varient;
1820 cp.have_sse2 = 1;
1821 cp.func = &varient->func[ linear ? 0 : 1 ];
1822
1823 cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1824 cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
1825 cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
1826 cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
1827 cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
1828 cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
1829 cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
1830
1831 x86_init_func(cp.func);
1832
1833 find_last_write_outputs(&cp);
1834
1835 x86_push(cp.func, cp.idx_EBX);
1836 x86_push(cp.func, cp.count_ESI);
1837 x86_push(cp.func, cp.temp_EBP);
1838
1839
1840 /* Load arguments into regs:
1841 */
1842 x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
1843 x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
1844 x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
1845 x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
1846
1847
1848 /* Compare count to zero and possibly bail.
1849 */
1850 x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
1851 x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
1852 fixup = x86_jcc_forward(cp.func, cc_E);
1853
1854
1855 save_fpu_state( &cp );
1856 set_fpu_round_nearest( &cp );
1857
1858 /* Note address for loop jump
1859 */
1860 label = x86_get_label(cp.func);
1861 {
1862 /* Fetch inputs... TODO: fetch lazily...
1863 */
1864 if (!aos_fetch_inputs( &cp, linear ))
1865 goto fail;
1866
1867 /* Emit the shader:
1868 */
1869 while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
1870 {
1871 tgsi_parse_token( &parse );
1872
1873 switch (parse.FullToken.Token.Type) {
1874 case TGSI_TOKEN_TYPE_IMMEDIATE:
1875 #if 0
1876 if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
1877 goto fail;
1878 #endif
1879 break;
1880
1881 case TGSI_TOKEN_TYPE_INSTRUCTION:
1882 if (DISASSEM)
1883 tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
1884
1885 if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
1886 goto fail;
1887 break;
1888 }
1889
1890 x87_assert_stack_empty(cp.func);
1891 cp.insn_counter++;
1892
1893 if (DISASSEM)
1894 debug_printf("\n");
1895 }
1896
1897
1898 {
1899 unsigned i;
1900 for (i = 0; i < 8; i++) {
1901 if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
1902 cp.xmm[i].file = TGSI_FILE_NULL;
1903 cp.xmm[i].dirty = 0;
1904 }
1905 }
1906 }
1907
1908 if (cp.error)
1909 goto fail;
1910
1911 if (cp.vaos->base.key.clip) {
1912 /* not really handling clipping, just do the rhw so we can
1913 * see the results...
1914 */
1915 emit_rhw_viewport(&cp);
1916 }
1917 else if (cp.vaos->base.key.viewport) {
1918 emit_viewport(&cp);
1919 }
1920
1921 /* Emit output... TODO: do this eagerly after the last write to a
1922 * given output.
1923 */
1924 if (!aos_emit_outputs( &cp ))
1925 goto fail;
1926
1927
1928 /* Next vertex:
1929 */
1930 x86_lea(cp.func,
1931 cp.outbuf_ECX,
1932 x86_make_disp(cp.outbuf_ECX,
1933 cp.vaos->base.key.output_stride));
1934
1935 /* Incr index
1936 */
1937 if (linear) {
1938 x86_inc(cp.func, cp.idx_EBX);
1939 }
1940 else {
1941 x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
1942 }
1943
1944 }
1945 /* decr count, loop if not zero
1946 */
1947 x86_dec(cp.func, cp.count_ESI);
1948 x86_jcc(cp.func, cc_NZ, label);
1949
1950 restore_fpu_state(&cp);
1951
1952 /* Land forward jump here:
1953 */
1954 x86_fixup_fwd_jump(cp.func, fixup);
1955
1956 /* Exit mmx state?
1957 */
1958 if (cp.func->need_emms)
1959 mmx_emms(cp.func);
1960
1961 x86_pop(cp.func, cp.temp_EBP);
1962 x86_pop(cp.func, cp.count_ESI);
1963 x86_pop(cp.func, cp.idx_EBX);
1964
1965 x87_assert_stack_empty(cp.func);
1966 x86_ret(cp.func);
1967
1968 tgsi_parse_free( &parse );
1969 return !cp.error;
1970
1971 fail:
1972 tgsi_parse_free( &parse );
1973 return FALSE;
1974 }
1975
1976
1977
1978 static void vaos_set_buffer( struct draw_vs_varient *varient,
1979 unsigned buf,
1980 const void *ptr,
1981 unsigned stride )
1982 {
1983 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
1984 unsigned i;
1985
1986 for (i = 0; i < vaos->base.key.nr_inputs; i++) {
1987 if (vaos->base.key.element[i].in.buffer == buf) {
1988 vaos->attrib[i].input_ptr = ((char *)ptr +
1989 vaos->base.key.element[i].in.offset);
1990 vaos->attrib[i].input_stride = stride;
1991 }
1992 }
1993 }
1994
1995
1996
1997 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
1998 const unsigned *elts,
1999 unsigned count,
2000 void *output_buffer )
2001 {
2002 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2003 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2004
2005 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2006 machine->constants = vaos->draw->vs.aligned_constants;
2007 machine->immediates = vaos->base.vs->immediates;
2008 machine->attrib = vaos->attrib;
2009
2010 vaos->gen_run_elts( machine,
2011 elts,
2012 count,
2013 output_buffer );
2014 }
2015
2016 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
2017 unsigned start,
2018 unsigned count,
2019 void *output_buffer )
2020 {
2021 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2022 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2023
2024 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2025 machine->constants = vaos->draw->vs.aligned_constants;
2026 machine->immediates = vaos->base.vs->immediates;
2027 machine->attrib = vaos->attrib;
2028
2029 vaos->gen_run_linear( machine,
2030 start,
2031 count,
2032 output_buffer );
2033 }
2034
2035
2036
2037 static void vaos_destroy( struct draw_vs_varient *varient )
2038 {
2039 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2040
2041 FREE( vaos->attrib );
2042
2043 x86_release_func( &vaos->func[0] );
2044 x86_release_func( &vaos->func[1] );
2045
2046 FREE(vaos);
2047 }
2048
2049
2050
2051 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
2052 const struct draw_vs_varient_key *key )
2053 {
2054 struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
2055
2056 if (!vaos)
2057 goto fail;
2058
2059 vaos->base.key = *key;
2060 vaos->base.vs = vs;
2061 vaos->base.set_input = vaos_set_buffer;
2062 vaos->base.destroy = vaos_destroy;
2063 vaos->base.run_linear = vaos_run_linear;
2064 vaos->base.run_elts = vaos_run_elts;
2065
2066 vaos->draw = vs->draw;
2067
2068 vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
2069 if (!vaos->attrib)
2070 goto fail;
2071
2072 tgsi_dump(vs->state.tokens, 0);
2073
2074 if (!build_vertex_program( vaos, TRUE ))
2075 goto fail;
2076
2077 if (!build_vertex_program( vaos, FALSE ))
2078 goto fail;
2079
2080 vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
2081 if (!vaos->gen_run_linear)
2082 goto fail;
2083
2084 vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
2085 if (!vaos->gen_run_elts)
2086 goto fail;
2087
2088 return &vaos->base;
2089
2090 fail:
2091 if (vaos && vaos->attrib)
2092 FREE(vaos->attrib);
2093
2094 if (vaos)
2095 x86_release_func( &vaos->func[0] );
2096
2097 if (vaos)
2098 x86_release_func( &vaos->func[1] );
2099
2100 FREE(vaos);
2101
2102 return NULL;
2103 }
2104
2105
2106 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
2107 const struct draw_vs_varient_key *key )
2108 {
2109 struct draw_vs_varient *varient = varient_aos_sse( vs, key );
2110
2111 if (varient == NULL) {
2112 assert(0);
2113 varient = draw_vs_varient_generic( vs, key );
2114 }
2115
2116 return varient;
2117 }
2118
2119
2120
2121 #endif