gallium: fix SSE codegen for instructions that use both a CONSTANT and IMMEDIATE
[mesa.git] / src / gallium / auxiliary / draw / draw_vs_aos.c
1 /*
2 * Mesa 3-D graphics library
3 * Version: 6.3
4 *
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
29 */
30
31
32 #include "pipe/p_util.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/util/tgsi_parse.h"
35 #include "tgsi/util/tgsi_util.h"
36 #include "tgsi/exec/tgsi_exec.h"
37 #include "tgsi/util/tgsi_dump.h"
38
39 #include "draw_vs.h"
40 #include "draw_vs_aos.h"
41
42 #include "rtasm/rtasm_x86sse.h"
43
44 #ifdef PIPE_ARCH_X86
45 #define DISASSEM 0
46
47 static const char *files[] =
48 {
49 "NULL",
50 "CONST",
51 "IN",
52 "OUT",
53 "TEMP",
54 "SAMP",
55 "ADDR",
56 "IMM",
57 "INTERNAL",
58 };
59
60 static INLINE boolean eq( struct x86_reg a,
61 struct x86_reg b )
62 {
63 return (a.file == b.file &&
64 a.idx == b.idx &&
65 a.mod == b.mod &&
66 a.disp == b.disp);
67 }
68
69 struct x86_reg aos_get_x86( struct aos_compilation *cp,
70 unsigned which_reg, /* quick hack */
71 unsigned value )
72 {
73 struct x86_reg reg;
74
75 if (which_reg == 0)
76 reg = cp->temp_EBP;
77 else
78 reg = cp->tmp_EAX;
79
80 if (cp->x86_reg[which_reg] != value) {
81 unsigned offset;
82
83 switch (value) {
84 case X86_IMMEDIATES:
85 assert(which_reg == 0);
86 offset = Offset(struct aos_machine, immediates);
87 break;
88 case X86_CONSTANTS:
89 assert(which_reg == 1);
90 offset = Offset(struct aos_machine, constants);
91 break;
92 case X86_ATTRIBS:
93 assert(which_reg == 0);
94 offset = Offset(struct aos_machine, attrib);
95 break;
96 default:
97 assert(0);
98 offset = 0;
99 }
100
101
102 x86_mov(cp->func, reg,
103 x86_make_disp(cp->machine_EDX, offset));
104
105 cp->x86_reg[which_reg] = value;
106 }
107
108 return reg;
109 }
110
111
112 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
113 unsigned file,
114 unsigned idx )
115 {
116 struct x86_reg ptr = cp->machine_EDX;
117
118 switch (file) {
119 case TGSI_FILE_INPUT:
120 return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
121
122 case TGSI_FILE_OUTPUT:
123 return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
124
125 case TGSI_FILE_TEMPORARY:
126 return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
127
128 case AOS_FILE_INTERNAL:
129 return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
130
131 case TGSI_FILE_IMMEDIATE:
132 return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
133
134 case TGSI_FILE_CONSTANT:
135 return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
136
137 default:
138 ERROR(cp, "unknown reg file");
139 return x86_make_reg(0,0);
140 }
141 }
142
143
144
145 #define X87_CW_EXCEPTION_INV_OP (1<<0)
146 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
147 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
148 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
149 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
150 #define X87_CW_EXCEPTION_PRECISION (1<<5)
151 #define X87_CW_PRECISION_SINGLE (0<<8)
152 #define X87_CW_PRECISION_RESERVED (1<<8)
153 #define X87_CW_PRECISION_DOUBLE (2<<8)
154 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
155 #define X87_CW_PRECISION_MASK (3<<8)
156 #define X87_CW_ROUND_NEAREST (0<<10)
157 #define X87_CW_ROUND_DOWN (1<<10)
158 #define X87_CW_ROUND_UP (2<<10)
159 #define X87_CW_ROUND_ZERO (3<<10)
160 #define X87_CW_ROUND_MASK (3<<10)
161 #define X87_CW_INFINITY (1<<12)
162
163
164
165
166 static void spill( struct aos_compilation *cp, unsigned idx )
167 {
168 if (!cp->xmm[idx].dirty ||
169 (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
170 cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
171 cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
172 ERROR(cp, "invalid spill");
173 return;
174 }
175 else {
176 struct x86_reg oldval = get_reg_ptr(cp,
177 cp->xmm[idx].file,
178 cp->xmm[idx].idx);
179
180 if (0) debug_printf("\nspill %s[%d]",
181 files[cp->xmm[idx].file],
182 cp->xmm[idx].idx);
183
184 assert(cp->xmm[idx].dirty);
185 sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
186 cp->xmm[idx].dirty = 0;
187 }
188 }
189
190
191 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
192 struct x86_reg reg )
193 {
194 if (reg.file != file_XMM ||
195 cp->xmm[reg.idx].file != TGSI_FILE_NULL)
196 {
197 struct x86_reg tmp = aos_get_xmm_reg(cp);
198 sse_movaps(cp->func, tmp, reg);
199 reg = tmp;
200 }
201
202 cp->xmm[reg.idx].last_used = cp->insn_counter;
203 return reg;
204 }
205
206 static struct x86_reg get_xmm( struct aos_compilation *cp,
207 struct x86_reg reg )
208 {
209 if (reg.file != file_XMM)
210 {
211 struct x86_reg tmp = aos_get_xmm_reg(cp);
212 sse_movaps(cp->func, tmp, reg);
213 reg = tmp;
214 }
215
216 cp->xmm[reg.idx].last_used = cp->insn_counter;
217 return reg;
218 }
219
220
221 /* Allocate an empty xmm register, either as a temporary or later to
222 * "adopt" as a shader reg.
223 */
224 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
225 {
226 unsigned i;
227 unsigned oldest = 0;
228 boolean found = FALSE;
229
230 for (i = 0; i < 8; i++)
231 if (cp->xmm[i].last_used != cp->insn_counter &&
232 cp->xmm[i].file == TGSI_FILE_NULL) {
233 oldest = i;
234 found = TRUE;
235 }
236
237 if (!found) {
238 for (i = 0; i < 8; i++)
239 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
240 oldest = i;
241 }
242
243 /* Need to write out the old value?
244 */
245 if (cp->xmm[oldest].dirty)
246 spill(cp, oldest);
247
248 assert(cp->xmm[oldest].last_used != cp->insn_counter);
249
250 cp->xmm[oldest].file = TGSI_FILE_NULL;
251 cp->xmm[oldest].idx = 0;
252 cp->xmm[oldest].dirty = 0;
253 cp->xmm[oldest].last_used = cp->insn_counter;
254 return x86_make_reg(file_XMM, oldest);
255 }
256
257 void aos_release_xmm_reg( struct aos_compilation *cp,
258 unsigned idx )
259 {
260 cp->xmm[idx].file = TGSI_FILE_NULL;
261 cp->xmm[idx].idx = 0;
262 cp->xmm[idx].dirty = 0;
263 cp->xmm[idx].last_used = 0;
264 }
265
266
267
268
269 /* Mark an xmm reg as holding the current copy of a shader reg.
270 */
271 void aos_adopt_xmm_reg( struct aos_compilation *cp,
272 struct x86_reg reg,
273 unsigned file,
274 unsigned idx,
275 unsigned dirty )
276 {
277 unsigned i;
278
279 if (reg.file != file_XMM) {
280 assert(0);
281 return;
282 }
283
284
285 /* If any xmm reg thinks it holds this shader reg, break the
286 * illusion.
287 */
288 for (i = 0; i < 8; i++) {
289 if (cp->xmm[i].file == file &&
290 cp->xmm[i].idx == idx)
291 {
292 /* If an xmm reg is already holding this shader reg, take into account its
293 * dirty flag...
294 */
295 dirty |= cp->xmm[i].dirty;
296 aos_release_xmm_reg(cp, i);
297 }
298 }
299
300 cp->xmm[reg.idx].file = file;
301 cp->xmm[reg.idx].idx = idx;
302 cp->xmm[reg.idx].dirty = dirty;
303 cp->xmm[reg.idx].last_used = cp->insn_counter;
304 }
305
306
307 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
308 */
309 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
310 unsigned file,
311 unsigned idx )
312 {
313 unsigned i;
314
315 /* Ensure the in-memory copy of this reg is up-to-date
316 */
317 for (i = 0; i < 8; i++) {
318 if (cp->xmm[i].file == file &&
319 cp->xmm[i].idx == idx &&
320 cp->xmm[i].dirty) {
321 spill(cp, i);
322 }
323 }
324
325 return get_reg_ptr( cp, file, idx );
326 }
327
328
329 /* As above, but return a pointer. Note - this pointer may alias
330 * those returned by get_arg_ptr().
331 */
332 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
333 const struct tgsi_full_dst_register *dst )
334 {
335 unsigned file = dst->DstRegister.File;
336 unsigned idx = dst->DstRegister.Index;
337 unsigned i;
338
339
340 /* Ensure in-memory copy of this reg is up-to-date and invalidate
341 * any xmm copies.
342 */
343 for (i = 0; i < 8; i++) {
344 if (cp->xmm[i].file == file &&
345 cp->xmm[i].idx == idx)
346 {
347 if (cp->xmm[i].dirty)
348 spill(cp, i);
349
350 aos_release_xmm_reg(cp, i);
351 }
352 }
353
354 return get_reg_ptr( cp, file, idx );
355 }
356
357
358
359
360
361 /* Return an XMM reg if the argument is resident, otherwise return a
362 * base+offset pointer to the saved value.
363 */
364 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
365 unsigned file,
366 unsigned idx )
367 {
368 unsigned i;
369
370 for (i = 0; i < 8; i++) {
371 if (cp->xmm[i].file == file &&
372 cp->xmm[i].idx == idx)
373 {
374 cp->xmm[i].last_used = cp->insn_counter;
375 return x86_make_reg(file_XMM, i);
376 }
377 }
378
379 /* If not found in the XMM register file, return an indirect
380 * reference to the in-memory copy:
381 */
382 return get_reg_ptr( cp, file, idx );
383 }
384
385
386
387 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
388 unsigned file,
389 unsigned idx )
390 {
391 struct x86_reg reg = get_xmm( cp,
392 aos_get_shader_reg( cp, file, idx ) );
393
394 aos_adopt_xmm_reg( cp,
395 reg,
396 file,
397 idx,
398 FALSE );
399
400 return reg;
401 }
402
403
404
405 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
406 unsigned imm )
407 {
408 return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
409 }
410
411
412 struct x86_reg aos_get_internal( struct aos_compilation *cp,
413 unsigned imm )
414 {
415 return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
416 }
417
418
419
420
421
422 /* Emulate pshufd insn in regular SSE, if necessary:
423 */
424 static void emit_pshufd( struct aos_compilation *cp,
425 struct x86_reg dst,
426 struct x86_reg arg0,
427 ubyte shuf )
428 {
429 if (cp->have_sse2) {
430 sse2_pshufd(cp->func, dst, arg0, shuf);
431 }
432 else {
433 if (!eq(dst, arg0))
434 sse_movaps(cp->func, dst, arg0);
435
436 sse_shufps(cp->func, dst, dst, shuf);
437 }
438 }
439
440 /* load masks (pack into negs??)
441 * pshufd - shuffle according to writemask
442 * and - result, mask
443 * nand - dest, mask
444 * or - dest, result
445 */
446 static boolean mask_write( struct aos_compilation *cp,
447 struct x86_reg dst,
448 struct x86_reg result,
449 unsigned mask )
450 {
451 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
452 struct x86_reg tmp = aos_get_xmm_reg(cp);
453
454 emit_pshufd(cp, tmp, imm_swz,
455 SHUF((mask & 1) ? 2 : 3,
456 (mask & 2) ? 2 : 3,
457 (mask & 4) ? 2 : 3,
458 (mask & 8) ? 2 : 3));
459
460 sse_andps(cp->func, dst, tmp);
461 sse_andnps(cp->func, tmp, result);
462 sse_orps(cp->func, dst, tmp);
463
464 aos_release_xmm_reg(cp, tmp.idx);
465 return TRUE;
466 }
467
468
469
470
471 /* Helper for writemask:
472 */
473 static boolean emit_shuf_copy2( struct aos_compilation *cp,
474 struct x86_reg dst,
475 struct x86_reg arg0,
476 struct x86_reg arg1,
477 ubyte shuf )
478 {
479 struct x86_reg tmp = aos_get_xmm_reg(cp);
480
481 emit_pshufd(cp, dst, arg1, shuf);
482 emit_pshufd(cp, tmp, arg0, shuf);
483 sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
484 emit_pshufd(cp, dst, dst, shuf);
485
486 aos_release_xmm_reg(cp, tmp.idx);
487 return TRUE;
488 }
489
490
491
492 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
493
494
495 /* Locate a source register and perform any required (simple) swizzle.
496 *
497 * Just fail on complex swizzles at this point.
498 */
499 static struct x86_reg fetch_src( struct aos_compilation *cp,
500 const struct tgsi_full_src_register *src )
501 {
502 struct x86_reg arg0 = aos_get_shader_reg(cp,
503 src->SrcRegister.File,
504 src->SrcRegister.Index);
505 unsigned i;
506 ubyte swz = 0;
507 unsigned negs = 0;
508 unsigned abs = 0;
509
510 for (i = 0; i < 4; i++) {
511 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
512 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
513
514 switch (swizzle) {
515 case TGSI_EXTSWIZZLE_ZERO:
516 case TGSI_EXTSWIZZLE_ONE:
517 ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
518 break;
519
520 default:
521 swz |= (swizzle & 0x3) << (i * 2);
522 break;
523 }
524
525 switch (neg) {
526 case TGSI_UTIL_SIGN_TOGGLE:
527 negs |= (1<<i);
528 break;
529
530 case TGSI_UTIL_SIGN_KEEP:
531 break;
532
533 case TGSI_UTIL_SIGN_CLEAR:
534 abs |= (1<<i);
535 break;
536
537 default:
538 ERROR(cp, "unsupported sign-mode");
539 break;
540 }
541 }
542
543 if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
544 struct x86_reg dst = aos_get_xmm_reg(cp);
545
546 if (swz != SSE_SWIZZLE_NOOP)
547 emit_pshufd(cp, dst, arg0, swz);
548 else
549 sse_movaps(cp->func, dst, arg0);
550
551 if (negs && negs != 0xf) {
552 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
553 struct x86_reg tmp = aos_get_xmm_reg(cp);
554
555 /* Load 1,-1,0,0
556 * Use neg as arg to pshufd
557 * Multiply
558 */
559 emit_pshufd(cp, tmp, imm_swz,
560 SHUF((negs & 1) ? 1 : 0,
561 (negs & 2) ? 1 : 0,
562 (negs & 4) ? 1 : 0,
563 (negs & 8) ? 1 : 0));
564 sse_mulps(cp->func, dst, tmp);
565
566 aos_release_xmm_reg(cp, tmp.idx);
567 }
568 else if (negs) {
569 struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
570 sse_mulps(cp->func, dst, imm_negs);
571 }
572
573
574 if (abs && abs != 0xf) {
575 ERROR(cp, "unsupported partial abs");
576 }
577 else if (abs) {
578 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
579 struct x86_reg tmp = aos_get_xmm_reg(cp);
580
581 sse_movaps(cp->func, tmp, dst);
582 sse_mulps(cp->func, tmp, neg);
583 sse_maxps(cp->func, dst, tmp);
584
585 aos_release_xmm_reg(cp, tmp.idx);
586 }
587
588 return dst;
589 }
590
591 return arg0;
592 }
593
594 static void x87_fld_src( struct aos_compilation *cp,
595 const struct tgsi_full_src_register *src,
596 unsigned channel )
597 {
598 struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
599 src->SrcRegister.File,
600 src->SrcRegister.Index);
601
602 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
603 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
604
605 switch (swizzle) {
606 case TGSI_EXTSWIZZLE_ZERO:
607 x87_fldz( cp->func );
608 break;
609
610 case TGSI_EXTSWIZZLE_ONE:
611 x87_fld1( cp->func );
612 break;
613
614 default:
615 x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
616 break;
617 }
618
619
620 switch (neg) {
621 case TGSI_UTIL_SIGN_TOGGLE:
622 /* Flip the sign:
623 */
624 x87_fchs( cp->func );
625 break;
626
627 case TGSI_UTIL_SIGN_KEEP:
628 break;
629
630 case TGSI_UTIL_SIGN_CLEAR:
631 x87_fabs( cp->func );
632 break;
633
634 case TGSI_UTIL_SIGN_SET:
635 x87_fabs( cp->func );
636 x87_fchs( cp->func );
637 break;
638
639 default:
640 ERROR(cp, "unsupported sign-mode");
641 break;
642 }
643 }
644
645
646
647
648
649
650 /* Used to implement write masking. This and most of the other instructions
651 * here would be easier to implement if there had been a translation
652 * to a 2 argument format (dst/arg0, arg1) at the shader level before
653 * attempting to translate to x86/sse code.
654 */
655 static void store_dest( struct aos_compilation *cp,
656 const struct tgsi_full_dst_register *reg,
657 struct x86_reg result )
658 {
659 struct x86_reg dst;
660
661 switch (reg->DstRegister.WriteMask) {
662 case 0:
663 return;
664
665 case TGSI_WRITEMASK_XYZW:
666 aos_adopt_xmm_reg(cp,
667 get_xmm_writable(cp, result),
668 reg->DstRegister.File,
669 reg->DstRegister.Index,
670 TRUE);
671 return;
672 default:
673 break;
674 }
675
676 dst = aos_get_shader_reg_xmm(cp,
677 reg->DstRegister.File,
678 reg->DstRegister.Index);
679
680 switch (reg->DstRegister.WriteMask) {
681 case TGSI_WRITEMASK_X:
682 sse_movss(cp->func, dst, get_xmm(cp, result));
683 break;
684
685 case TGSI_WRITEMASK_ZW:
686 sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
687 break;
688
689 case TGSI_WRITEMASK_XY:
690 result = get_xmm_writable(cp, result);
691 sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
692 dst = result;
693 break;
694
695 case TGSI_WRITEMASK_YZW:
696 result = get_xmm_writable(cp, result);
697 sse_movss(cp->func, result, dst);
698 dst = result;
699 break;
700
701 default:
702 mask_write(cp, dst, result, reg->DstRegister.WriteMask);
703 break;
704 }
705
706 aos_adopt_xmm_reg(cp,
707 dst,
708 reg->DstRegister.File,
709 reg->DstRegister.Index,
710 TRUE);
711
712 }
713
714 static void inject_scalar( struct aos_compilation *cp,
715 struct x86_reg dst,
716 struct x86_reg result,
717 ubyte swizzle )
718 {
719 sse_shufps(cp->func, dst, dst, swizzle);
720 sse_movss(cp->func, dst, result);
721 sse_shufps(cp->func, dst, dst, swizzle);
722 }
723
724
725 static void store_scalar_dest( struct aos_compilation *cp,
726 const struct tgsi_full_dst_register *reg,
727 struct x86_reg result )
728 {
729 unsigned writemask = reg->DstRegister.WriteMask;
730 struct x86_reg dst;
731
732 if (writemask != TGSI_WRITEMASK_X &&
733 writemask != TGSI_WRITEMASK_Y &&
734 writemask != TGSI_WRITEMASK_Z &&
735 writemask != TGSI_WRITEMASK_W &&
736 writemask != 0)
737 {
738 result = get_xmm_writable(cp, result); /* already true, right? */
739 sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
740 store_dest(cp, reg, result);
741 return;
742 }
743
744 result = get_xmm(cp, result);
745 dst = aos_get_shader_reg_xmm(cp,
746 reg->DstRegister.File,
747 reg->DstRegister.Index);
748
749
750
751 switch (reg->DstRegister.WriteMask) {
752 case TGSI_WRITEMASK_X:
753 sse_movss(cp->func, dst, result);
754 break;
755
756 case TGSI_WRITEMASK_Y:
757 inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
758 break;
759
760 case TGSI_WRITEMASK_Z:
761 inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
762 break;
763
764 case TGSI_WRITEMASK_W:
765 inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
766 break;
767
768 default:
769 break;
770 }
771
772 aos_adopt_xmm_reg(cp,
773 dst,
774 reg->DstRegister.File,
775 reg->DstRegister.Index,
776 TRUE);
777 }
778
779
780
781 static void x87_fst_or_nop( struct x86_function *func,
782 unsigned writemask,
783 unsigned channel,
784 struct x86_reg ptr )
785 {
786 assert(ptr.file == file_REG32);
787 if (writemask & (1<<channel))
788 x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
789 }
790
791 static void x87_fstp_or_pop( struct x86_function *func,
792 unsigned writemask,
793 unsigned channel,
794 struct x86_reg ptr )
795 {
796 assert(ptr.file == file_REG32);
797 if (writemask & (1<<channel))
798 x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
799 else
800 x87_fstp( func, x86_make_reg( file_x87, 0 ));
801 }
802
803
804
805 /*
806 */
807 static void x87_fstp_dest4( struct aos_compilation *cp,
808 const struct tgsi_full_dst_register *dst )
809 {
810 struct x86_reg ptr = get_dst_ptr(cp, dst);
811 unsigned writemask = dst->DstRegister.WriteMask;
812
813 x87_fst_or_nop(cp->func, writemask, 0, ptr);
814 x87_fst_or_nop(cp->func, writemask, 1, ptr);
815 x87_fst_or_nop(cp->func, writemask, 2, ptr);
816 x87_fstp_or_pop(cp->func, writemask, 3, ptr);
817 }
818
819 /* Save current x87 state and put it into single precision mode.
820 */
821 static void save_fpu_state( struct aos_compilation *cp )
822 {
823 x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
824 Offset(struct aos_machine, fpu_restore)));
825 }
826
827 static void restore_fpu_state( struct aos_compilation *cp )
828 {
829 x87_fnclex(cp->func);
830 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
831 Offset(struct aos_machine, fpu_restore)));
832 }
833
834 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
835 {
836 if (cp->fpucntl != FPU_RND_NEG) {
837 cp->fpucntl = FPU_RND_NEG;
838 x87_fnclex(cp->func);
839 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
840 Offset(struct aos_machine, fpu_rnd_neg_inf)));
841 }
842 }
843
844 static void set_fpu_round_nearest( struct aos_compilation *cp )
845 {
846 if (cp->fpucntl != FPU_RND_NEAREST) {
847 cp->fpucntl = FPU_RND_NEAREST;
848 x87_fnclex(cp->func);
849 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
850 Offset(struct aos_machine, fpu_rnd_nearest)));
851 }
852 }
853
854
855 static void x87_emit_ex2( struct aos_compilation *cp )
856 {
857 struct x86_reg st0 = x86_make_reg(file_x87, 0);
858 struct x86_reg st1 = x86_make_reg(file_x87, 1);
859 int stack = cp->func->x87_stack;
860
861 // set_fpu_round_neg_inf( cp );
862
863 x87_fld(cp->func, st0); /* a a */
864 x87_fprndint( cp->func ); /* int(a) a*/
865 x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
866 x87_fxch(cp->func, st1); /* frc(a) int(a) */
867 x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
868 x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
869 x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
870 x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
871 /* 2^a int(a) */
872 x87_fstp(cp->func, st1); /* 2^a */
873
874 assert( stack == cp->func->x87_stack);
875
876 }
877
878 static void PIPE_CDECL print_reg( const char *msg,
879 const float *reg )
880 {
881 debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
882 }
883
884 static void emit_print( struct aos_compilation *cp,
885 const char *message, /* must point to a static string! */
886 unsigned file,
887 unsigned idx )
888 {
889 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
890 struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
891 unsigned i;
892
893 /* There shouldn't be anything on the x87 stack. Can add this
894 * capacity later if need be.
895 */
896 assert(cp->func->x87_stack == 0);
897
898 /* For absolute correctness, need to spill/invalidate all XMM regs
899 * too. We're obviously not concerned about performance on this
900 * debug path, so here goes:
901 */
902 for (i = 0; i < 8; i++) {
903 if (cp->xmm[i].dirty)
904 spill(cp, i);
905
906 aos_release_xmm_reg(cp, i);
907 }
908
909 /* Push caller-save (ie scratch) regs.
910 */
911 x86_cdecl_caller_push_regs( cp->func );
912
913
914 /* Push the arguments:
915 */
916 x86_lea( cp->func, ecx, arg );
917 x86_push( cp->func, ecx );
918 x86_push_imm32( cp->func, (int)message );
919
920 /* Call the helper. Could call debug_printf directly, but
921 * print_reg is a nice place to put a breakpoint if need be.
922 */
923 x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
924 x86_call( cp->func, ecx );
925 x86_pop( cp->func, ecx );
926 x86_pop( cp->func, ecx );
927
928 /* Pop caller-save regs
929 */
930 x86_cdecl_caller_pop_regs( cp->func );
931
932 /* Done...
933 */
934 }
935
936 /**
937 * The traditional instructions. All operate on internal registers
938 * and ignore write masks and swizzling issues.
939 */
940
941 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
942 {
943 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
944 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
945 struct x86_reg tmp = aos_get_xmm_reg(cp);
946
947 sse_movaps(cp->func, tmp, arg0);
948 sse_mulps(cp->func, tmp, neg);
949 sse_maxps(cp->func, tmp, arg0);
950
951 store_dest(cp, &op->FullDstRegisters[0], tmp);
952 return TRUE;
953 }
954
955 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
956 {
957 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
958 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
959 struct x86_reg dst = get_xmm_writable(cp, arg0);
960
961 sse_addps(cp->func, dst, arg1);
962
963 store_dest(cp, &op->FullDstRegisters[0], dst);
964 return TRUE;
965 }
966
967 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
968 {
969 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
970 x87_fcos(cp->func);
971 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
972 return TRUE;
973 }
974
975 /* The dotproduct instructions don't really do that well in sse:
976 * XXX: produces wrong results -- disabled.
977 */
978 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
979 {
980 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
981 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
982 struct x86_reg tmp = aos_get_xmm_reg(cp);
983 struct x86_reg dst = get_xmm_writable(cp, arg0);
984
985 sse_mulps(cp->func, dst, arg1);
986 /* Now the hard bit: sum the first 3 values:
987 */
988 sse_movhlps(cp->func, tmp, dst);
989 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
990 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
991 sse_addss(cp->func, dst, tmp);
992
993 aos_release_xmm_reg(cp, tmp.idx);
994 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
995 return TRUE;
996 }
997
998 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
999 {
1000 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1001 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1002 struct x86_reg tmp = aos_get_xmm_reg(cp);
1003 struct x86_reg dst = get_xmm_writable(cp, arg0);
1004
1005 sse_mulps(cp->func, dst, arg1);
1006
1007 /* Now the hard bit: sum the values:
1008 */
1009 sse_movhlps(cp->func, tmp, dst);
1010 sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1011 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1012 sse_addss(cp->func, dst, tmp);
1013
1014 aos_release_xmm_reg(cp, tmp.idx);
1015 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1016 return TRUE;
1017 }
1018
1019 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1020 {
1021 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1022 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1023 struct x86_reg tmp = aos_get_xmm_reg(cp);
1024 struct x86_reg dst = get_xmm_writable(cp, arg0);
1025
1026 sse_mulps(cp->func, dst, arg1);
1027
1028 /* Now the hard bit: sum the values (from DP3):
1029 */
1030 sse_movhlps(cp->func, tmp, dst);
1031 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1032 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1033 sse_addss(cp->func, dst, tmp);
1034 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
1035 sse_addss(cp->func, dst, tmp);
1036
1037 aos_release_xmm_reg(cp, tmp.idx);
1038 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1039 return TRUE;
1040 }
1041
1042 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1043 {
1044 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1045 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1046 struct x86_reg dst = aos_get_xmm_reg(cp);
1047 struct x86_reg tmp = aos_get_xmm_reg(cp);
1048 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1049
1050 /* dst[0] = 1.0 * 1.0F; */
1051 /* dst[1] = arg0[1] * arg1[1]; */
1052 /* dst[2] = arg0[2] * 1.0; */
1053 /* dst[3] = 1.0 * arg1[3]; */
1054
1055 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
1056 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
1057 sse_mulps(cp->func, dst, tmp);
1058
1059 aos_release_xmm_reg(cp, tmp.idx);
1060 store_dest(cp, &op->FullDstRegisters[0], dst);
1061 return TRUE;
1062 }
1063
1064 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1065 {
1066 x87_fld1(cp->func); /* 1 */
1067 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */
1068 x87_fyl2x(cp->func); /* log2(a0) */
1069 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1070 return TRUE;
1071 }
1072
1073
1074 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1075 {
1076 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1077 x87_emit_ex2(cp);
1078 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1079 return TRUE;
1080 }
1081
1082
1083 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1084 {
1085 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1086 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1087 int i;
1088
1089 set_fpu_round_neg_inf( cp );
1090
1091 /* Load all sources first to avoid aliasing
1092 */
1093 for (i = 3; i >= 0; i--) {
1094 if (writemask & (1<<i)) {
1095 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1096 }
1097 }
1098
1099 for (i = 0; i < 4; i++) {
1100 if (writemask & (1<<i)) {
1101 x87_fprndint( cp->func );
1102 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1103 }
1104 }
1105
1106 return TRUE;
1107 }
1108
1109
1110 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1111 {
1112 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1113 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1114 int i;
1115
1116 set_fpu_round_nearest( cp );
1117
1118 /* Load all sources first to avoid aliasing
1119 */
1120 for (i = 3; i >= 0; i--) {
1121 if (writemask & (1<<i)) {
1122 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1123 }
1124 }
1125
1126 for (i = 0; i < 4; i++) {
1127 if (writemask & (1<<i)) {
1128 x87_fprndint( cp->func );
1129 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1130 }
1131 }
1132
1133 return TRUE;
1134 }
1135
1136
1137 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1138 {
1139 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1140 struct x86_reg st0 = x86_make_reg(file_x87, 0);
1141 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1142 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1143 int i;
1144
1145 set_fpu_round_neg_inf( cp );
1146
1147 /* suck all the source values onto the stack before writing out any
1148 * dst, which may alias...
1149 */
1150 for (i = 3; i >= 0; i--) {
1151 if (writemask & (1<<i)) {
1152 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1153 }
1154 }
1155
1156 for (i = 0; i < 4; i++) {
1157 if (writemask & (1<<i)) {
1158 x87_fld(cp->func, st0); /* a a */
1159 x87_fprndint( cp->func ); /* flr(a) a */
1160 x87_fsubp(cp->func, st1); /* frc(a) */
1161 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1162 }
1163 }
1164
1165 return TRUE;
1166 }
1167
1168
1169
1170
1171
1172
1173 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1174 {
1175 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
1176 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1177 unsigned lit_count = cp->lit_count++;
1178 struct x86_reg result, arg0;
1179 unsigned i;
1180
1181 #if 1
1182 /* For absolute correctness, need to spill/invalidate all XMM regs
1183 * too.
1184 */
1185 for (i = 0; i < 8; i++) {
1186 if (cp->xmm[i].dirty)
1187 spill(cp, i);
1188 aos_release_xmm_reg(cp, i);
1189 }
1190 #endif
1191
1192 if (writemask != TGSI_WRITEMASK_XYZW)
1193 result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
1194 else
1195 result = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1196
1197
1198 arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
1199 if (arg0.file == file_XMM) {
1200 struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
1201 Offset(struct aos_machine, tmp[1]));
1202 sse_movaps( cp->func, tmp, arg0 );
1203 arg0 = tmp;
1204 }
1205
1206
1207
1208 /* Push caller-save (ie scratch) regs.
1209 */
1210 x86_cdecl_caller_push_regs( cp->func );
1211
1212 /* Push the arguments:
1213 */
1214 x86_push_imm32( cp->func, lit_count );
1215
1216 x86_lea( cp->func, ecx, arg0 );
1217 x86_push( cp->func, ecx );
1218
1219 x86_lea( cp->func, ecx, result );
1220 x86_push( cp->func, ecx );
1221
1222 x86_push( cp->func, cp->machine_EDX );
1223
1224 if (lit_count < MAX_LIT_INFO) {
1225 x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
1226 Offset(struct aos_machine, lit_info) +
1227 lit_count * sizeof(struct lit_info) +
1228 Offset(struct lit_info, func)));
1229 }
1230 else {
1231 x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
1232 }
1233
1234 x86_call( cp->func, ecx );
1235
1236 x86_pop( cp->func, ecx ); /* fixme... */
1237 x86_pop( cp->func, ecx );
1238 x86_pop( cp->func, ecx );
1239 x86_pop( cp->func, ecx );
1240
1241 x86_cdecl_caller_pop_regs( cp->func );
1242
1243 if (writemask != TGSI_WRITEMASK_XYZW) {
1244 store_dest( cp,
1245 &op->FullDstRegisters[0],
1246 get_xmm_writable( cp, result ) );
1247 }
1248
1249 return TRUE;
1250 }
1251
1252 #if 0
1253 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1254 {
1255 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1256 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1257
1258 if (writemask & TGSI_WRITEMASK_YZ) {
1259 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1260 struct x86_reg st2 = x86_make_reg(file_x87, 2);
1261
1262 /* a1' = a1 <= 0 ? 1 : a1;
1263 */
1264 x87_fldz(cp->func); /* 1 0 */
1265 #if 1
1266 x87_fld1(cp->func); /* 1 0 */
1267 #else
1268 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1269 */
1270 x87_fldz(cp->func); /* 1 0 */
1271 #endif
1272 x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */
1273 x87_fcomi(cp->func, st2); /* a1 1 0 */
1274 x87_fcmovb(cp->func, st1); /* a1' 1 0 */
1275 x87_fstp(cp->func, st1); /* a1' 0 */
1276 x87_fstp(cp->func, st1); /* a1' */
1277
1278 x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1' */
1279 x87_fxch(cp->func, st1); /* a1' a3 */
1280
1281
1282 /* Compute pow(a1, a3)
1283 */
1284 x87_fyl2x(cp->func); /* a3*log2(a1) */
1285 x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
1286
1287
1288 /* a0' = max2(a0, 0):
1289 */
1290 x87_fldz(cp->func); /* 0 r2 */
1291 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
1292 x87_fcomi(cp->func, st1);
1293 x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
1294
1295 x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
1296
1297 x87_fcomi(cp->func, st1); /* a0' 0 r2 */
1298 x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
1299
1300 x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
1301 x87_fpop(cp->func); /* r2 */
1302 x87_fpop(cp->func);
1303 }
1304
1305 if (writemask & TGSI_WRITEMASK_XW) {
1306 x87_fld1(cp->func);
1307 x87_fst_or_nop(cp->func, writemask, 0, dst);
1308 x87_fstp_or_pop(cp->func, writemask, 3, dst);
1309 }
1310
1311 return TRUE;
1312 }
1313 #endif
1314
1315
1316
1317 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1318 {
1319 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1320 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1321 struct x86_reg dst = get_xmm_writable(cp, arg0);
1322
1323 sse_maxps(cp->func, dst, arg1);
1324
1325 store_dest(cp, &op->FullDstRegisters[0], dst);
1326 return TRUE;
1327 }
1328
1329
1330 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1331 {
1332 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1333 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1334 struct x86_reg dst = get_xmm_writable(cp, arg0);
1335
1336 sse_minps(cp->func, dst, arg1);
1337
1338 store_dest(cp, &op->FullDstRegisters[0], dst);
1339 return TRUE;
1340 }
1341
1342 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1343 {
1344 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1345 struct x86_reg dst = get_xmm_writable(cp, arg0);
1346
1347 /* potentially nothing to do */
1348
1349 store_dest(cp, &op->FullDstRegisters[0], dst);
1350 return TRUE;
1351 }
1352
1353 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1354 {
1355 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1356 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1357 struct x86_reg dst = get_xmm_writable(cp, arg0);
1358
1359 sse_mulps(cp->func, dst, arg1);
1360
1361 store_dest(cp, &op->FullDstRegisters[0], dst);
1362 return TRUE;
1363 }
1364
1365
1366 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1367 {
1368 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1369 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1370 struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
1371
1372 /* If we can't clobber old contents of arg0, get a temporary & copy
1373 * it there, then clobber it...
1374 */
1375 arg0 = get_xmm_writable(cp, arg0);
1376
1377 sse_mulps(cp->func, arg0, arg1);
1378 sse_addps(cp->func, arg0, arg2);
1379 store_dest(cp, &op->FullDstRegisters[0], arg0);
1380 return TRUE;
1381 }
1382
1383 /* A wrapper for powf().
1384 * Makes sure it is cdecl and operates on floats.
1385 */
1386 static float PIPE_CDECL _powerf( float x, float y )
1387 {
1388 return powf( x, y );
1389 }
1390
1391 /* Really not sufficient -- need to check for conditions that could
1392 * generate inf/nan values, which will slow things down hugely.
1393 */
1394 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1395 {
1396 #if 0
1397 x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */
1398 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */
1399 x87_fyl2x(cp->func); /* a1*log2(a0) */
1400
1401 x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
1402
1403 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1404 #else
1405 uint i;
1406
1407 /* For absolute correctness, need to spill/invalidate all XMM regs
1408 * too.
1409 */
1410 for (i = 0; i < 8; i++) {
1411 if (cp->xmm[i].dirty)
1412 spill(cp, i);
1413 aos_release_xmm_reg(cp, i);
1414 }
1415
1416 /* Push caller-save (ie scratch) regs.
1417 */
1418 x86_cdecl_caller_push_regs( cp->func );
1419
1420 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
1421
1422 x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
1423 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
1424 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
1425 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1426
1427 /* tmp_EAX has been pushed & will be restored below */
1428 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
1429 x86_call( cp->func, cp->tmp_EAX );
1430
1431 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
1432
1433 x86_cdecl_caller_pop_regs( cp->func );
1434
1435 /* Note retval on x87 stack:
1436 */
1437 cp->func->x87_stack++;
1438
1439 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
1440 #endif
1441 return TRUE;
1442 }
1443
1444
1445 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1446 {
1447 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1448 struct x86_reg dst = aos_get_xmm_reg(cp);
1449
1450 if (cp->have_sse2) {
1451 sse2_rcpss(cp->func, dst, arg0);
1452 /* extend precision here...
1453 */
1454 }
1455 else {
1456 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1457 sse_movss(cp->func, dst, ones);
1458 sse_divss(cp->func, dst, arg0);
1459 }
1460
1461 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1462 return TRUE;
1463 }
1464
1465
1466 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1467 * implementations, it is possible to improve its precision at
1468 * fairly low cost, using a newton/raphson step, as below:
1469 *
1470 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1471 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1472 * or:
1473 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1474 *
1475 *
1476 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1477 */
1478 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1479 {
1480
1481 if (0) {
1482 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1483 struct x86_reg r = aos_get_xmm_reg(cp);
1484 sse_rsqrtss(cp->func, r, arg0);
1485 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1486 return TRUE;
1487 }
1488 else {
1489 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1490 struct x86_reg r = aos_get_xmm_reg(cp);
1491
1492 struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
1493 struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
1494 struct x86_reg src = get_xmm_writable( cp, arg0 );
1495
1496 sse_rsqrtss( cp->func, r, src ); /* rsqrtss(a) */
1497 sse_mulss( cp->func, src, neg_half ); /* -.5 * a */
1498 sse_mulss( cp->func, src, r ); /* -.5 * a * r */
1499 sse_mulss( cp->func, src, r ); /* -.5 * a * r * r */
1500 sse_addss( cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
1501 sse_mulss( cp->func, r, src ); /* r * (1.5 - .5 * a * r * r) */
1502
1503 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1504 return TRUE;
1505 }
1506 }
1507
1508
1509 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1510 {
1511 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1512 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1513 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1514 struct x86_reg dst = get_xmm_writable(cp, arg0);
1515
1516 sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
1517 sse_andps(cp->func, dst, ones);
1518
1519 store_dest(cp, &op->FullDstRegisters[0], dst);
1520 return TRUE;
1521 }
1522
1523 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1524 {
1525 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1526 x87_fsin(cp->func);
1527 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1528 return TRUE;
1529 }
1530
1531
1532
1533 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1534 {
1535 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1536 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1537 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1538 struct x86_reg dst = get_xmm_writable(cp, arg0);
1539
1540 sse_cmpps(cp->func, dst, arg1, cc_LessThan);
1541 sse_andps(cp->func, dst, ones);
1542
1543 store_dest(cp, &op->FullDstRegisters[0], dst);
1544 return TRUE;
1545 }
1546
1547 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1548 {
1549 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1550 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1551 struct x86_reg dst = get_xmm_writable(cp, arg0);
1552
1553 sse_subps(cp->func, dst, arg1);
1554
1555 store_dest(cp, &op->FullDstRegisters[0], dst);
1556 return TRUE;
1557 }
1558
1559
1560 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1561 {
1562 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1563 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1564 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1565 struct x86_reg tmp1 = aos_get_xmm_reg(cp);
1566
1567 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1568 sse_mulps(cp->func, tmp1, arg0);
1569 emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
1570 sse_mulps(cp->func, tmp0, arg1);
1571 sse_subps(cp->func, tmp1, tmp0);
1572 sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
1573
1574 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1575 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1576 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1577 /* dst[3] is undef */
1578
1579
1580 aos_release_xmm_reg(cp, tmp0.idx);
1581 store_dest(cp, &op->FullDstRegisters[0], tmp1);
1582 return TRUE;
1583 }
1584
1585
1586
1587 static boolean
1588 emit_instruction( struct aos_compilation *cp,
1589 struct tgsi_full_instruction *inst )
1590 {
1591 x87_assert_stack_empty(cp->func);
1592
1593 switch( inst->Instruction.Opcode ) {
1594 case TGSI_OPCODE_MOV:
1595 return emit_MOV( cp, inst );
1596
1597 case TGSI_OPCODE_LIT:
1598 return emit_LIT(cp, inst);
1599
1600 case TGSI_OPCODE_RCP:
1601 return emit_RCP(cp, inst);
1602
1603 case TGSI_OPCODE_RSQ:
1604 return emit_RSQ(cp, inst);
1605
1606 case TGSI_OPCODE_EXP:
1607 /*return emit_EXP(cp, inst);*/
1608 return FALSE;
1609
1610 case TGSI_OPCODE_LOG:
1611 /*return emit_LOG(cp, inst);*/
1612 return FALSE;
1613
1614 case TGSI_OPCODE_MUL:
1615 return emit_MUL(cp, inst);
1616
1617 case TGSI_OPCODE_ADD:
1618 return emit_ADD(cp, inst);
1619
1620 case TGSI_OPCODE_DP3:
1621 return emit_DP3(cp, inst);
1622
1623 case TGSI_OPCODE_DP4:
1624 return emit_DP4(cp, inst);
1625
1626 case TGSI_OPCODE_DST:
1627 return emit_DST(cp, inst);
1628
1629 case TGSI_OPCODE_MIN:
1630 return emit_MIN(cp, inst);
1631
1632 case TGSI_OPCODE_MAX:
1633 return emit_MAX(cp, inst);
1634
1635 case TGSI_OPCODE_SLT:
1636 return emit_SLT(cp, inst);
1637
1638 case TGSI_OPCODE_SGE:
1639 return emit_SGE(cp, inst);
1640
1641 case TGSI_OPCODE_MAD:
1642 return emit_MAD(cp, inst);
1643
1644 case TGSI_OPCODE_SUB:
1645 return emit_SUB(cp, inst);
1646
1647 case TGSI_OPCODE_LERP:
1648 // return emit_LERP(cp, inst);
1649 return FALSE;
1650
1651 case TGSI_OPCODE_FRAC:
1652 return emit_FRC(cp, inst);
1653
1654 case TGSI_OPCODE_CLAMP:
1655 // return emit_CLAMP(cp, inst);
1656 return FALSE;
1657
1658 case TGSI_OPCODE_FLOOR:
1659 return emit_FLR(cp, inst);
1660
1661 case TGSI_OPCODE_ROUND:
1662 return emit_RND(cp, inst);
1663
1664 case TGSI_OPCODE_EXPBASE2:
1665 return emit_EX2(cp, inst);
1666
1667 case TGSI_OPCODE_LOGBASE2:
1668 return emit_LG2(cp, inst);
1669
1670 case TGSI_OPCODE_POWER:
1671 return emit_POW(cp, inst);
1672
1673 case TGSI_OPCODE_CROSSPRODUCT:
1674 return emit_XPD(cp, inst);
1675
1676 case TGSI_OPCODE_ABS:
1677 return emit_ABS(cp, inst);
1678
1679 case TGSI_OPCODE_DPH:
1680 return emit_DPH(cp, inst);
1681
1682 case TGSI_OPCODE_COS:
1683 return emit_COS(cp, inst);
1684
1685 case TGSI_OPCODE_SIN:
1686 return emit_SIN(cp, inst);
1687
1688 case TGSI_OPCODE_END:
1689 return TRUE;
1690
1691 default:
1692 return FALSE;
1693 }
1694 }
1695
1696
1697 static boolean emit_viewport( struct aos_compilation *cp )
1698 {
1699 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1700 TGSI_FILE_OUTPUT,
1701 cp->vaos->draw->vs.position_output );
1702
1703 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1704 Offset(struct aos_machine, scale));
1705
1706 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1707 Offset(struct aos_machine, translate));
1708
1709 sse_mulps(cp->func, pos, scale);
1710 sse_addps(cp->func, pos, translate);
1711
1712 aos_adopt_xmm_reg( cp,
1713 pos,
1714 TGSI_FILE_OUTPUT,
1715 cp->vaos->draw->vs.position_output,
1716 TRUE );
1717 return TRUE;
1718 }
1719
1720
1721 /* This is useful to be able to see the results on softpipe. Doesn't
1722 * do proper clipping, just assumes the backend can do it during
1723 * rasterization -- for debug only...
1724 */
1725 static boolean emit_rhw_viewport( struct aos_compilation *cp )
1726 {
1727 struct x86_reg tmp = aos_get_xmm_reg(cp);
1728 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1729 TGSI_FILE_OUTPUT,
1730 cp->vaos->draw->vs.position_output);
1731
1732 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1733 Offset(struct aos_machine, scale));
1734
1735 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1736 Offset(struct aos_machine, translate));
1737
1738
1739
1740 emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
1741 sse2_rcpss(cp->func, tmp, tmp);
1742 sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
1743
1744 sse_mulps(cp->func, pos, scale);
1745 sse_mulps(cp->func, pos, tmp);
1746 sse_addps(cp->func, pos, translate);
1747
1748 /* Set pos[3] = w
1749 */
1750 mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
1751
1752 aos_adopt_xmm_reg( cp,
1753 pos,
1754 TGSI_FILE_OUTPUT,
1755 cp->vaos->draw->vs.position_output,
1756 TRUE );
1757 return TRUE;
1758 }
1759
1760
1761 #if 0
1762 static boolean note_immediate( struct aos_compilation *cp,
1763 struct tgsi_full_immediate *imm )
1764 {
1765 unsigned pos = cp->num_immediates++;
1766 unsigned j;
1767
1768 for (j = 0; j < imm->Immediate.Size; j++) {
1769 cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
1770 }
1771
1772 return TRUE;
1773 }
1774 #endif
1775
1776
1777
1778
1779 static void find_last_write_outputs( struct aos_compilation *cp )
1780 {
1781 struct tgsi_parse_context parse;
1782 unsigned this_instruction = 0;
1783 unsigned i;
1784
1785 tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
1786
1787 while (!tgsi_parse_end_of_tokens( &parse )) {
1788
1789 tgsi_parse_token( &parse );
1790
1791 if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
1792 continue;
1793
1794 for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
1795 if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
1796 TGSI_FILE_OUTPUT)
1797 {
1798 unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
1799 cp->output_last_write[idx] = this_instruction;
1800 }
1801 }
1802
1803 this_instruction++;
1804 }
1805
1806 tgsi_parse_free( &parse );
1807 }
1808
1809
1810 #define ARG_MACHINE 1
1811 #define ARG_START_ELTS 2
1812 #define ARG_COUNT 3
1813 #define ARG_OUTBUF 4
1814
1815
1816 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
1817 boolean linear )
1818 {
1819 struct tgsi_parse_context parse;
1820 struct aos_compilation cp;
1821 unsigned fixup, label;
1822
1823 tgsi_parse_init( &parse, varient->base.vs->state.tokens );
1824
1825 memset(&cp, 0, sizeof(cp));
1826
1827 cp.insn_counter = 1;
1828 cp.vaos = varient;
1829 cp.have_sse2 = 1;
1830 cp.func = &varient->func[ linear ? 0 : 1 ];
1831
1832 cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1833 cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
1834 cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
1835 cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
1836 cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
1837 cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
1838 cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
1839
1840 x86_init_func(cp.func);
1841
1842 find_last_write_outputs(&cp);
1843
1844 x86_push(cp.func, cp.idx_EBX);
1845 x86_push(cp.func, cp.count_ESI);
1846 x86_push(cp.func, cp.temp_EBP);
1847
1848
1849 /* Load arguments into regs:
1850 */
1851 x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
1852 x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
1853 x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
1854 x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
1855
1856
1857 /* Compare count to zero and possibly bail.
1858 */
1859 x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
1860 x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
1861 fixup = x86_jcc_forward(cp.func, cc_E);
1862
1863
1864 save_fpu_state( &cp );
1865 set_fpu_round_nearest( &cp );
1866
1867 /* Note address for loop jump
1868 */
1869 label = x86_get_label(cp.func);
1870 {
1871 /* Fetch inputs... TODO: fetch lazily...
1872 */
1873 if (!aos_fetch_inputs( &cp, linear ))
1874 goto fail;
1875
1876 /* Emit the shader:
1877 */
1878 while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
1879 {
1880 tgsi_parse_token( &parse );
1881
1882 switch (parse.FullToken.Token.Type) {
1883 case TGSI_TOKEN_TYPE_IMMEDIATE:
1884 #if 0
1885 if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
1886 goto fail;
1887 #endif
1888 break;
1889
1890 case TGSI_TOKEN_TYPE_INSTRUCTION:
1891 if (DISASSEM)
1892 tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
1893
1894 if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
1895 goto fail;
1896 break;
1897 }
1898
1899 x87_assert_stack_empty(cp.func);
1900 cp.insn_counter++;
1901
1902 if (DISASSEM)
1903 debug_printf("\n");
1904 }
1905
1906
1907 {
1908 unsigned i;
1909 for (i = 0; i < 8; i++) {
1910 if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
1911 cp.xmm[i].file = TGSI_FILE_NULL;
1912 cp.xmm[i].dirty = 0;
1913 }
1914 }
1915 }
1916
1917 if (cp.error)
1918 goto fail;
1919
1920 if (cp.vaos->base.key.clip) {
1921 /* not really handling clipping, just do the rhw so we can
1922 * see the results...
1923 */
1924 emit_rhw_viewport(&cp);
1925 }
1926 else if (cp.vaos->base.key.viewport) {
1927 emit_viewport(&cp);
1928 }
1929
1930 /* Emit output... TODO: do this eagerly after the last write to a
1931 * given output.
1932 */
1933 if (!aos_emit_outputs( &cp ))
1934 goto fail;
1935
1936
1937 /* Next vertex:
1938 */
1939 x86_lea(cp.func,
1940 cp.outbuf_ECX,
1941 x86_make_disp(cp.outbuf_ECX,
1942 cp.vaos->base.key.output_stride));
1943
1944 /* Incr index
1945 */
1946 if (linear) {
1947 x86_inc(cp.func, cp.idx_EBX);
1948 }
1949 else {
1950 x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
1951 }
1952
1953 }
1954 /* decr count, loop if not zero
1955 */
1956 x86_dec(cp.func, cp.count_ESI);
1957 x86_jcc(cp.func, cc_NZ, label);
1958
1959 restore_fpu_state(&cp);
1960
1961 /* Land forward jump here:
1962 */
1963 x86_fixup_fwd_jump(cp.func, fixup);
1964
1965 /* Exit mmx state?
1966 */
1967 if (cp.func->need_emms)
1968 mmx_emms(cp.func);
1969
1970 x86_pop(cp.func, cp.temp_EBP);
1971 x86_pop(cp.func, cp.count_ESI);
1972 x86_pop(cp.func, cp.idx_EBX);
1973
1974 x87_assert_stack_empty(cp.func);
1975 x86_ret(cp.func);
1976
1977 tgsi_parse_free( &parse );
1978 return !cp.error;
1979
1980 fail:
1981 tgsi_parse_free( &parse );
1982 return FALSE;
1983 }
1984
1985
1986
1987 static void vaos_set_buffer( struct draw_vs_varient *varient,
1988 unsigned buf,
1989 const void *ptr,
1990 unsigned stride )
1991 {
1992 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
1993 unsigned i;
1994
1995 for (i = 0; i < vaos->base.key.nr_inputs; i++) {
1996 if (vaos->base.key.element[i].in.buffer == buf) {
1997 vaos->attrib[i].input_ptr = ((char *)ptr +
1998 vaos->base.key.element[i].in.offset);
1999 vaos->attrib[i].input_stride = stride;
2000 }
2001 }
2002 }
2003
2004
2005
2006 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
2007 const unsigned *elts,
2008 unsigned count,
2009 void *output_buffer )
2010 {
2011 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2012 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2013
2014 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2015 machine->constants = vaos->draw->vs.aligned_constants;
2016 machine->immediates = vaos->base.vs->immediates;
2017 machine->attrib = vaos->attrib;
2018
2019 vaos->gen_run_elts( machine,
2020 elts,
2021 count,
2022 output_buffer );
2023 }
2024
2025 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
2026 unsigned start,
2027 unsigned count,
2028 void *output_buffer )
2029 {
2030 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2031 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2032
2033 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2034 machine->constants = vaos->draw->vs.aligned_constants;
2035 machine->immediates = vaos->base.vs->immediates;
2036 machine->attrib = vaos->attrib;
2037
2038 vaos->gen_run_linear( machine,
2039 start,
2040 count,
2041 output_buffer );
2042 }
2043
2044
2045
2046 static void vaos_destroy( struct draw_vs_varient *varient )
2047 {
2048 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2049
2050 FREE( vaos->attrib );
2051
2052 x86_release_func( &vaos->func[0] );
2053 x86_release_func( &vaos->func[1] );
2054
2055 FREE(vaos);
2056 }
2057
2058
2059
2060 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
2061 const struct draw_vs_varient_key *key )
2062 {
2063 struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
2064
2065 if (!vaos)
2066 goto fail;
2067
2068 vaos->base.key = *key;
2069 vaos->base.vs = vs;
2070 vaos->base.set_input = vaos_set_buffer;
2071 vaos->base.destroy = vaos_destroy;
2072 vaos->base.run_linear = vaos_run_linear;
2073 vaos->base.run_elts = vaos_run_elts;
2074
2075 vaos->draw = vs->draw;
2076
2077 vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
2078 if (!vaos->attrib)
2079 goto fail;
2080
2081 #if 0
2082 tgsi_dump(vs->state.tokens, 0);
2083 #endif
2084
2085 if (!build_vertex_program( vaos, TRUE ))
2086 goto fail;
2087
2088 if (!build_vertex_program( vaos, FALSE ))
2089 goto fail;
2090
2091 vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
2092 if (!vaos->gen_run_linear)
2093 goto fail;
2094
2095 vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
2096 if (!vaos->gen_run_elts)
2097 goto fail;
2098
2099 return &vaos->base;
2100
2101 fail:
2102 if (vaos && vaos->attrib)
2103 FREE(vaos->attrib);
2104
2105 if (vaos)
2106 x86_release_func( &vaos->func[0] );
2107
2108 if (vaos)
2109 x86_release_func( &vaos->func[1] );
2110
2111 FREE(vaos);
2112
2113 return NULL;
2114 }
2115
2116
2117 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
2118 const struct draw_vs_varient_key *key )
2119 {
2120 struct draw_vs_varient *varient = varient_aos_sse( vs, key );
2121
2122 if (varient == NULL) {
2123 varient = draw_vs_varient_generic( vs, key );
2124 }
2125
2126 return varient;
2127 }
2128
2129
2130
2131 #endif