2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
32 #include "util/u_memory.h"
33 #include "util/u_math.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_debug.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi/tgsi_exec.h"
39 #include "tgsi/tgsi_dump.h"
42 #include "draw_vs_aos.h"
44 #include "rtasm/rtasm_x86sse.h"
50 static const char *files
[] =
63 static INLINE boolean
eq( struct x86_reg a
,
66 return (a
.file
== b
.file
&&
72 struct x86_reg
aos_get_x86( struct aos_compilation
*cp
,
73 unsigned which_reg
, /* quick hack */
83 if (cp
->x86_reg
[which_reg
] != value
) {
88 assert(which_reg
== 0);
89 offset
= Offset(struct aos_machine
, immediates
);
92 assert(which_reg
== 1);
93 offset
= Offset(struct aos_machine
, constants
);
96 assert(which_reg
== 0);
97 offset
= Offset(struct aos_machine
, buffer
);
105 x86_mov(cp
->func
, reg
,
106 x86_make_disp(cp
->machine_EDX
, offset
));
108 cp
->x86_reg
[which_reg
] = value
;
115 static struct x86_reg
get_reg_ptr(struct aos_compilation
*cp
,
119 struct x86_reg ptr
= cp
->machine_EDX
;
122 case TGSI_FILE_INPUT
:
123 assert(idx
< MAX_INPUTS
);
124 return x86_make_disp(ptr
, Offset(struct aos_machine
, input
[idx
]));
126 case TGSI_FILE_OUTPUT
:
127 return x86_make_disp(ptr
, Offset(struct aos_machine
, output
[idx
]));
129 case TGSI_FILE_TEMPORARY
:
130 assert(idx
< MAX_TEMPS
);
131 return x86_make_disp(ptr
, Offset(struct aos_machine
, temp
[idx
]));
133 case AOS_FILE_INTERNAL
:
134 assert(idx
< MAX_INTERNALS
);
135 return x86_make_disp(ptr
, Offset(struct aos_machine
, internal
[idx
]));
137 case TGSI_FILE_IMMEDIATE
:
138 assert(idx
< MAX_IMMEDIATES
); /* just a sanity check */
139 return x86_make_disp(aos_get_x86(cp
, 0, X86_IMMEDIATES
), idx
* 4 * sizeof(float));
141 case TGSI_FILE_CONSTANT
:
142 assert(idx
< MAX_CONSTANTS
); /* just a sanity check */
143 return x86_make_disp(aos_get_x86(cp
, 1, X86_CONSTANTS
), idx
* 4 * sizeof(float));
146 AOS_ERROR(cp
, "unknown reg file");
147 return x86_make_reg(0,0);
153 #define X87_CW_EXCEPTION_INV_OP (1<<0)
154 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
155 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
156 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
157 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
158 #define X87_CW_EXCEPTION_PRECISION (1<<5)
159 #define X87_CW_PRECISION_SINGLE (0<<8)
160 #define X87_CW_PRECISION_RESERVED (1<<8)
161 #define X87_CW_PRECISION_DOUBLE (2<<8)
162 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
163 #define X87_CW_PRECISION_MASK (3<<8)
164 #define X87_CW_ROUND_NEAREST (0<<10)
165 #define X87_CW_ROUND_DOWN (1<<10)
166 #define X87_CW_ROUND_UP (2<<10)
167 #define X87_CW_ROUND_ZERO (3<<10)
168 #define X87_CW_ROUND_MASK (3<<10)
169 #define X87_CW_INFINITY (1<<12)
174 static void spill( struct aos_compilation
*cp
, unsigned idx
)
176 if (!cp
->xmm
[idx
].dirty
||
177 (cp
->xmm
[idx
].file
!= TGSI_FILE_INPUT
&& /* inputs are fetched into xmm & set dirty */
178 cp
->xmm
[idx
].file
!= TGSI_FILE_OUTPUT
&&
179 cp
->xmm
[idx
].file
!= TGSI_FILE_TEMPORARY
)) {
180 AOS_ERROR(cp
, "invalid spill");
184 struct x86_reg oldval
= get_reg_ptr(cp
,
188 if (0) debug_printf("\nspill %s[%d]",
189 files
[cp
->xmm
[idx
].file
],
192 assert(cp
->xmm
[idx
].dirty
);
193 sse_movaps(cp
->func
, oldval
, x86_make_reg(file_XMM
, idx
));
194 cp
->xmm
[idx
].dirty
= 0;
199 void aos_spill_all( struct aos_compilation
*cp
)
203 for (i
= 0; i
< 8; i
++) {
204 if (cp
->xmm
[i
].dirty
)
206 aos_release_xmm_reg(cp
, i
);
211 static struct x86_reg
get_xmm_writable( struct aos_compilation
*cp
,
214 if (reg
.file
!= file_XMM
||
215 cp
->xmm
[reg
.idx
].file
!= TGSI_FILE_NULL
)
217 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
218 sse_movaps(cp
->func
, tmp
, reg
);
222 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
226 static struct x86_reg
get_xmm( struct aos_compilation
*cp
,
229 if (reg
.file
!= file_XMM
)
231 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
232 sse_movaps(cp
->func
, tmp
, reg
);
236 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
241 /* Allocate an empty xmm register, either as a temporary or later to
242 * "adopt" as a shader reg.
244 struct x86_reg
aos_get_xmm_reg( struct aos_compilation
*cp
)
248 boolean found
= FALSE
;
250 for (i
= 0; i
< 8; i
++)
251 if (cp
->xmm
[i
].last_used
!= cp
->insn_counter
&&
252 cp
->xmm
[i
].file
== TGSI_FILE_NULL
) {
258 for (i
= 0; i
< 8; i
++)
259 if (cp
->xmm
[i
].last_used
< cp
->xmm
[oldest
].last_used
)
263 /* Need to write out the old value?
265 if (cp
->xmm
[oldest
].dirty
)
268 assert(cp
->xmm
[oldest
].last_used
!= cp
->insn_counter
);
270 cp
->xmm
[oldest
].file
= TGSI_FILE_NULL
;
271 cp
->xmm
[oldest
].idx
= 0;
272 cp
->xmm
[oldest
].dirty
= 0;
273 cp
->xmm
[oldest
].last_used
= cp
->insn_counter
;
274 return x86_make_reg(file_XMM
, oldest
);
277 void aos_release_xmm_reg( struct aos_compilation
*cp
,
280 cp
->xmm
[idx
].file
= TGSI_FILE_NULL
;
281 cp
->xmm
[idx
].idx
= 0;
282 cp
->xmm
[idx
].dirty
= 0;
283 cp
->xmm
[idx
].last_used
= 0;
287 static void aos_soft_release_xmm( struct aos_compilation
*cp
,
290 if (reg
.file
== file_XMM
) {
291 assert(cp
->xmm
[reg
.idx
].last_used
== cp
->insn_counter
);
292 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
- 1;
298 /* Mark an xmm reg as holding the current copy of a shader reg.
300 void aos_adopt_xmm_reg( struct aos_compilation
*cp
,
308 if (reg
.file
!= file_XMM
) {
314 /* If any xmm reg thinks it holds this shader reg, break the
317 for (i
= 0; i
< 8; i
++) {
318 if (cp
->xmm
[i
].file
== file
&&
319 cp
->xmm
[i
].idx
== idx
)
321 /* If an xmm reg is already holding this shader reg, take into account its
324 dirty
|= cp
->xmm
[i
].dirty
;
325 aos_release_xmm_reg(cp
, i
);
329 cp
->xmm
[reg
.idx
].file
= file
;
330 cp
->xmm
[reg
.idx
].idx
= idx
;
331 cp
->xmm
[reg
.idx
].dirty
= dirty
;
332 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
336 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
338 static struct x86_reg
aos_get_shader_reg_ptr( struct aos_compilation
*cp
,
344 /* Ensure the in-memory copy of this reg is up-to-date
346 for (i
= 0; i
< 8; i
++) {
347 if (cp
->xmm
[i
].file
== file
&&
348 cp
->xmm
[i
].idx
== idx
&&
354 return get_reg_ptr( cp
, file
, idx
);
358 /* As above, but return a pointer. Note - this pointer may alias
359 * those returned by get_arg_ptr().
361 static struct x86_reg
get_dst_ptr( struct aos_compilation
*cp
,
362 const struct tgsi_full_dst_register
*dst
)
364 unsigned file
= dst
->DstRegister
.File
;
365 unsigned idx
= dst
->DstRegister
.Index
;
369 /* Ensure in-memory copy of this reg is up-to-date and invalidate
372 for (i
= 0; i
< 8; i
++) {
373 if (cp
->xmm
[i
].file
== file
&&
374 cp
->xmm
[i
].idx
== idx
)
376 if (cp
->xmm
[i
].dirty
)
379 aos_release_xmm_reg(cp
, i
);
383 return get_reg_ptr( cp
, file
, idx
);
390 /* Return an XMM reg if the argument is resident, otherwise return a
391 * base+offset pointer to the saved value.
393 struct x86_reg
aos_get_shader_reg( struct aos_compilation
*cp
,
399 for (i
= 0; i
< 8; i
++) {
400 if (cp
->xmm
[i
].file
== file
&&
401 cp
->xmm
[i
].idx
== idx
)
403 cp
->xmm
[i
].last_used
= cp
->insn_counter
;
404 return x86_make_reg(file_XMM
, i
);
408 /* If not found in the XMM register file, return an indirect
409 * reference to the in-memory copy:
411 return get_reg_ptr( cp
, file
, idx
);
416 static struct x86_reg
aos_get_shader_reg_xmm( struct aos_compilation
*cp
,
420 struct x86_reg reg
= get_xmm( cp
,
421 aos_get_shader_reg( cp
, file
, idx
) );
423 aos_adopt_xmm_reg( cp
,
434 struct x86_reg
aos_get_internal_xmm( struct aos_compilation
*cp
,
437 return aos_get_shader_reg_xmm( cp
, AOS_FILE_INTERNAL
, imm
);
441 struct x86_reg
aos_get_internal( struct aos_compilation
*cp
,
444 return aos_get_shader_reg( cp
, AOS_FILE_INTERNAL
, imm
);
451 /* Emulate pshufd insn in regular SSE, if necessary:
453 static void emit_pshufd( struct aos_compilation
*cp
,
459 sse2_pshufd(cp
->func
, dst
, arg0
, shuf
);
463 sse_movaps(cp
->func
, dst
, arg0
);
465 sse_shufps(cp
->func
, dst
, dst
, shuf
);
469 /* load masks (pack into negs??)
470 * pshufd - shuffle according to writemask
475 static boolean
mask_write( struct aos_compilation
*cp
,
477 struct x86_reg result
,
480 struct x86_reg imm_swz
= aos_get_internal_xmm(cp
, IMM_SWZ
);
481 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
483 emit_pshufd(cp
, tmp
, imm_swz
,
484 SHUF((mask
& 1) ? 2 : 3,
487 (mask
& 8) ? 2 : 3));
489 sse_andps(cp
->func
, dst
, tmp
);
490 sse_andnps(cp
->func
, tmp
, result
);
491 sse_orps(cp
->func
, dst
, tmp
);
493 aos_release_xmm_reg(cp
, tmp
.idx
);
500 /* Helper for writemask:
502 static boolean
emit_shuf_copy2( struct aos_compilation
*cp
,
508 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
510 emit_pshufd(cp
, dst
, arg1
, shuf
);
511 emit_pshufd(cp
, tmp
, arg0
, shuf
);
512 sse_shufps(cp
->func
, dst
, tmp
, SHUF(X
, Y
, Z
, W
));
513 emit_pshufd(cp
, dst
, dst
, shuf
);
515 aos_release_xmm_reg(cp
, tmp
.idx
);
521 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
524 /* Locate a source register and perform any required (simple) swizzle.
526 * Just fail on complex swizzles at this point.
528 static struct x86_reg
fetch_src( struct aos_compilation
*cp
,
529 const struct tgsi_full_src_register
*src
)
531 struct x86_reg arg0
= aos_get_shader_reg(cp
,
532 src
->SrcRegister
.File
,
533 src
->SrcRegister
.Index
);
539 for (i
= 0; i
< 4; i
++) {
540 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( src
, i
);
541 unsigned neg
= tgsi_util_get_full_src_register_sign_mode( src
, i
);
544 case TGSI_EXTSWIZZLE_ZERO
:
545 case TGSI_EXTSWIZZLE_ONE
:
546 AOS_ERROR(cp
, "not supporting full swizzles yet in tgsi_aos_sse2");
550 swz
|= (swizzle
& 0x3) << (i
* 2);
555 case TGSI_UTIL_SIGN_TOGGLE
:
559 case TGSI_UTIL_SIGN_KEEP
:
562 case TGSI_UTIL_SIGN_CLEAR
:
567 AOS_ERROR(cp
, "unsupported sign-mode");
572 if (swz
!= SSE_SWIZZLE_NOOP
|| negs
!= 0 || abs
!= 0) {
573 struct x86_reg dst
= aos_get_xmm_reg(cp
);
575 if (swz
!= SSE_SWIZZLE_NOOP
)
576 emit_pshufd(cp
, dst
, arg0
, swz
);
578 sse_movaps(cp
->func
, dst
, arg0
);
580 if (negs
&& negs
!= 0xf) {
581 struct x86_reg imm_swz
= aos_get_internal_xmm(cp
, IMM_SWZ
);
582 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
585 * Use neg as arg to pshufd
588 emit_pshufd(cp
, tmp
, imm_swz
,
589 SHUF((negs
& 1) ? 1 : 0,
592 (negs
& 8) ? 1 : 0));
593 sse_mulps(cp
->func
, dst
, tmp
);
595 aos_release_xmm_reg(cp
, tmp
.idx
);
596 aos_soft_release_xmm(cp
, imm_swz
);
599 struct x86_reg imm_negs
= aos_get_internal_xmm(cp
, IMM_NEGS
);
600 sse_mulps(cp
->func
, dst
, imm_negs
);
601 aos_soft_release_xmm(cp
, imm_negs
);
605 if (abs
&& abs
!= 0xf) {
606 AOS_ERROR(cp
, "unsupported partial abs");
609 struct x86_reg neg
= aos_get_internal(cp
, IMM_NEGS
);
610 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
612 sse_movaps(cp
->func
, tmp
, dst
);
613 sse_mulps(cp
->func
, tmp
, neg
);
614 sse_maxps(cp
->func
, dst
, tmp
);
616 aos_release_xmm_reg(cp
, tmp
.idx
);
617 aos_soft_release_xmm(cp
, neg
);
620 aos_soft_release_xmm(cp
, arg0
);
627 static void x87_fld_src( struct aos_compilation
*cp
,
628 const struct tgsi_full_src_register
*src
,
631 struct x86_reg arg0
= aos_get_shader_reg_ptr(cp
,
632 src
->SrcRegister
.File
,
633 src
->SrcRegister
.Index
);
635 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( src
, channel
);
636 unsigned neg
= tgsi_util_get_full_src_register_sign_mode( src
, channel
);
639 case TGSI_EXTSWIZZLE_ZERO
:
640 x87_fldz( cp
->func
);
643 case TGSI_EXTSWIZZLE_ONE
:
644 x87_fld1( cp
->func
);
648 x87_fld( cp
->func
, x86_make_disp(arg0
, (swizzle
& 3) * sizeof(float)) );
654 case TGSI_UTIL_SIGN_TOGGLE
:
657 x87_fchs( cp
->func
);
660 case TGSI_UTIL_SIGN_KEEP
:
663 case TGSI_UTIL_SIGN_CLEAR
:
664 x87_fabs( cp
->func
);
667 case TGSI_UTIL_SIGN_SET
:
668 x87_fabs( cp
->func
);
669 x87_fchs( cp
->func
);
673 AOS_ERROR(cp
, "unsupported sign-mode");
683 /* Used to implement write masking. This and most of the other instructions
684 * here would be easier to implement if there had been a translation
685 * to a 2 argument format (dst/arg0, arg1) at the shader level before
686 * attempting to translate to x86/sse code.
688 static void store_dest( struct aos_compilation
*cp
,
689 const struct tgsi_full_dst_register
*reg
,
690 struct x86_reg result
)
694 switch (reg
->DstRegister
.WriteMask
) {
698 case TGSI_WRITEMASK_XYZW
:
699 aos_adopt_xmm_reg(cp
,
700 get_xmm_writable(cp
, result
),
701 reg
->DstRegister
.File
,
702 reg
->DstRegister
.Index
,
709 dst
= aos_get_shader_reg_xmm(cp
,
710 reg
->DstRegister
.File
,
711 reg
->DstRegister
.Index
);
713 switch (reg
->DstRegister
.WriteMask
) {
714 case TGSI_WRITEMASK_X
:
715 sse_movss(cp
->func
, dst
, get_xmm(cp
, result
));
718 case TGSI_WRITEMASK_ZW
:
719 sse_shufps(cp
->func
, dst
, get_xmm(cp
, result
), SHUF(X
, Y
, Z
, W
));
722 case TGSI_WRITEMASK_XY
:
723 result
= get_xmm_writable(cp
, result
);
724 sse_shufps(cp
->func
, result
, dst
, SHUF(X
, Y
, Z
, W
));
728 case TGSI_WRITEMASK_YZW
:
729 result
= get_xmm_writable(cp
, result
);
730 sse_movss(cp
->func
, result
, dst
);
735 mask_write(cp
, dst
, result
, reg
->DstRegister
.WriteMask
);
739 aos_adopt_xmm_reg(cp
,
741 reg
->DstRegister
.File
,
742 reg
->DstRegister
.Index
,
747 static void inject_scalar( struct aos_compilation
*cp
,
749 struct x86_reg result
,
752 sse_shufps(cp
->func
, dst
, dst
, swizzle
);
753 sse_movss(cp
->func
, dst
, result
);
754 sse_shufps(cp
->func
, dst
, dst
, swizzle
);
758 static void store_scalar_dest( struct aos_compilation
*cp
,
759 const struct tgsi_full_dst_register
*reg
,
760 struct x86_reg result
)
762 unsigned writemask
= reg
->DstRegister
.WriteMask
;
765 if (writemask
!= TGSI_WRITEMASK_X
&&
766 writemask
!= TGSI_WRITEMASK_Y
&&
767 writemask
!= TGSI_WRITEMASK_Z
&&
768 writemask
!= TGSI_WRITEMASK_W
&&
771 result
= get_xmm_writable(cp
, result
); /* already true, right? */
772 sse_shufps(cp
->func
, result
, result
, SHUF(X
,X
,X
,X
));
773 store_dest(cp
, reg
, result
);
777 result
= get_xmm(cp
, result
);
778 dst
= aos_get_shader_reg_xmm(cp
,
779 reg
->DstRegister
.File
,
780 reg
->DstRegister
.Index
);
784 switch (reg
->DstRegister
.WriteMask
) {
785 case TGSI_WRITEMASK_X
:
786 sse_movss(cp
->func
, dst
, result
);
789 case TGSI_WRITEMASK_Y
:
790 inject_scalar(cp
, dst
, result
, SHUF(Y
, X
, Z
, W
));
793 case TGSI_WRITEMASK_Z
:
794 inject_scalar(cp
, dst
, result
, SHUF(Z
, Y
, X
, W
));
797 case TGSI_WRITEMASK_W
:
798 inject_scalar(cp
, dst
, result
, SHUF(W
, Y
, Z
, X
));
805 aos_adopt_xmm_reg(cp
,
807 reg
->DstRegister
.File
,
808 reg
->DstRegister
.Index
,
814 static void x87_fst_or_nop( struct x86_function
*func
,
819 assert(ptr
.file
== file_REG32
);
820 if (writemask
& (1<<channel
))
821 x87_fst( func
, x86_make_disp(ptr
, channel
* sizeof(float)) );
824 static void x87_fstp_or_pop( struct x86_function
*func
,
829 assert(ptr
.file
== file_REG32
);
830 if (writemask
& (1<<channel
))
831 x87_fstp( func
, x86_make_disp(ptr
, channel
* sizeof(float)) );
833 x87_fstp( func
, x86_make_reg( file_x87
, 0 ));
840 static void x87_fstp_dest4( struct aos_compilation
*cp
,
841 const struct tgsi_full_dst_register
*dst
)
843 struct x86_reg ptr
= get_dst_ptr(cp
, dst
);
844 unsigned writemask
= dst
->DstRegister
.WriteMask
;
846 x87_fst_or_nop(cp
->func
, writemask
, 0, ptr
);
847 x87_fst_or_nop(cp
->func
, writemask
, 1, ptr
);
848 x87_fst_or_nop(cp
->func
, writemask
, 2, ptr
);
849 x87_fstp_or_pop(cp
->func
, writemask
, 3, ptr
);
852 /* Save current x87 state and put it into single precision mode.
854 static void save_fpu_state( struct aos_compilation
*cp
)
856 x87_fnstcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
857 Offset(struct aos_machine
, fpu_restore
)));
860 static void restore_fpu_state( struct aos_compilation
*cp
)
862 x87_fnclex(cp
->func
);
863 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
864 Offset(struct aos_machine
, fpu_restore
)));
867 static void set_fpu_round_neg_inf( struct aos_compilation
*cp
)
869 if (cp
->fpucntl
!= FPU_RND_NEG
) {
870 cp
->fpucntl
= FPU_RND_NEG
;
871 x87_fnclex(cp
->func
);
872 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
873 Offset(struct aos_machine
, fpu_rnd_neg_inf
)));
877 static void set_fpu_round_nearest( struct aos_compilation
*cp
)
879 if (cp
->fpucntl
!= FPU_RND_NEAREST
) {
880 cp
->fpucntl
= FPU_RND_NEAREST
;
881 x87_fnclex(cp
->func
);
882 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
883 Offset(struct aos_machine
, fpu_rnd_nearest
)));
888 static void x87_emit_ex2( struct aos_compilation
*cp
)
890 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
891 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
892 int stack
= cp
->func
->x87_stack
;
894 // set_fpu_round_neg_inf( cp );
896 x87_fld(cp
->func
, st0
); /* a a */
897 x87_fprndint( cp
->func
); /* int(a) a*/
898 x87_fsubr(cp
->func
, st1
, st0
); /* int(a) frc(a) */
899 x87_fxch(cp
->func
, st1
); /* frc(a) int(a) */
900 x87_f2xm1(cp
->func
); /* (2^frc(a))-1 int(a) */
901 x87_fld1(cp
->func
); /* 1 (2^frc(a))-1 int(a) */
902 x87_faddp(cp
->func
, st1
); /* 2^frac(a) int(a) */
903 x87_fscale(cp
->func
); /* (2^frac(a)*2^int(int(a))) int(a) */
905 x87_fstp(cp
->func
, st1
); /* 2^a */
907 assert( stack
== cp
->func
->x87_stack
);
913 static void PIPE_CDECL
print_reg( const char *msg
,
916 debug_printf("%s: %f %f %f %f\n", msg
, reg
[0], reg
[1], reg
[2], reg
[3]);
921 static void emit_print( struct aos_compilation
*cp
,
922 const char *message
, /* must point to a static string! */
926 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
927 struct x86_reg arg
= aos_get_shader_reg_ptr( cp
, file
, idx
);
930 /* There shouldn't be anything on the x87 stack. Can add this
931 * capacity later if need be.
933 assert(cp
->func
->x87_stack
== 0);
935 /* For absolute correctness, need to spill/invalidate all XMM regs
936 * too. We're obviously not concerned about performance on this
937 * debug path, so here goes:
939 for (i
= 0; i
< 8; i
++) {
940 if (cp
->xmm
[i
].dirty
)
943 aos_release_xmm_reg(cp
, i
);
946 /* Push caller-save (ie scratch) regs.
948 x86_cdecl_caller_push_regs( cp
->func
);
951 /* Push the arguments:
953 x86_lea( cp
->func
, ecx
, arg
);
954 x86_push( cp
->func
, ecx
);
955 x86_push_imm32( cp
->func
, (int)message
);
957 /* Call the helper. Could call debug_printf directly, but
958 * print_reg is a nice place to put a breakpoint if need be.
960 x86_mov_reg_imm( cp
->func
, ecx
, (int)print_reg
);
961 x86_call( cp
->func
, ecx
);
962 x86_pop( cp
->func
, ecx
);
963 x86_pop( cp
->func
, ecx
);
965 /* Pop caller-save regs
967 x86_cdecl_caller_pop_regs( cp
->func
);
975 * The traditional instructions. All operate on internal registers
976 * and ignore write masks and swizzling issues.
979 static boolean
emit_ABS( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
981 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
982 struct x86_reg neg
= aos_get_internal(cp
, IMM_NEGS
);
983 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
985 sse_movaps(cp
->func
, tmp
, arg0
);
986 sse_mulps(cp
->func
, tmp
, neg
);
987 sse_maxps(cp
->func
, tmp
, arg0
);
989 store_dest(cp
, &op
->FullDstRegisters
[0], tmp
);
993 static boolean
emit_ADD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
995 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
996 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
997 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
999 sse_addps(cp
->func
, dst
, arg1
);
1001 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1005 static boolean
emit_COS( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1007 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
1009 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1013 /* The dotproduct instructions don't really do that well in sse:
1014 * XXX: produces wrong results -- disabled.
1016 static boolean
emit_DP3( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1018 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1019 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1020 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1021 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1023 sse_mulps(cp
->func
, dst
, arg1
);
1024 /* Now the hard bit: sum the first 3 values:
1026 sse_movhlps(cp
->func
, tmp
, dst
);
1027 sse_addss(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y, ?, ? */
1028 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
1029 sse_addss(cp
->func
, dst
, tmp
);
1031 aos_release_xmm_reg(cp
, tmp
.idx
);
1032 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1036 static boolean
emit_DP4( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1038 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1039 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1040 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1041 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1043 sse_mulps(cp
->func
, dst
, arg1
);
1045 /* Now the hard bit: sum the values:
1047 sse_movhlps(cp
->func
, tmp
, dst
);
1048 sse_addps(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1049 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
1050 sse_addss(cp
->func
, dst
, tmp
);
1052 aos_release_xmm_reg(cp
, tmp
.idx
);
1053 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1057 static boolean
emit_DPH( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1059 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1060 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1061 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1062 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1064 sse_mulps(cp
->func
, dst
, arg1
);
1066 /* Now the hard bit: sum the values (from DP3):
1068 sse_movhlps(cp
->func
, tmp
, dst
);
1069 sse_addss(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y, ?, ? */
1070 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
1071 sse_addss(cp
->func
, dst
, tmp
);
1072 emit_pshufd(cp
, tmp
, arg1
, SHUF(W
,W
,W
,W
));
1073 sse_addss(cp
->func
, dst
, tmp
);
1075 aos_release_xmm_reg(cp
, tmp
.idx
);
1076 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1080 static boolean
emit_DST( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1082 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1083 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1084 struct x86_reg dst
= aos_get_xmm_reg(cp
);
1085 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1086 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1088 /* dst[0] = 1.0 * 1.0F; */
1089 /* dst[1] = arg0[1] * arg1[1]; */
1090 /* dst[2] = arg0[2] * 1.0; */
1091 /* dst[3] = 1.0 * arg1[3]; */
1093 emit_shuf_copy2(cp
, dst
, arg0
, ones
, SHUF(X
,W
,Z
,Y
));
1094 emit_shuf_copy2(cp
, tmp
, arg1
, ones
, SHUF(X
,Z
,Y
,W
));
1095 sse_mulps(cp
->func
, dst
, tmp
);
1097 aos_release_xmm_reg(cp
, tmp
.idx
);
1098 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1102 static boolean
emit_LG2( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1104 x87_fld1(cp
->func
); /* 1 */
1105 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0 1 */
1106 x87_fyl2x(cp
->func
); /* log2(a0) */
1107 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1112 static boolean
emit_EX2( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1114 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
1116 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1122 static boolean
emit_FLR( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1124 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1125 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1128 set_fpu_round_neg_inf( cp
);
1130 /* Load all sources first to avoid aliasing
1132 for (i
= 3; i
>= 0; i
--) {
1133 if (writemask
& (1<<i
)) {
1134 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1138 for (i
= 0; i
< 4; i
++) {
1139 if (writemask
& (1<<i
)) {
1140 x87_fprndint( cp
->func
);
1141 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1149 static boolean
emit_RND( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1151 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1152 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1155 set_fpu_round_nearest( cp
);
1157 /* Load all sources first to avoid aliasing
1159 for (i
= 3; i
>= 0; i
--) {
1160 if (writemask
& (1<<i
)) {
1161 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1165 for (i
= 0; i
< 4; i
++) {
1166 if (writemask
& (1<<i
)) {
1167 x87_fprndint( cp
->func
);
1168 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1176 static boolean
emit_FRC( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1178 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1179 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
1180 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
1181 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1184 set_fpu_round_neg_inf( cp
);
1186 /* suck all the source values onto the stack before writing out any
1187 * dst, which may alias...
1189 for (i
= 3; i
>= 0; i
--) {
1190 if (writemask
& (1<<i
)) {
1191 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1195 for (i
= 0; i
< 4; i
++) {
1196 if (writemask
& (1<<i
)) {
1197 x87_fld(cp
->func
, st0
); /* a a */
1198 x87_fprndint( cp
->func
); /* flr(a) a */
1199 x87_fsubp(cp
->func
, st1
); /* frc(a) */
1200 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1212 static boolean
emit_LIT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1214 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
1215 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1216 unsigned lit_count
= cp
->lit_count
++;
1217 struct x86_reg result
, arg0
;
1221 /* For absolute correctness, need to spill/invalidate all XMM regs
1224 for (i
= 0; i
< 8; i
++) {
1225 if (cp
->xmm
[i
].dirty
)
1227 aos_release_xmm_reg(cp
, i
);
1231 if (writemask
!= TGSI_WRITEMASK_XYZW
)
1232 result
= x86_make_disp(cp
->machine_EDX
, Offset(struct aos_machine
, tmp
[0]));
1234 result
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1237 arg0
= fetch_src( cp
, &op
->FullSrcRegisters
[0] );
1238 if (arg0
.file
== file_XMM
) {
1239 struct x86_reg tmp
= x86_make_disp(cp
->machine_EDX
,
1240 Offset(struct aos_machine
, tmp
[1]));
1241 sse_movaps( cp
->func
, tmp
, arg0
);
1247 /* Push caller-save (ie scratch) regs.
1249 x86_cdecl_caller_push_regs( cp
->func
);
1251 /* Push the arguments:
1253 x86_push_imm32( cp
->func
, lit_count
);
1255 x86_lea( cp
->func
, ecx
, arg0
);
1256 x86_push( cp
->func
, ecx
);
1258 x86_lea( cp
->func
, ecx
, result
);
1259 x86_push( cp
->func
, ecx
);
1261 x86_push( cp
->func
, cp
->machine_EDX
);
1263 if (lit_count
< MAX_LIT_INFO
) {
1264 x86_mov( cp
->func
, ecx
, x86_make_disp( cp
->machine_EDX
,
1265 Offset(struct aos_machine
, lit_info
) +
1266 lit_count
* sizeof(struct lit_info
) +
1267 Offset(struct lit_info
, func
)));
1270 x86_mov_reg_imm( cp
->func
, ecx
, (int)aos_do_lit
);
1273 x86_call( cp
->func
, ecx
);
1275 x86_pop( cp
->func
, ecx
); /* fixme... */
1276 x86_pop( cp
->func
, ecx
);
1277 x86_pop( cp
->func
, ecx
);
1278 x86_pop( cp
->func
, ecx
);
1280 x86_cdecl_caller_pop_regs( cp
->func
);
1282 if (writemask
!= TGSI_WRITEMASK_XYZW
) {
1284 &op
->FullDstRegisters
[0],
1285 get_xmm_writable( cp
, result
) );
1292 static boolean
emit_inline_LIT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1294 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1295 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1297 if (writemask
& TGSI_WRITEMASK_YZ
) {
1298 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
1299 struct x86_reg st2
= x86_make_reg(file_x87
, 2);
1301 /* a1' = a1 <= 0 ? 1 : a1;
1303 x87_fldz(cp
->func
); /* 1 0 */
1305 x87_fld1(cp
->func
); /* 1 0 */
1307 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1309 x87_fldz(cp
->func
); /* 1 0 */
1311 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 1); /* a1 1 0 */
1312 x87_fcomi(cp
->func
, st2
); /* a1 1 0 */
1313 x87_fcmovb(cp
->func
, st1
); /* a1' 1 0 */
1314 x87_fstp(cp
->func
, st1
); /* a1' 0 */
1315 x87_fstp(cp
->func
, st1
); /* a1' */
1317 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 3); /* a3 a1' */
1318 x87_fxch(cp
->func
, st1
); /* a1' a3 */
1321 /* Compute pow(a1, a3)
1323 x87_fyl2x(cp
->func
); /* a3*log2(a1) */
1324 x87_emit_ex2( cp
); /* 2^(a3*log2(a1)) */
1327 /* a0' = max2(a0, 0):
1329 x87_fldz(cp
->func
); /* 0 r2 */
1330 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0 0 r2 */
1331 x87_fcomi(cp
->func
, st1
);
1332 x87_fcmovb(cp
->func
, st1
); /* a0' 0 r2 */
1334 x87_fst_or_nop(cp
->func
, writemask
, 1, dst
); /* result[1] = a0' */
1336 x87_fcomi(cp
->func
, st1
); /* a0' 0 r2 */
1337 x87_fcmovnbe(cp
->func
, st2
); /* r2' 0' r2 */
1339 x87_fstp_or_pop(cp
->func
, writemask
, 2, dst
); /* 0 r2 */
1340 x87_fpop(cp
->func
); /* r2 */
1344 if (writemask
& TGSI_WRITEMASK_XW
) {
1346 x87_fst_or_nop(cp
->func
, writemask
, 0, dst
);
1347 x87_fstp_or_pop(cp
->func
, writemask
, 3, dst
);
1356 static boolean
emit_MAX( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1358 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1359 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1360 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1362 sse_maxps(cp
->func
, dst
, arg1
);
1364 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1369 static boolean
emit_MIN( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1371 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1372 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1373 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1375 sse_minps(cp
->func
, dst
, arg1
);
1377 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1381 static boolean
emit_MOV( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1383 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1384 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1386 /* potentially nothing to do */
1388 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1392 static boolean
emit_MUL( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1394 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1395 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1396 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1398 sse_mulps(cp
->func
, dst
, arg1
);
1400 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1405 static boolean
emit_MAD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1407 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1408 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1409 struct x86_reg arg2
= fetch_src(cp
, &op
->FullSrcRegisters
[2]);
1411 /* If we can't clobber old contents of arg0, get a temporary & copy
1412 * it there, then clobber it...
1414 arg0
= get_xmm_writable(cp
, arg0
);
1416 sse_mulps(cp
->func
, arg0
, arg1
);
1417 sse_addps(cp
->func
, arg0
, arg2
);
1418 store_dest(cp
, &op
->FullDstRegisters
[0], arg0
);
1424 /* A wrapper for powf().
1425 * Makes sure it is cdecl and operates on floats.
1427 static float PIPE_CDECL
_powerf( float x
, float y
)
1430 return util_fast_pow(x
, y
);
1432 return powf( x
, y
);
1437 static float PIPE_CDECL
_exp2(float x
)
1439 return util_fast_exp2(x
);
1444 /* Really not sufficient -- need to check for conditions that could
1445 * generate inf/nan values, which will slow things down hugely.
1447 static boolean
emit_POW( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1450 x87_fld_src(cp
, &op
->FullSrcRegisters
[1], 0); /* a1.x */
1451 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0.x a1.x */
1452 x87_fyl2x(cp
->func
); /* a1*log2(a0) */
1454 x87_emit_ex2( cp
); /* 2^(a1*log2(a0)) */
1456 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1460 /* For absolute correctness, need to spill/invalidate all XMM regs
1463 for (i
= 0; i
< 8; i
++) {
1464 if (cp
->xmm
[i
].dirty
)
1466 aos_release_xmm_reg(cp
, i
);
1469 /* Push caller-save (ie scratch) regs.
1471 x86_cdecl_caller_push_regs( cp
->func
);
1473 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, -8) );
1475 x87_fld_src( cp
, &op
->FullSrcRegisters
[1], 0 );
1476 x87_fstp( cp
->func
, x86_make_disp( cp
->stack_ESP
, 4 ) );
1477 x87_fld_src( cp
, &op
->FullSrcRegisters
[0], 0 );
1478 x87_fstp( cp
->func
, x86_make_disp( cp
->stack_ESP
, 0 ) );
1480 /* tmp_EAX has been pushed & will be restored below */
1481 x86_mov_reg_imm( cp
->func
, cp
->tmp_EAX
, (unsigned long) _powerf
);
1482 x86_call( cp
->func
, cp
->tmp_EAX
);
1484 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, 8) );
1486 x86_cdecl_caller_pop_regs( cp
->func
);
1488 /* Note retval on x87 stack:
1490 cp
->func
->x87_stack
++;
1492 x87_fstp_dest4( cp
, &op
->FullDstRegisters
[0] );
1499 static boolean
emit_EXPBASE2( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1503 /* For absolute correctness, need to spill/invalidate all XMM regs
1506 for (i
= 0; i
< 8; i
++) {
1507 if (cp
->xmm
[i
].dirty
)
1509 aos_release_xmm_reg(cp
, i
);
1512 /* Push caller-save (ie scratch) regs.
1514 x86_cdecl_caller_push_regs( cp
->func
);
1516 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, -4) );
1518 x87_fld_src( cp
, &op
->FullSrcRegisters
[0], 0 );
1519 x87_fstp( cp
->func
, x86_make_disp( cp
->stack_ESP
, 0 ) );
1521 /* tmp_EAX has been pushed & will be restored below */
1522 x86_mov_reg_imm( cp
->func
, cp
->tmp_EAX
, (unsigned long) _exp2
);
1523 x86_call( cp
->func
, cp
->tmp_EAX
);
1525 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, 4) );
1527 x86_cdecl_caller_pop_regs( cp
->func
);
1529 /* Note retval on x87 stack:
1531 cp
->func
->x87_stack
++;
1533 x87_fstp_dest4( cp
, &op
->FullDstRegisters
[0] );
1540 static boolean
emit_RCP( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1542 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1543 struct x86_reg dst
= aos_get_xmm_reg(cp
);
1545 if (cp
->have_sse2
) {
1546 sse2_rcpss(cp
->func
, dst
, arg0
);
1547 /* extend precision here...
1551 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1552 sse_movss(cp
->func
, dst
, ones
);
1553 sse_divss(cp
->func
, dst
, arg0
);
1556 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1561 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1562 * implementations, it is possible to improve its precision at
1563 * fairly low cost, using a newton/raphson step, as below:
1565 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1566 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1568 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1571 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1573 static boolean
emit_RSQ( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1576 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1577 struct x86_reg r
= aos_get_xmm_reg(cp
);
1578 sse_rsqrtss(cp
->func
, r
, arg0
);
1579 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], r
);
1583 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1584 struct x86_reg r
= aos_get_xmm_reg(cp
);
1586 struct x86_reg neg_half
= get_reg_ptr( cp
, AOS_FILE_INTERNAL
, IMM_RSQ
);
1587 struct x86_reg one_point_five
= x86_make_disp( neg_half
, 4 );
1588 struct x86_reg src
= get_xmm_writable( cp
, arg0
);
1589 struct x86_reg neg
= aos_get_internal(cp
, IMM_NEGS
);
1590 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1592 sse_movaps(cp
->func
, tmp
, src
);
1593 sse_mulps(cp
->func
, tmp
, neg
);
1594 sse_maxps(cp
->func
, tmp
, src
);
1596 sse_rsqrtss( cp
->func
, r
, tmp
); /* rsqrtss(a) */
1597 sse_mulss( cp
->func
, tmp
, neg_half
); /* -.5 * a */
1598 sse_mulss( cp
->func
, tmp
, r
); /* -.5 * a * r */
1599 sse_mulss( cp
->func
, tmp
, r
); /* -.5 * a * r * r */
1600 sse_addss( cp
->func
, tmp
, one_point_five
); /* 1.5 - .5 * a * r * r */
1601 sse_mulss( cp
->func
, r
, tmp
); /* r * (1.5 - .5 * a * r * r) */
1603 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], r
);
1605 aos_release_xmm_reg(cp
, tmp
.idx
);
1612 static boolean
emit_SGE( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1614 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1615 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1616 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1617 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1619 sse_cmpps(cp
->func
, dst
, arg1
, cc_NotLessThan
);
1620 sse_andps(cp
->func
, dst
, ones
);
1622 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1626 static boolean
emit_SIN( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1628 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
1630 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1636 static boolean
emit_SLT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1638 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1639 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1640 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1641 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1643 sse_cmpps(cp
->func
, dst
, arg1
, cc_LessThan
);
1644 sse_andps(cp
->func
, dst
, ones
);
1646 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1650 static boolean
emit_SUB( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1652 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1653 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1654 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1656 sse_subps(cp
->func
, dst
, arg1
);
1658 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1662 static boolean
emit_TRUNC( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1664 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1665 struct x86_reg tmp0
= aos_get_xmm_reg(cp
);
1667 sse2_cvttps2dq(cp
->func
, tmp0
, arg0
);
1668 sse2_cvtdq2ps(cp
->func
, tmp0
, tmp0
);
1670 store_dest(cp
, &op
->FullDstRegisters
[0], tmp0
);
1674 static boolean
emit_XPD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1676 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1677 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1678 struct x86_reg tmp0
= aos_get_xmm_reg(cp
);
1679 struct x86_reg tmp1
= aos_get_xmm_reg(cp
);
1681 emit_pshufd(cp
, tmp1
, arg1
, SHUF(Y
, Z
, X
, W
));
1682 sse_mulps(cp
->func
, tmp1
, arg0
);
1683 emit_pshufd(cp
, tmp0
, arg0
, SHUF(Y
, Z
, X
, W
));
1684 sse_mulps(cp
->func
, tmp0
, arg1
);
1685 sse_subps(cp
->func
, tmp1
, tmp0
);
1686 sse_shufps(cp
->func
, tmp1
, tmp1
, SHUF(Y
, Z
, X
, W
));
1688 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1689 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1690 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1691 /* dst[3] is undef */
1694 aos_release_xmm_reg(cp
, tmp0
.idx
);
1695 store_dest(cp
, &op
->FullDstRegisters
[0], tmp1
);
1702 emit_instruction( struct aos_compilation
*cp
,
1703 struct tgsi_full_instruction
*inst
)
1705 x87_assert_stack_empty(cp
->func
);
1707 switch( inst
->Instruction
.Opcode
) {
1708 case TGSI_OPCODE_MOV
:
1709 return emit_MOV( cp
, inst
);
1711 case TGSI_OPCODE_LIT
:
1712 return emit_LIT(cp
, inst
);
1714 case TGSI_OPCODE_RCP
:
1715 return emit_RCP(cp
, inst
);
1717 case TGSI_OPCODE_RSQ
:
1718 return emit_RSQ(cp
, inst
);
1720 case TGSI_OPCODE_EXP
:
1721 /*return emit_EXP(cp, inst);*/
1724 case TGSI_OPCODE_LOG
:
1725 /*return emit_LOG(cp, inst);*/
1728 case TGSI_OPCODE_MUL
:
1729 return emit_MUL(cp
, inst
);
1731 case TGSI_OPCODE_ADD
:
1732 return emit_ADD(cp
, inst
);
1734 case TGSI_OPCODE_DP3
:
1735 return emit_DP3(cp
, inst
);
1737 case TGSI_OPCODE_DP4
:
1738 return emit_DP4(cp
, inst
);
1740 case TGSI_OPCODE_DST
:
1741 return emit_DST(cp
, inst
);
1743 case TGSI_OPCODE_MIN
:
1744 return emit_MIN(cp
, inst
);
1746 case TGSI_OPCODE_MAX
:
1747 return emit_MAX(cp
, inst
);
1749 case TGSI_OPCODE_SLT
:
1750 return emit_SLT(cp
, inst
);
1752 case TGSI_OPCODE_SGE
:
1753 return emit_SGE(cp
, inst
);
1755 case TGSI_OPCODE_MAD
:
1756 return emit_MAD(cp
, inst
);
1758 case TGSI_OPCODE_SUB
:
1759 return emit_SUB(cp
, inst
);
1761 case TGSI_OPCODE_LERP
:
1762 // return emit_LERP(cp, inst);
1765 case TGSI_OPCODE_FRAC
:
1766 return emit_FRC(cp
, inst
);
1768 case TGSI_OPCODE_CLAMP
:
1769 // return emit_CLAMP(cp, inst);
1772 case TGSI_OPCODE_FLOOR
:
1773 return emit_FLR(cp
, inst
);
1775 case TGSI_OPCODE_ROUND
:
1776 return emit_RND(cp
, inst
);
1778 case TGSI_OPCODE_EXPBASE2
:
1780 return emit_EXPBASE2(cp
, inst
);
1782 /* this seems to fail for "larger" exponents.
1783 * See glean tvertProg1's EX2 test.
1785 return emit_EX2(cp
, inst
);
1790 case TGSI_OPCODE_LOGBASE2
:
1791 return emit_LG2(cp
, inst
);
1793 case TGSI_OPCODE_POWER
:
1794 return emit_POW(cp
, inst
);
1796 case TGSI_OPCODE_CROSSPRODUCT
:
1797 return emit_XPD(cp
, inst
);
1799 case TGSI_OPCODE_ABS
:
1800 return emit_ABS(cp
, inst
);
1802 case TGSI_OPCODE_DPH
:
1803 return emit_DPH(cp
, inst
);
1805 case TGSI_OPCODE_COS
:
1806 return emit_COS(cp
, inst
);
1808 case TGSI_OPCODE_SIN
:
1809 return emit_SIN(cp
, inst
);
1811 case TGSI_OPCODE_TRUNC
:
1812 return emit_TRUNC(cp
, inst
);
1814 case TGSI_OPCODE_END
:
1823 static boolean
emit_viewport( struct aos_compilation
*cp
)
1825 struct x86_reg pos
= aos_get_shader_reg_xmm(cp
,
1827 cp
->vaos
->draw
->vs
.position_output
);
1829 struct x86_reg scale
= x86_make_disp(cp
->machine_EDX
,
1830 Offset(struct aos_machine
, scale
));
1832 struct x86_reg translate
= x86_make_disp(cp
->machine_EDX
,
1833 Offset(struct aos_machine
, translate
));
1835 sse_mulps(cp
->func
, pos
, scale
);
1836 sse_addps(cp
->func
, pos
, translate
);
1838 aos_adopt_xmm_reg( cp
,
1841 cp
->vaos
->draw
->vs
.position_output
,
1847 /* This is useful to be able to see the results on softpipe. Doesn't
1848 * do proper clipping, just assumes the backend can do it during
1849 * rasterization -- for debug only...
1851 static boolean
emit_rhw_viewport( struct aos_compilation
*cp
)
1853 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1854 struct x86_reg pos
= aos_get_shader_reg_xmm(cp
,
1856 cp
->vaos
->draw
->vs
.position_output
);
1858 struct x86_reg scale
= x86_make_disp(cp
->machine_EDX
,
1859 Offset(struct aos_machine
, scale
));
1861 struct x86_reg translate
= x86_make_disp(cp
->machine_EDX
,
1862 Offset(struct aos_machine
, translate
));
1866 emit_pshufd(cp
, tmp
, pos
, SHUF(W
, W
, W
, W
));
1867 sse2_rcpss(cp
->func
, tmp
, tmp
);
1868 sse_shufps(cp
->func
, tmp
, tmp
, SHUF(X
, X
, X
, X
));
1870 sse_mulps(cp
->func
, pos
, scale
);
1871 sse_mulps(cp
->func
, pos
, tmp
);
1872 sse_addps(cp
->func
, pos
, translate
);
1876 mask_write(cp
, pos
, tmp
, TGSI_WRITEMASK_W
);
1878 aos_adopt_xmm_reg( cp
,
1881 cp
->vaos
->draw
->vs
.position_output
,
1888 static boolean
note_immediate( struct aos_compilation
*cp
,
1889 struct tgsi_full_immediate
*imm
)
1891 unsigned pos
= cp
->num_immediates
++;
1894 for (j
= 0; j
< imm
->Immediate
.NrTokens
- 1; j
++) {
1895 cp
->vaos
->machine
->immediate
[pos
][j
] = imm
->u
.ImmediateFloat32
[j
].Float
;
1905 static void find_last_write_outputs( struct aos_compilation
*cp
)
1907 struct tgsi_parse_context parse
;
1908 unsigned this_instruction
= 0;
1911 tgsi_parse_init( &parse
, cp
->vaos
->base
.vs
->state
.tokens
);
1913 while (!tgsi_parse_end_of_tokens( &parse
)) {
1915 tgsi_parse_token( &parse
);
1917 if (parse
.FullToken
.Token
.Type
!= TGSI_TOKEN_TYPE_INSTRUCTION
)
1920 for (i
= 0; i
< TGSI_FULL_MAX_DST_REGISTERS
; i
++) {
1921 if (parse
.FullToken
.FullInstruction
.FullDstRegisters
[i
].DstRegister
.File
==
1924 unsigned idx
= parse
.FullToken
.FullInstruction
.FullDstRegisters
[i
].DstRegister
.Index
;
1925 cp
->output_last_write
[idx
] = this_instruction
;
1932 tgsi_parse_free( &parse
);
1936 #define ARG_MACHINE 1
1937 #define ARG_START_ELTS 2
1939 #define ARG_OUTBUF 4
1942 static boolean
build_vertex_program( struct draw_vs_varient_aos_sse
*varient
,
1945 struct tgsi_parse_context parse
;
1946 struct aos_compilation cp
;
1947 unsigned fixup
, label
;
1951 tgsi_parse_init( &parse
, varient
->base
.vs
->state
.tokens
);
1953 memset(&cp
, 0, sizeof(cp
));
1955 cp
.insn_counter
= 1;
1958 cp
.func
= &varient
->func
[ linear
? 0 : 1 ];
1960 cp
.tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
1961 cp
.idx_EBX
= x86_make_reg(file_REG32
, reg_BX
);
1962 cp
.outbuf_ECX
= x86_make_reg(file_REG32
, reg_CX
);
1963 cp
.machine_EDX
= x86_make_reg(file_REG32
, reg_DX
);
1964 cp
.count_ESI
= x86_make_reg(file_REG32
, reg_SI
);
1965 cp
.temp_EBP
= x86_make_reg(file_REG32
, reg_BP
);
1966 cp
.stack_ESP
= x86_make_reg( file_REG32
, reg_SP
);
1968 x86_init_func(cp
.func
);
1970 find_last_write_outputs(&cp
);
1972 x86_push(cp
.func
, cp
.idx_EBX
);
1973 x86_push(cp
.func
, cp
.count_ESI
);
1974 x86_push(cp
.func
, cp
.temp_EBP
);
1977 /* Load arguments into regs:
1979 x86_mov(cp
.func
, cp
.machine_EDX
, x86_fn_arg(cp
.func
, ARG_MACHINE
));
1980 x86_mov(cp
.func
, cp
.idx_EBX
, x86_fn_arg(cp
.func
, ARG_START_ELTS
));
1981 x86_mov(cp
.func
, cp
.count_ESI
, x86_fn_arg(cp
.func
, ARG_COUNT
));
1982 x86_mov(cp
.func
, cp
.outbuf_ECX
, x86_fn_arg(cp
.func
, ARG_OUTBUF
));
1985 /* Compare count to zero and possibly bail.
1987 x86_xor(cp
.func
, cp
.tmp_EAX
, cp
.tmp_EAX
);
1988 x86_cmp(cp
.func
, cp
.count_ESI
, cp
.tmp_EAX
);
1989 fixup
= x86_jcc_forward(cp
.func
, cc_E
);
1992 save_fpu_state( &cp
);
1993 set_fpu_round_nearest( &cp
);
1995 aos_init_inputs( &cp
, linear
);
2000 /* Note address for loop jump
2002 label
= x86_get_label(cp
.func
);
2004 /* Fetch inputs... TODO: fetch lazily...
2006 if (!aos_fetch_inputs( &cp
, linear
))
2011 while( !tgsi_parse_end_of_tokens( &parse
) && !cp
.error
)
2013 tgsi_parse_token( &parse
);
2015 switch (parse
.FullToken
.Token
.Type
) {
2016 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2018 if (!note_immediate( &cp
, &parse
.FullToken
.FullImmediate
))
2023 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2025 tgsi_dump_instruction( &parse
.FullToken
.FullInstruction
, cp
.insn_counter
);
2027 if (!emit_instruction( &cp
, &parse
.FullToken
.FullInstruction
))
2032 x87_assert_stack_empty(cp
.func
);
2042 for (i
= 0; i
< 8; i
++) {
2043 if (cp
.xmm
[i
].file
!= TGSI_FILE_OUTPUT
) {
2044 cp
.xmm
[i
].file
= TGSI_FILE_NULL
;
2045 cp
.xmm
[i
].dirty
= 0;
2053 if (cp
.vaos
->base
.key
.clip
) {
2054 /* not really handling clipping, just do the rhw so we can
2055 * see the results...
2057 emit_rhw_viewport(&cp
);
2059 else if (cp
.vaos
->base
.key
.viewport
) {
2063 /* Emit output... TODO: do this eagerly after the last write to a
2066 if (!aos_emit_outputs( &cp
))
2074 x86_make_disp(cp
.outbuf_ECX
,
2075 cp
.vaos
->base
.key
.output_stride
));
2079 aos_incr_inputs( &cp
, linear
);
2081 /* decr count, loop if not zero
2083 x86_dec(cp
.func
, cp
.count_ESI
);
2084 x86_jcc(cp
.func
, cc_NZ
, label
);
2086 restore_fpu_state(&cp
);
2088 /* Land forward jump here:
2090 x86_fixup_fwd_jump(cp
.func
, fixup
);
2094 if (cp
.func
->need_emms
)
2097 x86_pop(cp
.func
, cp
.temp_EBP
);
2098 x86_pop(cp
.func
, cp
.count_ESI
);
2099 x86_pop(cp
.func
, cp
.idx_EBX
);
2101 x87_assert_stack_empty(cp
.func
);
2104 tgsi_parse_free( &parse
);
2108 tgsi_parse_free( &parse
);
2114 static void vaos_set_buffer( struct draw_vs_varient
*varient
,
2119 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2121 if (buf
< vaos
->nr_vb
) {
2122 vaos
->buffer
[buf
].base_ptr
= (char *)ptr
;
2123 vaos
->buffer
[buf
].stride
= stride
;
2126 if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__
, buf
, vaos
->nr_vb
, ptr
, stride
);
2131 static void PIPE_CDECL
vaos_run_elts( struct draw_vs_varient
*varient
,
2132 const unsigned *elts
,
2134 void *output_buffer
)
2136 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2137 struct aos_machine
*machine
= vaos
->draw
->vs
.aos_machine
;
2139 if (0) debug_printf("%s %d\n", __FUNCTION__
, count
);
2141 machine
->internal
[IMM_PSIZE
][0] = vaos
->draw
->rasterizer
->point_size
;
2142 machine
->constants
= vaos
->draw
->vs
.aligned_constants
;
2143 machine
->immediates
= vaos
->base
.vs
->immediates
;
2144 machine
->buffer
= vaos
->buffer
;
2146 vaos
->gen_run_elts( machine
,
2152 static void PIPE_CDECL
vaos_run_linear( struct draw_vs_varient
*varient
,
2155 void *output_buffer
)
2157 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2158 struct aos_machine
*machine
= vaos
->draw
->vs
.aos_machine
;
2160 if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__
, start
, count
,
2161 vaos
->base
.key
.const_vbuffers
);
2163 machine
->internal
[IMM_PSIZE
][0] = vaos
->draw
->rasterizer
->point_size
;
2164 machine
->constants
= vaos
->draw
->vs
.aligned_constants
;
2165 machine
->immediates
= vaos
->base
.vs
->immediates
;
2166 machine
->buffer
= vaos
->buffer
;
2168 vaos
->gen_run_linear( machine
,
2173 /* Sanity spot checks to make sure we didn't trash our constants */
2174 assert(machine
->internal
[IMM_ONES
][0] == 1.0f
);
2175 assert(machine
->internal
[IMM_IDENTITY
][0] == 0.0f
);
2176 assert(machine
->internal
[IMM_NEGS
][0] == -1.0f
);
2181 static void vaos_destroy( struct draw_vs_varient
*varient
)
2183 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2185 FREE( vaos
->buffer
);
2187 x86_release_func( &vaos
->func
[0] );
2188 x86_release_func( &vaos
->func
[1] );
2195 static struct draw_vs_varient
*varient_aos_sse( struct draw_vertex_shader
*vs
,
2196 const struct draw_vs_varient_key
*key
)
2199 struct draw_vs_varient_aos_sse
*vaos
= CALLOC_STRUCT(draw_vs_varient_aos_sse
);
2204 vaos
->base
.key
= *key
;
2206 vaos
->base
.set_buffer
= vaos_set_buffer
;
2207 vaos
->base
.destroy
= vaos_destroy
;
2208 vaos
->base
.run_linear
= vaos_run_linear
;
2209 vaos
->base
.run_elts
= vaos_run_elts
;
2211 vaos
->draw
= vs
->draw
;
2213 for (i
= 0; i
< key
->nr_inputs
; i
++)
2214 vaos
->nr_vb
= MAX2( vaos
->nr_vb
, key
->element
[i
].in
.buffer
+ 1 );
2216 vaos
->buffer
= MALLOC( vaos
->nr_vb
* sizeof(vaos
->buffer
[0]) );
2221 debug_printf("nr_vb: %d const: %x\n", vaos
->nr_vb
, vaos
->base
.key
.const_vbuffers
);
2224 tgsi_dump(vs
->state
.tokens
, 0);
2227 if (!build_vertex_program( vaos
, TRUE
))
2230 if (!build_vertex_program( vaos
, FALSE
))
2233 vaos
->gen_run_linear
= (vaos_run_linear_func
)x86_get_func(&vaos
->func
[0]);
2234 if (!vaos
->gen_run_linear
)
2237 vaos
->gen_run_elts
= (vaos_run_elts_func
)x86_get_func(&vaos
->func
[1]);
2238 if (!vaos
->gen_run_elts
)
2244 if (vaos
&& vaos
->buffer
)
2248 x86_release_func( &vaos
->func
[0] );
2251 x86_release_func( &vaos
->func
[1] );
2259 struct draw_vs_varient
*draw_vs_varient_aos_sse( struct draw_vertex_shader
*vs
,
2260 const struct draw_vs_varient_key
*key
)
2262 struct draw_vs_varient
*varient
= varient_aos_sse( vs
, key
);
2264 if (varient
== NULL
) {
2265 varient
= draw_vs_varient_generic( vs
, key
);
2273 #endif /* PIPE_ARCH_X86 */