2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
32 #include "pipe/p_util.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/util/tgsi_parse.h"
35 #include "tgsi/util/tgsi_util.h"
36 #include "tgsi/exec/tgsi_exec.h"
37 #include "tgsi/util/tgsi_dump.h"
40 #include "draw_vs_aos.h"
42 #include "rtasm/rtasm_x86sse.h"
47 static const char *files
[] =
60 static INLINE boolean
eq( struct x86_reg a
,
63 return (a
.file
== b
.file
&&
69 struct x86_reg
aos_get_x86( struct aos_compilation
*cp
,
72 if (cp
->ebp
!= value
) {
77 offset
= Offset(struct aos_machine
, immediates
);
80 offset
= Offset(struct aos_machine
, constants
);
83 offset
= Offset(struct aos_machine
, attrib
);
90 x86_mov(cp
->func
, cp
->temp_EBP
,
91 x86_make_disp(cp
->machine_EDX
, offset
));
92 /* x86_deref(x86_make_disp(cp->machine_EDX, offset))); */
101 static struct x86_reg
get_reg_ptr(struct aos_compilation
*cp
,
105 struct x86_reg ptr
= cp
->machine_EDX
;
108 case TGSI_FILE_INPUT
:
109 return x86_make_disp(ptr
, Offset(struct aos_machine
, input
[idx
]));
111 case TGSI_FILE_OUTPUT
:
112 return x86_make_disp(ptr
, Offset(struct aos_machine
, output
[idx
]));
114 case TGSI_FILE_TEMPORARY
:
115 return x86_make_disp(ptr
, Offset(struct aos_machine
, temp
[idx
]));
117 case AOS_FILE_INTERNAL
:
118 return x86_make_disp(ptr
, Offset(struct aos_machine
, internal
[idx
]));
120 case TGSI_FILE_IMMEDIATE
:
121 return x86_make_disp(aos_get_x86(cp
, X86_IMMEDIATES
), idx
* 4 * sizeof(float));
123 case TGSI_FILE_CONSTANT
:
124 return x86_make_disp(aos_get_x86(cp
, X86_CONSTANTS
), idx
* 4 * sizeof(float));
127 ERROR(cp
, "unknown reg file");
128 return x86_make_reg(0,0);
134 #define X87_CW_EXCEPTION_INV_OP (1<<0)
135 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
136 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
137 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
138 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
139 #define X87_CW_EXCEPTION_PRECISION (1<<5)
140 #define X87_CW_PRECISION_SINGLE (0<<8)
141 #define X87_CW_PRECISION_RESERVED (1<<8)
142 #define X87_CW_PRECISION_DOUBLE (2<<8)
143 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
144 #define X87_CW_PRECISION_MASK (3<<8)
145 #define X87_CW_ROUND_NEAREST (0<<10)
146 #define X87_CW_ROUND_DOWN (1<<10)
147 #define X87_CW_ROUND_UP (2<<10)
148 #define X87_CW_ROUND_ZERO (3<<10)
149 #define X87_CW_ROUND_MASK (3<<10)
150 #define X87_CW_INFINITY (1<<12)
155 static void spill( struct aos_compilation
*cp
, unsigned idx
)
157 if (!cp
->xmm
[idx
].dirty
||
158 (cp
->xmm
[idx
].file
!= TGSI_FILE_INPUT
&& /* inputs are fetched into xmm & set dirty */
159 cp
->xmm
[idx
].file
!= TGSI_FILE_OUTPUT
&&
160 cp
->xmm
[idx
].file
!= TGSI_FILE_TEMPORARY
)) {
161 ERROR(cp
, "invalid spill");
165 struct x86_reg oldval
= get_reg_ptr(cp
,
169 if (0) debug_printf("\nspill %s[%d]",
170 files
[cp
->xmm
[idx
].file
],
173 assert(cp
->xmm
[idx
].dirty
);
174 sse_movaps(cp
->func
, oldval
, x86_make_reg(file_XMM
, idx
));
175 cp
->xmm
[idx
].dirty
= 0;
180 static struct x86_reg
get_xmm_writable( struct aos_compilation
*cp
,
183 if (reg
.file
!= file_XMM
||
184 cp
->xmm
[reg
.idx
].file
!= TGSI_FILE_NULL
)
186 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
187 sse_movaps(cp
->func
, tmp
, reg
);
191 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
195 static struct x86_reg
get_xmm( struct aos_compilation
*cp
,
198 if (reg
.file
!= file_XMM
)
200 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
201 sse_movaps(cp
->func
, tmp
, reg
);
205 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
210 /* Allocate an empty xmm register, either as a temporary or later to
211 * "adopt" as a shader reg.
213 struct x86_reg
aos_get_xmm_reg( struct aos_compilation
*cp
)
217 boolean found
= FALSE
;
219 for (i
= 0; i
< 8; i
++)
220 if (cp
->xmm
[i
].last_used
!= cp
->insn_counter
&&
221 cp
->xmm
[i
].file
== TGSI_FILE_NULL
) {
227 for (i
= 0; i
< 8; i
++)
228 if (cp
->xmm
[i
].last_used
< cp
->xmm
[oldest
].last_used
)
232 /* Need to write out the old value?
234 if (cp
->xmm
[oldest
].dirty
)
237 assert(cp
->xmm
[oldest
].last_used
!= cp
->insn_counter
);
239 cp
->xmm
[oldest
].file
= TGSI_FILE_NULL
;
240 cp
->xmm
[oldest
].idx
= 0;
241 cp
->xmm
[oldest
].dirty
= 0;
242 cp
->xmm
[oldest
].last_used
= cp
->insn_counter
;
243 return x86_make_reg(file_XMM
, oldest
);
246 void aos_release_xmm_reg( struct aos_compilation
*cp
,
249 cp
->xmm
[idx
].file
= TGSI_FILE_NULL
;
250 cp
->xmm
[idx
].idx
= 0;
251 cp
->xmm
[idx
].dirty
= 0;
252 cp
->xmm
[idx
].last_used
= 0;
258 /* Mark an xmm reg as holding the current copy of a shader reg.
260 void aos_adopt_xmm_reg( struct aos_compilation
*cp
,
268 if (reg
.file
!= file_XMM
) {
274 /* If any xmm reg thinks it holds this shader reg, break the
277 for (i
= 0; i
< 8; i
++) {
278 if (cp
->xmm
[i
].file
== file
&&
279 cp
->xmm
[i
].idx
== idx
)
281 /* If an xmm reg is already holding this shader reg, take into account its
284 dirty
|= cp
->xmm
[i
].dirty
;
285 aos_release_xmm_reg(cp
, i
);
289 cp
->xmm
[reg
.idx
].file
= file
;
290 cp
->xmm
[reg
.idx
].idx
= idx
;
291 cp
->xmm
[reg
.idx
].dirty
= dirty
;
292 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
296 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
298 static struct x86_reg
aos_get_shader_reg_ptr( struct aos_compilation
*cp
,
304 /* Ensure the in-memory copy of this reg is up-to-date
306 for (i
= 0; i
< 8; i
++) {
307 if (cp
->xmm
[i
].file
== file
&&
308 cp
->xmm
[i
].idx
== idx
&&
314 return get_reg_ptr( cp
, file
, idx
);
318 /* As above, but return a pointer. Note - this pointer may alias
319 * those returned by get_arg_ptr().
321 static struct x86_reg
get_dst_ptr( struct aos_compilation
*cp
,
322 const struct tgsi_full_dst_register
*dst
)
324 unsigned file
= dst
->DstRegister
.File
;
325 unsigned idx
= dst
->DstRegister
.Index
;
329 /* Ensure in-memory copy of this reg is up-to-date and invalidate
332 for (i
= 0; i
< 8; i
++) {
333 if (cp
->xmm
[i
].file
== file
&&
334 cp
->xmm
[i
].idx
== idx
)
336 if (cp
->xmm
[i
].dirty
)
339 aos_release_xmm_reg(cp
, i
);
343 return get_reg_ptr( cp
, file
, idx
);
350 /* Return an XMM reg if the argument is resident, otherwise return a
351 * base+offset pointer to the saved value.
353 struct x86_reg
aos_get_shader_reg( struct aos_compilation
*cp
,
359 for (i
= 0; i
< 8; i
++) {
360 if (cp
->xmm
[i
].file
== file
&&
361 cp
->xmm
[i
].idx
== idx
)
363 cp
->xmm
[i
].last_used
= cp
->insn_counter
;
364 return x86_make_reg(file_XMM
, i
);
368 /* If not found in the XMM register file, return an indirect
369 * reference to the in-memory copy:
371 return get_reg_ptr( cp
, file
, idx
);
376 static struct x86_reg
aos_get_shader_reg_xmm( struct aos_compilation
*cp
,
380 struct x86_reg reg
= get_xmm( cp
,
381 aos_get_shader_reg( cp
, file
, idx
) );
383 aos_adopt_xmm_reg( cp
,
394 struct x86_reg
aos_get_internal_xmm( struct aos_compilation
*cp
,
397 return aos_get_shader_reg_xmm( cp
, AOS_FILE_INTERNAL
, imm
);
401 struct x86_reg
aos_get_internal( struct aos_compilation
*cp
,
404 return aos_get_shader_reg( cp
, AOS_FILE_INTERNAL
, imm
);
411 /* Emulate pshufd insn in regular SSE, if necessary:
413 static void emit_pshufd( struct aos_compilation
*cp
,
419 sse2_pshufd(cp
->func
, dst
, arg0
, shuf
);
423 sse_movaps(cp
->func
, dst
, arg0
);
425 sse_shufps(cp
->func
, dst
, dst
, shuf
);
429 /* load masks (pack into negs??)
430 * pshufd - shuffle according to writemask
435 static boolean
mask_write( struct aos_compilation
*cp
,
437 struct x86_reg result
,
440 struct x86_reg imm_swz
= aos_get_internal_xmm(cp
, IMM_SWZ
);
441 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
443 emit_pshufd(cp
, tmp
, imm_swz
,
444 SHUF((mask
& 1) ? 2 : 3,
447 (mask
& 8) ? 2 : 3));
449 sse_andps(cp
->func
, dst
, tmp
);
450 sse_andnps(cp
->func
, tmp
, result
);
451 sse_orps(cp
->func
, dst
, tmp
);
453 aos_release_xmm_reg(cp
, tmp
.idx
);
460 /* Helper for writemask:
462 static boolean
emit_shuf_copy2( struct aos_compilation
*cp
,
468 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
470 emit_pshufd(cp
, dst
, arg1
, shuf
);
471 emit_pshufd(cp
, tmp
, arg0
, shuf
);
472 sse_shufps(cp
->func
, dst
, tmp
, SHUF(X
, Y
, Z
, W
));
473 emit_pshufd(cp
, dst
, dst
, shuf
);
475 aos_release_xmm_reg(cp
, tmp
.idx
);
481 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
484 /* Locate a source register and perform any required (simple) swizzle.
486 * Just fail on complex swizzles at this point.
488 static struct x86_reg
fetch_src( struct aos_compilation
*cp
,
489 const struct tgsi_full_src_register
*src
)
491 struct x86_reg arg0
= aos_get_shader_reg(cp
,
492 src
->SrcRegister
.File
,
493 src
->SrcRegister
.Index
);
499 for (i
= 0; i
< 4; i
++) {
500 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( src
, i
);
501 unsigned neg
= tgsi_util_get_full_src_register_sign_mode( src
, i
);
504 case TGSI_EXTSWIZZLE_ZERO
:
505 case TGSI_EXTSWIZZLE_ONE
:
506 ERROR(cp
, "not supporting full swizzles yet in tgsi_aos_sse2");
510 swz
|= (swizzle
& 0x3) << (i
* 2);
515 case TGSI_UTIL_SIGN_TOGGLE
:
519 case TGSI_UTIL_SIGN_KEEP
:
522 case TGSI_UTIL_SIGN_CLEAR
:
527 ERROR(cp
, "unsupported sign-mode");
532 if (swz
!= SSE_SWIZZLE_NOOP
|| negs
!= 0 || abs
!= 0) {
533 struct x86_reg dst
= aos_get_xmm_reg(cp
);
535 if (swz
!= SSE_SWIZZLE_NOOP
) {
536 emit_pshufd(cp
, dst
, arg0
, swz
);
540 if (negs
&& negs
!= 0xf) {
541 struct x86_reg imm_swz
= aos_get_internal_xmm(cp
, IMM_SWZ
);
542 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
545 * Use neg as arg to pshufd
548 emit_pshufd(cp
, tmp
, imm_swz
,
549 SHUF((negs
& 1) ? 1 : 0,
552 (negs
& 8) ? 1 : 0));
553 sse_mulps(cp
->func
, dst
, arg0
);
555 aos_release_xmm_reg(cp
, tmp
.idx
);
559 struct x86_reg imm_negs
= aos_get_internal_xmm(cp
, IMM_NEGS
);
560 sse_mulps(cp
->func
, dst
, imm_negs
);
565 if (abs
&& abs
!= 0xf) {
566 ERROR(cp
, "unsupported partial abs");
569 struct x86_reg neg
= aos_get_internal(cp
, IMM_NEGS
);
570 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
572 sse_movaps(cp
->func
, tmp
, arg0
);
573 sse_mulps(cp
->func
, tmp
, neg
);
574 sse_maxps(cp
->func
, dst
, arg0
);
576 aos_release_xmm_reg(cp
, tmp
.idx
);
584 static void x87_fld_src( struct aos_compilation
*cp
,
585 const struct tgsi_full_src_register
*src
,
588 struct x86_reg arg0
= aos_get_shader_reg_ptr(cp
,
589 src
->SrcRegister
.File
,
590 src
->SrcRegister
.Index
);
592 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( src
, channel
);
593 unsigned neg
= tgsi_util_get_full_src_register_sign_mode( src
, channel
);
596 case TGSI_EXTSWIZZLE_ZERO
:
597 x87_fldz( cp
->func
);
600 case TGSI_EXTSWIZZLE_ONE
:
601 x87_fld1( cp
->func
);
605 x87_fld( cp
->func
, x86_make_disp(arg0
, (swizzle
& 3) * sizeof(float)) );
611 case TGSI_UTIL_SIGN_TOGGLE
:
614 x87_fchs( cp
->func
);
617 case TGSI_UTIL_SIGN_KEEP
:
620 case TGSI_UTIL_SIGN_CLEAR
:
621 x87_fabs( cp
->func
);
624 case TGSI_UTIL_SIGN_SET
:
625 x87_fabs( cp
->func
);
626 x87_fchs( cp
->func
);
630 ERROR(cp
, "unsupported sign-mode");
640 /* Used to implement write masking. This and most of the other instructions
641 * here would be easier to implement if there had been a translation
642 * to a 2 argument format (dst/arg0, arg1) at the shader level before
643 * attempting to translate to x86/sse code.
645 static void store_dest( struct aos_compilation
*cp
,
646 const struct tgsi_full_dst_register
*reg
,
647 struct x86_reg result
)
651 switch (reg
->DstRegister
.WriteMask
) {
655 case TGSI_WRITEMASK_XYZW
:
656 aos_adopt_xmm_reg(cp
,
657 get_xmm_writable(cp
, result
),
658 reg
->DstRegister
.File
,
659 reg
->DstRegister
.Index
,
666 dst
= aos_get_shader_reg_xmm(cp
,
667 reg
->DstRegister
.File
,
668 reg
->DstRegister
.Index
);
670 switch (reg
->DstRegister
.WriteMask
) {
671 case TGSI_WRITEMASK_X
:
672 sse_movss(cp
->func
, dst
, get_xmm(cp
, result
));
675 case TGSI_WRITEMASK_ZW
:
676 sse_shufps(cp
->func
, dst
, get_xmm(cp
, result
), SHUF(X
, Y
, Z
, W
));
679 case TGSI_WRITEMASK_XY
:
680 result
= get_xmm_writable(cp
, result
);
681 sse_shufps(cp
->func
, result
, dst
, SHUF(X
, Y
, Z
, W
));
685 case TGSI_WRITEMASK_YZW
:
686 result
= get_xmm_writable(cp
, result
);
687 sse_movss(cp
->func
, result
, dst
);
692 mask_write(cp
, dst
, result
, reg
->DstRegister
.WriteMask
);
696 aos_adopt_xmm_reg(cp
,
698 reg
->DstRegister
.File
,
699 reg
->DstRegister
.Index
,
704 static void inject_scalar( struct aos_compilation
*cp
,
706 struct x86_reg result
,
709 sse_shufps(cp
->func
, dst
, dst
, swizzle
);
710 sse_movss(cp
->func
, dst
, result
);
711 sse_shufps(cp
->func
, dst
, dst
, swizzle
);
715 static void store_scalar_dest( struct aos_compilation
*cp
,
716 const struct tgsi_full_dst_register
*reg
,
717 struct x86_reg result
)
719 unsigned writemask
= reg
->DstRegister
.WriteMask
;
722 if (writemask
!= TGSI_WRITEMASK_X
&&
723 writemask
!= TGSI_WRITEMASK_Y
&&
724 writemask
!= TGSI_WRITEMASK_Z
&&
725 writemask
!= TGSI_WRITEMASK_W
&&
728 result
= get_xmm_writable(cp
, result
); /* already true, right? */
729 sse_shufps(cp
->func
, result
, result
, SHUF(X
,X
,X
,X
));
730 store_dest(cp
, reg
, result
);
734 result
= get_xmm(cp
, result
);
735 dst
= aos_get_shader_reg_xmm(cp
,
736 reg
->DstRegister
.File
,
737 reg
->DstRegister
.Index
);
741 switch (reg
->DstRegister
.WriteMask
) {
742 case TGSI_WRITEMASK_X
:
743 sse_movss(cp
->func
, dst
, result
);
746 case TGSI_WRITEMASK_Y
:
747 inject_scalar(cp
, dst
, result
, SHUF(Y
, X
, Z
, W
));
750 case TGSI_WRITEMASK_Z
:
751 inject_scalar(cp
, dst
, result
, SHUF(Z
, Y
, X
, W
));
754 case TGSI_WRITEMASK_W
:
755 inject_scalar(cp
, dst
, result
, SHUF(W
, Y
, Z
, X
));
762 aos_adopt_xmm_reg(cp
,
764 reg
->DstRegister
.File
,
765 reg
->DstRegister
.Index
,
771 static void x87_fst_or_nop( struct x86_function
*func
,
776 assert(ptr
.file
== file_REG32
);
777 if (writemask
& (1<<channel
))
778 x87_fst( func
, x86_make_disp(ptr
, channel
* sizeof(float)) );
781 static void x87_fstp_or_pop( struct x86_function
*func
,
786 assert(ptr
.file
== file_REG32
);
787 if (writemask
& (1<<channel
))
788 x87_fstp( func
, x86_make_disp(ptr
, channel
* sizeof(float)) );
790 x87_fstp( func
, x86_make_reg( file_x87
, 0 ));
797 static void x87_fstp_dest4( struct aos_compilation
*cp
,
798 const struct tgsi_full_dst_register
*dst
)
800 struct x86_reg ptr
= get_dst_ptr(cp
, dst
);
801 unsigned writemask
= dst
->DstRegister
.WriteMask
;
803 x87_fst_or_nop(cp
->func
, writemask
, 0, ptr
);
804 x87_fst_or_nop(cp
->func
, writemask
, 1, ptr
);
805 x87_fst_or_nop(cp
->func
, writemask
, 2, ptr
);
806 x87_fstp_or_pop(cp
->func
, writemask
, 3, ptr
);
809 /* Save current x87 state and put it into single precision mode.
811 static void save_fpu_state( struct aos_compilation
*cp
)
813 x87_fnstcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
814 Offset(struct aos_machine
, fpu_restore
)));
817 static void restore_fpu_state( struct aos_compilation
*cp
)
819 x87_fnclex(cp
->func
);
820 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
821 Offset(struct aos_machine
, fpu_restore
)));
824 static void set_fpu_round_neg_inf( struct aos_compilation
*cp
)
826 if (cp
->fpucntl
!= FPU_RND_NEG
) {
827 cp
->fpucntl
= FPU_RND_NEG
;
828 x87_fnclex(cp
->func
);
829 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
830 Offset(struct aos_machine
, fpu_rnd_neg_inf
)));
834 static void set_fpu_round_nearest( struct aos_compilation
*cp
)
836 if (cp
->fpucntl
!= FPU_RND_NEAREST
) {
837 cp
->fpucntl
= FPU_RND_NEAREST
;
838 x87_fnclex(cp
->func
);
839 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
840 Offset(struct aos_machine
, fpu_rnd_nearest
)));
845 static void x87_emit_ex2( struct aos_compilation
*cp
)
847 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
848 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
849 int stack
= cp
->func
->x87_stack
;
851 // set_fpu_round_neg_inf( cp );
853 x87_fld(cp
->func
, st0
); /* a a */
854 x87_fprndint( cp
->func
); /* int(a) a*/
855 x87_fsubr(cp
->func
, st1
, st0
); /* int(a) frc(a) */
856 x87_fxch(cp
->func
, st1
); /* frc(a) int(a) */
857 x87_f2xm1(cp
->func
); /* (2^frc(a))-1 int(a) */
858 x87_fld1(cp
->func
); /* 1 (2^frc(a))-1 int(a) */
859 x87_faddp(cp
->func
, st1
); /* 2^frac(a) int(a) */
860 x87_fscale(cp
->func
); /* (2^frac(a)*2^int(int(a))) int(a) */
862 x87_fstp(cp
->func
, st1
); /* 2^a */
864 assert( stack
== cp
->func
->x87_stack
);
868 static void PIPE_CDECL
print_reg( const char *msg
,
871 debug_printf("%s: %f %f %f %f\n", msg
, reg
[0], reg
[1], reg
[2], reg
[3]);
874 static void emit_print( struct aos_compilation
*cp
,
875 const char *message
, /* must point to a static string! */
879 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
880 struct x86_reg arg
= aos_get_shader_reg_ptr( cp
, file
, idx
);
883 /* There shouldn't be anything on the x87 stack. Can add this
884 * capacity later if need be.
886 assert(cp
->func
->x87_stack
== 0);
888 /* For absolute correctness, need to spill/invalidate all XMM regs
889 * too. We're obviously not concerned about performance on this
890 * debug path, so here goes:
892 for (i
= 0; i
< 8; i
++) {
893 if (cp
->xmm
[i
].dirty
)
896 aos_release_xmm_reg(cp
, i
);
899 /* Push caller-save (ie scratch) regs.
901 x86_cdecl_caller_push_regs( cp
->func
);
904 /* Push the arguments:
906 x86_lea( cp
->func
, ecx
, arg
);
907 x86_push( cp
->func
, ecx
);
908 x86_push_imm32( cp
->func
, (int)message
);
910 /* Call the helper. Could call debug_printf directly, but
911 * print_reg is a nice place to put a breakpoint if need be.
913 x86_mov_reg_imm( cp
->func
, ecx
, (int)print_reg
);
914 x86_call( cp
->func
, ecx
);
915 x86_pop( cp
->func
, ecx
);
916 x86_pop( cp
->func
, ecx
);
918 /* Pop caller-save regs
920 x86_cdecl_caller_pop_regs( cp
->func
);
927 * The traditional instructions. All operate on internal registers
928 * and ignore write masks and swizzling issues.
931 static boolean
emit_ABS( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
933 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
934 struct x86_reg neg
= aos_get_internal(cp
, IMM_NEGS
);
935 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
937 sse_movaps(cp
->func
, tmp
, arg0
);
938 sse_mulps(cp
->func
, tmp
, neg
);
939 sse_maxps(cp
->func
, tmp
, arg0
);
941 store_dest(cp
, &op
->FullDstRegisters
[0], tmp
);
945 static boolean
emit_ADD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
947 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
948 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
949 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
951 sse_addps(cp
->func
, dst
, arg1
);
953 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
957 static boolean
emit_COS( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
959 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
961 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
966 /* The dotproduct instructions don't really do that well in sse:
968 static boolean
emit_DP3( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
970 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
971 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
972 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
973 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
975 sse_mulps(cp
->func
, dst
, arg1
);
976 /* Now the hard bit: sum the first 3 values:
978 sse_movhlps(cp
->func
, tmp
, dst
);
979 sse_addss(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y, ?, ? */
980 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
981 sse_addss(cp
->func
, dst
, tmp
);
983 aos_release_xmm_reg(cp
, tmp
.idx
);
984 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
990 static boolean
emit_DP4( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
992 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
993 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
994 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
995 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
997 sse_mulps(cp
->func
, dst
, arg1
);
999 /* Now the hard bit: sum the values:
1001 sse_movhlps(cp
->func
, tmp
, dst
);
1002 sse_addps(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1003 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
1004 sse_addss(cp
->func
, dst
, tmp
);
1006 aos_release_xmm_reg(cp
, tmp
.idx
);
1007 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1011 static boolean
emit_DPH( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1013 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1014 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1015 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1016 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1018 sse_mulps(cp
->func
, dst
, arg1
);
1020 /* Now the hard bit: sum the values (from DP3):
1022 sse_movhlps(cp
->func
, tmp
, dst
);
1023 sse_addss(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y, ?, ? */
1024 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
1025 sse_addss(cp
->func
, dst
, tmp
);
1026 emit_pshufd(cp
, tmp
, arg1
, SHUF(W
,W
,W
,W
));
1027 sse_addss(cp
->func
, dst
, tmp
);
1029 aos_release_xmm_reg(cp
, tmp
.idx
);
1030 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1034 static boolean
emit_DST( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1036 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1037 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1038 struct x86_reg dst
= aos_get_xmm_reg(cp
);
1039 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1040 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1042 /* dst[0] = 1.0 * 1.0F; */
1043 /* dst[1] = arg0[1] * arg1[1]; */
1044 /* dst[2] = arg0[2] * 1.0; */
1045 /* dst[3] = 1.0 * arg1[3]; */
1047 emit_shuf_copy2(cp
, dst
, arg0
, ones
, SHUF(X
,W
,Z
,Y
));
1048 emit_shuf_copy2(cp
, tmp
, arg1
, ones
, SHUF(X
,Z
,Y
,W
));
1049 sse_mulps(cp
->func
, dst
, tmp
);
1051 aos_release_xmm_reg(cp
, tmp
.idx
);
1052 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1056 static boolean
emit_LG2( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1058 x87_fld1(cp
->func
); /* 1 */
1059 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0 1 */
1060 x87_fyl2x(cp
->func
); /* log2(a0) */
1061 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1066 static boolean
emit_EX2( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1068 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
1070 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1075 static boolean
emit_FLR( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1077 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1078 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1081 set_fpu_round_neg_inf( cp
);
1083 /* Load all sources first to avoid aliasing
1085 for (i
= 3; i
>= 0; i
--) {
1086 if (writemask
& (1<<i
)) {
1087 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1091 for (i
= 0; i
< 4; i
++) {
1092 if (writemask
& (1<<i
)) {
1093 x87_fprndint( cp
->func
);
1094 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1102 static boolean
emit_RND( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1104 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1105 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1108 set_fpu_round_nearest( cp
);
1110 /* Load all sources first to avoid aliasing
1112 for (i
= 3; i
>= 0; i
--) {
1113 if (writemask
& (1<<i
)) {
1114 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1118 for (i
= 0; i
< 4; i
++) {
1119 if (writemask
& (1<<i
)) {
1120 x87_fprndint( cp
->func
);
1121 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1129 static boolean
emit_FRC( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1131 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1132 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
1133 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
1134 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1137 set_fpu_round_neg_inf( cp
);
1139 /* suck all the source values onto the stack before writing out any
1140 * dst, which may alias...
1142 for (i
= 3; i
>= 0; i
--) {
1143 if (writemask
& (1<<i
)) {
1144 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1148 for (i
= 0; i
< 4; i
++) {
1149 if (writemask
& (1<<i
)) {
1150 x87_fld(cp
->func
, st0
); /* a a */
1151 x87_fprndint( cp
->func
); /* flr(a) a */
1152 x87_fsubp(cp
->func
, st1
); /* frc(a) */
1153 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1165 static boolean
emit_LIT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1167 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
1168 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1169 unsigned lit_count
= cp
->lit_count
++;
1170 struct x86_reg result
, arg0
;
1174 /* For absolute correctness, need to spill/invalidate all XMM regs
1177 for (i
= 0; i
< 8; i
++) {
1178 if (cp
->xmm
[i
].dirty
)
1180 aos_release_xmm_reg(cp
, i
);
1184 if (writemask
!= TGSI_WRITEMASK_XYZW
)
1185 result
= x86_make_disp(cp
->machine_EDX
, Offset(struct aos_machine
, tmp
[0]));
1187 result
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1190 arg0
= fetch_src( cp
, &op
->FullSrcRegisters
[0] );
1191 if (arg0
.file
== file_XMM
) {
1192 struct x86_reg tmp
= x86_make_disp(cp
->machine_EDX
,
1193 Offset(struct aos_machine
, tmp
[1]));
1194 sse_movaps( cp
->func
, tmp
, arg0
);
1200 /* Push caller-save (ie scratch) regs.
1202 x86_cdecl_caller_push_regs( cp
->func
);
1204 /* Push the arguments:
1206 x86_push_imm32( cp
->func
, lit_count
);
1208 x86_lea( cp
->func
, ecx
, arg0
);
1209 x86_push( cp
->func
, ecx
);
1211 x86_lea( cp
->func
, ecx
, result
);
1212 x86_push( cp
->func
, ecx
);
1214 x86_push( cp
->func
, cp
->machine_EDX
);
1216 if (lit_count
< MAX_LIT_INFO
) {
1217 x86_mov( cp
->func
, ecx
, x86_make_disp( cp
->machine_EDX
,
1218 Offset(struct aos_machine
, lit_info
) +
1219 lit_count
* sizeof(struct lit_info
) +
1220 Offset(struct lit_info
, func
)));
1223 x86_mov_reg_imm( cp
->func
, ecx
, (int)aos_do_lit
);
1226 x86_call( cp
->func
, ecx
);
1228 x86_pop( cp
->func
, ecx
); /* fixme... */
1229 x86_pop( cp
->func
, ecx
);
1230 x86_pop( cp
->func
, ecx
);
1231 x86_pop( cp
->func
, ecx
);
1233 x86_cdecl_caller_pop_regs( cp
->func
);
1235 if (writemask
!= TGSI_WRITEMASK_XYZW
) {
1237 &op
->FullDstRegisters
[0],
1238 get_xmm_writable( cp
, result
) );
1245 static boolean
emit_inline_LIT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1247 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1248 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1250 if (writemask
& TGSI_WRITEMASK_YZ
) {
1251 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
1252 struct x86_reg st2
= x86_make_reg(file_x87
, 2);
1254 /* a1' = a1 <= 0 ? 1 : a1;
1256 x87_fldz(cp
->func
); /* 1 0 */
1258 x87_fld1(cp
->func
); /* 1 0 */
1260 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1262 x87_fldz(cp
->func
); /* 1 0 */
1264 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 1); /* a1 1 0 */
1265 x87_fcomi(cp
->func
, st2
); /* a1 1 0 */
1266 x87_fcmovb(cp
->func
, st1
); /* a1' 1 0 */
1267 x87_fstp(cp
->func
, st1
); /* a1' 0 */
1268 x87_fstp(cp
->func
, st1
); /* a1' */
1270 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 3); /* a3 a1' */
1271 x87_fxch(cp
->func
, st1
); /* a1' a3 */
1274 /* Compute pow(a1, a3)
1276 x87_fyl2x(cp
->func
); /* a3*log2(a1) */
1277 x87_emit_ex2( cp
); /* 2^(a3*log2(a1)) */
1280 /* a0' = max2(a0, 0):
1282 x87_fldz(cp
->func
); /* 0 r2 */
1283 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0 0 r2 */
1284 x87_fcomi(cp
->func
, st1
);
1285 x87_fcmovb(cp
->func
, st1
); /* a0' 0 r2 */
1287 x87_fst_or_nop(cp
->func
, writemask
, 1, dst
); /* result[1] = a0' */
1289 x87_fcomi(cp
->func
, st1
); /* a0' 0 r2 */
1290 x87_fcmovnbe(cp
->func
, st2
); /* r2' 0' r2 */
1292 x87_fstp_or_pop(cp
->func
, writemask
, 2, dst
); /* 0 r2 */
1293 x87_fpop(cp
->func
); /* r2 */
1297 if (writemask
& TGSI_WRITEMASK_XW
) {
1299 x87_fst_or_nop(cp
->func
, writemask
, 0, dst
);
1300 x87_fstp_or_pop(cp
->func
, writemask
, 3, dst
);
1309 static boolean
emit_MAX( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1311 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1312 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1313 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1315 sse_maxps(cp
->func
, dst
, arg1
);
1317 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1322 static boolean
emit_MIN( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1324 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1325 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1326 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1328 sse_minps(cp
->func
, dst
, arg1
);
1330 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1334 static boolean
emit_MOV( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1336 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1337 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1339 /* potentially nothing to do */
1341 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1345 static boolean
emit_MUL( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1347 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1348 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1349 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1351 sse_mulps(cp
->func
, dst
, arg1
);
1353 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1358 static boolean
emit_MAD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1360 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1361 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1362 struct x86_reg arg2
= fetch_src(cp
, &op
->FullSrcRegisters
[2]);
1364 /* If we can't clobber old contents of arg0, get a temporary & copy
1365 * it there, then clobber it...
1367 arg0
= get_xmm_writable(cp
, arg0
);
1369 sse_mulps(cp
->func
, arg0
, arg1
);
1370 sse_addps(cp
->func
, arg0
, arg2
);
1371 store_dest(cp
, &op
->FullDstRegisters
[0], arg0
);
1375 /* A wrapper for powf().
1376 * Makes sure it is cdecl and operates on floats.
1378 static float PIPE_CDECL
_powerf( float x
, float y
)
1380 return powf( x
, y
);
1383 /* Really not sufficient -- need to check for conditions that could
1384 * generate inf/nan values, which will slow things down hugely.
1386 static boolean
emit_POW( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1389 x87_fld_src(cp
, &op
->FullSrcRegisters
[1], 0); /* a1.x */
1390 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0.x a1.x */
1391 x87_fyl2x(cp
->func
); /* a1*log2(a0) */
1393 x87_emit_ex2( cp
); /* 2^(a1*log2(a0)) */
1395 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1399 /* For absolute correctness, need to spill/invalidate all XMM regs
1402 for (i
= 0; i
< 8; i
++) {
1403 if (cp
->xmm
[i
].dirty
)
1405 aos_release_xmm_reg(cp
, i
);
1408 /* Push caller-save (ie scratch) regs.
1410 x86_cdecl_caller_push_regs( cp
->func
);
1412 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, -8) );
1414 x87_fld_src( cp
, &op
->FullSrcRegisters
[1], 0 );
1415 x87_fstp( cp
->func
, x86_make_disp( cp
->stack_ESP
, 4 ) );
1416 x87_fld_src( cp
, &op
->FullSrcRegisters
[0], 0 );
1417 x87_fstp( cp
->func
, x86_make_disp( cp
->stack_ESP
, 0 ) );
1419 x86_mov_reg_imm( cp
->func
, cp
->tmp_EAX
, (unsigned long) _powerf
);
1420 x86_call( cp
->func
, cp
->tmp_EAX
);
1422 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, 8) );
1424 x86_cdecl_caller_pop_regs( cp
->func
);
1426 /* Note retval on x87 stack:
1428 cp
->func
->x87_stack
++;
1430 x87_fstp_dest4( cp
, &op
->FullDstRegisters
[0] );
1436 static boolean
emit_RCP( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1438 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1439 struct x86_reg dst
= aos_get_xmm_reg(cp
);
1441 if (cp
->have_sse2
) {
1442 sse2_rcpss(cp
->func
, dst
, arg0
);
1443 /* extend precision here...
1447 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1448 sse_movss(cp
->func
, dst
, ones
);
1449 sse_divss(cp
->func
, dst
, arg0
);
1452 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1457 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1458 * implementations, it is possible to improve its precision at
1459 * fairly low cost, using a newton/raphson step, as below:
1461 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1462 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1464 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1467 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1469 static boolean
emit_RSQ( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1473 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1474 struct x86_reg r
= aos_get_xmm_reg(cp
);
1475 sse_rsqrtss(cp
->func
, r
, arg0
);
1476 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], r
);
1480 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1481 struct x86_reg r
= aos_get_xmm_reg(cp
);
1483 struct x86_reg neg_half
= get_reg_ptr( cp
, AOS_FILE_INTERNAL
, IMM_RSQ
);
1484 struct x86_reg one_point_five
= x86_make_disp( neg_half
, 4 );
1485 struct x86_reg src
= get_xmm_writable( cp
, arg0
);
1487 sse_rsqrtss( cp
->func
, r
, src
); /* rsqrtss(a) */
1488 sse_mulss( cp
->func
, src
, neg_half
); /* -.5 * a */
1489 sse_mulss( cp
->func
, src
, r
); /* -.5 * a * r */
1490 sse_mulss( cp
->func
, src
, r
); /* -.5 * a * r * r */
1491 sse_addss( cp
->func
, src
, one_point_five
); /* 1.5 - .5 * a * r * r */
1492 sse_mulss( cp
->func
, r
, src
); /* r * (1.5 - .5 * a * r * r) */
1494 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], r
);
1500 static boolean
emit_SGE( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1502 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1503 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1504 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1505 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1507 sse_cmpps(cp
->func
, dst
, arg1
, cc_NotLessThan
);
1508 sse_andps(cp
->func
, dst
, ones
);
1510 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1514 static boolean
emit_SIN( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1516 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
1518 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1524 static boolean
emit_SLT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1526 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1527 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1528 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1529 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1531 sse_cmpps(cp
->func
, dst
, arg1
, cc_LessThan
);
1532 sse_andps(cp
->func
, dst
, ones
);
1534 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1538 static boolean
emit_SUB( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1540 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1541 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1542 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1544 sse_subps(cp
->func
, dst
, arg1
);
1546 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1551 static boolean
emit_XPD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1553 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1554 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1555 struct x86_reg tmp0
= aos_get_xmm_reg(cp
);
1556 struct x86_reg tmp1
= aos_get_xmm_reg(cp
);
1558 emit_pshufd(cp
, tmp1
, arg1
, SHUF(Y
, Z
, X
, W
));
1559 sse_mulps(cp
->func
, tmp1
, arg0
);
1560 emit_pshufd(cp
, tmp0
, arg0
, SHUF(Y
, Z
, X
, W
));
1561 sse_mulps(cp
->func
, tmp0
, arg1
);
1562 sse_subps(cp
->func
, tmp1
, tmp0
);
1563 sse_shufps(cp
->func
, tmp1
, tmp1
, SHUF(Y
, Z
, X
, W
));
1565 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1566 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1567 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1568 /* dst[3] is undef */
1571 aos_release_xmm_reg(cp
, tmp0
.idx
);
1572 store_dest(cp
, &op
->FullDstRegisters
[0], tmp1
);
1579 emit_instruction( struct aos_compilation
*cp
,
1580 struct tgsi_full_instruction
*inst
)
1582 x87_assert_stack_empty(cp
->func
);
1584 switch( inst
->Instruction
.Opcode
) {
1585 case TGSI_OPCODE_MOV
:
1586 return emit_MOV( cp
, inst
);
1588 case TGSI_OPCODE_LIT
:
1589 return emit_LIT(cp
, inst
);
1591 case TGSI_OPCODE_RCP
:
1592 return emit_RCP(cp
, inst
);
1594 case TGSI_OPCODE_RSQ
:
1595 return emit_RSQ(cp
, inst
);
1597 case TGSI_OPCODE_EXP
:
1598 /*return emit_EXP(cp, inst);*/
1601 case TGSI_OPCODE_LOG
:
1602 /*return emit_LOG(cp, inst);*/
1605 case TGSI_OPCODE_MUL
:
1606 return emit_MUL(cp
, inst
);
1608 case TGSI_OPCODE_ADD
:
1609 return emit_ADD(cp
, inst
);
1611 case TGSI_OPCODE_DP3
:
1612 return emit_DP3(cp
, inst
);
1614 case TGSI_OPCODE_DP4
:
1615 return emit_DP4(cp
, inst
);
1617 case TGSI_OPCODE_DST
:
1618 return emit_DST(cp
, inst
);
1620 case TGSI_OPCODE_MIN
:
1621 return emit_MIN(cp
, inst
);
1623 case TGSI_OPCODE_MAX
:
1624 return emit_MAX(cp
, inst
);
1626 case TGSI_OPCODE_SLT
:
1627 return emit_SLT(cp
, inst
);
1629 case TGSI_OPCODE_SGE
:
1630 return emit_SGE(cp
, inst
);
1632 case TGSI_OPCODE_MAD
:
1633 return emit_MAD(cp
, inst
);
1635 case TGSI_OPCODE_SUB
:
1636 return emit_SUB(cp
, inst
);
1638 case TGSI_OPCODE_LERP
:
1639 // return emit_LERP(cp, inst);
1642 case TGSI_OPCODE_FRAC
:
1643 return emit_FRC(cp
, inst
);
1645 case TGSI_OPCODE_CLAMP
:
1646 // return emit_CLAMP(cp, inst);
1649 case TGSI_OPCODE_FLOOR
:
1650 return emit_FLR(cp
, inst
);
1652 case TGSI_OPCODE_ROUND
:
1653 return emit_RND(cp
, inst
);
1655 case TGSI_OPCODE_EXPBASE2
:
1656 return emit_EX2(cp
, inst
);
1658 case TGSI_OPCODE_LOGBASE2
:
1659 return emit_LG2(cp
, inst
);
1661 case TGSI_OPCODE_POWER
:
1662 return emit_POW(cp
, inst
);
1664 case TGSI_OPCODE_CROSSPRODUCT
:
1665 return emit_XPD(cp
, inst
);
1667 case TGSI_OPCODE_ABS
:
1668 return emit_ABS(cp
, inst
);
1670 case TGSI_OPCODE_DPH
:
1671 return emit_DPH(cp
, inst
);
1673 case TGSI_OPCODE_COS
:
1674 return emit_COS(cp
, inst
);
1676 case TGSI_OPCODE_SIN
:
1677 return emit_SIN(cp
, inst
);
1679 case TGSI_OPCODE_END
:
1688 static boolean
emit_viewport( struct aos_compilation
*cp
)
1690 struct x86_reg pos
= aos_get_shader_reg_xmm(cp
,
1694 struct x86_reg scale
= x86_make_disp(cp
->machine_EDX
,
1695 Offset(struct aos_machine
, scale
));
1697 struct x86_reg translate
= x86_make_disp(cp
->machine_EDX
,
1698 Offset(struct aos_machine
, translate
));
1700 sse_mulps(cp
->func
, pos
, scale
);
1701 sse_addps(cp
->func
, pos
, translate
);
1703 aos_adopt_xmm_reg( cp
,
1712 /* This is useful to be able to see the results on softpipe. Doesn't
1713 * do proper clipping, just assumes the backend can do it during
1714 * rasterization -- for debug only...
1716 static boolean
emit_rhw_viewport( struct aos_compilation
*cp
)
1718 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1719 struct x86_reg pos
= aos_get_shader_reg_xmm(cp
,
1723 struct x86_reg scale
= x86_make_disp(cp
->machine_EDX
,
1724 Offset(struct aos_machine
, scale
));
1726 struct x86_reg translate
= x86_make_disp(cp
->machine_EDX
,
1727 Offset(struct aos_machine
, translate
));
1731 emit_pshufd(cp
, tmp
, pos
, SHUF(W
, W
, W
, W
));
1732 sse2_rcpss(cp
->func
, tmp
, tmp
);
1733 sse_shufps(cp
->func
, tmp
, tmp
, SHUF(X
, X
, X
, X
));
1735 sse_mulps(cp
->func
, pos
, scale
);
1736 sse_mulps(cp
->func
, pos
, tmp
);
1737 sse_addps(cp
->func
, pos
, translate
);
1741 mask_write(cp
, pos
, tmp
, TGSI_WRITEMASK_W
);
1743 aos_adopt_xmm_reg( cp
,
1753 static boolean
note_immediate( struct aos_compilation
*cp
,
1754 struct tgsi_full_immediate
*imm
)
1756 unsigned pos
= cp
->num_immediates
++;
1759 for (j
= 0; j
< imm
->Immediate
.Size
; j
++) {
1760 cp
->vaos
->machine
->immediate
[pos
][j
] = imm
->u
.ImmediateFloat32
[j
].Float
;
1770 static void find_last_write_outputs( struct aos_compilation
*cp
)
1772 struct tgsi_parse_context parse
;
1773 unsigned this_instruction
= 0;
1776 tgsi_parse_init( &parse
, cp
->vaos
->base
.vs
->state
.tokens
);
1778 while (!tgsi_parse_end_of_tokens( &parse
)) {
1780 tgsi_parse_token( &parse
);
1782 if (parse
.FullToken
.Token
.Type
!= TGSI_TOKEN_TYPE_INSTRUCTION
)
1785 for (i
= 0; i
< TGSI_FULL_MAX_DST_REGISTERS
; i
++) {
1786 if (parse
.FullToken
.FullInstruction
.FullDstRegisters
[i
].DstRegister
.File
==
1789 unsigned idx
= parse
.FullToken
.FullInstruction
.FullDstRegisters
[i
].DstRegister
.Index
;
1790 cp
->output_last_write
[idx
] = this_instruction
;
1797 tgsi_parse_free( &parse
);
1801 #define ARG_MACHINE 1
1802 #define ARG_START_ELTS 2
1804 #define ARG_OUTBUF 4
1807 static boolean
build_vertex_program( struct draw_vs_varient_aos_sse
*varient
,
1810 struct tgsi_parse_context parse
;
1811 struct aos_compilation cp
;
1812 unsigned fixup
, label
;
1814 tgsi_parse_init( &parse
, varient
->base
.vs
->state
.tokens
);
1816 memset(&cp
, 0, sizeof(cp
));
1818 cp
.insn_counter
= 1;
1821 cp
.func
= &varient
->func
[ linear
? 0 : 1 ];
1823 cp
.tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
1824 cp
.idx_EBX
= x86_make_reg(file_REG32
, reg_BX
);
1825 cp
.outbuf_ECX
= x86_make_reg(file_REG32
, reg_CX
);
1826 cp
.machine_EDX
= x86_make_reg(file_REG32
, reg_DX
);
1827 cp
.count_ESI
= x86_make_reg(file_REG32
, reg_SI
);
1828 cp
.temp_EBP
= x86_make_reg(file_REG32
, reg_BP
);
1829 cp
.stack_ESP
= x86_make_reg( file_REG32
, reg_SP
);
1831 x86_init_func(cp
.func
);
1833 find_last_write_outputs(&cp
);
1835 x86_push(cp
.func
, cp
.idx_EBX
);
1836 x86_push(cp
.func
, cp
.count_ESI
);
1837 x86_push(cp
.func
, cp
.temp_EBP
);
1840 /* Load arguments into regs:
1842 x86_mov(cp
.func
, cp
.machine_EDX
, x86_fn_arg(cp
.func
, ARG_MACHINE
));
1843 x86_mov(cp
.func
, cp
.idx_EBX
, x86_fn_arg(cp
.func
, ARG_START_ELTS
));
1844 x86_mov(cp
.func
, cp
.count_ESI
, x86_fn_arg(cp
.func
, ARG_COUNT
));
1845 x86_mov(cp
.func
, cp
.outbuf_ECX
, x86_fn_arg(cp
.func
, ARG_OUTBUF
));
1848 /* Compare count to zero and possibly bail.
1850 x86_xor(cp
.func
, cp
.tmp_EAX
, cp
.tmp_EAX
);
1851 x86_cmp(cp
.func
, cp
.count_ESI
, cp
.tmp_EAX
);
1852 fixup
= x86_jcc_forward(cp
.func
, cc_E
);
1855 save_fpu_state( &cp
);
1856 set_fpu_round_nearest( &cp
);
1858 /* Note address for loop jump
1860 label
= x86_get_label(cp
.func
);
1862 /* Fetch inputs... TODO: fetch lazily...
1864 if (!aos_fetch_inputs( &cp
, linear
))
1869 while( !tgsi_parse_end_of_tokens( &parse
) && !cp
.error
)
1871 tgsi_parse_token( &parse
);
1873 switch (parse
.FullToken
.Token
.Type
) {
1874 case TGSI_TOKEN_TYPE_IMMEDIATE
:
1876 if (!note_immediate( &cp
, &parse
.FullToken
.FullImmediate
))
1881 case TGSI_TOKEN_TYPE_INSTRUCTION
:
1883 tgsi_dump_instruction( &parse
.FullToken
.FullInstruction
, cp
.insn_counter
);
1885 if (!emit_instruction( &cp
, &parse
.FullToken
.FullInstruction
))
1890 x87_assert_stack_empty(cp
.func
);
1900 for (i
= 0; i
< 8; i
++) {
1901 if (cp
.xmm
[i
].file
!= TGSI_FILE_OUTPUT
) {
1902 cp
.xmm
[i
].file
= TGSI_FILE_NULL
;
1903 cp
.xmm
[i
].dirty
= 0;
1911 if (cp
.vaos
->base
.key
.clip
) {
1912 /* not really handling clipping, just do the rhw so we can
1913 * see the results...
1915 emit_rhw_viewport(&cp
);
1917 else if (cp
.vaos
->base
.key
.viewport
) {
1921 /* Emit output... TODO: do this eagerly after the last write to a
1924 if (!aos_emit_outputs( &cp
))
1932 x86_make_disp(cp
.outbuf_ECX
,
1933 cp
.vaos
->base
.key
.output_stride
));
1938 x86_inc(cp
.func
, cp
.idx_EBX
);
1941 x86_lea(cp
.func
, cp
.idx_EBX
, x86_make_disp(cp
.idx_EBX
, 4));
1945 /* decr count, loop if not zero
1947 x86_dec(cp
.func
, cp
.count_ESI
);
1948 x86_jcc(cp
.func
, cc_NZ
, label
);
1950 restore_fpu_state(&cp
);
1952 /* Land forward jump here:
1954 x86_fixup_fwd_jump(cp
.func
, fixup
);
1958 if (cp
.func
->need_emms
)
1961 x86_pop(cp
.func
, cp
.temp_EBP
);
1962 x86_pop(cp
.func
, cp
.count_ESI
);
1963 x86_pop(cp
.func
, cp
.idx_EBX
);
1965 x87_assert_stack_empty(cp
.func
);
1968 tgsi_parse_free( &parse
);
1972 tgsi_parse_free( &parse
);
1978 static void vaos_set_buffer( struct draw_vs_varient
*varient
,
1983 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
1986 for (i
= 0; i
< vaos
->base
.key
.nr_inputs
; i
++) {
1987 if (vaos
->base
.key
.element
[i
].in
.buffer
== buf
) {
1988 vaos
->attrib
[i
].input_ptr
= ((char *)ptr
+
1989 vaos
->base
.key
.element
[i
].in
.offset
);
1990 vaos
->attrib
[i
].input_stride
= stride
;
1997 static void PIPE_CDECL
vaos_run_elts( struct draw_vs_varient
*varient
,
1998 const unsigned *elts
,
2000 void *output_buffer
)
2002 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2003 struct aos_machine
*machine
= vaos
->draw
->vs
.aos_machine
;
2005 machine
->internal
[IMM_PSIZE
][0] = vaos
->draw
->rasterizer
->point_size
;
2006 machine
->constants
= vaos
->draw
->vs
.aligned_constants
;
2007 machine
->immediates
= vaos
->base
.vs
->immediates
;
2008 machine
->attrib
= vaos
->attrib
;
2010 vaos
->gen_run_elts( machine
,
2016 static void PIPE_CDECL
vaos_run_linear( struct draw_vs_varient
*varient
,
2019 void *output_buffer
)
2021 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2022 struct aos_machine
*machine
= vaos
->draw
->vs
.aos_machine
;
2024 machine
->internal
[IMM_PSIZE
][0] = vaos
->draw
->rasterizer
->point_size
;
2025 machine
->constants
= vaos
->draw
->vs
.aligned_constants
;
2026 machine
->immediates
= vaos
->base
.vs
->immediates
;
2027 machine
->attrib
= vaos
->attrib
;
2029 vaos
->gen_run_linear( machine
,
2037 static void vaos_destroy( struct draw_vs_varient
*varient
)
2039 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2041 FREE( vaos
->attrib
);
2043 x86_release_func( &vaos
->func
[0] );
2044 x86_release_func( &vaos
->func
[1] );
2051 static struct draw_vs_varient
*varient_aos_sse( struct draw_vertex_shader
*vs
,
2052 const struct draw_vs_varient_key
*key
)
2054 struct draw_vs_varient_aos_sse
*vaos
= CALLOC_STRUCT(draw_vs_varient_aos_sse
);
2059 vaos
->base
.key
= *key
;
2061 vaos
->base
.set_input
= vaos_set_buffer
;
2062 vaos
->base
.destroy
= vaos_destroy
;
2063 vaos
->base
.run_linear
= vaos_run_linear
;
2064 vaos
->base
.run_elts
= vaos_run_elts
;
2066 vaos
->draw
= vs
->draw
;
2068 vaos
->attrib
= MALLOC( key
->nr_inputs
* sizeof(vaos
->attrib
[0]) );
2072 tgsi_dump(vs
->state
.tokens
, 0);
2074 if (!build_vertex_program( vaos
, TRUE
))
2077 if (!build_vertex_program( vaos
, FALSE
))
2080 vaos
->gen_run_linear
= (vaos_run_linear_func
)x86_get_func(&vaos
->func
[0]);
2081 if (!vaos
->gen_run_linear
)
2084 vaos
->gen_run_elts
= (vaos_run_elts_func
)x86_get_func(&vaos
->func
[1]);
2085 if (!vaos
->gen_run_elts
)
2091 if (vaos
&& vaos
->attrib
)
2095 x86_release_func( &vaos
->func
[0] );
2098 x86_release_func( &vaos
->func
[1] );
2106 struct draw_vs_varient
*draw_vs_varient_aos_sse( struct draw_vertex_shader
*vs
,
2107 const struct draw_vs_varient_key
*key
)
2109 struct draw_vs_varient
*varient
= varient_aos_sse( vs
, key
);
2111 if (varient
== NULL
) {
2113 varient
= draw_vs_varient_generic( vs
, key
);