2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
32 #include "pipe/p_util.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/util/tgsi_parse.h"
35 #include "tgsi/util/tgsi_util.h"
36 #include "tgsi/exec/tgsi_exec.h"
37 #include "tgsi/util/tgsi_dump.h"
40 #include "draw_vs_aos.h"
42 #include "rtasm/rtasm_x86sse.h"
47 static const char *files
[] =
60 static INLINE boolean
eq( struct x86_reg a
,
63 return (a
.file
== b
.file
&&
69 struct x86_reg
aos_get_x86( struct aos_compilation
*cp
,
70 unsigned which_reg
, /* quick hack */
80 if (cp
->x86_reg
[which_reg
] != value
) {
85 assert(which_reg
== 0);
86 offset
= Offset(struct aos_machine
, immediates
);
89 assert(which_reg
== 1);
90 offset
= Offset(struct aos_machine
, constants
);
93 assert(which_reg
== 0);
94 offset
= Offset(struct aos_machine
, attrib
);
102 x86_mov(cp
->func
, reg
,
103 x86_make_disp(cp
->machine_EDX
, offset
));
105 cp
->x86_reg
[which_reg
] = value
;
112 static struct x86_reg
get_reg_ptr(struct aos_compilation
*cp
,
116 struct x86_reg ptr
= cp
->machine_EDX
;
119 case TGSI_FILE_INPUT
:
120 return x86_make_disp(ptr
, Offset(struct aos_machine
, input
[idx
]));
122 case TGSI_FILE_OUTPUT
:
123 return x86_make_disp(ptr
, Offset(struct aos_machine
, output
[idx
]));
125 case TGSI_FILE_TEMPORARY
:
126 return x86_make_disp(ptr
, Offset(struct aos_machine
, temp
[idx
]));
128 case AOS_FILE_INTERNAL
:
129 return x86_make_disp(ptr
, Offset(struct aos_machine
, internal
[idx
]));
131 case TGSI_FILE_IMMEDIATE
:
132 return x86_make_disp(aos_get_x86(cp
, 0, X86_IMMEDIATES
), idx
* 4 * sizeof(float));
134 case TGSI_FILE_CONSTANT
:
135 return x86_make_disp(aos_get_x86(cp
, 1, X86_CONSTANTS
), idx
* 4 * sizeof(float));
138 ERROR(cp
, "unknown reg file");
139 return x86_make_reg(0,0);
145 #define X87_CW_EXCEPTION_INV_OP (1<<0)
146 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
147 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
148 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
149 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
150 #define X87_CW_EXCEPTION_PRECISION (1<<5)
151 #define X87_CW_PRECISION_SINGLE (0<<8)
152 #define X87_CW_PRECISION_RESERVED (1<<8)
153 #define X87_CW_PRECISION_DOUBLE (2<<8)
154 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
155 #define X87_CW_PRECISION_MASK (3<<8)
156 #define X87_CW_ROUND_NEAREST (0<<10)
157 #define X87_CW_ROUND_DOWN (1<<10)
158 #define X87_CW_ROUND_UP (2<<10)
159 #define X87_CW_ROUND_ZERO (3<<10)
160 #define X87_CW_ROUND_MASK (3<<10)
161 #define X87_CW_INFINITY (1<<12)
166 static void spill( struct aos_compilation
*cp
, unsigned idx
)
168 if (!cp
->xmm
[idx
].dirty
||
169 (cp
->xmm
[idx
].file
!= TGSI_FILE_INPUT
&& /* inputs are fetched into xmm & set dirty */
170 cp
->xmm
[idx
].file
!= TGSI_FILE_OUTPUT
&&
171 cp
->xmm
[idx
].file
!= TGSI_FILE_TEMPORARY
)) {
172 ERROR(cp
, "invalid spill");
176 struct x86_reg oldval
= get_reg_ptr(cp
,
180 if (0) debug_printf("\nspill %s[%d]",
181 files
[cp
->xmm
[idx
].file
],
184 assert(cp
->xmm
[idx
].dirty
);
185 sse_movaps(cp
->func
, oldval
, x86_make_reg(file_XMM
, idx
));
186 cp
->xmm
[idx
].dirty
= 0;
191 static struct x86_reg
get_xmm_writable( struct aos_compilation
*cp
,
194 if (reg
.file
!= file_XMM
||
195 cp
->xmm
[reg
.idx
].file
!= TGSI_FILE_NULL
)
197 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
198 sse_movaps(cp
->func
, tmp
, reg
);
202 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
206 static struct x86_reg
get_xmm( struct aos_compilation
*cp
,
209 if (reg
.file
!= file_XMM
)
211 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
212 sse_movaps(cp
->func
, tmp
, reg
);
216 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
221 /* Allocate an empty xmm register, either as a temporary or later to
222 * "adopt" as a shader reg.
224 struct x86_reg
aos_get_xmm_reg( struct aos_compilation
*cp
)
228 boolean found
= FALSE
;
230 for (i
= 0; i
< 8; i
++)
231 if (cp
->xmm
[i
].last_used
!= cp
->insn_counter
&&
232 cp
->xmm
[i
].file
== TGSI_FILE_NULL
) {
238 for (i
= 0; i
< 8; i
++)
239 if (cp
->xmm
[i
].last_used
< cp
->xmm
[oldest
].last_used
)
243 /* Need to write out the old value?
245 if (cp
->xmm
[oldest
].dirty
)
248 assert(cp
->xmm
[oldest
].last_used
!= cp
->insn_counter
);
250 cp
->xmm
[oldest
].file
= TGSI_FILE_NULL
;
251 cp
->xmm
[oldest
].idx
= 0;
252 cp
->xmm
[oldest
].dirty
= 0;
253 cp
->xmm
[oldest
].last_used
= cp
->insn_counter
;
254 return x86_make_reg(file_XMM
, oldest
);
257 void aos_release_xmm_reg( struct aos_compilation
*cp
,
260 cp
->xmm
[idx
].file
= TGSI_FILE_NULL
;
261 cp
->xmm
[idx
].idx
= 0;
262 cp
->xmm
[idx
].dirty
= 0;
263 cp
->xmm
[idx
].last_used
= 0;
269 /* Mark an xmm reg as holding the current copy of a shader reg.
271 void aos_adopt_xmm_reg( struct aos_compilation
*cp
,
279 if (reg
.file
!= file_XMM
) {
285 /* If any xmm reg thinks it holds this shader reg, break the
288 for (i
= 0; i
< 8; i
++) {
289 if (cp
->xmm
[i
].file
== file
&&
290 cp
->xmm
[i
].idx
== idx
)
292 /* If an xmm reg is already holding this shader reg, take into account its
295 dirty
|= cp
->xmm
[i
].dirty
;
296 aos_release_xmm_reg(cp
, i
);
300 cp
->xmm
[reg
.idx
].file
= file
;
301 cp
->xmm
[reg
.idx
].idx
= idx
;
302 cp
->xmm
[reg
.idx
].dirty
= dirty
;
303 cp
->xmm
[reg
.idx
].last_used
= cp
->insn_counter
;
307 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
309 static struct x86_reg
aos_get_shader_reg_ptr( struct aos_compilation
*cp
,
315 /* Ensure the in-memory copy of this reg is up-to-date
317 for (i
= 0; i
< 8; i
++) {
318 if (cp
->xmm
[i
].file
== file
&&
319 cp
->xmm
[i
].idx
== idx
&&
325 return get_reg_ptr( cp
, file
, idx
);
329 /* As above, but return a pointer. Note - this pointer may alias
330 * those returned by get_arg_ptr().
332 static struct x86_reg
get_dst_ptr( struct aos_compilation
*cp
,
333 const struct tgsi_full_dst_register
*dst
)
335 unsigned file
= dst
->DstRegister
.File
;
336 unsigned idx
= dst
->DstRegister
.Index
;
340 /* Ensure in-memory copy of this reg is up-to-date and invalidate
343 for (i
= 0; i
< 8; i
++) {
344 if (cp
->xmm
[i
].file
== file
&&
345 cp
->xmm
[i
].idx
== idx
)
347 if (cp
->xmm
[i
].dirty
)
350 aos_release_xmm_reg(cp
, i
);
354 return get_reg_ptr( cp
, file
, idx
);
361 /* Return an XMM reg if the argument is resident, otherwise return a
362 * base+offset pointer to the saved value.
364 struct x86_reg
aos_get_shader_reg( struct aos_compilation
*cp
,
370 for (i
= 0; i
< 8; i
++) {
371 if (cp
->xmm
[i
].file
== file
&&
372 cp
->xmm
[i
].idx
== idx
)
374 cp
->xmm
[i
].last_used
= cp
->insn_counter
;
375 return x86_make_reg(file_XMM
, i
);
379 /* If not found in the XMM register file, return an indirect
380 * reference to the in-memory copy:
382 return get_reg_ptr( cp
, file
, idx
);
387 static struct x86_reg
aos_get_shader_reg_xmm( struct aos_compilation
*cp
,
391 struct x86_reg reg
= get_xmm( cp
,
392 aos_get_shader_reg( cp
, file
, idx
) );
394 aos_adopt_xmm_reg( cp
,
405 struct x86_reg
aos_get_internal_xmm( struct aos_compilation
*cp
,
408 return aos_get_shader_reg_xmm( cp
, AOS_FILE_INTERNAL
, imm
);
412 struct x86_reg
aos_get_internal( struct aos_compilation
*cp
,
415 return aos_get_shader_reg( cp
, AOS_FILE_INTERNAL
, imm
);
422 /* Emulate pshufd insn in regular SSE, if necessary:
424 static void emit_pshufd( struct aos_compilation
*cp
,
430 sse2_pshufd(cp
->func
, dst
, arg0
, shuf
);
434 sse_movaps(cp
->func
, dst
, arg0
);
436 sse_shufps(cp
->func
, dst
, dst
, shuf
);
440 /* load masks (pack into negs??)
441 * pshufd - shuffle according to writemask
446 static boolean
mask_write( struct aos_compilation
*cp
,
448 struct x86_reg result
,
451 struct x86_reg imm_swz
= aos_get_internal_xmm(cp
, IMM_SWZ
);
452 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
454 emit_pshufd(cp
, tmp
, imm_swz
,
455 SHUF((mask
& 1) ? 2 : 3,
458 (mask
& 8) ? 2 : 3));
460 sse_andps(cp
->func
, dst
, tmp
);
461 sse_andnps(cp
->func
, tmp
, result
);
462 sse_orps(cp
->func
, dst
, tmp
);
464 aos_release_xmm_reg(cp
, tmp
.idx
);
471 /* Helper for writemask:
473 static boolean
emit_shuf_copy2( struct aos_compilation
*cp
,
479 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
481 emit_pshufd(cp
, dst
, arg1
, shuf
);
482 emit_pshufd(cp
, tmp
, arg0
, shuf
);
483 sse_shufps(cp
->func
, dst
, tmp
, SHUF(X
, Y
, Z
, W
));
484 emit_pshufd(cp
, dst
, dst
, shuf
);
486 aos_release_xmm_reg(cp
, tmp
.idx
);
492 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
495 /* Locate a source register and perform any required (simple) swizzle.
497 * Just fail on complex swizzles at this point.
499 static struct x86_reg
fetch_src( struct aos_compilation
*cp
,
500 const struct tgsi_full_src_register
*src
)
502 struct x86_reg arg0
= aos_get_shader_reg(cp
,
503 src
->SrcRegister
.File
,
504 src
->SrcRegister
.Index
);
510 for (i
= 0; i
< 4; i
++) {
511 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( src
, i
);
512 unsigned neg
= tgsi_util_get_full_src_register_sign_mode( src
, i
);
515 case TGSI_EXTSWIZZLE_ZERO
:
516 case TGSI_EXTSWIZZLE_ONE
:
517 ERROR(cp
, "not supporting full swizzles yet in tgsi_aos_sse2");
521 swz
|= (swizzle
& 0x3) << (i
* 2);
526 case TGSI_UTIL_SIGN_TOGGLE
:
530 case TGSI_UTIL_SIGN_KEEP
:
533 case TGSI_UTIL_SIGN_CLEAR
:
538 ERROR(cp
, "unsupported sign-mode");
543 if (swz
!= SSE_SWIZZLE_NOOP
|| negs
!= 0 || abs
!= 0) {
544 struct x86_reg dst
= aos_get_xmm_reg(cp
);
546 if (swz
!= SSE_SWIZZLE_NOOP
)
547 emit_pshufd(cp
, dst
, arg0
, swz
);
549 sse_movaps(cp
->func
, dst
, arg0
);
551 if (negs
&& negs
!= 0xf) {
552 struct x86_reg imm_swz
= aos_get_internal_xmm(cp
, IMM_SWZ
);
553 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
556 * Use neg as arg to pshufd
559 emit_pshufd(cp
, tmp
, imm_swz
,
560 SHUF((negs
& 1) ? 1 : 0,
563 (negs
& 8) ? 1 : 0));
564 sse_mulps(cp
->func
, dst
, tmp
);
566 aos_release_xmm_reg(cp
, tmp
.idx
);
569 struct x86_reg imm_negs
= aos_get_internal_xmm(cp
, IMM_NEGS
);
570 sse_mulps(cp
->func
, dst
, imm_negs
);
574 if (abs
&& abs
!= 0xf) {
575 ERROR(cp
, "unsupported partial abs");
578 struct x86_reg neg
= aos_get_internal(cp
, IMM_NEGS
);
579 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
581 sse_movaps(cp
->func
, tmp
, dst
);
582 sse_mulps(cp
->func
, tmp
, neg
);
583 sse_maxps(cp
->func
, dst
, tmp
);
585 aos_release_xmm_reg(cp
, tmp
.idx
);
594 static void x87_fld_src( struct aos_compilation
*cp
,
595 const struct tgsi_full_src_register
*src
,
598 struct x86_reg arg0
= aos_get_shader_reg_ptr(cp
,
599 src
->SrcRegister
.File
,
600 src
->SrcRegister
.Index
);
602 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( src
, channel
);
603 unsigned neg
= tgsi_util_get_full_src_register_sign_mode( src
, channel
);
606 case TGSI_EXTSWIZZLE_ZERO
:
607 x87_fldz( cp
->func
);
610 case TGSI_EXTSWIZZLE_ONE
:
611 x87_fld1( cp
->func
);
615 x87_fld( cp
->func
, x86_make_disp(arg0
, (swizzle
& 3) * sizeof(float)) );
621 case TGSI_UTIL_SIGN_TOGGLE
:
624 x87_fchs( cp
->func
);
627 case TGSI_UTIL_SIGN_KEEP
:
630 case TGSI_UTIL_SIGN_CLEAR
:
631 x87_fabs( cp
->func
);
634 case TGSI_UTIL_SIGN_SET
:
635 x87_fabs( cp
->func
);
636 x87_fchs( cp
->func
);
640 ERROR(cp
, "unsupported sign-mode");
650 /* Used to implement write masking. This and most of the other instructions
651 * here would be easier to implement if there had been a translation
652 * to a 2 argument format (dst/arg0, arg1) at the shader level before
653 * attempting to translate to x86/sse code.
655 static void store_dest( struct aos_compilation
*cp
,
656 const struct tgsi_full_dst_register
*reg
,
657 struct x86_reg result
)
661 switch (reg
->DstRegister
.WriteMask
) {
665 case TGSI_WRITEMASK_XYZW
:
666 aos_adopt_xmm_reg(cp
,
667 get_xmm_writable(cp
, result
),
668 reg
->DstRegister
.File
,
669 reg
->DstRegister
.Index
,
676 dst
= aos_get_shader_reg_xmm(cp
,
677 reg
->DstRegister
.File
,
678 reg
->DstRegister
.Index
);
680 switch (reg
->DstRegister
.WriteMask
) {
681 case TGSI_WRITEMASK_X
:
682 sse_movss(cp
->func
, dst
, get_xmm(cp
, result
));
685 case TGSI_WRITEMASK_ZW
:
686 sse_shufps(cp
->func
, dst
, get_xmm(cp
, result
), SHUF(X
, Y
, Z
, W
));
689 case TGSI_WRITEMASK_XY
:
690 result
= get_xmm_writable(cp
, result
);
691 sse_shufps(cp
->func
, result
, dst
, SHUF(X
, Y
, Z
, W
));
695 case TGSI_WRITEMASK_YZW
:
696 result
= get_xmm_writable(cp
, result
);
697 sse_movss(cp
->func
, result
, dst
);
702 mask_write(cp
, dst
, result
, reg
->DstRegister
.WriteMask
);
706 aos_adopt_xmm_reg(cp
,
708 reg
->DstRegister
.File
,
709 reg
->DstRegister
.Index
,
714 static void inject_scalar( struct aos_compilation
*cp
,
716 struct x86_reg result
,
719 sse_shufps(cp
->func
, dst
, dst
, swizzle
);
720 sse_movss(cp
->func
, dst
, result
);
721 sse_shufps(cp
->func
, dst
, dst
, swizzle
);
725 static void store_scalar_dest( struct aos_compilation
*cp
,
726 const struct tgsi_full_dst_register
*reg
,
727 struct x86_reg result
)
729 unsigned writemask
= reg
->DstRegister
.WriteMask
;
732 if (writemask
!= TGSI_WRITEMASK_X
&&
733 writemask
!= TGSI_WRITEMASK_Y
&&
734 writemask
!= TGSI_WRITEMASK_Z
&&
735 writemask
!= TGSI_WRITEMASK_W
&&
738 result
= get_xmm_writable(cp
, result
); /* already true, right? */
739 sse_shufps(cp
->func
, result
, result
, SHUF(X
,X
,X
,X
));
740 store_dest(cp
, reg
, result
);
744 result
= get_xmm(cp
, result
);
745 dst
= aos_get_shader_reg_xmm(cp
,
746 reg
->DstRegister
.File
,
747 reg
->DstRegister
.Index
);
751 switch (reg
->DstRegister
.WriteMask
) {
752 case TGSI_WRITEMASK_X
:
753 sse_movss(cp
->func
, dst
, result
);
756 case TGSI_WRITEMASK_Y
:
757 inject_scalar(cp
, dst
, result
, SHUF(Y
, X
, Z
, W
));
760 case TGSI_WRITEMASK_Z
:
761 inject_scalar(cp
, dst
, result
, SHUF(Z
, Y
, X
, W
));
764 case TGSI_WRITEMASK_W
:
765 inject_scalar(cp
, dst
, result
, SHUF(W
, Y
, Z
, X
));
772 aos_adopt_xmm_reg(cp
,
774 reg
->DstRegister
.File
,
775 reg
->DstRegister
.Index
,
781 static void x87_fst_or_nop( struct x86_function
*func
,
786 assert(ptr
.file
== file_REG32
);
787 if (writemask
& (1<<channel
))
788 x87_fst( func
, x86_make_disp(ptr
, channel
* sizeof(float)) );
791 static void x87_fstp_or_pop( struct x86_function
*func
,
796 assert(ptr
.file
== file_REG32
);
797 if (writemask
& (1<<channel
))
798 x87_fstp( func
, x86_make_disp(ptr
, channel
* sizeof(float)) );
800 x87_fstp( func
, x86_make_reg( file_x87
, 0 ));
807 static void x87_fstp_dest4( struct aos_compilation
*cp
,
808 const struct tgsi_full_dst_register
*dst
)
810 struct x86_reg ptr
= get_dst_ptr(cp
, dst
);
811 unsigned writemask
= dst
->DstRegister
.WriteMask
;
813 x87_fst_or_nop(cp
->func
, writemask
, 0, ptr
);
814 x87_fst_or_nop(cp
->func
, writemask
, 1, ptr
);
815 x87_fst_or_nop(cp
->func
, writemask
, 2, ptr
);
816 x87_fstp_or_pop(cp
->func
, writemask
, 3, ptr
);
819 /* Save current x87 state and put it into single precision mode.
821 static void save_fpu_state( struct aos_compilation
*cp
)
823 x87_fnstcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
824 Offset(struct aos_machine
, fpu_restore
)));
827 static void restore_fpu_state( struct aos_compilation
*cp
)
829 x87_fnclex(cp
->func
);
830 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
831 Offset(struct aos_machine
, fpu_restore
)));
834 static void set_fpu_round_neg_inf( struct aos_compilation
*cp
)
836 if (cp
->fpucntl
!= FPU_RND_NEG
) {
837 cp
->fpucntl
= FPU_RND_NEG
;
838 x87_fnclex(cp
->func
);
839 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
840 Offset(struct aos_machine
, fpu_rnd_neg_inf
)));
844 static void set_fpu_round_nearest( struct aos_compilation
*cp
)
846 if (cp
->fpucntl
!= FPU_RND_NEAREST
) {
847 cp
->fpucntl
= FPU_RND_NEAREST
;
848 x87_fnclex(cp
->func
);
849 x87_fldcw( cp
->func
, x86_make_disp(cp
->machine_EDX
,
850 Offset(struct aos_machine
, fpu_rnd_nearest
)));
855 static void x87_emit_ex2( struct aos_compilation
*cp
)
857 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
858 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
859 int stack
= cp
->func
->x87_stack
;
861 // set_fpu_round_neg_inf( cp );
863 x87_fld(cp
->func
, st0
); /* a a */
864 x87_fprndint( cp
->func
); /* int(a) a*/
865 x87_fsubr(cp
->func
, st1
, st0
); /* int(a) frc(a) */
866 x87_fxch(cp
->func
, st1
); /* frc(a) int(a) */
867 x87_f2xm1(cp
->func
); /* (2^frc(a))-1 int(a) */
868 x87_fld1(cp
->func
); /* 1 (2^frc(a))-1 int(a) */
869 x87_faddp(cp
->func
, st1
); /* 2^frac(a) int(a) */
870 x87_fscale(cp
->func
); /* (2^frac(a)*2^int(int(a))) int(a) */
872 x87_fstp(cp
->func
, st1
); /* 2^a */
874 assert( stack
== cp
->func
->x87_stack
);
878 static void PIPE_CDECL
print_reg( const char *msg
,
881 debug_printf("%s: %f %f %f %f\n", msg
, reg
[0], reg
[1], reg
[2], reg
[3]);
884 static void emit_print( struct aos_compilation
*cp
,
885 const char *message
, /* must point to a static string! */
889 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
890 struct x86_reg arg
= aos_get_shader_reg_ptr( cp
, file
, idx
);
893 /* There shouldn't be anything on the x87 stack. Can add this
894 * capacity later if need be.
896 assert(cp
->func
->x87_stack
== 0);
898 /* For absolute correctness, need to spill/invalidate all XMM regs
899 * too. We're obviously not concerned about performance on this
900 * debug path, so here goes:
902 for (i
= 0; i
< 8; i
++) {
903 if (cp
->xmm
[i
].dirty
)
906 aos_release_xmm_reg(cp
, i
);
909 /* Push caller-save (ie scratch) regs.
911 x86_cdecl_caller_push_regs( cp
->func
);
914 /* Push the arguments:
916 x86_lea( cp
->func
, ecx
, arg
);
917 x86_push( cp
->func
, ecx
);
918 x86_push_imm32( cp
->func
, (int)message
);
920 /* Call the helper. Could call debug_printf directly, but
921 * print_reg is a nice place to put a breakpoint if need be.
923 x86_mov_reg_imm( cp
->func
, ecx
, (int)print_reg
);
924 x86_call( cp
->func
, ecx
);
925 x86_pop( cp
->func
, ecx
);
926 x86_pop( cp
->func
, ecx
);
928 /* Pop caller-save regs
930 x86_cdecl_caller_pop_regs( cp
->func
);
937 * The traditional instructions. All operate on internal registers
938 * and ignore write masks and swizzling issues.
941 static boolean
emit_ABS( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
943 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
944 struct x86_reg neg
= aos_get_internal(cp
, IMM_NEGS
);
945 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
947 sse_movaps(cp
->func
, tmp
, arg0
);
948 sse_mulps(cp
->func
, tmp
, neg
);
949 sse_maxps(cp
->func
, tmp
, arg0
);
951 store_dest(cp
, &op
->FullDstRegisters
[0], tmp
);
955 static boolean
emit_ADD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
957 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
958 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
959 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
961 sse_addps(cp
->func
, dst
, arg1
);
963 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
967 static boolean
emit_COS( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
969 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
971 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
975 /* The dotproduct instructions don't really do that well in sse:
976 * XXX: produces wrong results -- disabled.
978 static boolean
emit_DP3( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
980 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
981 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
982 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
983 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
985 sse_mulps(cp
->func
, dst
, arg1
);
986 /* Now the hard bit: sum the first 3 values:
988 sse_movhlps(cp
->func
, tmp
, dst
);
989 sse_addss(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y, ?, ? */
990 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
991 sse_addss(cp
->func
, dst
, tmp
);
993 aos_release_xmm_reg(cp
, tmp
.idx
);
994 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
998 static boolean
emit_DP4( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1000 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1001 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1002 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1003 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1005 sse_mulps(cp
->func
, dst
, arg1
);
1007 /* Now the hard bit: sum the values:
1009 sse_movhlps(cp
->func
, tmp
, dst
);
1010 sse_addps(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1011 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
1012 sse_addss(cp
->func
, dst
, tmp
);
1014 aos_release_xmm_reg(cp
, tmp
.idx
);
1015 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1019 static boolean
emit_DPH( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1021 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1022 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1023 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1024 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1026 sse_mulps(cp
->func
, dst
, arg1
);
1028 /* Now the hard bit: sum the values (from DP3):
1030 sse_movhlps(cp
->func
, tmp
, dst
);
1031 sse_addss(cp
->func
, dst
, tmp
); /* a*x+c*z, b*y, ?, ? */
1032 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
1033 sse_addss(cp
->func
, dst
, tmp
);
1034 emit_pshufd(cp
, tmp
, arg1
, SHUF(W
,W
,W
,W
));
1035 sse_addss(cp
->func
, dst
, tmp
);
1037 aos_release_xmm_reg(cp
, tmp
.idx
);
1038 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1042 static boolean
emit_DST( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1044 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1045 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1046 struct x86_reg dst
= aos_get_xmm_reg(cp
);
1047 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1048 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1050 /* dst[0] = 1.0 * 1.0F; */
1051 /* dst[1] = arg0[1] * arg1[1]; */
1052 /* dst[2] = arg0[2] * 1.0; */
1053 /* dst[3] = 1.0 * arg1[3]; */
1055 emit_shuf_copy2(cp
, dst
, arg0
, ones
, SHUF(X
,W
,Z
,Y
));
1056 emit_shuf_copy2(cp
, tmp
, arg1
, ones
, SHUF(X
,Z
,Y
,W
));
1057 sse_mulps(cp
->func
, dst
, tmp
);
1059 aos_release_xmm_reg(cp
, tmp
.idx
);
1060 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1064 static boolean
emit_LG2( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1066 x87_fld1(cp
->func
); /* 1 */
1067 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0 1 */
1068 x87_fyl2x(cp
->func
); /* log2(a0) */
1069 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1074 static boolean
emit_EX2( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1076 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
1078 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1083 static boolean
emit_FLR( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1085 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1086 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1089 set_fpu_round_neg_inf( cp
);
1091 /* Load all sources first to avoid aliasing
1093 for (i
= 3; i
>= 0; i
--) {
1094 if (writemask
& (1<<i
)) {
1095 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1099 for (i
= 0; i
< 4; i
++) {
1100 if (writemask
& (1<<i
)) {
1101 x87_fprndint( cp
->func
);
1102 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1110 static boolean
emit_RND( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1112 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1113 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1116 set_fpu_round_nearest( cp
);
1118 /* Load all sources first to avoid aliasing
1120 for (i
= 3; i
>= 0; i
--) {
1121 if (writemask
& (1<<i
)) {
1122 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1126 for (i
= 0; i
< 4; i
++) {
1127 if (writemask
& (1<<i
)) {
1128 x87_fprndint( cp
->func
);
1129 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1137 static boolean
emit_FRC( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1139 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1140 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
1141 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
1142 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1145 set_fpu_round_neg_inf( cp
);
1147 /* suck all the source values onto the stack before writing out any
1148 * dst, which may alias...
1150 for (i
= 3; i
>= 0; i
--) {
1151 if (writemask
& (1<<i
)) {
1152 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], i
);
1156 for (i
= 0; i
< 4; i
++) {
1157 if (writemask
& (1<<i
)) {
1158 x87_fld(cp
->func
, st0
); /* a a */
1159 x87_fprndint( cp
->func
); /* flr(a) a */
1160 x87_fsubp(cp
->func
, st1
); /* frc(a) */
1161 x87_fstp(cp
->func
, x86_make_disp(dst
, i
*4));
1173 static boolean
emit_LIT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1175 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
1176 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1177 unsigned lit_count
= cp
->lit_count
++;
1178 struct x86_reg result
, arg0
;
1182 /* For absolute correctness, need to spill/invalidate all XMM regs
1185 for (i
= 0; i
< 8; i
++) {
1186 if (cp
->xmm
[i
].dirty
)
1188 aos_release_xmm_reg(cp
, i
);
1192 if (writemask
!= TGSI_WRITEMASK_XYZW
)
1193 result
= x86_make_disp(cp
->machine_EDX
, Offset(struct aos_machine
, tmp
[0]));
1195 result
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1198 arg0
= fetch_src( cp
, &op
->FullSrcRegisters
[0] );
1199 if (arg0
.file
== file_XMM
) {
1200 struct x86_reg tmp
= x86_make_disp(cp
->machine_EDX
,
1201 Offset(struct aos_machine
, tmp
[1]));
1202 sse_movaps( cp
->func
, tmp
, arg0
);
1208 /* Push caller-save (ie scratch) regs.
1210 x86_cdecl_caller_push_regs( cp
->func
);
1212 /* Push the arguments:
1214 x86_push_imm32( cp
->func
, lit_count
);
1216 x86_lea( cp
->func
, ecx
, arg0
);
1217 x86_push( cp
->func
, ecx
);
1219 x86_lea( cp
->func
, ecx
, result
);
1220 x86_push( cp
->func
, ecx
);
1222 x86_push( cp
->func
, cp
->machine_EDX
);
1224 if (lit_count
< MAX_LIT_INFO
) {
1225 x86_mov( cp
->func
, ecx
, x86_make_disp( cp
->machine_EDX
,
1226 Offset(struct aos_machine
, lit_info
) +
1227 lit_count
* sizeof(struct lit_info
) +
1228 Offset(struct lit_info
, func
)));
1231 x86_mov_reg_imm( cp
->func
, ecx
, (int)aos_do_lit
);
1234 x86_call( cp
->func
, ecx
);
1236 x86_pop( cp
->func
, ecx
); /* fixme... */
1237 x86_pop( cp
->func
, ecx
);
1238 x86_pop( cp
->func
, ecx
);
1239 x86_pop( cp
->func
, ecx
);
1241 x86_cdecl_caller_pop_regs( cp
->func
);
1243 if (writemask
!= TGSI_WRITEMASK_XYZW
) {
1245 &op
->FullDstRegisters
[0],
1246 get_xmm_writable( cp
, result
) );
1253 static boolean
emit_inline_LIT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1255 struct x86_reg dst
= get_dst_ptr(cp
, &op
->FullDstRegisters
[0]);
1256 unsigned writemask
= op
->FullDstRegisters
[0].DstRegister
.WriteMask
;
1258 if (writemask
& TGSI_WRITEMASK_YZ
) {
1259 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
1260 struct x86_reg st2
= x86_make_reg(file_x87
, 2);
1262 /* a1' = a1 <= 0 ? 1 : a1;
1264 x87_fldz(cp
->func
); /* 1 0 */
1266 x87_fld1(cp
->func
); /* 1 0 */
1268 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1270 x87_fldz(cp
->func
); /* 1 0 */
1272 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 1); /* a1 1 0 */
1273 x87_fcomi(cp
->func
, st2
); /* a1 1 0 */
1274 x87_fcmovb(cp
->func
, st1
); /* a1' 1 0 */
1275 x87_fstp(cp
->func
, st1
); /* a1' 0 */
1276 x87_fstp(cp
->func
, st1
); /* a1' */
1278 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 3); /* a3 a1' */
1279 x87_fxch(cp
->func
, st1
); /* a1' a3 */
1282 /* Compute pow(a1, a3)
1284 x87_fyl2x(cp
->func
); /* a3*log2(a1) */
1285 x87_emit_ex2( cp
); /* 2^(a3*log2(a1)) */
1288 /* a0' = max2(a0, 0):
1290 x87_fldz(cp
->func
); /* 0 r2 */
1291 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0 0 r2 */
1292 x87_fcomi(cp
->func
, st1
);
1293 x87_fcmovb(cp
->func
, st1
); /* a0' 0 r2 */
1295 x87_fst_or_nop(cp
->func
, writemask
, 1, dst
); /* result[1] = a0' */
1297 x87_fcomi(cp
->func
, st1
); /* a0' 0 r2 */
1298 x87_fcmovnbe(cp
->func
, st2
); /* r2' 0' r2 */
1300 x87_fstp_or_pop(cp
->func
, writemask
, 2, dst
); /* 0 r2 */
1301 x87_fpop(cp
->func
); /* r2 */
1305 if (writemask
& TGSI_WRITEMASK_XW
) {
1307 x87_fst_or_nop(cp
->func
, writemask
, 0, dst
);
1308 x87_fstp_or_pop(cp
->func
, writemask
, 3, dst
);
1317 static boolean
emit_MAX( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1319 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1320 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1321 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1323 sse_maxps(cp
->func
, dst
, arg1
);
1325 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1330 static boolean
emit_MIN( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1332 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1333 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1334 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1336 sse_minps(cp
->func
, dst
, arg1
);
1338 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1342 static boolean
emit_MOV( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1344 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1345 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1347 /* potentially nothing to do */
1349 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1353 static boolean
emit_MUL( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1355 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1356 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1357 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1359 sse_mulps(cp
->func
, dst
, arg1
);
1361 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1366 static boolean
emit_MAD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1368 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1369 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1370 struct x86_reg arg2
= fetch_src(cp
, &op
->FullSrcRegisters
[2]);
1372 /* If we can't clobber old contents of arg0, get a temporary & copy
1373 * it there, then clobber it...
1375 arg0
= get_xmm_writable(cp
, arg0
);
1377 sse_mulps(cp
->func
, arg0
, arg1
);
1378 sse_addps(cp
->func
, arg0
, arg2
);
1379 store_dest(cp
, &op
->FullDstRegisters
[0], arg0
);
1383 /* A wrapper for powf().
1384 * Makes sure it is cdecl and operates on floats.
1386 static float PIPE_CDECL
_powerf( float x
, float y
)
1388 return powf( x
, y
);
1391 /* Really not sufficient -- need to check for conditions that could
1392 * generate inf/nan values, which will slow things down hugely.
1394 static boolean
emit_POW( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1397 x87_fld_src(cp
, &op
->FullSrcRegisters
[1], 0); /* a1.x */
1398 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0); /* a0.x a1.x */
1399 x87_fyl2x(cp
->func
); /* a1*log2(a0) */
1401 x87_emit_ex2( cp
); /* 2^(a1*log2(a0)) */
1403 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1407 /* For absolute correctness, need to spill/invalidate all XMM regs
1410 for (i
= 0; i
< 8; i
++) {
1411 if (cp
->xmm
[i
].dirty
)
1413 aos_release_xmm_reg(cp
, i
);
1416 /* Push caller-save (ie scratch) regs.
1418 x86_cdecl_caller_push_regs( cp
->func
);
1420 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, -8) );
1422 x87_fld_src( cp
, &op
->FullSrcRegisters
[1], 0 );
1423 x87_fstp( cp
->func
, x86_make_disp( cp
->stack_ESP
, 4 ) );
1424 x87_fld_src( cp
, &op
->FullSrcRegisters
[0], 0 );
1425 x87_fstp( cp
->func
, x86_make_disp( cp
->stack_ESP
, 0 ) );
1427 /* tmp_EAX has been pushed & will be restored below */
1428 x86_mov_reg_imm( cp
->func
, cp
->tmp_EAX
, (unsigned long) _powerf
);
1429 x86_call( cp
->func
, cp
->tmp_EAX
);
1431 x86_lea( cp
->func
, cp
->stack_ESP
, x86_make_disp(cp
->stack_ESP
, 8) );
1433 x86_cdecl_caller_pop_regs( cp
->func
);
1435 /* Note retval on x87 stack:
1437 cp
->func
->x87_stack
++;
1439 x87_fstp_dest4( cp
, &op
->FullDstRegisters
[0] );
1445 static boolean
emit_RCP( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1447 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1448 struct x86_reg dst
= aos_get_xmm_reg(cp
);
1450 if (cp
->have_sse2
) {
1451 sse2_rcpss(cp
->func
, dst
, arg0
);
1452 /* extend precision here...
1456 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1457 sse_movss(cp
->func
, dst
, ones
);
1458 sse_divss(cp
->func
, dst
, arg0
);
1461 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1466 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1467 * implementations, it is possible to improve its precision at
1468 * fairly low cost, using a newton/raphson step, as below:
1470 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1471 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1473 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1476 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1478 static boolean
emit_RSQ( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1482 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1483 struct x86_reg r
= aos_get_xmm_reg(cp
);
1484 sse_rsqrtss(cp
->func
, r
, arg0
);
1485 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], r
);
1489 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1490 struct x86_reg r
= aos_get_xmm_reg(cp
);
1492 struct x86_reg neg_half
= get_reg_ptr( cp
, AOS_FILE_INTERNAL
, IMM_RSQ
);
1493 struct x86_reg one_point_five
= x86_make_disp( neg_half
, 4 );
1494 struct x86_reg src
= get_xmm_writable( cp
, arg0
);
1496 sse_rsqrtss( cp
->func
, r
, src
); /* rsqrtss(a) */
1497 sse_mulss( cp
->func
, src
, neg_half
); /* -.5 * a */
1498 sse_mulss( cp
->func
, src
, r
); /* -.5 * a * r */
1499 sse_mulss( cp
->func
, src
, r
); /* -.5 * a * r * r */
1500 sse_addss( cp
->func
, src
, one_point_five
); /* 1.5 - .5 * a * r * r */
1501 sse_mulss( cp
->func
, r
, src
); /* r * (1.5 - .5 * a * r * r) */
1503 store_scalar_dest(cp
, &op
->FullDstRegisters
[0], r
);
1509 static boolean
emit_SGE( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1511 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1512 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1513 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1514 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1516 sse_cmpps(cp
->func
, dst
, arg1
, cc_NotLessThan
);
1517 sse_andps(cp
->func
, dst
, ones
);
1519 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1523 static boolean
emit_SIN( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1525 x87_fld_src(cp
, &op
->FullSrcRegisters
[0], 0);
1527 x87_fstp_dest4(cp
, &op
->FullDstRegisters
[0]);
1533 static boolean
emit_SLT( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1535 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1536 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1537 struct x86_reg ones
= aos_get_internal(cp
, IMM_ONES
);
1538 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1540 sse_cmpps(cp
->func
, dst
, arg1
, cc_LessThan
);
1541 sse_andps(cp
->func
, dst
, ones
);
1543 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1547 static boolean
emit_SUB( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1549 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1550 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1551 struct x86_reg dst
= get_xmm_writable(cp
, arg0
);
1553 sse_subps(cp
->func
, dst
, arg1
);
1555 store_dest(cp
, &op
->FullDstRegisters
[0], dst
);
1560 static boolean
emit_XPD( struct aos_compilation
*cp
, const struct tgsi_full_instruction
*op
)
1562 struct x86_reg arg0
= fetch_src(cp
, &op
->FullSrcRegisters
[0]);
1563 struct x86_reg arg1
= fetch_src(cp
, &op
->FullSrcRegisters
[1]);
1564 struct x86_reg tmp0
= aos_get_xmm_reg(cp
);
1565 struct x86_reg tmp1
= aos_get_xmm_reg(cp
);
1567 emit_pshufd(cp
, tmp1
, arg1
, SHUF(Y
, Z
, X
, W
));
1568 sse_mulps(cp
->func
, tmp1
, arg0
);
1569 emit_pshufd(cp
, tmp0
, arg0
, SHUF(Y
, Z
, X
, W
));
1570 sse_mulps(cp
->func
, tmp0
, arg1
);
1571 sse_subps(cp
->func
, tmp1
, tmp0
);
1572 sse_shufps(cp
->func
, tmp1
, tmp1
, SHUF(Y
, Z
, X
, W
));
1574 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1575 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1576 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1577 /* dst[3] is undef */
1580 aos_release_xmm_reg(cp
, tmp0
.idx
);
1581 store_dest(cp
, &op
->FullDstRegisters
[0], tmp1
);
1588 emit_instruction( struct aos_compilation
*cp
,
1589 struct tgsi_full_instruction
*inst
)
1591 x87_assert_stack_empty(cp
->func
);
1593 switch( inst
->Instruction
.Opcode
) {
1594 case TGSI_OPCODE_MOV
:
1595 return emit_MOV( cp
, inst
);
1597 case TGSI_OPCODE_LIT
:
1598 return emit_LIT(cp
, inst
);
1600 case TGSI_OPCODE_RCP
:
1601 return emit_RCP(cp
, inst
);
1603 case TGSI_OPCODE_RSQ
:
1604 return emit_RSQ(cp
, inst
);
1606 case TGSI_OPCODE_EXP
:
1607 /*return emit_EXP(cp, inst);*/
1610 case TGSI_OPCODE_LOG
:
1611 /*return emit_LOG(cp, inst);*/
1614 case TGSI_OPCODE_MUL
:
1615 return emit_MUL(cp
, inst
);
1617 case TGSI_OPCODE_ADD
:
1618 return emit_ADD(cp
, inst
);
1620 case TGSI_OPCODE_DP3
:
1621 return emit_DP3(cp
, inst
);
1623 case TGSI_OPCODE_DP4
:
1624 return emit_DP4(cp
, inst
);
1626 case TGSI_OPCODE_DST
:
1627 return emit_DST(cp
, inst
);
1629 case TGSI_OPCODE_MIN
:
1630 return emit_MIN(cp
, inst
);
1632 case TGSI_OPCODE_MAX
:
1633 return emit_MAX(cp
, inst
);
1635 case TGSI_OPCODE_SLT
:
1636 return emit_SLT(cp
, inst
);
1638 case TGSI_OPCODE_SGE
:
1639 return emit_SGE(cp
, inst
);
1641 case TGSI_OPCODE_MAD
:
1642 return emit_MAD(cp
, inst
);
1644 case TGSI_OPCODE_SUB
:
1645 return emit_SUB(cp
, inst
);
1647 case TGSI_OPCODE_LERP
:
1648 // return emit_LERP(cp, inst);
1651 case TGSI_OPCODE_FRAC
:
1652 return emit_FRC(cp
, inst
);
1654 case TGSI_OPCODE_CLAMP
:
1655 // return emit_CLAMP(cp, inst);
1658 case TGSI_OPCODE_FLOOR
:
1659 return emit_FLR(cp
, inst
);
1661 case TGSI_OPCODE_ROUND
:
1662 return emit_RND(cp
, inst
);
1664 case TGSI_OPCODE_EXPBASE2
:
1665 return emit_EX2(cp
, inst
);
1667 case TGSI_OPCODE_LOGBASE2
:
1668 return emit_LG2(cp
, inst
);
1670 case TGSI_OPCODE_POWER
:
1671 return emit_POW(cp
, inst
);
1673 case TGSI_OPCODE_CROSSPRODUCT
:
1674 return emit_XPD(cp
, inst
);
1676 case TGSI_OPCODE_ABS
:
1677 return emit_ABS(cp
, inst
);
1679 case TGSI_OPCODE_DPH
:
1680 return emit_DPH(cp
, inst
);
1682 case TGSI_OPCODE_COS
:
1683 return emit_COS(cp
, inst
);
1685 case TGSI_OPCODE_SIN
:
1686 return emit_SIN(cp
, inst
);
1688 case TGSI_OPCODE_END
:
1697 static boolean
emit_viewport( struct aos_compilation
*cp
)
1699 struct x86_reg pos
= aos_get_shader_reg_xmm(cp
,
1701 cp
->vaos
->draw
->vs
.position_output
);
1703 struct x86_reg scale
= x86_make_disp(cp
->machine_EDX
,
1704 Offset(struct aos_machine
, scale
));
1706 struct x86_reg translate
= x86_make_disp(cp
->machine_EDX
,
1707 Offset(struct aos_machine
, translate
));
1709 sse_mulps(cp
->func
, pos
, scale
);
1710 sse_addps(cp
->func
, pos
, translate
);
1712 aos_adopt_xmm_reg( cp
,
1715 cp
->vaos
->draw
->vs
.position_output
,
1721 /* This is useful to be able to see the results on softpipe. Doesn't
1722 * do proper clipping, just assumes the backend can do it during
1723 * rasterization -- for debug only...
1725 static boolean
emit_rhw_viewport( struct aos_compilation
*cp
)
1727 struct x86_reg tmp
= aos_get_xmm_reg(cp
);
1728 struct x86_reg pos
= aos_get_shader_reg_xmm(cp
,
1730 cp
->vaos
->draw
->vs
.position_output
);
1732 struct x86_reg scale
= x86_make_disp(cp
->machine_EDX
,
1733 Offset(struct aos_machine
, scale
));
1735 struct x86_reg translate
= x86_make_disp(cp
->machine_EDX
,
1736 Offset(struct aos_machine
, translate
));
1740 emit_pshufd(cp
, tmp
, pos
, SHUF(W
, W
, W
, W
));
1741 sse2_rcpss(cp
->func
, tmp
, tmp
);
1742 sse_shufps(cp
->func
, tmp
, tmp
, SHUF(X
, X
, X
, X
));
1744 sse_mulps(cp
->func
, pos
, scale
);
1745 sse_mulps(cp
->func
, pos
, tmp
);
1746 sse_addps(cp
->func
, pos
, translate
);
1750 mask_write(cp
, pos
, tmp
, TGSI_WRITEMASK_W
);
1752 aos_adopt_xmm_reg( cp
,
1755 cp
->vaos
->draw
->vs
.position_output
,
1762 static boolean
note_immediate( struct aos_compilation
*cp
,
1763 struct tgsi_full_immediate
*imm
)
1765 unsigned pos
= cp
->num_immediates
++;
1768 for (j
= 0; j
< imm
->Immediate
.Size
; j
++) {
1769 cp
->vaos
->machine
->immediate
[pos
][j
] = imm
->u
.ImmediateFloat32
[j
].Float
;
1779 static void find_last_write_outputs( struct aos_compilation
*cp
)
1781 struct tgsi_parse_context parse
;
1782 unsigned this_instruction
= 0;
1785 tgsi_parse_init( &parse
, cp
->vaos
->base
.vs
->state
.tokens
);
1787 while (!tgsi_parse_end_of_tokens( &parse
)) {
1789 tgsi_parse_token( &parse
);
1791 if (parse
.FullToken
.Token
.Type
!= TGSI_TOKEN_TYPE_INSTRUCTION
)
1794 for (i
= 0; i
< TGSI_FULL_MAX_DST_REGISTERS
; i
++) {
1795 if (parse
.FullToken
.FullInstruction
.FullDstRegisters
[i
].DstRegister
.File
==
1798 unsigned idx
= parse
.FullToken
.FullInstruction
.FullDstRegisters
[i
].DstRegister
.Index
;
1799 cp
->output_last_write
[idx
] = this_instruction
;
1806 tgsi_parse_free( &parse
);
1810 #define ARG_MACHINE 1
1811 #define ARG_START_ELTS 2
1813 #define ARG_OUTBUF 4
1816 static boolean
build_vertex_program( struct draw_vs_varient_aos_sse
*varient
,
1819 struct tgsi_parse_context parse
;
1820 struct aos_compilation cp
;
1821 unsigned fixup
, label
;
1823 tgsi_parse_init( &parse
, varient
->base
.vs
->state
.tokens
);
1825 memset(&cp
, 0, sizeof(cp
));
1827 cp
.insn_counter
= 1;
1830 cp
.func
= &varient
->func
[ linear
? 0 : 1 ];
1832 cp
.tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
1833 cp
.idx_EBX
= x86_make_reg(file_REG32
, reg_BX
);
1834 cp
.outbuf_ECX
= x86_make_reg(file_REG32
, reg_CX
);
1835 cp
.machine_EDX
= x86_make_reg(file_REG32
, reg_DX
);
1836 cp
.count_ESI
= x86_make_reg(file_REG32
, reg_SI
);
1837 cp
.temp_EBP
= x86_make_reg(file_REG32
, reg_BP
);
1838 cp
.stack_ESP
= x86_make_reg( file_REG32
, reg_SP
);
1840 x86_init_func(cp
.func
);
1842 find_last_write_outputs(&cp
);
1844 x86_push(cp
.func
, cp
.idx_EBX
);
1845 x86_push(cp
.func
, cp
.count_ESI
);
1846 x86_push(cp
.func
, cp
.temp_EBP
);
1849 /* Load arguments into regs:
1851 x86_mov(cp
.func
, cp
.machine_EDX
, x86_fn_arg(cp
.func
, ARG_MACHINE
));
1852 x86_mov(cp
.func
, cp
.idx_EBX
, x86_fn_arg(cp
.func
, ARG_START_ELTS
));
1853 x86_mov(cp
.func
, cp
.count_ESI
, x86_fn_arg(cp
.func
, ARG_COUNT
));
1854 x86_mov(cp
.func
, cp
.outbuf_ECX
, x86_fn_arg(cp
.func
, ARG_OUTBUF
));
1857 /* Compare count to zero and possibly bail.
1859 x86_xor(cp
.func
, cp
.tmp_EAX
, cp
.tmp_EAX
);
1860 x86_cmp(cp
.func
, cp
.count_ESI
, cp
.tmp_EAX
);
1861 fixup
= x86_jcc_forward(cp
.func
, cc_E
);
1864 save_fpu_state( &cp
);
1865 set_fpu_round_nearest( &cp
);
1867 /* Note address for loop jump
1869 label
= x86_get_label(cp
.func
);
1871 /* Fetch inputs... TODO: fetch lazily...
1873 if (!aos_fetch_inputs( &cp
, linear
))
1878 while( !tgsi_parse_end_of_tokens( &parse
) && !cp
.error
)
1880 tgsi_parse_token( &parse
);
1882 switch (parse
.FullToken
.Token
.Type
) {
1883 case TGSI_TOKEN_TYPE_IMMEDIATE
:
1885 if (!note_immediate( &cp
, &parse
.FullToken
.FullImmediate
))
1890 case TGSI_TOKEN_TYPE_INSTRUCTION
:
1892 tgsi_dump_instruction( &parse
.FullToken
.FullInstruction
, cp
.insn_counter
);
1894 if (!emit_instruction( &cp
, &parse
.FullToken
.FullInstruction
))
1899 x87_assert_stack_empty(cp
.func
);
1909 for (i
= 0; i
< 8; i
++) {
1910 if (cp
.xmm
[i
].file
!= TGSI_FILE_OUTPUT
) {
1911 cp
.xmm
[i
].file
= TGSI_FILE_NULL
;
1912 cp
.xmm
[i
].dirty
= 0;
1920 if (cp
.vaos
->base
.key
.clip
) {
1921 /* not really handling clipping, just do the rhw so we can
1922 * see the results...
1924 emit_rhw_viewport(&cp
);
1926 else if (cp
.vaos
->base
.key
.viewport
) {
1930 /* Emit output... TODO: do this eagerly after the last write to a
1933 if (!aos_emit_outputs( &cp
))
1941 x86_make_disp(cp
.outbuf_ECX
,
1942 cp
.vaos
->base
.key
.output_stride
));
1947 x86_inc(cp
.func
, cp
.idx_EBX
);
1950 x86_lea(cp
.func
, cp
.idx_EBX
, x86_make_disp(cp
.idx_EBX
, 4));
1954 /* decr count, loop if not zero
1956 x86_dec(cp
.func
, cp
.count_ESI
);
1957 x86_jcc(cp
.func
, cc_NZ
, label
);
1959 restore_fpu_state(&cp
);
1961 /* Land forward jump here:
1963 x86_fixup_fwd_jump(cp
.func
, fixup
);
1967 if (cp
.func
->need_emms
)
1970 x86_pop(cp
.func
, cp
.temp_EBP
);
1971 x86_pop(cp
.func
, cp
.count_ESI
);
1972 x86_pop(cp
.func
, cp
.idx_EBX
);
1974 x87_assert_stack_empty(cp
.func
);
1977 tgsi_parse_free( &parse
);
1981 tgsi_parse_free( &parse
);
1987 static void vaos_set_buffer( struct draw_vs_varient
*varient
,
1992 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
1995 for (i
= 0; i
< vaos
->base
.key
.nr_inputs
; i
++) {
1996 if (vaos
->base
.key
.element
[i
].in
.buffer
== buf
) {
1997 vaos
->attrib
[i
].input_ptr
= ((char *)ptr
+
1998 vaos
->base
.key
.element
[i
].in
.offset
);
1999 vaos
->attrib
[i
].input_stride
= stride
;
2006 static void PIPE_CDECL
vaos_run_elts( struct draw_vs_varient
*varient
,
2007 const unsigned *elts
,
2009 void *output_buffer
)
2011 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2012 struct aos_machine
*machine
= vaos
->draw
->vs
.aos_machine
;
2014 machine
->internal
[IMM_PSIZE
][0] = vaos
->draw
->rasterizer
->point_size
;
2015 machine
->constants
= vaos
->draw
->vs
.aligned_constants
;
2016 machine
->immediates
= vaos
->base
.vs
->immediates
;
2017 machine
->attrib
= vaos
->attrib
;
2019 vaos
->gen_run_elts( machine
,
2025 static void PIPE_CDECL
vaos_run_linear( struct draw_vs_varient
*varient
,
2028 void *output_buffer
)
2030 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2031 struct aos_machine
*machine
= vaos
->draw
->vs
.aos_machine
;
2033 machine
->internal
[IMM_PSIZE
][0] = vaos
->draw
->rasterizer
->point_size
;
2034 machine
->constants
= vaos
->draw
->vs
.aligned_constants
;
2035 machine
->immediates
= vaos
->base
.vs
->immediates
;
2036 machine
->attrib
= vaos
->attrib
;
2038 vaos
->gen_run_linear( machine
,
2046 static void vaos_destroy( struct draw_vs_varient
*varient
)
2048 struct draw_vs_varient_aos_sse
*vaos
= (struct draw_vs_varient_aos_sse
*)varient
;
2050 FREE( vaos
->attrib
);
2052 x86_release_func( &vaos
->func
[0] );
2053 x86_release_func( &vaos
->func
[1] );
2060 static struct draw_vs_varient
*varient_aos_sse( struct draw_vertex_shader
*vs
,
2061 const struct draw_vs_varient_key
*key
)
2063 struct draw_vs_varient_aos_sse
*vaos
= CALLOC_STRUCT(draw_vs_varient_aos_sse
);
2068 vaos
->base
.key
= *key
;
2070 vaos
->base
.set_input
= vaos_set_buffer
;
2071 vaos
->base
.destroy
= vaos_destroy
;
2072 vaos
->base
.run_linear
= vaos_run_linear
;
2073 vaos
->base
.run_elts
= vaos_run_elts
;
2075 vaos
->draw
= vs
->draw
;
2077 vaos
->attrib
= MALLOC( key
->nr_inputs
* sizeof(vaos
->attrib
[0]) );
2082 tgsi_dump(vs
->state
.tokens
, 0);
2085 if (!build_vertex_program( vaos
, TRUE
))
2088 if (!build_vertex_program( vaos
, FALSE
))
2091 vaos
->gen_run_linear
= (vaos_run_linear_func
)x86_get_func(&vaos
->func
[0]);
2092 if (!vaos
->gen_run_linear
)
2095 vaos
->gen_run_elts
= (vaos_run_elts_func
)x86_get_func(&vaos
->func
[1]);
2096 if (!vaos
->gen_run_elts
)
2102 if (vaos
&& vaos
->attrib
)
2106 x86_release_func( &vaos
->func
[0] );
2109 x86_release_func( &vaos
->func
[1] );
2117 struct draw_vs_varient
*draw_vs_varient_aos_sse( struct draw_vertex_shader
*vs
,
2118 const struct draw_vs_varient_key
*key
)
2120 struct draw_vs_varient
*varient
= varient_aos_sse( vs
, key
);
2122 if (varient
== NULL
) {
2123 varient
= draw_vs_varient_generic( vs
, key
);