2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * \file t_vb_arb_program_sse.c
28 * Translate simplified vertex_program representation to
29 * x86/x87/SSE/SSE2 machine code using mesa's rtasm runtime assembler.
31 * This is very much a first attempt - build something that works.
32 * There are probably better approaches for applying SSE to vertex
33 * programs, and the whole thing is crying out for static analysis of
34 * the programs to avoid redundant operations.
36 * \author Keith Whitwell
44 #include "arbprogparse.h"
46 #include "program_instruction.h"
47 #include "math/m_matrix.h"
48 #include "math/m_translate.h"
49 #include "t_context.h"
50 #include "t_vb_arbprogram.h"
52 #if defined(USE_SSE_ASM)
54 #include "x86/rtasm/x86sse.h"
55 #include "x86/common_x86_asm.h"
65 * EBX - point to 'm->File[0]'
66 * ECX - point to 'm->File[3]'
77 _mesa_printf("x86 translation failed in %s\n", __FUNCTION__); \
82 struct x86_function func
;
83 struct tnl_compiled_program
*p
;
101 static INLINE GLboolean
eq( struct x86_reg a
,
104 return (a
.file
== b
.file
&&
110 static GLint
get_offset( const void *a
, const void *b
)
112 return (const char *)b
- (const char *)a
;
116 static struct x86_reg
get_reg_ptr(GLuint file
,
123 reg
= x86_make_reg(file_REG32
, reg_BX
);
124 assert(idx
!= REG_UNDEF
);
126 case FILE_STATE_PARAM
:
127 reg
= x86_make_reg(file_REG32
, reg_CX
);
133 return x86_make_disp(reg
, 16 * idx
);
137 static void spill( struct compilation
*cp
, GLuint idx
)
139 struct x86_reg oldval
= get_reg_ptr(cp
->xmm
[idx
].file
,
142 assert(cp
->xmm
[idx
].dirty
);
143 sse_movups(&cp
->func
, oldval
, x86_make_reg(file_XMM
, idx
));
144 cp
->xmm
[idx
].dirty
= 0;
147 static struct x86_reg
get_xmm_reg( struct compilation
*cp
)
152 for (i
= 0; i
< 8; i
++)
153 if (cp
->xmm
[i
].last_used
< cp
->xmm
[oldest
].last_used
)
156 /* Need to write out the old value?
158 if (cp
->xmm
[oldest
].dirty
)
161 assert(cp
->xmm
[oldest
].last_used
!= cp
->insn_counter
);
163 cp
->xmm
[oldest
].file
= FILE_REG
;
164 cp
->xmm
[oldest
].idx
= REG_UNDEF
;
165 cp
->xmm
[oldest
].last_used
= cp
->insn_counter
;
166 return x86_make_reg(file_XMM
, oldest
);
169 static void invalidate_xmm( struct compilation
*cp
,
170 GLuint file
, GLuint idx
)
174 /* Invalidate any old copy of this register in XMM0-7.
176 for (i
= 0; i
< 8; i
++) {
177 if (cp
->xmm
[i
].file
== file
&& cp
->xmm
[i
].idx
== idx
) {
178 cp
->xmm
[i
].file
= FILE_REG
;
179 cp
->xmm
[i
].idx
= REG_UNDEF
;
180 cp
->xmm
[i
].dirty
= 0;
187 /* Return an XMM reg to receive the results of an operation.
189 static struct x86_reg
get_dst_xmm_reg( struct compilation
*cp
,
190 GLuint file
, GLuint idx
)
194 /* Invalidate any old copy of this register in XMM0-7. Don't reuse
195 * as this may be one of the arguments.
197 invalidate_xmm( cp
, file
, idx
);
199 reg
= get_xmm_reg( cp
);
200 cp
->xmm
[reg
.idx
].file
= file
;
201 cp
->xmm
[reg
.idx
].idx
= idx
;
202 cp
->xmm
[reg
.idx
].dirty
= 1;
206 /* As above, but return a pointer. Note - this pointer may alias
207 * those returned by get_arg_ptr().
209 static struct x86_reg
get_dst_ptr( struct compilation
*cp
,
210 GLuint file
, GLuint idx
)
212 /* Invalidate any old copy of this register in XMM0-7. Don't reuse
213 * as this may be one of the arguments.
215 invalidate_xmm( cp
, file
, idx
);
217 return get_reg_ptr(file
, idx
);
222 /* Return an XMM reg if the argument is resident, otherwise return a
223 * base+offset pointer to the saved value.
225 static struct x86_reg
get_arg( struct compilation
*cp
, GLuint file
, GLuint idx
)
229 for (i
= 0; i
< 8; i
++) {
230 if (cp
->xmm
[i
].file
== file
&&
231 cp
->xmm
[i
].idx
== idx
) {
232 cp
->xmm
[i
].last_used
= cp
->insn_counter
;
233 return x86_make_reg(file_XMM
, i
);
237 return get_reg_ptr(file
, idx
);
240 /* As above, but always return a pointer:
242 static struct x86_reg
get_arg_ptr( struct compilation
*cp
, GLuint file
, GLuint idx
)
246 /* If there is a modified version of this register in one of the
247 * XMM regs, write it out to memory.
249 for (i
= 0; i
< 8; i
++) {
250 if (cp
->xmm
[i
].file
== file
&&
251 cp
->xmm
[i
].idx
== idx
&&
256 return get_reg_ptr(file
, idx
);
259 /* Emulate pshufd insn in regular SSE, if necessary:
261 static void emit_pshufd( struct compilation
*cp
,
267 sse2_pshufd(&cp
->func
, dst
, arg0
, shuf
);
272 sse_movups(&cp
->func
, dst
, arg0
);
274 sse_shufps(&cp
->func
, dst
, dst
, shuf
);
278 static void set_fpu_round_neg_inf( struct compilation
*cp
)
280 if (cp
->fpucntl
!= RND_NEG_FPU
) {
281 struct x86_reg regEDX
= x86_make_reg(file_REG32
, reg_DX
);
282 struct arb_vp_machine
*m
= NULL
;
284 cp
->fpucntl
= RND_NEG_FPU
;
285 x87_fnclex(&cp
->func
);
286 x87_fldcw(&cp
->func
, x86_make_disp(regEDX
, get_offset(m
, &m
->fpucntl_rnd_neg
)));
291 /* Perform a reduced swizzle.
293 static GLboolean
emit_RSW( struct compilation
*cp
, union instruction op
)
295 struct x86_reg arg0
= get_arg(cp
, op
.rsw
.file0
, op
.rsw
.idx0
);
296 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.rsw
.dst
);
297 GLuint swz
= op
.rsw
.swz
;
298 GLuint neg
= op
.rsw
.neg
;
300 emit_pshufd(cp
, dst
, arg0
, swz
);
303 struct x86_reg negs
= get_arg(cp
, FILE_REG
, REG_SWZ
);
304 struct x86_reg tmp
= get_xmm_reg(cp
);
306 * Use neg as arg to pshufd
309 emit_pshufd(cp
, tmp
, negs
,
310 SHUF((neg
& 1) ? 1 : 0,
314 sse_mulps(&cp
->func
, dst
, tmp
);
320 /* Helper for writemask:
322 static GLboolean
emit_shuf_copy1( struct compilation
*cp
,
328 struct x86_reg tmp
= get_xmm_reg(cp
);
329 sse_movups(&cp
->func
, dst
, arg1
);
330 emit_pshufd(cp
, dst
, dst
, shuf
);
331 emit_pshufd(cp
, tmp
, arg0
, shuf
);
333 sse_movss(&cp
->func
, dst
, tmp
);
335 emit_pshufd(cp
, dst
, dst
, shuf
);
340 /* Helper for writemask:
342 static GLboolean
emit_shuf_copy2( struct compilation
*cp
,
348 struct x86_reg tmp
= get_xmm_reg(cp
);
349 emit_pshufd(cp
, dst
, arg1
, shuf
);
350 emit_pshufd(cp
, tmp
, arg0
, shuf
);
352 sse_shufps(&cp
->func
, dst
, tmp
, SHUF(X
, Y
, Z
, W
));
354 emit_pshufd(cp
, dst
, dst
, shuf
);
359 static void emit_x87_ex2( struct compilation
*cp
)
361 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
362 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
363 struct x86_reg st3
= x86_make_reg(file_x87
, 3);
365 set_fpu_round_neg_inf( cp
);
367 x87_fld(&cp
->func
, st0
); /* a a */
368 x87_fprndint( &cp
->func
); /* int(a) a */
369 x87_fld(&cp
->func
, st0
); /* int(a) int(a) a */
370 x87_fstp(&cp
->func
, st3
); /* int(a) a int(a)*/
371 x87_fsubp(&cp
->func
, st1
); /* frac(a) int(a) */
372 x87_f2xm1(&cp
->func
); /* (2^frac(a))-1 int(a)*/
373 x87_fld1(&cp
->func
); /* 1 (2^frac(a))-1 int(a)*/
374 x87_faddp(&cp
->func
, st1
); /* 2^frac(a) int(a) */
375 x87_fscale(&cp
->func
); /* 2^a */
379 static GLboolean
emit_MSK2( struct compilation
*cp
, union instruction op
)
381 struct x86_reg arg0
= get_arg(cp
, op
.msk
.file
, op
.msk
.arg
);
382 struct x86_reg arg1
= get_arg(cp
, FILE_REG
, op
.msk
.dst
); /* NOTE! */
383 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.msk
.dst
);
385 /* make full width bitmask in tmp
391 emit_pshufd(cp
, tmp
, get_arg(cp
, FILE_REG
, REG_NEGS
),
392 SHUF((op
.msk
.mask
& 1) ? 2 : 0,
393 (op
.msk
.mask
& 2) ? 2 : 0,
394 (op
.msk
.mask
& 4) ? 2 : 0,
395 (op
.msk
.mask
& 8) ? 2 : 0));
396 sse2_pnot(&cp
->func
, dst
, tmp
);
397 sse2_pand(&cp
->func
, arg0
, tmp
);
398 sse2_pand(&cp
->func
, arg1
, dst
);
399 sse2_por(&cp
->func
, tmp
, dst
);
405 /* Used to implement write masking. This and most of the other instructions
406 * here would be easier to implement if there had been a translation
407 * to a 2 argument format (dst/arg0, arg1) at the shader level before
408 * attempting to translate to x86/sse code.
410 static GLboolean
emit_MSK( struct compilation
*cp
, union instruction op
)
412 struct x86_reg arg
= get_arg(cp
, op
.msk
.file
, op
.msk
.idx
);
413 struct x86_reg dst0
= get_arg(cp
, FILE_REG
, op
.msk
.dst
); /* NOTE! */
414 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.msk
.dst
);
416 /* Note that dst and dst0 refer to the same program variable, but
417 * will definitely be different XMM registers. We're effectively
418 * treating this as a 2 argument SEL now, just one of which happens
419 * always to be the same register as the destination.
422 switch (op
.msk
.mask
) {
424 sse_movups(&cp
->func
, dst
, dst0
);
428 if (arg
.file
== file_XMM
) {
429 sse_movups(&cp
->func
, dst
, dst0
);
430 sse_movss(&cp
->func
, dst
, arg
);
433 struct x86_reg tmp
= get_xmm_reg(cp
);
434 sse_movups(&cp
->func
, dst
, dst0
);
435 sse_movss(&cp
->func
, tmp
, arg
);
436 sse_movss(&cp
->func
, dst
, tmp
);
441 sse_movups(&cp
->func
, dst
, dst0
);
442 sse_shufps(&cp
->func
, dst
, arg
, SHUF(X
, Y
, Z
, W
));
446 sse_movups(&cp
->func
, dst
, arg
);
447 sse_shufps(&cp
->func
, dst
, dst0
, SHUF(X
, Y
, Z
, W
));
451 if (dst0
.file
== file_XMM
) {
452 sse_movups(&cp
->func
, dst
, arg
);
453 sse_movss(&cp
->func
, dst
, dst0
);
456 struct x86_reg tmp
= get_xmm_reg(cp
);
457 sse_movups(&cp
->func
, dst
, arg
);
458 sse_movss(&cp
->func
, tmp
, dst0
);
459 sse_movss(&cp
->func
, dst
, tmp
);
464 emit_shuf_copy1(cp
, dst
, arg
, dst0
, SHUF(Y
,X
,Z
,W
));
468 emit_shuf_copy1(cp
, dst
, arg
, dst0
, SHUF(Z
,Y
,X
,W
));
472 emit_shuf_copy1(cp
, dst
, arg
, dst0
, SHUF(W
,Y
,Z
,X
));
476 emit_shuf_copy2(cp
, dst
, arg
, dst0
, SHUF(X
,Z
,Y
,W
));
480 emit_shuf_copy2(cp
, dst
, arg
, dst0
, SHUF(X
,W
,Z
,Y
));
483 emit_shuf_copy2(cp
, dst
, arg
, dst0
, SHUF(Z
,Y
,X
,W
));
487 emit_shuf_copy2(cp
, dst
, arg
, dst0
, SHUF(W
,Y
,Z
,X
));
491 emit_shuf_copy1(cp
, dst
, dst0
, arg
, SHUF(Y
,X
,Z
,W
));
495 emit_shuf_copy1(cp
, dst
, dst0
, arg
, SHUF(Z
,Y
,X
,W
));
499 emit_shuf_copy1(cp
, dst
, dst0
, arg
, SHUF(W
,Y
,Z
,X
));
503 sse_movups(&cp
->func
, dst
, arg
);
514 static GLboolean
emit_PRT( struct compilation
*cp
, union instruction op
)
521 * The traditional instructions. All operate on internal registers
522 * and ignore write masks and swizzling issues.
525 static GLboolean
emit_ABS( struct compilation
*cp
, union instruction op
)
527 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
528 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
529 struct x86_reg neg
= get_reg_ptr(FILE_REG
, REG_NEG
);
531 sse_movups(&cp
->func
, dst
, arg0
);
532 sse_mulps(&cp
->func
, dst
, neg
);
533 sse_maxps(&cp
->func
, dst
, arg0
);
537 static GLboolean
emit_ADD( struct compilation
*cp
, union instruction op
)
539 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
540 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
541 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
543 sse_movups(&cp
->func
, dst
, arg0
);
544 sse_addps(&cp
->func
, dst
, arg1
);
549 /* The dotproduct instructions don't really do that well in sse:
551 static GLboolean
emit_DP3( struct compilation
*cp
, union instruction op
)
553 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
554 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
555 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
556 struct x86_reg tmp
= get_xmm_reg(cp
);
558 sse_movups(&cp
->func
, dst
, arg0
);
559 sse_mulps(&cp
->func
, dst
, arg1
);
561 /* Now the hard bit: sum the first 3 values:
563 sse_movhlps(&cp
->func
, tmp
, dst
);
564 sse_addss(&cp
->func
, dst
, tmp
); /* a*x+c*z, b*y, ?, ? */
565 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
566 sse_addss(&cp
->func
, dst
, tmp
);
567 sse_shufps(&cp
->func
, dst
, dst
, SHUF(X
, X
, X
, X
));
573 static GLboolean
emit_DP4( struct compilation
*cp
, union instruction op
)
575 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
576 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
577 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
578 struct x86_reg tmp
= get_xmm_reg(cp
);
580 sse_movups(&cp
->func
, dst
, arg0
);
581 sse_mulps(&cp
->func
, dst
, arg1
);
583 /* Now the hard bit: sum the values:
585 sse_movhlps(&cp
->func
, tmp
, dst
);
586 sse_addps(&cp
->func
, dst
, tmp
); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
587 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
588 sse_addss(&cp
->func
, dst
, tmp
);
589 sse_shufps(&cp
->func
, dst
, dst
, SHUF(X
, X
, X
, X
));
593 static GLboolean
emit_DPH( struct compilation
*cp
, union instruction op
)
595 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
596 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
597 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
598 struct x86_reg ones
= get_reg_ptr(FILE_REG
, REG_ONES
);
599 struct x86_reg tmp
= get_xmm_reg(cp
);
601 emit_pshufd(cp
, dst
, arg0
, SHUF(W
,X
,Y
,Z
));
602 sse_movss(&cp
->func
, dst
, ones
);
603 emit_pshufd(cp
, dst
, dst
, SHUF(W
,X
,Y
,Z
));
604 sse_mulps(&cp
->func
, dst
, arg1
);
606 /* Now the hard bit: sum the values (from DP4):
608 sse_movhlps(&cp
->func
, tmp
, dst
);
609 sse_addps(&cp
->func
, dst
, tmp
); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
610 emit_pshufd(cp
, tmp
, dst
, SHUF(Y
,X
,W
,Z
));
611 sse_addss(&cp
->func
, dst
, tmp
);
612 sse_shufps(&cp
->func
, dst
, dst
, SHUF(X
, X
, X
, X
));
617 static GLboolean
emit_DST( struct compilation
*cp
, union instruction op
)
619 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
620 struct x86_reg arg1
= get_arg_ptr(cp
, op
.alu
.file1
, op
.alu
.idx1
);
621 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
623 /* dst[0] = 1.0 * 1.0F; */
624 /* dst[1] = arg0[1] * arg1[1]; */
625 /* dst[2] = arg0[2] * 1.0; */
626 /* dst[3] = 1.0 * arg1[3]; */
628 /* Would rather do some of this with integer regs, but:
629 * 1) No proper support for immediate values yet
630 * 2) I'd need to push/pop somewhere to get a free reg.
633 x87_fstp(&cp
->func
, dst
); /* would rather do an immediate store... */
634 x87_fld(&cp
->func
, x86_make_disp(arg0
, 4));
635 x87_fmul(&cp
->func
, x86_make_disp(arg1
, 4));
636 x87_fstp(&cp
->func
, x86_make_disp(dst
, 4));
638 if (!eq(arg0
, dst
)) {
639 x86_fld(&cp
->func
, x86_make_disp(arg0
, 8));
640 x86_stp(&cp
->func
, x86_make_disp(dst
, 8));
643 if (!eq(arg1
, dst
)) {
644 x86_fld(&cp
->func
, x86_make_disp(arg0
, 12));
645 x86_stp(&cp
->func
, x86_make_disp(dst
, 12));
651 static GLboolean
emit_DST( struct compilation
*cp
, union instruction op
)
653 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
654 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
655 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
656 struct x86_reg tmp
= get_xmm_reg(cp
);
657 struct x86_reg ones
= get_reg_ptr(FILE_REG
, REG_ONES
);
659 emit_shuf_copy2(cp
, dst
, arg0
, ones
, SHUF(X
,W
,Z
,Y
));
660 emit_shuf_copy2(cp
, tmp
, arg1
, ones
, SHUF(X
,Z
,Y
,W
));
661 sse_mulps(&cp
->func
, dst
, tmp
);
663 /* dst[0] = 1.0 * 1.0F; */
664 /* dst[1] = arg0[1] * arg1[1]; */
665 /* dst[2] = arg0[2] * 1.0; */
666 /* dst[3] = 1.0 * arg1[3]; */
672 static GLboolean
emit_LG2( struct compilation
*cp
, union instruction op
)
674 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
675 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
677 x87_fld1(&cp
->func
); /* 1 */
678 x87_fld(&cp
->func
, arg0
); /* a0 1 */
679 x87_fyl2x(&cp
->func
); /* log2(a0) */
680 x87_fst(&cp
->func
, x86_make_disp(dst
, 0));
681 x87_fst(&cp
->func
, x86_make_disp(dst
, 4));
682 x87_fst(&cp
->func
, x86_make_disp(dst
, 8));
683 x87_fstp(&cp
->func
, x86_make_disp(dst
, 12));
689 static GLboolean
emit_EX2( struct compilation
*cp
, union instruction op
)
691 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
692 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
694 /* CAUTION: dst may alias arg0!
696 x87_fld(&cp
->func
, arg0
);
700 x87_fst(&cp
->func
, x86_make_disp(dst
, 0));
701 x87_fst(&cp
->func
, x86_make_disp(dst
, 4));
702 x87_fst(&cp
->func
, x86_make_disp(dst
, 8));
703 x87_fst(&cp
->func
, x86_make_disp(dst
, 12));
707 static GLboolean
emit_EXP( struct compilation
*cp
, union instruction op
)
709 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
710 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
711 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
712 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
713 struct x86_reg st3
= x86_make_reg(file_x87
, 3);
715 /* CAUTION: dst may alias arg0!
717 x87_fld(&cp
->func
, arg0
); /* arg0.x */
718 x87_fld(&cp
->func
, st0
); /* arg arg */
720 /* by default, fpu is setup to round-to-nearest. We want to
721 * change this now, and track the state through to the end of the
722 * generated function so that it isn't repeated unnecessarily.
723 * Alternately, could subtract .5 to get round to -inf behaviour.
725 set_fpu_round_neg_inf( cp
);
726 x87_fprndint( &cp
->func
); /* flr(a) a */
727 x87_fld(&cp
->func
, st0
); /* flr(a) flr(a) a */
728 x87_fld1(&cp
->func
); /* 1 floor(a) floor(a) a */
729 x87_fst(&cp
->func
, x86_make_disp(dst
, 12)); /* stack unchanged */
730 x87_fscale(&cp
->func
); /* 2^floor(a) floor(a) a */
731 x87_fst(&cp
->func
, st3
); /* 2^floor(a) floor(a) a 2^floor(a)*/
732 x87_fstp(&cp
->func
, x86_make_disp(dst
, 0)); /* flr(a) a 2^flr(a) */
733 x87_fsubrp(&cp
->func
, st1
); /* frac(a) 2^flr(a) */
734 x87_fst(&cp
->func
, x86_make_disp(dst
, 4)); /* frac(a) 2^flr(a) */
735 x87_f2xm1(&cp
->func
); /* (2^frac(a))-1 2^flr(a)*/
736 x87_fld1(&cp
->func
); /* 1 (2^frac(a))-1 2^flr(a)*/
737 x87_faddp(&cp
->func
, st1
); /* 2^frac(a) 2^flr(a) */
738 x87_fmulp(&cp
->func
, st1
); /* 2^a */
739 x87_fst(&cp
->func
, x86_make_disp(dst
, 8));
743 /* dst[0] = 2^floor(tmp); */
744 /* dst[1] = frac(tmp); */
745 /* dst[2] = 2^floor(tmp) * 2^frac(tmp); */
750 static GLboolean
emit_LOG( struct compilation
*cp
, union instruction op
)
752 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
753 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
754 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
755 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
756 struct x86_reg st2
= x86_make_reg(file_x87
, 2);
758 /* CAUTION: dst may alias arg0!
760 x87_fld(&cp
->func
, arg0
); /* arg0.x */
761 x87_fabs(&cp
->func
); /* |arg0.x| */
762 x87_fxtract(&cp
->func
); /* mantissa(arg0.x), exponent(arg0.x) */
763 x87_fst(&cp
->func
, st2
); /* mantissa, exponent, mantissa */
764 x87_fld1(&cp
->func
); /* 1, mantissa, exponent, mantissa */
765 x87_fyl2x(&cp
->func
); /* log2(mantissa), exponent, mantissa */
766 x87_fadd(&cp
->func
, st0
, st1
); /* e+l2(m), e, m */
767 x87_fstp(&cp
->func
, x86_make_disp(dst
, 8)); /* e, m */
769 x87_fld1(&cp
->func
); /* 1, e, m */
770 x87_fsub(&cp
->func
, st1
, st0
); /* 1, e-1, m */
771 x87_fstp(&cp
->func
, x86_make_disp(dst
, 12)); /* e-1,m */
772 x87_fstp(&cp
->func
, dst
); /* m */
774 x87_fadd(&cp
->func
, st0
, st0
); /* 2m */
775 x87_fstp(&cp
->func
, x86_make_disp(dst
, 4));
780 static GLboolean
emit_FLR( struct compilation
*cp
, union instruction op
)
782 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
783 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
786 set_fpu_round_neg_inf( cp
);
788 for (i
= 0; i
< 4; i
++) {
789 x87_fld(&cp
->func
, x86_make_disp(arg0
, i
*4));
790 x87_fprndint( &cp
->func
);
791 x87_fstp(&cp
->func
, x86_make_disp(dst
, i
*4));
798 static GLboolean
emit_FRC( struct compilation
*cp
, union instruction op
)
800 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
801 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
802 struct x86_reg st0
= x86_make_reg(file_x87
, 0);
803 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
806 set_fpu_round_neg_inf( cp
);
808 /* Knowing liveness info or even just writemask would be useful
811 for (i
= 0; i
< 4; i
++) {
812 x87_fld(&cp
->func
, x86_make_disp(arg0
, i
*4));
813 x87_fld(&cp
->func
, st0
); /* a a */
814 x87_fprndint( &cp
->func
); /* flr(a) a */
815 x87_fsubrp(&cp
->func
, st1
); /* frc(a) */
816 x87_fstp(&cp
->func
, x86_make_disp(dst
, i
*4));
824 static GLboolean
emit_LIT( struct compilation
*cp
, union instruction op
)
827 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
828 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
829 struct x86_reg lit
= get_arg(cp
, FILE_REG
, REG_LIT
);
830 struct x86_reg tmp
= get_xmm_reg(cp
);
831 struct x86_reg st1
= x86_make_reg(file_x87
, 1);
832 struct x86_reg regEAX
= x86_make_reg(file_REG32
, reg_AX
);
833 GLubyte
*fixup1
, *fixup2
;
836 /* Load the interesting parts of arg0:
838 x87_fld(&cp
->func
, x86_make_disp(arg0
, 12)); /* a3 */
839 x87_fld(&cp
->func
, x86_make_disp(arg0
, 4)); /* a1 a3 */
840 x87_fld(&cp
->func
, x86_make_disp(arg0
, 0)); /* a0 a1 a3 */
844 sse_movaps(&cp
->func
, tmp
, lit
);
845 sse_movaps(&cp
->func
, dst
, tmp
);
849 x87_fldz(&cp
->func
); /* 0 a0 a1 a3 */
850 x87_fucomp(&cp
->func
, st1
); /* a0 a1 a3 */
851 x87_fnstsw(&cp
->func
, regEAX
);
853 fixup1
= x86_jcc_forward(&cp
->func
, cc_AE
);
855 x87_fstp(&cp
->func
, x86_make_disp(dst
, 4)); /* a1 a3 */
859 x87_fldz(&cp
->func
); /* 0 a1 a3 */
860 x87_fucomp(&cp
->func
, st1
); /* a1 a3 */
861 x87_fnstsw(&cp
->func
, regEAX
);
863 fixup2
= x86_jcc_forward(&cp
->func
, cc_AE
);
865 /* Compute pow(a1, a3)
867 x87_fyl2x(&cp
->func
); /* a3*log2(a1) */
869 emit_x87_ex2( cp
); /* 2^(a3*log2(a1)) */
871 x87_fstp(&cp
->func
, x86_make_disp(dst
, 8));
875 x86_fixup_fwd_jump(&cp
->func
, fixup1
);
876 x86_fixup_fwd_jump(&cp
->func
, fixup2
);
878 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
879 struct x86_reg ones
= get_reg_ptr(FILE_REG
, REG_LIT
);
880 sse_movups(&cp
->func
, dst
, ones
);
887 static GLboolean
emit_MAX( struct compilation
*cp
, union instruction op
)
889 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
890 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
891 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
893 sse_movups(&cp
->func
, dst
, arg0
);
894 sse_maxps(&cp
->func
, dst
, arg1
);
899 static GLboolean
emit_MIN( struct compilation
*cp
, union instruction op
)
901 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
902 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
903 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
905 sse_movups(&cp
->func
, dst
, arg0
);
906 sse_minps(&cp
->func
, dst
, arg1
);
910 static GLboolean
emit_MOV( struct compilation
*cp
, union instruction op
)
912 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
913 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
915 sse_movups(&cp
->func
, dst
, arg0
);
919 static GLboolean
emit_MUL( struct compilation
*cp
, union instruction op
)
921 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
922 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
923 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
925 sse_movups(&cp
->func
, dst
, arg0
);
926 sse_mulps(&cp
->func
, dst
, arg1
);
931 static GLboolean
emit_POW( struct compilation
*cp
, union instruction op
)
933 struct x86_reg arg0
= get_arg_ptr(cp
, op
.alu
.file0
, op
.alu
.idx0
);
934 struct x86_reg arg1
= get_arg_ptr(cp
, op
.alu
.file1
, op
.alu
.idx1
);
935 struct x86_reg dst
= get_dst_ptr(cp
, FILE_REG
, op
.alu
.dst
);
937 x87_fld(&cp
->func
, arg1
); /* a1 */
938 x87_fld(&cp
->func
, arg0
); /* a0 a1 */
939 x87_fyl2x(&cp
->func
); /* a1*log2(a0) */
941 emit_x87_ex2( cp
); /* 2^(a1*log2(a0)) */
943 x87_fst(&cp
->func
, x86_make_disp(dst
, 0));
944 x87_fst(&cp
->func
, x86_make_disp(dst
, 4));
945 x87_fst(&cp
->func
, x86_make_disp(dst
, 8));
946 x87_fstp(&cp
->func
, x86_make_disp(dst
, 12));
951 static GLboolean
emit_REL( struct compilation
*cp
, union instruction op
)
953 /* GLuint idx = (op.alu.idx0 + (GLint)cp->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1); */
954 /* GLuint idx = 0; */
955 /* struct x86_reg arg0 = get_arg(cp, op.alu.file0, idx); */
956 /* struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); */
958 /* dst[0] = arg0[0]; */
959 /* dst[1] = arg0[1]; */
960 /* dst[2] = arg0[2]; */
961 /* dst[3] = arg0[3]; */
966 static GLboolean
emit_RCP( struct compilation
*cp
, union instruction op
)
968 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
969 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
972 sse2_rcpss(&cp
->func
, dst
, arg0
);
975 struct x86_reg ones
= get_reg_ptr(FILE_REG
, REG_ONES
);
976 sse_movss(&cp
->func
, dst
, ones
);
977 sse_divss(&cp
->func
, dst
, arg0
);
980 sse_shufps(&cp
->func
, dst
, dst
, SHUF(X
, X
, X
, X
));
984 static GLboolean
emit_RSQ( struct compilation
*cp
, union instruction op
)
986 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
987 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
989 /* TODO: Calculate absolute value
992 sse_movss(&cp
->func
, dst
, arg0
);
993 sse_mulss(&cp
->func
, dst
, neg
);
994 sse_maxss(&cp
->func
, dst
, arg0
);
997 sse_rsqrtss(&cp
->func
, dst
, arg0
);
998 sse_shufps(&cp
->func
, dst
, dst
, SHUF(X
, X
, X
, X
));
1003 static GLboolean
emit_SGE( struct compilation
*cp
, union instruction op
)
1005 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
1006 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
1007 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
1008 struct x86_reg ones
= get_reg_ptr(FILE_REG
, REG_ONES
);
1010 sse_movups(&cp
->func
, dst
, arg0
);
1011 sse_cmpps(&cp
->func
, dst
, arg1
, cc_NotLessThan
);
1012 sse_andps(&cp
->func
, dst
, ones
);
1017 static GLboolean
emit_SLT( struct compilation
*cp
, union instruction op
)
1019 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
1020 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
1021 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
1022 struct x86_reg ones
= get_reg_ptr(FILE_REG
, REG_ONES
);
1024 sse_movups(&cp
->func
, dst
, arg0
);
1025 sse_cmpps(&cp
->func
, dst
, arg1
, cc_LessThan
);
1026 sse_andps(&cp
->func
, dst
, ones
);
1030 static GLboolean
emit_SUB( struct compilation
*cp
, union instruction op
)
1032 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
1033 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
1034 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
1036 sse_movups(&cp
->func
, dst
, arg0
);
1037 sse_subps(&cp
->func
, dst
, arg1
);
1042 static GLboolean
emit_XPD( struct compilation
*cp
, union instruction op
)
1044 struct x86_reg arg0
= get_arg(cp
, op
.alu
.file0
, op
.alu
.idx0
);
1045 struct x86_reg arg1
= get_arg(cp
, op
.alu
.file1
, op
.alu
.idx1
);
1046 struct x86_reg dst
= get_dst_xmm_reg(cp
, FILE_REG
, op
.alu
.dst
);
1047 struct x86_reg tmp0
= get_xmm_reg(cp
);
1048 struct x86_reg tmp1
= get_xmm_reg(cp
);
1050 /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way
1051 * to invalidate registers. This will come with better analysis
1052 * (liveness analysis) of the incoming program.
1054 emit_pshufd(cp
, dst
, arg0
, SHUF(Y
, Z
, X
, W
));
1055 emit_pshufd(cp
, tmp1
, arg1
, SHUF(Z
, X
, Y
, W
));
1056 sse_mulps(&cp
->func
, dst
, tmp1
);
1057 emit_pshufd(cp
, tmp0
, arg0
, SHUF(Z
, X
, Y
, W
));
1058 emit_pshufd(cp
, tmp1
, arg1
, SHUF(Y
, Z
, X
, W
));
1059 sse_mulps(&cp
->func
, tmp0
, tmp1
);
1060 sse_subps(&cp
->func
, dst
, tmp0
);
1062 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1063 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1064 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1065 /* dst[3] is undef */
1070 static GLboolean
emit_NOP( struct compilation
*cp
, union instruction op
)
1076 static GLboolean (* const emit_func
[])(struct compilation
*, union instruction
) =
1082 emit_NOP
, /* ARL_NV */
1100 emit_NOP
, /* KIL_NV */
1110 emit_NOP
, /* PK2H */
1111 emit_NOP
, /* PK2US */
1112 emit_NOP
, /* PK4B */
1113 emit_NOP
, /* PK4UB */
1115 emit_NOP
, /* POPA */
1117 emit_NOP
, /* PUSHA */
1141 emit_NOP
, /* TXP_NV */
1142 emit_NOP
, /* UP2H */
1143 emit_NOP
, /* UP2US */
1144 emit_NOP
, /* UP4B */
1145 emit_NOP
, /* UP4UB */
1155 static GLboolean
build_vertex_program( struct compilation
*cp
)
1157 struct arb_vp_machine
*m
= NULL
;
1160 struct x86_reg regEBX
= x86_make_reg(file_REG32
, reg_BX
);
1161 struct x86_reg regECX
= x86_make_reg(file_REG32
, reg_CX
);
1162 struct x86_reg regEDX
= x86_make_reg(file_REG32
, reg_DX
);
1164 x86_push(&cp
->func
, regEBX
);
1166 x86_mov(&cp
->func
, regEDX
, x86_fn_arg(&cp
->func
, 1));
1167 x86_mov(&cp
->func
, regEBX
, x86_make_disp(regEDX
, get_offset(m
, m
->File
+ FILE_REG
)));
1168 x86_mov(&cp
->func
, regECX
, x86_make_disp(regEDX
, get_offset(m
, m
->File
+ FILE_STATE_PARAM
)));
1170 for (j
= 0; j
< cp
->p
->nr_instructions
; j
++) {
1171 union instruction inst
= cp
->p
->instructions
[j
];
1172 cp
->insn_counter
= j
+1; /* avoid zero */
1175 _mesa_printf("%p: ", cp
->func
.csr
);
1176 _tnl_disassem_vba_insn( inst
);
1180 if (!emit_func
[inst
.alu
.opcode
]( cp
, inst
)) {
1185 /* TODO: only for outputs:
1187 for (j
= 0; j
< 8; j
++) {
1188 if (cp
->xmm
[j
].dirty
)
1195 if (cp
->func
.need_emms
)
1196 mmx_emms(&cp
->func
);
1198 /* Restore FPU control word?
1200 if (cp
->fpucntl
!= RESTORE_FPU
) {
1201 x87_fnclex(&cp
->func
);
1202 x87_fldcw(&cp
->func
, x86_make_disp(regEDX
, get_offset(m
, &m
->fpucntl_restore
)));
1205 x86_pop(&cp
->func
, regEBX
);
1212 * Execute the given vertex program.
1214 * TODO: Integrate the t_vertex.c code here, to build machine vertices
1215 * directly at this point.
1217 * TODO: Eliminate the VB struct entirely and just use
1218 * struct arb_vertex_machine.
1221 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program
*p
)
1223 struct compilation cp
;
1226 assert(emit_func
[OPCODE_ABS
] == emit_ABS
);
1227 assert(emit_func
[OPCODE_MUL
] == emit_MUL
);
1228 assert(emit_func
[OPCODE_XPD
] == emit_XPD
);
1230 _mesa_memset(&cp
, 0, sizeof(cp
));
1234 if (p
->compiled_func
) {
1235 _mesa_free((void *)p
->compiled_func
);
1236 p
->compiled_func
= NULL
;
1239 x86_init_func(&cp
.func
);
1241 cp
.fpucntl
= RESTORE_FPU
;
1244 /* Note ctx state is not referenced in building the function, so it
1245 * depends only on the list of instructions:
1247 if (!build_vertex_program(&cp
)) {
1248 x86_release_func( &cp
.func
);
1253 p
->compiled_func
= (void (*)(struct arb_vp_machine
*))x86_get_func( &cp
.func
);
1262 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program
*p
)
1264 /* Dummy version for when USE_SSE_ASM not defined */