d4eefdb6fc936c0b81fb7e00226c1fa6953394da
2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
31 #include "t_context.h"
33 #include "simple_list.h"
36 #if defined(USE_X86_ASM)
48 GLuint mod
:2; /* mod_REG if this is just a register */
49 GLint disp
:24; /* only +/- 23bits of offset - should be enough... */
60 GLboolean inputs_safe
;
61 GLboolean outputs_safe
;
65 struct x86_reg identity
;
73 /* There are more but these are all we'll use:
81 /* Values for mod field of modr/m byte
104 cc_NO
, /* not overflow */
105 cc_NAE
, /* not above or equal / carry */
106 cc_AE
, /* above or equal / not carry */
107 cc_E
, /* equal / zero */
108 cc_NE
/* not equal / not zero */
115 /* Create and manipulate registers and regmem values:
117 static struct x86_reg
make_reg( GLuint file
,
130 static struct x86_reg
make_disp( struct x86_reg reg
,
133 assert(reg
.file
== file_REG32
);
135 if (reg
.mod
== mod_REG
)
141 reg
.mod
= mod_INDIRECT
;
142 else if (reg
.disp
<= 127 && reg
.disp
>= -128)
145 reg
.mod
= mod_DISP32
;
150 static struct x86_reg
deref( struct x86_reg reg
)
152 return make_disp(reg
, 0);
155 static struct x86_reg
get_base_reg( struct x86_reg reg
)
157 return make_reg( reg
.file
, reg
.idx
);
161 /* Retreive a reference to one of the function arguments, taking into
162 * account any push/pop activity:
164 static struct x86_reg
make_fn_arg( struct x86_program
*p
,
167 return make_disp(make_reg(file_REG32
, reg_SP
),
168 p
->stack_offset
+ arg
* 4); /* ??? */
171 static struct x86_reg
get_identity( struct x86_program
*p
)
177 /* Emit bytes to the instruction stream:
179 static void emit_1b( struct x86_program
*p
, GLbyte b0
)
181 *(GLbyte
*)(p
->csr
++) = b0
;
184 static void emit_1i( struct x86_program
*p
, GLint i0
)
186 *(GLint
*)(p
->csr
) = i0
;
190 static void disassem( struct x86_program
*p
, const char *fn
)
193 static const char *last_fn
;
194 if (fn
&& fn
!= last_fn
) {
195 _mesa_printf("0x%x: %s\n", p
->csr
, fn
);
201 static void emit_1ub_fn( struct x86_program
*p
, GLubyte b0
, const char *fn
)
207 static void emit_2ub_fn( struct x86_program
*p
, GLubyte b0
, GLubyte b1
, const char *fn
)
214 static void emit_3ub_fn( struct x86_program
*p
, GLubyte b0
, GLubyte b1
, GLubyte b2
, const char *fn
)
222 #define emit_1ub(p, b0) emit_1ub_fn(p, b0, __FUNCTION__)
223 #define emit_2ub(p, b0, b1) emit_2ub_fn(p, b0, b1, __FUNCTION__)
224 #define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__)
227 /* Labels, jumps and fixup:
229 static GLubyte
*get_label( struct x86_program
*p
)
234 static void x86_jcc( struct x86_program
*p
,
238 GLint offset
= label
- (get_label(p
) + 2);
240 if (offset
<= 127 && offset
>= -128) {
241 emit_1ub(p
, 0x70 + cc
);
242 emit_1b(p
, (GLbyte
) offset
);
245 offset
= label
- (get_label(p
) + 6);
246 emit_2ub(p
, 0x0f, 0x80 + cc
);
251 /* Always use a 32bit offset for forward jumps:
253 static GLubyte
*x86_jcc_forward( struct x86_program
*p
,
256 emit_2ub(p
, 0x0f, 0x80 + cc
);
261 /* Fixup offset from forward jump:
263 static void do_fixup( struct x86_program
*p
,
266 *(int *)(fixup
- 4) = get_label(p
) - fixup
;
269 static void x86_push( struct x86_program
*p
,
272 assert(reg
.mod
== mod_REG
);
273 emit_1ub(p
, 0x50 + reg
.idx
);
274 p
->stack_offset
+= 4;
277 static void x86_pop( struct x86_program
*p
,
280 assert(reg
.mod
== mod_REG
);
281 emit_1ub(p
, 0x58 + reg
.idx
);
282 p
->stack_offset
-= 4;
285 static void x86_inc( struct x86_program
*p
,
288 assert(reg
.mod
== mod_REG
);
289 emit_1ub(p
, 0x40 + reg
.idx
);
292 static void x86_dec( struct x86_program
*p
,
295 assert(reg
.mod
== mod_REG
);
296 emit_1ub(p
, 0x48 + reg
.idx
);
299 static void x86_ret( struct x86_program
*p
)
304 static void mmx_emms( struct x86_program
*p
)
306 assert(p
->need_emms
);
307 emit_2ub(p
, 0x0f, 0x77);
314 /* Build a modRM byte + possible displacement. No treatment of SIB
315 * indexing. BZZT - no way to encode an absolute address.
317 static void emit_modrm( struct x86_program
*p
,
319 struct x86_reg regmem
)
323 assert(reg
.mod
== mod_REG
);
325 val
|= regmem
.mod
<< 6; /* mod field */
326 val
|= reg
.idx
<< 3; /* reg field */
327 val
|= regmem
.idx
; /* r/m field */
329 emit_1ub_fn(p
, val
, 0);
331 /* Oh-oh we've stumbled into the SIB thing.
333 if (regmem
.idx
== reg_SP
) {
334 emit_1ub_fn(p
, 0x24, 0); /* simplistic! */
337 switch (regmem
.mod
) {
342 emit_1b(p
, regmem
.disp
);
345 emit_1i(p
, regmem
.disp
);
348 _mesa_printf("unknown regmem.mod %d\n", regmem
.mod
);
354 /* Many x86 instructions have two opcodes to cope with the situations
355 * where the destination is a register or memory reference
356 * respectively. This function selects the correct opcode based on
357 * the arguments presented.
359 static void emit_op_modrm( struct x86_program
*p
,
360 GLubyte op_dst_is_reg
,
361 GLubyte op_dst_is_mem
,
367 emit_1ub_fn(p
, op_dst_is_reg
, 0);
368 emit_modrm(p
, dst
, src
);
373 assert(src
.mod
== mod_REG
);
374 emit_1ub_fn(p
, op_dst_is_mem
, 0);
375 emit_modrm(p
, src
, dst
);
378 _mesa_printf("unknown dst.mod %d\n", dst
.mod
);
384 static void x86_mov( struct x86_program
*p
,
388 emit_op_modrm( p
, 0x8b, 0x89, dst
, src
);
391 static void x86_xor( struct x86_program
*p
,
395 emit_op_modrm( p
, 0x33, 0x31, dst
, src
);
398 static void x86_cmp( struct x86_program
*p
,
402 emit_op_modrm( p
, 0x3b, 0x39, dst
, src
);
405 static void sse2_movd( struct x86_program
*p
,
409 assert(p
->have_sse2
);
410 emit_2ub(p
, 0x66, X86_TWOB
);
411 emit_op_modrm( p
, 0x6e, 0x7e, dst
, src
);
414 static void mmx_movd( struct x86_program
*p
,
419 emit_1ub(p
, X86_TWOB
);
420 emit_op_modrm( p
, 0x6e, 0x7e, dst
, src
);
423 static void mmx_movq( struct x86_program
*p
,
428 emit_1ub(p
, X86_TWOB
);
429 emit_op_modrm( p
, 0x6f, 0x7f, dst
, src
);
433 static void sse_movss( struct x86_program
*p
,
437 emit_2ub(p
, 0xF3, X86_TWOB
);
438 emit_op_modrm( p
, 0x10, 0x11, dst
, src
);
441 static void sse_movaps( struct x86_program
*p
,
445 emit_1ub(p
, X86_TWOB
);
446 emit_op_modrm( p
, 0x28, 0x29, dst
, src
);
449 static void sse_movups( struct x86_program
*p
,
453 emit_1ub(p
, X86_TWOB
);
454 emit_op_modrm( p
, 0x10, 0x11, dst
, src
);
457 static void sse_movhps( struct x86_program
*p
,
461 assert(dst
.mod
!= mod_REG
|| src
.mod
!= mod_REG
);
462 emit_1ub(p
, X86_TWOB
);
463 emit_op_modrm( p
, 0x16, 0x17, dst
, src
); /* cf movlhps */
466 static void sse_movlps( struct x86_program
*p
,
470 assert(dst
.mod
!= mod_REG
|| src
.mod
!= mod_REG
);
471 emit_1ub(p
, X86_TWOB
);
472 emit_op_modrm( p
, 0x12, 0x13, dst
, src
); /* cf movhlps */
475 /* SSE operations often only have one format, with dest constrained to
478 static void sse_mulps( struct x86_program
*p
,
482 emit_2ub(p
, X86_TWOB
, 0x59);
483 emit_modrm( p
, dst
, src
);
486 static void sse_addps( struct x86_program
*p
,
490 emit_2ub(p
, X86_TWOB
, 0x58);
491 emit_modrm( p
, dst
, src
);
494 static void sse_movhlps( struct x86_program
*p
,
498 assert(dst
.mod
== mod_REG
&& src
.mod
== mod_REG
);
499 emit_2ub(p
, X86_TWOB
, 0x12);
500 emit_modrm( p
, dst
, src
);
503 static void sse_movlhps( struct x86_program
*p
,
507 assert(dst
.mod
== mod_REG
&& src
.mod
== mod_REG
);
508 emit_2ub(p
, X86_TWOB
, 0x16);
509 emit_modrm( p
, dst
, src
);
512 static void sse2_cvtps2dq( struct x86_program
*p
,
516 assert(p
->have_sse2
);
517 emit_3ub(p
, 0x66, X86_TWOB
, 0x5B);
518 emit_modrm( p
, dst
, src
);
521 static void sse2_packssdw( struct x86_program
*p
,
525 assert(p
->have_sse2
);
526 emit_3ub(p
, 0x66, X86_TWOB
, 0x6B);
527 emit_modrm( p
, dst
, src
);
530 static void sse2_packsswb( struct x86_program
*p
,
534 assert(p
->have_sse2
);
535 emit_3ub(p
, 0x66, X86_TWOB
, 0x63);
536 emit_modrm( p
, dst
, src
);
539 static void sse2_packuswb( struct x86_program
*p
,
543 assert(p
->have_sse2
);
544 emit_3ub(p
, 0x66, X86_TWOB
, 0x67);
545 emit_modrm( p
, dst
, src
);
548 static void sse_cvtps2pi( struct x86_program
*p
,
552 assert(dst
.file
== file_MMX
&&
553 (src
.file
== file_XMM
|| src
.mod
!= mod_REG
));
557 emit_2ub(p
, X86_TWOB
, 0x2d);
558 emit_modrm( p
, dst
, src
);
561 static void mmx_packssdw( struct x86_program
*p
,
565 assert(dst
.file
== file_MMX
&&
566 (src
.file
== file_MMX
|| src
.mod
!= mod_REG
));
570 emit_2ub(p
, X86_TWOB
, 0x6b);
571 emit_modrm( p
, dst
, src
);
574 static void mmx_packuswb( struct x86_program
*p
,
578 assert(dst
.file
== file_MMX
&&
579 (src
.file
== file_MMX
|| src
.mod
!= mod_REG
));
583 emit_2ub(p
, X86_TWOB
, 0x67);
584 emit_modrm( p
, dst
, src
);
588 /* Load effective address:
590 static void x86_lea( struct x86_program
*p
,
595 emit_modrm( p
, dst
, src
);
598 static void x86_test( struct x86_program
*p
,
603 emit_modrm( p
, dst
, src
);
610 * Perform a reduced swizzle:
612 static void sse2_pshufd( struct x86_program
*p
,
620 assert(p
->have_sse2
);
621 emit_3ub(p
, 0x66, X86_TWOB
, 0x70);
622 emit_modrm(p
, dest
, arg0
);
623 emit_1ub(p
, (x
|(y
<<2)|(z
<<4)|w
<<6));
627 /* Shufps can also be used to implement a reduced swizzle when dest ==
630 static void sse_shufps( struct x86_program
*p
,
638 emit_2ub(p
, X86_TWOB
, 0xC6);
639 emit_modrm(p
, dest
, arg0
);
640 emit_1ub(p
, (x
|(y
<<2)|(z
<<4)|w
<<6));
644 static void emit_load4f_4( struct x86_program
*p
,
646 struct x86_reg arg0
)
648 sse_movups(p
, dest
, arg0
);
651 static void emit_load4f_3( struct x86_program
*p
,
653 struct x86_reg arg0
)
655 /* Have to jump through some hoops:
662 sse_movss(p
, dest
, make_disp(arg0
, 8));
663 sse_shufps(p
, dest
, get_identity(p
), X
,Y
,Z
,W
);
664 sse_shufps(p
, dest
, dest
, Y
,Z
,X
,W
);
665 sse_movlps(p
, dest
, arg0
);
668 static void emit_load4f_2( struct x86_program
*p
,
670 struct x86_reg arg0
)
672 /* Initialize from identity, then pull in low two words:
674 sse_movups(p
, dest
, get_identity(p
));
675 sse_movlps(p
, dest
, arg0
);
678 static void emit_load4f_1( struct x86_program
*p
,
680 struct x86_reg arg0
)
682 /* Pull in low word, then swizzle in identity */
683 sse_movss(p
, dest
, arg0
);
684 sse_shufps(p
, dest
, get_identity(p
), X
,Y
,Z
,W
);
689 static void emit_load3f_3( struct x86_program
*p
,
691 struct x86_reg arg0
)
693 /* Over-reads by 1 dword - potential SEGV if input is a vertex
696 if (p
->inputs_safe
) {
697 sse_movups(p
, dest
, arg0
);
704 sse_movss(p
, dest
, make_disp(arg0
, 8));
705 sse_shufps(p
, dest
, dest
, X
,X
,X
,X
);
706 sse_movlps(p
, dest
, arg0
);
710 static void emit_load3f_2( struct x86_program
*p
,
712 struct x86_reg arg0
)
714 emit_load4f_2(p
, dest
, arg0
);
717 static void emit_load3f_1( struct x86_program
*p
,
719 struct x86_reg arg0
)
721 emit_load4f_1(p
, dest
, arg0
);
724 static void emit_load2f_2( struct x86_program
*p
,
726 struct x86_reg arg0
)
728 sse_movlps(p
, dest
, arg0
);
731 static void emit_load2f_1( struct x86_program
*p
,
733 struct x86_reg arg0
)
735 emit_load4f_1(p
, dest
, arg0
);
738 static void emit_load1f_1( struct x86_program
*p
,
740 struct x86_reg arg0
)
742 sse_movss(p
, dest
, arg0
);
745 static void (*load
[4][4])( struct x86_program
*p
,
747 struct x86_reg arg0
) = {
769 static void emit_load( struct x86_program
*p
,
776 _mesa_printf("load %d/%d\n", sz
, src_sz
);
778 load
[sz
-1][src_sz
-1](p
, dest
, src
);
781 static void emit_store4f( struct x86_program
*p
,
783 struct x86_reg arg0
)
785 sse_movups(p
, dest
, arg0
);
788 static void emit_store3f( struct x86_program
*p
,
790 struct x86_reg arg0
)
792 if (p
->outputs_safe
) {
793 /* Emit the extra dword anyway. This may hurt writecombining,
794 * may cause other problems.
796 sse_movups(p
, dest
, arg0
);
799 /* Alternate strategy - emit two, shuffle, emit one.
801 sse_movlps(p
, dest
, arg0
);
802 sse_shufps(p
, arg0
, arg0
, Z
, Z
, Z
, Z
); /* NOTE! destructive */
803 sse_movss(p
, make_disp(dest
,8), arg0
);
807 static void emit_store2f( struct x86_program
*p
,
809 struct x86_reg arg0
)
811 sse_movlps(p
, dest
, arg0
);
814 static void emit_store1f( struct x86_program
*p
,
816 struct x86_reg arg0
)
818 sse_movss(p
, dest
, arg0
);
822 static void (*store
[4])( struct x86_program
*p
,
824 struct x86_reg arg0
) =
832 static void emit_store( struct x86_program
*p
,
835 struct x86_reg temp
)
839 _mesa_printf("store %d\n", sz
);
840 store
[sz
-1](p
, dest
, temp
);
843 static void emit_pack_store_4ub( struct x86_program
*p
,
845 struct x86_reg temp
)
849 sse_mulps(p
, temp
, p
->chan0
);
852 sse2_cvtps2dq(p
, temp
, temp
);
853 sse2_packssdw(p
, temp
, temp
);
854 sse2_packuswb(p
, temp
, temp
);
855 sse_movss(p
, dest
, temp
);
858 struct x86_reg mmx0
= make_reg(file_MMX
, 0);
859 struct x86_reg mmx1
= make_reg(file_MMX
, 1);
860 sse_cvtps2pi(p
, mmx0
, temp
);
861 sse_movhlps(p
, temp
, temp
);
862 sse_cvtps2pi(p
, mmx1
, temp
);
863 mmx_packssdw(p
, mmx0
, mmx1
);
864 mmx_packuswb(p
, mmx0
, mmx0
);
865 mmx_movd(p
, dest
, mmx0
);
869 static GLint
get_offset( const void *a
, const void *b
)
871 return (const char *)b
- (const char *)a
;
874 /* Not much happens here. Eventually use this function to try and
875 * avoid saving/reloading the source pointers each vertex (if some of
876 * them can fit in registers).
878 static void get_src_ptr( struct x86_program
*p
,
879 struct x86_reg srcREG
,
880 struct x86_reg vtxREG
,
881 struct tnl_clipspace_attr
*a
)
883 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(p
->ctx
);
884 struct x86_reg ptr_to_src
= make_disp(vtxREG
, get_offset(vtx
, &a
->inputptr
));
886 /* Load current a[j].inputptr
888 x86_mov(p
, srcREG
, ptr_to_src
);
891 static void update_src_ptr( struct x86_program
*p
,
892 struct x86_reg srcREG
,
893 struct x86_reg vtxREG
,
894 struct tnl_clipspace_attr
*a
)
896 if (a
->inputstride
) {
897 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(p
->ctx
);
898 struct x86_reg ptr_to_src
= make_disp(vtxREG
, get_offset(vtx
, &a
->inputptr
));
900 /* add a[j].inputstride (hardcoded value - could just as easily
901 * pull the stride value from memory each time).
903 x86_lea(p
, srcREG
, make_disp(srcREG
, a
->inputstride
));
905 /* save new value of a[j].inputptr
907 x86_mov(p
, ptr_to_src
, srcREG
);
912 /* Lots of hardcoding
914 * EAX -- pointer to current output vertex
915 * ECX -- pointer to current attribute
918 static GLboolean
build_vertex_emit( struct x86_program
*p
)
920 GLcontext
*ctx
= p
->ctx
;
921 TNLcontext
*tnl
= TNL_CONTEXT(ctx
);
922 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(ctx
);
925 struct x86_reg vertexEAX
= make_reg(file_REG32
, reg_AX
);
926 struct x86_reg srcECX
= make_reg(file_REG32
, reg_CX
);
927 struct x86_reg countEBP
= make_reg(file_REG32
, reg_BP
);
928 struct x86_reg vtxESI
= make_reg(file_REG32
, reg_SI
);
929 struct x86_reg temp
= make_reg(file_XMM
, 0);
930 struct x86_reg vp0
= make_reg(file_XMM
, 1);
931 struct x86_reg vp1
= make_reg(file_XMM
, 2);
932 GLubyte
*fixup
, *label
;
938 /* x86_push(p, srcECX); */
939 x86_push(p
, countEBP
);
943 /* Get vertex count, compare to zero
945 x86_xor(p
, srcECX
, srcECX
);
946 x86_mov(p
, countEBP
, make_fn_arg(p
, 2));
947 x86_cmp(p
, countEBP
, srcECX
);
948 fixup
= x86_jcc_forward(p
, cc_E
);
950 /* Initialize destination register.
952 x86_mov(p
, vertexEAX
, make_fn_arg(p
, 3));
954 /* Dereference ctx to get tnl, then vtx:
956 x86_mov(p
, vtxESI
, make_fn_arg(p
, 1));
957 x86_mov(p
, vtxESI
, make_disp(vtxESI
, get_offset(ctx
, &ctx
->swtnl_context
)));
958 vtxESI
= make_disp(vtxESI
, get_offset(tnl
, &tnl
->clipspace
));
961 /* Possibly load vp0, vp1 for viewport calcs:
963 if (vtx
->need_viewport
) {
964 sse_movups(p
, vp0
, make_disp(vtxESI
, get_offset(vtx
, &vtx
->vp_scale
[0])));
965 sse_movups(p
, vp1
, make_disp(vtxESI
, get_offset(vtx
, &vtx
->vp_xlate
[0])));
968 /* always load, needed or not:
970 sse_movups(p
, p
->chan0
, make_disp(vtxESI
, get_offset(vtx
, &vtx
->chan_scale
[0])));
971 sse_movups(p
, p
->identity
, make_disp(vtxESI
, get_offset(vtx
, &vtx
->identity
[0])));
973 /* Note address for loop jump */
974 label
= get_label(p
);
976 /* Emit code for each of the attributes. Currently routes
977 * everything through SSE registers, even when it might be more
978 * efficient to stick with regular old x86. No optimization or
979 * other tricks - enough new ground to cover here just getting
982 while (j
< vtx
->attr_count
) {
983 struct tnl_clipspace_attr
*a
= &vtx
->attr
[j
];
984 struct x86_reg dest
= make_disp(vertexEAX
, a
->vertoffset
);
986 /* Now, load an XMM reg from src, perhaps transform, then save.
987 * Could be shortcircuited in specific cases:
991 get_src_ptr(p
, srcECX
, vtxESI
, a
);
992 emit_load(p
, temp
, 1, deref(srcECX
), a
->inputsize
);
993 emit_store(p
, dest
, 1, temp
);
994 update_src_ptr(p
, srcECX
, vtxESI
, a
);
997 get_src_ptr(p
, srcECX
, vtxESI
, a
);
998 emit_load(p
, temp
, 2, deref(srcECX
), a
->inputsize
);
999 emit_store(p
, dest
, 2, temp
);
1000 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1003 /* Potentially the worst case - hardcode 2+1 copying:
1006 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1007 emit_load(p
, temp
, 3, deref(srcECX
), a
->inputsize
);
1008 emit_store(p
, dest
, 3, temp
);
1009 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1012 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1013 emit_load(p
, temp
, 2, deref(srcECX
), a
->inputsize
);
1014 emit_store(p
, dest
, 2, temp
);
1015 if (a
->inputsize
> 2) {
1016 emit_load(p
, temp
, 1, make_disp(srcECX
, 8), 1);
1017 emit_store(p
, make_disp(dest
,8), 1, temp
);
1020 sse_movss(p
, make_disp(dest
,8), get_identity(p
));
1022 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1026 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1027 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1028 emit_store(p
, dest
, 4, temp
);
1029 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1031 case EMIT_2F_VIEWPORT
:
1032 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1033 emit_load(p
, temp
, 2, deref(srcECX
), a
->inputsize
);
1034 sse_mulps(p
, temp
, vp0
);
1035 sse_addps(p
, temp
, vp1
);
1036 emit_store(p
, dest
, 2, temp
);
1037 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1039 case EMIT_3F_VIEWPORT
:
1040 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1041 emit_load(p
, temp
, 3, deref(srcECX
), a
->inputsize
);
1042 sse_mulps(p
, temp
, vp0
);
1043 sse_addps(p
, temp
, vp1
);
1044 emit_store(p
, dest
, 3, temp
);
1045 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1047 case EMIT_4F_VIEWPORT
:
1048 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1049 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1050 sse_mulps(p
, temp
, vp0
);
1051 sse_addps(p
, temp
, vp1
);
1052 emit_store(p
, dest
, 4, temp
);
1053 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1056 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1057 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1058 sse_shufps(p
, temp
, temp
, X
, Y
, W
, Z
);
1059 emit_store(p
, dest
, 3, temp
);
1060 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1064 /* Test for PAD3 + 1UB:
1067 a
[-1].vertoffset
+ a
[-1].vertattrsize
<= a
->vertoffset
- 3)
1069 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1070 emit_load(p
, temp
, 1, deref(srcECX
), a
->inputsize
);
1071 sse_shufps(p
, temp
, temp
, X
, X
, X
, X
);
1072 emit_pack_store_4ub(p
, make_disp(dest
, -3), temp
); /* overkill! */
1073 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1076 _mesa_printf("Can't emit 1ub %x %x %d\n", a
->vertoffset
, a
[-1].vertoffset
, a
[-1].vertattrsize
);
1080 case EMIT_3UB_3F_RGB
:
1081 case EMIT_3UB_3F_BGR
:
1082 /* Test for 3UB + PAD1:
1084 if (j
== vtx
->attr_count
- 1 ||
1085 a
[1].vertoffset
>= a
->vertoffset
+ 4) {
1086 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1087 emit_load(p
, temp
, 3, deref(srcECX
), a
->inputsize
);
1088 if (a
->format
== EMIT_3UB_3F_BGR
)
1089 sse_shufps(p
, temp
, temp
, Z
, Y
, X
, W
);
1090 emit_pack_store_4ub(p
, dest
, temp
);
1091 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1093 /* Test for 3UB + 1UB:
1095 else if (j
< vtx
->attr_count
- 1 &&
1096 a
[1].format
== EMIT_1UB_1F
&&
1097 a
[1].vertoffset
== a
->vertoffset
+ 3) {
1098 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1099 emit_load(p
, temp
, 3, deref(srcECX
), a
->inputsize
);
1100 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1102 /* Make room for incoming value:
1104 sse_shufps(p
, temp
, temp
, W
, X
, Y
, Z
);
1106 get_src_ptr(p
, srcECX
, vtxESI
, &a
[1]);
1107 emit_load(p
, temp
, 1, deref(srcECX
), a
[1].inputsize
);
1108 update_src_ptr(p
, srcECX
, vtxESI
, &a
[1]);
1110 /* Rearrange and possibly do BGR conversion:
1112 if (a
->format
== EMIT_3UB_3F_BGR
)
1113 sse_shufps(p
, temp
, temp
, W
, Z
, Y
, X
);
1115 sse_shufps(p
, temp
, temp
, Y
, Z
, W
, X
);
1117 emit_pack_store_4ub(p
, dest
, temp
);
1118 j
++; /* NOTE: two attrs consumed */
1121 _mesa_printf("Can't emit 3ub\n");
1123 return GL_FALSE
; /* add this later */
1126 case EMIT_4UB_4F_RGBA
:
1127 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1128 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1129 emit_pack_store_4ub(p
, dest
, temp
);
1130 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1132 case EMIT_4UB_4F_BGRA
:
1133 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1134 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1135 sse_shufps(p
, temp
, temp
, Z
, Y
, X
, W
);
1136 emit_pack_store_4ub(p
, dest
, temp
);
1137 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1139 case EMIT_4UB_4F_ARGB
:
1140 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1141 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1142 sse_shufps(p
, temp
, temp
, W
, X
, Y
, Z
);
1143 emit_pack_store_4ub(p
, dest
, temp
);
1144 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1146 case EMIT_4UB_4F_ABGR
:
1147 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1148 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1149 sse_shufps(p
, temp
, temp
, W
, Z
, Y
, X
);
1150 emit_pack_store_4ub(p
, dest
, temp
);
1151 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1153 case EMIT_4CHAN_4F_RGBA
:
1154 switch (CHAN_TYPE
) {
1155 case GL_UNSIGNED_BYTE
:
1156 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1157 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1158 emit_pack_store_4ub(p
, dest
, temp
);
1159 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1162 get_src_ptr(p
, srcECX
, vtxESI
, a
);
1163 emit_load(p
, temp
, 4, deref(srcECX
), a
->inputsize
);
1164 emit_store(p
, dest
, 4, temp
);
1165 update_src_ptr(p
, srcECX
, vtxESI
, a
);
1167 case GL_UNSIGNED_SHORT
:
1169 _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE
));
1174 _mesa_printf("unknown a[%d].format %d\n", j
, a
->format
);
1175 return GL_FALSE
; /* catch any new opcodes */
1178 /* Increment j by at least 1 - may have been incremented above also:
1185 x86_lea(p
, vertexEAX
, make_disp(vertexEAX
, vtx
->vertex_size
));
1187 /* decr count, loop if not zero
1189 x86_dec(p
, countEBP
);
1190 x86_test(p
, countEBP
, countEBP
);
1191 x86_jcc(p
, cc_NZ
, label
);
1198 /* Land forward jump here:
1202 /* Pop regs and return
1204 x86_pop(p
, get_base_reg(vtxESI
));
1205 x86_pop(p
, countEBP
);
1206 /* x86_pop(p, srcECX); */
1209 vtx
->emit
= (tnl_emit_func
)p
->store
;
1213 #include "x86/common_x86_asm.h"
1216 void _tnl_generate_sse_emit( GLcontext
*ctx
)
1218 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(ctx
);
1219 struct x86_program p
;
1222 vtx
->codegen_emit
= NULL
;
1226 memset(&p
, 0, sizeof(p
));
1228 p
.store
= MALLOC(1024);
1230 p
.inputs_safe
= 0; /* for now */
1231 p
.outputs_safe
= 1; /* for now */
1232 p
.have_sse2
= cpu_has_xmm2
;
1233 p
.identity
= make_reg(file_XMM
, 6);
1234 p
.chan0
= make_reg(file_XMM
, 7);
1236 if (build_vertex_emit(&p
)) {
1237 _tnl_register_fastpath( vtx
, GL_TRUE
);
1239 _mesa_printf("disassemble 0x%x 0x%x\n", p
.store
, p
.csr
);
1242 /* Note the failure so that we don't keep trying to codegen an
1245 _tnl_register_fastpath( vtx
, GL_FALSE
);
1256 (void)sse2_packsswb
;
1262 void _tnl_generate_sse_emit( GLcontext
*ctx
)
1264 /* Dummy version for when USE_SSE_ASM not defined */