2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
28 #include "main/glheader.h"
29 #include "main/colormac.h"
30 #include "main/simple_list.h"
31 #include "main/enums.h"
35 #if defined(USE_SSE_ASM)
37 #include "x86/rtasm/x86sse.h"
38 #include "x86/common_x86_asm.h"
48 struct x86_function func
;
50 struct vertex_fetch
*vf
;
51 GLboolean inputs_safe
;
52 GLboolean outputs_safe
;
55 struct x86_reg identity
;
60 static struct x86_reg
get_identity( struct x86_program
*p
)
65 static void emit_load4f_4( struct x86_program
*p
,
69 sse_movups(&p
->func
, dest
, arg0
);
72 static void emit_load4f_3( struct x86_program
*p
,
76 /* Have to jump through some hoops:
83 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
84 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
85 sse_shufps(&p
->func
, dest
, dest
, SHUF(Y
,Z
,X
,W
) );
86 sse_movlps(&p
->func
, dest
, arg0
);
89 static void emit_load4f_2( struct x86_program
*p
,
93 /* Initialize from identity, then pull in low two words:
95 sse_movups(&p
->func
, dest
, get_identity(p
));
96 sse_movlps(&p
->func
, dest
, arg0
);
99 static void emit_load4f_1( struct x86_program
*p
,
101 struct x86_reg arg0
)
103 /* Pull in low word, then swizzle in identity */
104 sse_movss(&p
->func
, dest
, arg0
);
105 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
110 static void emit_load3f_3( struct x86_program
*p
,
112 struct x86_reg arg0
)
114 /* Over-reads by 1 dword - potential SEGV if input is a vertex
117 if (p
->inputs_safe
) {
118 sse_movups(&p
->func
, dest
, arg0
);
125 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
126 sse_shufps(&p
->func
, dest
, dest
, SHUF(X
,X
,X
,X
));
127 sse_movlps(&p
->func
, dest
, arg0
);
131 static void emit_load3f_2( struct x86_program
*p
,
133 struct x86_reg arg0
)
135 emit_load4f_2(p
, dest
, arg0
);
138 static void emit_load3f_1( struct x86_program
*p
,
140 struct x86_reg arg0
)
142 emit_load4f_1(p
, dest
, arg0
);
145 static void emit_load2f_2( struct x86_program
*p
,
147 struct x86_reg arg0
)
149 sse_movlps(&p
->func
, dest
, arg0
);
152 static void emit_load2f_1( struct x86_program
*p
,
154 struct x86_reg arg0
)
156 emit_load4f_1(p
, dest
, arg0
);
159 static void emit_load1f_1( struct x86_program
*p
,
161 struct x86_reg arg0
)
163 sse_movss(&p
->func
, dest
, arg0
);
166 static void (*load
[4][4])( struct x86_program
*p
,
168 struct x86_reg arg0
) = {
190 static void emit_load( struct x86_program
*p
,
196 load
[sz
-1][src_sz
-1](p
, dest
, src
);
199 static void emit_store4f( struct x86_program
*p
,
201 struct x86_reg arg0
)
203 sse_movups(&p
->func
, dest
, arg0
);
206 static void emit_store3f( struct x86_program
*p
,
208 struct x86_reg arg0
)
210 if (p
->outputs_safe
) {
211 /* Emit the extra dword anyway. This may hurt writecombining,
212 * may cause other problems.
214 sse_movups(&p
->func
, dest
, arg0
);
217 /* Alternate strategy - emit two, shuffle, emit one.
219 sse_movlps(&p
->func
, dest
, arg0
);
220 sse_shufps(&p
->func
, arg0
, arg0
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
221 sse_movss(&p
->func
, x86_make_disp(dest
,8), arg0
);
225 static void emit_store2f( struct x86_program
*p
,
227 struct x86_reg arg0
)
229 sse_movlps(&p
->func
, dest
, arg0
);
232 static void emit_store1f( struct x86_program
*p
,
234 struct x86_reg arg0
)
236 sse_movss(&p
->func
, dest
, arg0
);
240 static void (*store
[4])( struct x86_program
*p
,
242 struct x86_reg arg0
) =
250 static void emit_store( struct x86_program
*p
,
253 struct x86_reg temp
)
256 store
[sz
-1](p
, dest
, temp
);
259 static void emit_pack_store_4ub( struct x86_program
*p
,
261 struct x86_reg temp
)
265 sse_mulps(&p
->func
, temp
, p
->chan0
);
268 sse2_cvtps2dq(&p
->func
, temp
, temp
);
269 sse2_packssdw(&p
->func
, temp
, temp
);
270 sse2_packuswb(&p
->func
, temp
, temp
);
271 sse_movss(&p
->func
, dest
, temp
);
274 struct x86_reg mmx0
= x86_make_reg(file_MMX
, 0);
275 struct x86_reg mmx1
= x86_make_reg(file_MMX
, 1);
276 sse_cvtps2pi(&p
->func
, mmx0
, temp
);
277 sse_movhlps(&p
->func
, temp
, temp
);
278 sse_cvtps2pi(&p
->func
, mmx1
, temp
);
279 mmx_packssdw(&p
->func
, mmx0
, mmx1
);
280 mmx_packuswb(&p
->func
, mmx0
, mmx0
);
281 mmx_movd(&p
->func
, dest
, mmx0
);
285 static GLint
get_offset( const void *a
, const void *b
)
287 return (const char *)b
- (const char *)a
;
290 /* Not much happens here. Eventually use this function to try and
291 * avoid saving/reloading the source pointers each vertex (if some of
292 * them can fit in registers).
294 static void get_src_ptr( struct x86_program
*p
,
295 struct x86_reg srcREG
,
296 struct x86_reg vfREG
,
299 struct vertex_fetch
*vf
= p
->vf
;
300 struct x86_reg ptr_to_src
= x86_make_disp(vfREG
, get_offset(vf
, &a
->inputptr
));
302 /* Load current a[j].inputptr
304 x86_mov(&p
->func
, srcREG
, ptr_to_src
);
307 static void update_src_ptr( struct x86_program
*p
,
308 struct x86_reg srcREG
,
309 struct x86_reg vfREG
,
312 if (a
->inputstride
) {
313 struct vertex_fetch
*vf
= p
->vf
;
314 struct x86_reg ptr_to_src
= x86_make_disp(vfREG
, get_offset(vf
, &a
->inputptr
));
316 /* add a[j].inputstride (hardcoded value - could just as easily
317 * pull the stride value from memory each time).
319 x86_lea(&p
->func
, srcREG
, x86_make_disp(srcREG
, a
->inputstride
));
321 /* save new value of a[j].inputptr
323 x86_mov(&p
->func
, ptr_to_src
, srcREG
);
328 /* Lots of hardcoding
330 * EAX -- pointer to current output vertex
331 * ECX -- pointer to current attribute
334 static GLboolean
build_vertex_emit( struct x86_program
*p
)
336 struct vertex_fetch
*vf
= p
->vf
;
339 struct x86_reg vertexEAX
= x86_make_reg(file_REG32
, reg_AX
);
340 struct x86_reg srcECX
= x86_make_reg(file_REG32
, reg_CX
);
341 struct x86_reg countEBP
= x86_make_reg(file_REG32
, reg_BP
);
342 struct x86_reg vfESI
= x86_make_reg(file_REG32
, reg_SI
);
343 struct x86_reg temp
= x86_make_reg(file_XMM
, 0);
344 struct x86_reg vp0
= x86_make_reg(file_XMM
, 1);
345 struct x86_reg vp1
= x86_make_reg(file_XMM
, 2);
346 GLubyte
*fixup
, *label
;
350 x86_push(&p
->func
, countEBP
);
351 x86_push(&p
->func
, vfESI
);
354 /* Get vertex count, compare to zero
356 x86_xor(&p
->func
, srcECX
, srcECX
);
357 x86_mov(&p
->func
, countEBP
, x86_fn_arg(&p
->func
, 2));
358 x86_cmp(&p
->func
, countEBP
, srcECX
);
359 fixup
= x86_jcc_forward(&p
->func
, cc_E
);
361 /* Initialize destination register.
363 x86_mov(&p
->func
, vertexEAX
, x86_fn_arg(&p
->func
, 3));
365 /* Move argument 1 (vf) into a reg:
367 x86_mov(&p
->func
, vfESI
, x86_fn_arg(&p
->func
, 1));
370 /* Possibly load vp0, vp1 for viewport calcs:
372 if (vf
->allow_viewport_emits
) {
373 sse_movups(&p
->func
, vp0
, x86_make_disp(vfESI
, get_offset(vf
, &vf
->vp
[0])));
374 sse_movups(&p
->func
, vp1
, x86_make_disp(vfESI
, get_offset(vf
, &vf
->vp
[4])));
377 /* always load, needed or not:
379 sse_movups(&p
->func
, p
->chan0
, x86_make_disp(vfESI
, get_offset(vf
, &vf
->chan_scale
[0])));
380 sse_movups(&p
->func
, p
->identity
, x86_make_disp(vfESI
, get_offset(vf
, &vf
->identity
[0])));
382 /* Note address for loop jump */
383 label
= x86_get_label(&p
->func
);
385 /* Emit code for each of the attributes. Currently routes
386 * everything through SSE registers, even when it might be more
387 * efficient to stick with regular old x86. No optimization or
388 * other tricks - enough new ground to cover here just getting
391 while (j
< vf
->attr_count
) {
392 struct vf_attr
*a
= &vf
->attr
[j
];
393 struct x86_reg dest
= x86_make_disp(vertexEAX
, a
->vertoffset
);
395 /* Now, load an XMM reg from src, perhaps transform, then save.
396 * Could be shortcircuited in specific cases:
400 get_src_ptr(p
, srcECX
, vfESI
, a
);
401 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
402 emit_store(p
, dest
, 1, temp
);
403 update_src_ptr(p
, srcECX
, vfESI
, a
);
406 get_src_ptr(p
, srcECX
, vfESI
, a
);
407 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
408 emit_store(p
, dest
, 2, temp
);
409 update_src_ptr(p
, srcECX
, vfESI
, a
);
412 /* Potentially the worst case - hardcode 2+1 copying:
415 get_src_ptr(p
, srcECX
, vfESI
, a
);
416 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
417 emit_store(p
, dest
, 3, temp
);
418 update_src_ptr(p
, srcECX
, vfESI
, a
);
421 get_src_ptr(p
, srcECX
, vfESI
, a
);
422 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
423 emit_store(p
, dest
, 2, temp
);
424 if (a
->inputsize
> 2) {
425 emit_load(p
, temp
, 1, x86_make_disp(srcECX
, 8), 1);
426 emit_store(p
, x86_make_disp(dest
,8), 1, temp
);
429 sse_movss(&p
->func
, x86_make_disp(dest
,8), get_identity(p
));
431 update_src_ptr(p
, srcECX
, vfESI
, a
);
435 get_src_ptr(p
, srcECX
, vfESI
, a
);
436 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
437 emit_store(p
, dest
, 4, temp
);
438 update_src_ptr(p
, srcECX
, vfESI
, a
);
440 case EMIT_2F_VIEWPORT
:
441 get_src_ptr(p
, srcECX
, vfESI
, a
);
442 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
443 sse_mulps(&p
->func
, temp
, vp0
);
444 sse_addps(&p
->func
, temp
, vp1
);
445 emit_store(p
, dest
, 2, temp
);
446 update_src_ptr(p
, srcECX
, vfESI
, a
);
448 case EMIT_3F_VIEWPORT
:
449 get_src_ptr(p
, srcECX
, vfESI
, a
);
450 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
451 sse_mulps(&p
->func
, temp
, vp0
);
452 sse_addps(&p
->func
, temp
, vp1
);
453 emit_store(p
, dest
, 3, temp
);
454 update_src_ptr(p
, srcECX
, vfESI
, a
);
456 case EMIT_4F_VIEWPORT
:
457 get_src_ptr(p
, srcECX
, vfESI
, a
);
458 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
459 sse_mulps(&p
->func
, temp
, vp0
);
460 sse_addps(&p
->func
, temp
, vp1
);
461 emit_store(p
, dest
, 4, temp
);
462 update_src_ptr(p
, srcECX
, vfESI
, a
);
465 get_src_ptr(p
, srcECX
, vfESI
, a
);
466 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
467 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,Y
,W
,Z
));
468 emit_store(p
, dest
, 3, temp
);
469 update_src_ptr(p
, srcECX
, vfESI
, a
);
473 /* Test for PAD3 + 1UB:
476 a
[-1].vertoffset
+ a
[-1].vertattrsize
<= a
->vertoffset
- 3)
478 get_src_ptr(p
, srcECX
, vfESI
, a
);
479 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
480 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,X
,X
,X
));
481 emit_pack_store_4ub(p
, x86_make_disp(dest
, -3), temp
); /* overkill! */
482 update_src_ptr(p
, srcECX
, vfESI
, a
);
485 _mesa_printf("Can't emit 1ub %x %x %d\n", a
->vertoffset
, a
[-1].vertoffset
, a
[-1].vertattrsize
);
489 case EMIT_3UB_3F_RGB
:
490 case EMIT_3UB_3F_BGR
:
491 /* Test for 3UB + PAD1:
493 if (j
== vf
->attr_count
- 1 ||
494 a
[1].vertoffset
>= a
->vertoffset
+ 4) {
495 get_src_ptr(p
, srcECX
, vfESI
, a
);
496 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
497 if (a
->format
== EMIT_3UB_3F_BGR
)
498 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
499 emit_pack_store_4ub(p
, dest
, temp
);
500 update_src_ptr(p
, srcECX
, vfESI
, a
);
502 /* Test for 3UB + 1UB:
504 else if (j
< vf
->attr_count
- 1 &&
505 a
[1].format
== EMIT_1UB_1F
&&
506 a
[1].vertoffset
== a
->vertoffset
+ 3) {
507 get_src_ptr(p
, srcECX
, vfESI
, a
);
508 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
509 update_src_ptr(p
, srcECX
, vfESI
, a
);
511 /* Make room for incoming value:
513 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
515 get_src_ptr(p
, srcECX
, vfESI
, &a
[1]);
516 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
[1].inputsize
);
517 update_src_ptr(p
, srcECX
, vfESI
, &a
[1]);
519 /* Rearrange and possibly do BGR conversion:
521 if (a
->format
== EMIT_3UB_3F_BGR
)
522 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
524 sse_shufps(&p
->func
, temp
, temp
, SHUF(Y
,Z
,W
,X
));
526 emit_pack_store_4ub(p
, dest
, temp
);
527 j
++; /* NOTE: two attrs consumed */
530 _mesa_printf("Can't emit 3ub\n");
532 return GL_FALSE
; /* add this later */
535 case EMIT_4UB_4F_RGBA
:
536 get_src_ptr(p
, srcECX
, vfESI
, a
);
537 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
538 emit_pack_store_4ub(p
, dest
, temp
);
539 update_src_ptr(p
, srcECX
, vfESI
, a
);
541 case EMIT_4UB_4F_BGRA
:
542 get_src_ptr(p
, srcECX
, vfESI
, a
);
543 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
544 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
545 emit_pack_store_4ub(p
, dest
, temp
);
546 update_src_ptr(p
, srcECX
, vfESI
, a
);
548 case EMIT_4UB_4F_ARGB
:
549 get_src_ptr(p
, srcECX
, vfESI
, a
);
550 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
551 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
552 emit_pack_store_4ub(p
, dest
, temp
);
553 update_src_ptr(p
, srcECX
, vfESI
, a
);
555 case EMIT_4UB_4F_ABGR
:
556 get_src_ptr(p
, srcECX
, vfESI
, a
);
557 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
558 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
559 emit_pack_store_4ub(p
, dest
, temp
);
560 update_src_ptr(p
, srcECX
, vfESI
, a
);
562 case EMIT_4CHAN_4F_RGBA
:
564 case GL_UNSIGNED_BYTE
:
565 get_src_ptr(p
, srcECX
, vfESI
, a
);
566 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
567 emit_pack_store_4ub(p
, dest
, temp
);
568 update_src_ptr(p
, srcECX
, vfESI
, a
);
571 get_src_ptr(p
, srcECX
, vfESI
, a
);
572 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
573 emit_store(p
, dest
, 4, temp
);
574 update_src_ptr(p
, srcECX
, vfESI
, a
);
576 case GL_UNSIGNED_SHORT
:
578 _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE
));
583 _mesa_printf("unknown a[%d].format %d\n", j
, a
->format
);
584 return GL_FALSE
; /* catch any new opcodes */
587 /* Increment j by at least 1 - may have been incremented above also:
594 x86_lea(&p
->func
, vertexEAX
, x86_make_disp(vertexEAX
, vf
->vertex_stride
));
596 /* decr count, loop if not zero
598 x86_dec(&p
->func
, countEBP
);
599 x86_test(&p
->func
, countEBP
, countEBP
);
600 x86_jcc(&p
->func
, cc_NZ
, label
);
604 if (p
->func
.need_emms
)
607 /* Land forward jump here:
609 x86_fixup_fwd_jump(&p
->func
, fixup
);
611 /* Pop regs and return
613 x86_pop(&p
->func
, x86_get_base_reg(vfESI
));
614 x86_pop(&p
->func
, countEBP
);
617 vf
->emit
= (vf_emit_func
)x86_get_func(&p
->func
);
623 void vf_generate_sse_emit( struct vertex_fetch
*vf
)
625 struct x86_program p
;
628 vf
->codegen_emit
= NULL
;
632 _mesa_memset(&p
, 0, sizeof(p
));
635 p
.inputs_safe
= 0; /* for now */
636 p
.outputs_safe
= 0; /* for now */
637 p
.have_sse2
= cpu_has_xmm2
;
638 p
.identity
= x86_make_reg(file_XMM
, 6);
639 p
.chan0
= x86_make_reg(file_XMM
, 7);
641 x86_init_func(&p
.func
);
643 if (build_vertex_emit(&p
)) {
644 vf_register_fastpath( vf
, GL_TRUE
);
647 /* Note the failure so that we don't keep trying to codegen an
650 vf_register_fastpath( vf
, GL_FALSE
);
651 x86_release_func(&p
.func
);
657 void vf_generate_sse_emit( struct vertex_fetch
*vf
)
659 /* Dummy version for when USE_SSE_ASM not defined */