2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
28 #include "main/glheader.h"
29 #include "main/context.h"
30 #include "main/colormac.h"
31 #include "main/simple_list.h"
32 #include "main/enums.h"
33 #include "swrast/s_chan.h"
34 #include "t_context.h"
37 #if defined(USE_SSE_ASM)
39 #include "x86/rtasm/x86sse.h"
40 #include "x86/common_x86_asm.h"
44 * Number of bytes to allocate for generated SSE functions
46 #define MAX_SSE_CODE_SIZE 1024
56 struct x86_function func
;
58 struct gl_context
*ctx
;
59 GLboolean inputs_safe
;
60 GLboolean outputs_safe
;
63 struct x86_reg identity
;
68 static struct x86_reg
get_identity( struct x86_program
*p
)
73 static void emit_load4f_4( struct x86_program
*p
,
77 sse_movups(&p
->func
, dest
, arg0
);
80 static void emit_load4f_3( struct x86_program
*p
,
84 /* Have to jump through some hoops:
91 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
92 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
93 sse_shufps(&p
->func
, dest
, dest
, SHUF(Y
,Z
,X
,W
) );
94 sse_movlps(&p
->func
, dest
, arg0
);
97 static void emit_load4f_2( struct x86_program
*p
,
101 /* Initialize from identity, then pull in low two words:
103 sse_movups(&p
->func
, dest
, get_identity(p
));
104 sse_movlps(&p
->func
, dest
, arg0
);
107 static void emit_load4f_1( struct x86_program
*p
,
109 struct x86_reg arg0
)
111 /* Pull in low word, then swizzle in identity */
112 sse_movss(&p
->func
, dest
, arg0
);
113 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
118 static void emit_load3f_3( struct x86_program
*p
,
120 struct x86_reg arg0
)
122 /* Over-reads by 1 dword - potential SEGV if input is a vertex
125 if (p
->inputs_safe
) {
126 sse_movups(&p
->func
, dest
, arg0
);
133 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
134 sse_shufps(&p
->func
, dest
, dest
, SHUF(X
,X
,X
,X
));
135 sse_movlps(&p
->func
, dest
, arg0
);
139 static void emit_load3f_2( struct x86_program
*p
,
141 struct x86_reg arg0
)
143 emit_load4f_2(p
, dest
, arg0
);
146 static void emit_load3f_1( struct x86_program
*p
,
148 struct x86_reg arg0
)
150 /* Loading from memory erases the upper bits. */
151 sse_movss(&p
->func
, dest
, arg0
);
154 static void emit_load2f_2( struct x86_program
*p
,
156 struct x86_reg arg0
)
158 sse_movlps(&p
->func
, dest
, arg0
);
161 static void emit_load2f_1( struct x86_program
*p
,
163 struct x86_reg arg0
)
165 /* Loading from memory erases the upper bits. */
166 sse_movss(&p
->func
, dest
, arg0
);
169 static void emit_load1f_1( struct x86_program
*p
,
171 struct x86_reg arg0
)
173 sse_movss(&p
->func
, dest
, arg0
);
176 static void (*load
[4][4])( struct x86_program
*p
,
178 struct x86_reg arg0
) = {
200 static void emit_load( struct x86_program
*p
,
206 load
[sz
-1][src_sz
-1](p
, dest
, src
);
209 static void emit_store4f( struct x86_program
*p
,
211 struct x86_reg arg0
)
213 sse_movups(&p
->func
, dest
, arg0
);
216 static void emit_store3f( struct x86_program
*p
,
218 struct x86_reg arg0
)
220 if (p
->outputs_safe
) {
221 /* Emit the extra dword anyway. This may hurt writecombining,
222 * may cause other problems.
224 sse_movups(&p
->func
, dest
, arg0
);
227 /* Alternate strategy - emit two, shuffle, emit one.
229 sse_movlps(&p
->func
, dest
, arg0
);
230 sse_shufps(&p
->func
, arg0
, arg0
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
231 sse_movss(&p
->func
, x86_make_disp(dest
,8), arg0
);
235 static void emit_store2f( struct x86_program
*p
,
237 struct x86_reg arg0
)
239 sse_movlps(&p
->func
, dest
, arg0
);
242 static void emit_store1f( struct x86_program
*p
,
244 struct x86_reg arg0
)
246 sse_movss(&p
->func
, dest
, arg0
);
250 static void (*store
[4])( struct x86_program
*p
,
252 struct x86_reg arg0
) =
260 static void emit_store( struct x86_program
*p
,
263 struct x86_reg temp
)
266 store
[sz
-1](p
, dest
, temp
);
269 static void emit_pack_store_4ub( struct x86_program
*p
,
271 struct x86_reg temp
)
275 sse_mulps(&p
->func
, temp
, p
->chan0
);
278 sse2_cvtps2dq(&p
->func
, temp
, temp
);
279 sse2_packssdw(&p
->func
, temp
, temp
);
280 sse2_packuswb(&p
->func
, temp
, temp
);
281 sse_movss(&p
->func
, dest
, temp
);
284 struct x86_reg mmx0
= x86_make_reg(file_MMX
, 0);
285 struct x86_reg mmx1
= x86_make_reg(file_MMX
, 1);
286 sse_cvtps2pi(&p
->func
, mmx0
, temp
);
287 sse_movhlps(&p
->func
, temp
, temp
);
288 sse_cvtps2pi(&p
->func
, mmx1
, temp
);
289 mmx_packssdw(&p
->func
, mmx0
, mmx1
);
290 mmx_packuswb(&p
->func
, mmx0
, mmx0
);
291 mmx_movd(&p
->func
, dest
, mmx0
);
295 static GLint
get_offset( const void *a
, const void *b
)
297 return (const char *)b
- (const char *)a
;
300 /* Not much happens here. Eventually use this function to try and
301 * avoid saving/reloading the source pointers each vertex (if some of
302 * them can fit in registers).
304 static void get_src_ptr( struct x86_program
*p
,
305 struct x86_reg srcREG
,
306 struct x86_reg vtxREG
,
307 struct tnl_clipspace_attr
*a
)
309 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(p
->ctx
);
310 struct x86_reg ptr_to_src
= x86_make_disp(vtxREG
, get_offset(vtx
, &a
->inputptr
));
312 /* Load current a[j].inputptr
314 x86_mov(&p
->func
, srcREG
, ptr_to_src
);
317 static void update_src_ptr( struct x86_program
*p
,
318 struct x86_reg srcREG
,
319 struct x86_reg vtxREG
,
320 struct tnl_clipspace_attr
*a
)
322 if (a
->inputstride
) {
323 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(p
->ctx
);
324 struct x86_reg ptr_to_src
= x86_make_disp(vtxREG
, get_offset(vtx
, &a
->inputptr
));
326 /* add a[j].inputstride (hardcoded value - could just as easily
327 * pull the stride value from memory each time).
329 x86_lea(&p
->func
, srcREG
, x86_make_disp(srcREG
, a
->inputstride
));
331 /* save new value of a[j].inputptr
333 x86_mov(&p
->func
, ptr_to_src
, srcREG
);
338 /* Lots of hardcoding
340 * EAX -- pointer to current output vertex
341 * ECX -- pointer to current attribute
344 static GLboolean
build_vertex_emit( struct x86_program
*p
)
346 struct gl_context
*ctx
= p
->ctx
;
347 TNLcontext
*tnl
= TNL_CONTEXT(ctx
);
348 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(ctx
);
351 struct x86_reg vertexEAX
= x86_make_reg(file_REG32
, reg_AX
);
352 struct x86_reg srcECX
= x86_make_reg(file_REG32
, reg_CX
);
353 struct x86_reg countEBP
= x86_make_reg(file_REG32
, reg_BP
);
354 struct x86_reg vtxESI
= x86_make_reg(file_REG32
, reg_SI
);
355 struct x86_reg temp
= x86_make_reg(file_XMM
, 0);
356 struct x86_reg vp0
= x86_make_reg(file_XMM
, 1);
357 struct x86_reg vp1
= x86_make_reg(file_XMM
, 2);
358 struct x86_reg temp2
= x86_make_reg(file_XMM
, 3);
359 GLubyte
*fixup
, *label
;
363 x86_push(&p
->func
, countEBP
);
364 x86_push(&p
->func
, vtxESI
);
367 /* Get vertex count, compare to zero
369 x86_xor(&p
->func
, srcECX
, srcECX
);
370 x86_mov(&p
->func
, countEBP
, x86_fn_arg(&p
->func
, 2));
371 x86_cmp(&p
->func
, countEBP
, srcECX
);
372 fixup
= x86_jcc_forward(&p
->func
, cc_E
);
374 /* Initialize destination register.
376 x86_mov(&p
->func
, vertexEAX
, x86_fn_arg(&p
->func
, 3));
378 /* Dereference ctx to get tnl, then vtx:
380 x86_mov(&p
->func
, vtxESI
, x86_fn_arg(&p
->func
, 1));
381 x86_mov(&p
->func
, vtxESI
, x86_make_disp(vtxESI
, get_offset(ctx
, &ctx
->swtnl_context
)));
382 vtxESI
= x86_make_disp(vtxESI
, get_offset(tnl
, &tnl
->clipspace
));
385 /* Possibly load vp0, vp1 for viewport calcs:
387 if (vtx
->need_viewport
) {
388 sse_movups(&p
->func
, vp0
, x86_make_disp(vtxESI
, get_offset(vtx
, &vtx
->vp_scale
[0])));
389 sse_movups(&p
->func
, vp1
, x86_make_disp(vtxESI
, get_offset(vtx
, &vtx
->vp_xlate
[0])));
392 /* always load, needed or not:
394 sse_movups(&p
->func
, p
->chan0
, x86_make_disp(vtxESI
, get_offset(vtx
, &vtx
->chan_scale
[0])));
395 sse_movups(&p
->func
, p
->identity
, x86_make_disp(vtxESI
, get_offset(vtx
, &vtx
->identity
[0])));
397 /* Note address for loop jump */
398 label
= x86_get_label(&p
->func
);
400 /* Emit code for each of the attributes. Currently routes
401 * everything through SSE registers, even when it might be more
402 * efficient to stick with regular old x86. No optimization or
403 * other tricks - enough new ground to cover here just getting
406 while (j
< vtx
->attr_count
) {
407 struct tnl_clipspace_attr
*a
= &vtx
->attr
[j
];
408 struct x86_reg dest
= x86_make_disp(vertexEAX
, a
->vertoffset
);
410 /* Now, load an XMM reg from src, perhaps transform, then save.
411 * Could be shortcircuited in specific cases:
415 get_src_ptr(p
, srcECX
, vtxESI
, a
);
416 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
417 emit_store(p
, dest
, 1, temp
);
418 update_src_ptr(p
, srcECX
, vtxESI
, a
);
421 get_src_ptr(p
, srcECX
, vtxESI
, a
);
422 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
423 emit_store(p
, dest
, 2, temp
);
424 update_src_ptr(p
, srcECX
, vtxESI
, a
);
427 /* Potentially the worst case - hardcode 2+1 copying:
430 get_src_ptr(p
, srcECX
, vtxESI
, a
);
431 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
432 emit_store(p
, dest
, 3, temp
);
433 update_src_ptr(p
, srcECX
, vtxESI
, a
);
436 get_src_ptr(p
, srcECX
, vtxESI
, a
);
437 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
438 emit_store(p
, dest
, 2, temp
);
439 if (a
->inputsize
> 2) {
440 emit_load(p
, temp
, 1, x86_make_disp(srcECX
, 8), 1);
441 emit_store(p
, x86_make_disp(dest
,8), 1, temp
);
444 sse_movss(&p
->func
, x86_make_disp(dest
,8), get_identity(p
));
446 update_src_ptr(p
, srcECX
, vtxESI
, a
);
450 get_src_ptr(p
, srcECX
, vtxESI
, a
);
451 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
452 emit_store(p
, dest
, 4, temp
);
453 update_src_ptr(p
, srcECX
, vtxESI
, a
);
455 case EMIT_2F_VIEWPORT
:
456 get_src_ptr(p
, srcECX
, vtxESI
, a
);
457 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
458 sse_mulps(&p
->func
, temp
, vp0
);
459 sse_addps(&p
->func
, temp
, vp1
);
460 emit_store(p
, dest
, 2, temp
);
461 update_src_ptr(p
, srcECX
, vtxESI
, a
);
463 case EMIT_3F_VIEWPORT
:
464 get_src_ptr(p
, srcECX
, vtxESI
, a
);
465 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
466 sse_mulps(&p
->func
, temp
, vp0
);
467 sse_addps(&p
->func
, temp
, vp1
);
468 emit_store(p
, dest
, 3, temp
);
469 update_src_ptr(p
, srcECX
, vtxESI
, a
);
471 case EMIT_4F_VIEWPORT
:
472 get_src_ptr(p
, srcECX
, vtxESI
, a
);
473 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
474 sse_mulps(&p
->func
, temp
, vp0
);
475 sse_addps(&p
->func
, temp
, vp1
);
476 emit_store(p
, dest
, 4, temp
);
477 update_src_ptr(p
, srcECX
, vtxESI
, a
);
480 get_src_ptr(p
, srcECX
, vtxESI
, a
);
481 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
482 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,Y
,W
,Z
));
483 emit_store(p
, dest
, 3, temp
);
484 update_src_ptr(p
, srcECX
, vtxESI
, a
);
488 /* Test for PAD3 + 1UB:
491 a
[-1].vertoffset
+ a
[-1].vertattrsize
<= a
->vertoffset
- 3)
493 get_src_ptr(p
, srcECX
, vtxESI
, a
);
494 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
495 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,X
,X
,X
));
496 emit_pack_store_4ub(p
, x86_make_disp(dest
, -3), temp
); /* overkill! */
497 update_src_ptr(p
, srcECX
, vtxESI
, a
);
500 printf("Can't emit 1ub %x %x %d\n", a
->vertoffset
, a
[-1].vertoffset
, a
[-1].vertattrsize
);
504 case EMIT_3UB_3F_RGB
:
505 case EMIT_3UB_3F_BGR
:
506 /* Test for 3UB + PAD1:
508 if (j
== vtx
->attr_count
- 1 ||
509 a
[1].vertoffset
>= a
->vertoffset
+ 4) {
510 get_src_ptr(p
, srcECX
, vtxESI
, a
);
511 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
512 if (a
->format
== EMIT_3UB_3F_BGR
)
513 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
514 emit_pack_store_4ub(p
, dest
, temp
);
515 update_src_ptr(p
, srcECX
, vtxESI
, a
);
517 /* Test for 3UB + 1UB:
519 else if (j
< vtx
->attr_count
- 1 &&
520 a
[1].format
== EMIT_1UB_1F
&&
521 a
[1].vertoffset
== a
->vertoffset
+ 3) {
522 get_src_ptr(p
, srcECX
, vtxESI
, a
);
523 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
524 update_src_ptr(p
, srcECX
, vtxESI
, a
);
526 /* Make room for incoming value:
528 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
530 get_src_ptr(p
, srcECX
, vtxESI
, &a
[1]);
531 emit_load(p
, temp2
, 1, x86_deref(srcECX
), a
[1].inputsize
);
532 sse_movss(&p
->func
, temp
, temp2
);
533 update_src_ptr(p
, srcECX
, vtxESI
, &a
[1]);
535 /* Rearrange and possibly do BGR conversion:
537 if (a
->format
== EMIT_3UB_3F_BGR
)
538 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
540 sse_shufps(&p
->func
, temp
, temp
, SHUF(Y
,Z
,W
,X
));
542 emit_pack_store_4ub(p
, dest
, temp
);
543 j
++; /* NOTE: two attrs consumed */
546 printf("Can't emit 3ub\n");
547 return GL_FALSE
; /* add this later */
551 case EMIT_4UB_4F_RGBA
:
552 get_src_ptr(p
, srcECX
, vtxESI
, a
);
553 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
554 emit_pack_store_4ub(p
, dest
, temp
);
555 update_src_ptr(p
, srcECX
, vtxESI
, a
);
557 case EMIT_4UB_4F_BGRA
:
558 get_src_ptr(p
, srcECX
, vtxESI
, a
);
559 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
560 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
561 emit_pack_store_4ub(p
, dest
, temp
);
562 update_src_ptr(p
, srcECX
, vtxESI
, a
);
564 case EMIT_4UB_4F_ARGB
:
565 get_src_ptr(p
, srcECX
, vtxESI
, a
);
566 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
567 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
568 emit_pack_store_4ub(p
, dest
, temp
);
569 update_src_ptr(p
, srcECX
, vtxESI
, a
);
571 case EMIT_4UB_4F_ABGR
:
572 get_src_ptr(p
, srcECX
, vtxESI
, a
);
573 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
574 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
575 emit_pack_store_4ub(p
, dest
, temp
);
576 update_src_ptr(p
, srcECX
, vtxESI
, a
);
578 case EMIT_4CHAN_4F_RGBA
:
580 case GL_UNSIGNED_BYTE
:
581 get_src_ptr(p
, srcECX
, vtxESI
, a
);
582 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
583 emit_pack_store_4ub(p
, dest
, temp
);
584 update_src_ptr(p
, srcECX
, vtxESI
, a
);
587 get_src_ptr(p
, srcECX
, vtxESI
, a
);
588 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
589 emit_store(p
, dest
, 4, temp
);
590 update_src_ptr(p
, srcECX
, vtxESI
, a
);
592 case GL_UNSIGNED_SHORT
:
594 printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE
));
599 printf("unknown a[%d].format %d\n", j
, a
->format
);
600 return GL_FALSE
; /* catch any new opcodes */
603 /* Increment j by at least 1 - may have been incremented above also:
610 x86_lea(&p
->func
, vertexEAX
, x86_make_disp(vertexEAX
, vtx
->vertex_size
));
612 /* decr count, loop if not zero
614 x86_dec(&p
->func
, countEBP
);
615 x86_test(&p
->func
, countEBP
, countEBP
);
616 x86_jcc(&p
->func
, cc_NZ
, label
);
620 if (p
->func
.need_emms
)
623 /* Land forward jump here:
625 x86_fixup_fwd_jump(&p
->func
, fixup
);
627 /* Pop regs and return
629 x86_pop(&p
->func
, x86_get_base_reg(vtxESI
));
630 x86_pop(&p
->func
, countEBP
);
634 vtx
->emit
= (tnl_emit_func
)x86_get_func(&p
->func
);
636 assert( (char *) p
->func
.csr
- (char *) p
->func
.store
<= MAX_SSE_CODE_SIZE
);
642 void _tnl_generate_sse_emit( struct gl_context
*ctx
)
644 struct tnl_clipspace
*vtx
= GET_VERTEX_STATE(ctx
);
645 struct x86_program p
;
648 vtx
->codegen_emit
= NULL
;
652 memset(&p
, 0, sizeof(p
));
655 p
.inputs_safe
= 0; /* for now */
656 p
.outputs_safe
= 0; /* for now */
657 p
.have_sse2
= cpu_has_xmm2
;
658 p
.identity
= x86_make_reg(file_XMM
, 6);
659 p
.chan0
= x86_make_reg(file_XMM
, 7);
661 if (!x86_init_func_size(&p
.func
, MAX_SSE_CODE_SIZE
)) {
666 if (build_vertex_emit(&p
)) {
667 _tnl_register_fastpath( vtx
, GL_TRUE
);
670 /* Note the failure so that we don't keep trying to codegen an
673 _tnl_register_fastpath( vtx
, GL_FALSE
);
674 x86_release_func(&p
.func
);
680 void _tnl_generate_sse_emit( struct gl_context
*ctx
)
682 /* Dummy version for when USE_SSE_ASM not defined */