2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "simple_list.h"
31 #include "pipe/p_compiler.h"
36 #if defined(USE_SSE_ASM)
38 #include "x86/rtasm/x86sse.h"
39 #include "x86/common_x86_asm.h"
49 struct x86_function func
;
51 struct draw_vertex_fetch
*vf
;
56 struct x86_reg identity
;
61 static struct x86_reg
get_identity( struct x86_program
*p
)
66 static void emit_load4f_4( struct x86_program
*p
,
70 sse_movups(&p
->func
, dest
, arg0
);
73 static void emit_load4f_3( struct x86_program
*p
,
77 /* Have to jump through some hoops:
84 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
85 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
86 sse_shufps(&p
->func
, dest
, dest
, SHUF(Y
,Z
,X
,W
) );
87 sse_movlps(&p
->func
, dest
, arg0
);
90 static void emit_load4f_2( struct x86_program
*p
,
94 /* Initialize from identity, then pull in low two words:
96 sse_movups(&p
->func
, dest
, get_identity(p
));
97 sse_movlps(&p
->func
, dest
, arg0
);
100 static void emit_load4f_1( struct x86_program
*p
,
102 struct x86_reg arg0
)
104 /* Pull in low word, then swizzle in identity */
105 sse_movss(&p
->func
, dest
, arg0
);
106 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
111 static void emit_load3f_3( struct x86_program
*p
,
113 struct x86_reg arg0
)
115 /* Over-reads by 1 dword - potential SEGV if input is a vertex
118 if (p
->inputs_safe
) {
119 sse_movups(&p
->func
, dest
, arg0
);
126 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
127 sse_shufps(&p
->func
, dest
, dest
, SHUF(X
,X
,X
,X
));
128 sse_movlps(&p
->func
, dest
, arg0
);
132 static void emit_load3f_2( struct x86_program
*p
,
134 struct x86_reg arg0
)
136 emit_load4f_2(p
, dest
, arg0
);
139 static void emit_load3f_1( struct x86_program
*p
,
141 struct x86_reg arg0
)
143 emit_load4f_1(p
, dest
, arg0
);
146 static void emit_load2f_2( struct x86_program
*p
,
148 struct x86_reg arg0
)
150 sse_movlps(&p
->func
, dest
, arg0
);
153 static void emit_load2f_1( struct x86_program
*p
,
155 struct x86_reg arg0
)
157 emit_load4f_1(p
, dest
, arg0
);
160 static void emit_load1f_1( struct x86_program
*p
,
162 struct x86_reg arg0
)
164 sse_movss(&p
->func
, dest
, arg0
);
167 static void (*load
[4][4])( struct x86_program
*p
,
169 struct x86_reg arg0
) = {
191 static void emit_load( struct x86_program
*p
,
197 load
[sz
-1][src_sz
-1](p
, dest
, src
);
200 static void emit_store4f( struct x86_program
*p
,
202 struct x86_reg arg0
)
204 sse_movups(&p
->func
, dest
, arg0
);
207 static void emit_store3f( struct x86_program
*p
,
209 struct x86_reg arg0
)
211 if (p
->outputs_safe
) {
212 /* Emit the extra dword anyway. This may hurt writecombining,
213 * may cause other problems.
215 sse_movups(&p
->func
, dest
, arg0
);
218 /* Alternate strategy - emit two, shuffle, emit one.
220 sse_movlps(&p
->func
, dest
, arg0
);
221 sse_shufps(&p
->func
, arg0
, arg0
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
222 sse_movss(&p
->func
, x86_make_disp(dest
,8), arg0
);
226 static void emit_store2f( struct x86_program
*p
,
228 struct x86_reg arg0
)
230 sse_movlps(&p
->func
, dest
, arg0
);
233 static void emit_store1f( struct x86_program
*p
,
235 struct x86_reg arg0
)
237 sse_movss(&p
->func
, dest
, arg0
);
241 static void (*store
[4])( struct x86_program
*p
,
243 struct x86_reg arg0
) =
251 static void emit_store( struct x86_program
*p
,
254 struct x86_reg temp
)
257 store
[sz
-1](p
, dest
, temp
);
260 static void emit_pack_store_4ub( struct x86_program
*p
,
262 struct x86_reg temp
)
266 sse_mulps(&p
->func
, temp
, p
->chan0
);
269 sse2_cvtps2dq(&p
->func
, temp
, temp
);
270 sse2_packssdw(&p
->func
, temp
, temp
);
271 sse2_packuswb(&p
->func
, temp
, temp
);
272 sse_movss(&p
->func
, dest
, temp
);
275 struct x86_reg mmx0
= x86_make_reg(file_MMX
, 0);
276 struct x86_reg mmx1
= x86_make_reg(file_MMX
, 1);
277 sse_cvtps2pi(&p
->func
, mmx0
, temp
);
278 sse_movhlps(&p
->func
, temp
, temp
);
279 sse_cvtps2pi(&p
->func
, mmx1
, temp
);
280 mmx_packssdw(&p
->func
, mmx0
, mmx1
);
281 mmx_packuswb(&p
->func
, mmx0
, mmx0
);
282 mmx_movd(&p
->func
, dest
, mmx0
);
286 static int get_offset( const void *a
, const void *b
)
288 return (const char *)b
- (const char *)a
;
291 /* Not much happens here. Eventually use this function to try and
292 * avoid saving/reloading the source pointers each vertex (if some of
293 * them can fit in registers).
295 static void get_src_ptr( struct x86_program
*p
,
296 struct x86_reg srcREG
,
297 struct x86_reg vfREG
,
298 struct draw_vf_attr
*a
)
300 struct draw_vertex_fetch
*vf
= p
->vf
;
301 struct x86_reg ptr_to_src
= x86_make_disp(vfREG
, get_offset(vf
, &a
->inputptr
));
303 /* Load current a[j].inputptr
305 x86_mov(&p
->func
, srcREG
, ptr_to_src
);
308 static void update_src_ptr( struct x86_program
*p
,
309 struct x86_reg srcREG
,
310 struct x86_reg vfREG
,
311 struct draw_vf_attr
*a
)
313 if (a
->inputstride
) {
314 struct draw_vertex_fetch
*vf
= p
->vf
;
315 struct x86_reg ptr_to_src
= x86_make_disp(vfREG
, get_offset(vf
, &a
->inputptr
));
317 /* add a[j].inputstride (hardcoded value - could just as easily
318 * pull the stride value from memory each time).
320 x86_lea(&p
->func
, srcREG
, x86_make_disp(srcREG
, a
->inputstride
));
322 /* save new value of a[j].inputptr
324 x86_mov(&p
->func
, ptr_to_src
, srcREG
);
329 /* Lots of hardcoding
331 * EAX -- pointer to current output vertex
332 * ECX -- pointer to current attribute
335 static boolean
build_vertex_emit( struct x86_program
*p
)
337 struct draw_vertex_fetch
*vf
= p
->vf
;
340 struct x86_reg vertexEAX
= x86_make_reg(file_REG32
, reg_AX
);
341 struct x86_reg srcECX
= x86_make_reg(file_REG32
, reg_CX
);
342 struct x86_reg countEBP
= x86_make_reg(file_REG32
, reg_BP
);
343 struct x86_reg vfESI
= x86_make_reg(file_REG32
, reg_SI
);
344 struct x86_reg temp
= x86_make_reg(file_XMM
, 0);
345 uint8_t *fixup
, *label
;
349 x86_push(&p
->func
, countEBP
);
350 x86_push(&p
->func
, vfESI
);
353 /* Get vertex count, compare to zero
355 x86_xor(&p
->func
, srcECX
, srcECX
);
356 x86_mov(&p
->func
, countEBP
, x86_fn_arg(&p
->func
, 2));
357 x86_cmp(&p
->func
, countEBP
, srcECX
);
358 fixup
= x86_jcc_forward(&p
->func
, cc_E
);
360 /* Initialize destination register.
362 x86_mov(&p
->func
, vertexEAX
, x86_fn_arg(&p
->func
, 3));
364 /* Move argument 1 (vf) into a reg:
366 x86_mov(&p
->func
, vfESI
, x86_fn_arg(&p
->func
, 1));
369 /* always load, needed or not:
371 sse_movups(&p
->func
, p
->identity
, x86_make_disp(vfESI
, get_offset(vf
, &vf
->identity
[0])));
373 /* Note address for loop jump */
374 label
= x86_get_label(&p
->func
);
376 /* Emit code for each of the attributes. Currently routes
377 * everything through SSE registers, even when it might be more
378 * efficient to stick with regular old x86. No optimization or
379 * other tricks - enough new ground to cover here just getting
382 while (j
< vf
->attr_count
) {
383 struct draw_vf_attr
*a
= &vf
->attr
[j
];
384 struct x86_reg dest
= x86_make_disp(vertexEAX
, a
->vertoffset
);
386 /* Now, load an XMM reg from src, perhaps transform, then save.
387 * Could be shortcircuited in specific cases:
391 case DRAW_EMIT_1F_CONST
:
392 get_src_ptr(p
, srcECX
, vfESI
, a
);
393 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
394 emit_store(p
, dest
, 1, temp
);
395 update_src_ptr(p
, srcECX
, vfESI
, a
);
398 case DRAW_EMIT_2F_CONST
:
399 get_src_ptr(p
, srcECX
, vfESI
, a
);
400 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
401 emit_store(p
, dest
, 2, temp
);
402 update_src_ptr(p
, srcECX
, vfESI
, a
);
405 case DRAW_EMIT_3F_CONST
:
406 /* Potentially the worst case - hardcode 2+1 copying:
409 get_src_ptr(p
, srcECX
, vfESI
, a
);
410 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
411 emit_store(p
, dest
, 3, temp
);
412 update_src_ptr(p
, srcECX
, vfESI
, a
);
415 get_src_ptr(p
, srcECX
, vfESI
, a
);
416 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
417 emit_store(p
, dest
, 2, temp
);
418 if (a
->inputsize
> 2) {
419 emit_load(p
, temp
, 1, x86_make_disp(srcECX
, 8), 1);
420 emit_store(p
, x86_make_disp(dest
,8), 1, temp
);
423 sse_movss(&p
->func
, x86_make_disp(dest
,8), get_identity(p
));
425 update_src_ptr(p
, srcECX
, vfESI
, a
);
429 case DRAW_EMIT_4F_CONST
:
430 get_src_ptr(p
, srcECX
, vfESI
, a
);
431 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
432 emit_store(p
, dest
, 4, temp
);
433 update_src_ptr(p
, srcECX
, vfESI
, a
);
435 case DRAW_EMIT_3F_XYW
:
436 get_src_ptr(p
, srcECX
, vfESI
, a
);
437 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
438 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,Y
,W
,Z
));
439 emit_store(p
, dest
, 3, temp
);
440 update_src_ptr(p
, srcECX
, vfESI
, a
);
443 case DRAW_EMIT_1UB_1F
:
444 /* Test for PAD3 + 1UB:
447 a
[-1].vertoffset
+ a
[-1].vertattrsize
<= a
->vertoffset
- 3)
449 get_src_ptr(p
, srcECX
, vfESI
, a
);
450 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
451 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,X
,X
,X
));
452 emit_pack_store_4ub(p
, x86_make_disp(dest
, -3), temp
); /* overkill! */
453 update_src_ptr(p
, srcECX
, vfESI
, a
);
456 debug_printf("Can't emit 1ub %x %x %d\n",
457 a
->vertoffset
, a
[-1].vertoffset
, a
[-1].vertattrsize
);
461 case DRAW_EMIT_3UB_3F_RGB
:
462 case DRAW_EMIT_3UB_3F_BGR
:
463 /* Test for 3UB + PAD1:
465 if (j
== vf
->attr_count
- 1 ||
466 a
[1].vertoffset
>= a
->vertoffset
+ 4) {
467 get_src_ptr(p
, srcECX
, vfESI
, a
);
468 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
469 if (a
->format
== DRAW_EMIT_3UB_3F_BGR
)
470 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
471 emit_pack_store_4ub(p
, dest
, temp
);
472 update_src_ptr(p
, srcECX
, vfESI
, a
);
474 /* Test for 3UB + 1UB:
476 else if (j
< vf
->attr_count
- 1 &&
477 a
[1].format
== DRAW_EMIT_1UB_1F
&&
478 a
[1].vertoffset
== a
->vertoffset
+ 3) {
479 get_src_ptr(p
, srcECX
, vfESI
, a
);
480 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
481 update_src_ptr(p
, srcECX
, vfESI
, a
);
483 /* Make room for incoming value:
485 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
487 get_src_ptr(p
, srcECX
, vfESI
, &a
[1]);
488 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
[1].inputsize
);
489 update_src_ptr(p
, srcECX
, vfESI
, &a
[1]);
491 /* Rearrange and possibly do BGR conversion:
493 if (a
->format
== DRAW_EMIT_3UB_3F_BGR
)
494 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
496 sse_shufps(&p
->func
, temp
, temp
, SHUF(Y
,Z
,W
,X
));
498 emit_pack_store_4ub(p
, dest
, temp
);
499 j
++; /* NOTE: two attrs consumed */
502 debug_printf("Can't emit 3ub\n");
504 return FALSE
; /* add this later */
507 case DRAW_EMIT_4UB_4F_RGBA
:
508 get_src_ptr(p
, srcECX
, vfESI
, a
);
509 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
510 emit_pack_store_4ub(p
, dest
, temp
);
511 update_src_ptr(p
, srcECX
, vfESI
, a
);
513 case DRAW_EMIT_4UB_4F_BGRA
:
514 get_src_ptr(p
, srcECX
, vfESI
, a
);
515 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
516 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
517 emit_pack_store_4ub(p
, dest
, temp
);
518 update_src_ptr(p
, srcECX
, vfESI
, a
);
520 case DRAW_EMIT_4UB_4F_ARGB
:
521 get_src_ptr(p
, srcECX
, vfESI
, a
);
522 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
523 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
524 emit_pack_store_4ub(p
, dest
, temp
);
525 update_src_ptr(p
, srcECX
, vfESI
, a
);
527 case DRAW_EMIT_4UB_4F_ABGR
:
528 get_src_ptr(p
, srcECX
, vfESI
, a
);
529 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
530 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
531 emit_pack_store_4ub(p
, dest
, temp
);
532 update_src_ptr(p
, srcECX
, vfESI
, a
);
535 debug_printf("unknown a[%d].format %d\n", j
, a
->format
);
536 return FALSE
; /* catch any new opcodes */
539 /* Increment j by at least 1 - may have been incremented above also:
546 x86_lea(&p
->func
, vertexEAX
, x86_make_disp(vertexEAX
, vf
->vertex_stride
));
548 /* decr count, loop if not zero
550 x86_dec(&p
->func
, countEBP
);
551 x86_test(&p
->func
, countEBP
, countEBP
);
552 x86_jcc(&p
->func
, cc_NZ
, label
);
556 if (p
->func
.need_emms
)
559 /* Land forward jump here:
561 x86_fixup_fwd_jump(&p
->func
, fixup
);
563 /* Pop regs and return
565 x86_pop(&p
->func
, x86_get_base_reg(vfESI
));
566 x86_pop(&p
->func
, countEBP
);
569 vf
->emit
= (draw_vf_emit_func
)x86_get_func(&p
->func
);
575 void draw_vf_generate_sse_emit( struct draw_vertex_fetch
*vf
)
577 struct x86_program p
;
580 vf
->codegen_emit
= NULL
;
584 memset(&p
, 0, sizeof(p
));
587 p
.inputs_safe
= 0; /* for now */
588 p
.outputs_safe
= 1; /* for now */
589 p
.have_sse2
= cpu_has_xmm2
;
590 p
.identity
= x86_make_reg(file_XMM
, 6);
591 p
.chan0
= x86_make_reg(file_XMM
, 7);
593 x86_init_func(&p
.func
);
595 if (build_vertex_emit(&p
)) {
596 draw_vf_register_fastpath( vf
, TRUE
);
599 /* Note the failure so that we don't keep trying to codegen an
602 draw_vf_register_fastpath( vf
, FALSE
);
603 x86_release_func(&p
.func
);
609 void draw_vf_generate_sse_emit( struct draw_vertex_fetch
*vf
)
611 /* Dummy version for when USE_SSE_ASM not defined */