2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_compiler.h"
30 #include "util/u_simple_list.h"
35 #if defined(USE_SSE_ASM)
37 #include "rtasm/rtasm_cpu.h"
38 #include "rtasm/rtasm_x86sse.h"
48 struct x86_function func
;
50 struct draw_vertex_fetch
*vf
;
55 struct x86_reg identity
;
60 static struct x86_reg
get_identity( struct x86_program
*p
)
65 static void emit_load4f_4( struct x86_program
*p
,
69 sse_movups(&p
->func
, dest
, arg0
);
72 static void emit_load4f_3( struct x86_program
*p
,
76 /* Have to jump through some hoops:
83 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
84 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
85 sse_shufps(&p
->func
, dest
, dest
, SHUF(Y
,Z
,X
,W
) );
86 sse_movlps(&p
->func
, dest
, arg0
);
89 static void emit_load4f_2( struct x86_program
*p
,
93 /* Initialize from identity, then pull in low two words:
95 sse_movups(&p
->func
, dest
, get_identity(p
));
96 sse_movlps(&p
->func
, dest
, arg0
);
99 static void emit_load4f_1( struct x86_program
*p
,
101 struct x86_reg arg0
)
103 /* Pull in low word, then swizzle in identity */
104 sse_movss(&p
->func
, dest
, arg0
);
105 sse_shufps(&p
->func
, dest
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
110 static void emit_load3f_3( struct x86_program
*p
,
112 struct x86_reg arg0
)
114 /* Over-reads by 1 dword - potential SEGV if input is a vertex
117 if (p
->inputs_safe
) {
118 sse_movups(&p
->func
, dest
, arg0
);
125 sse_movss(&p
->func
, dest
, x86_make_disp(arg0
, 8));
126 sse_shufps(&p
->func
, dest
, dest
, SHUF(X
,X
,X
,X
));
127 sse_movlps(&p
->func
, dest
, arg0
);
131 static void emit_load3f_2( struct x86_program
*p
,
133 struct x86_reg arg0
)
135 emit_load4f_2(p
, dest
, arg0
);
138 static void emit_load3f_1( struct x86_program
*p
,
140 struct x86_reg arg0
)
142 emit_load4f_1(p
, dest
, arg0
);
145 static void emit_load2f_2( struct x86_program
*p
,
147 struct x86_reg arg0
)
149 sse_movlps(&p
->func
, dest
, arg0
);
152 static void emit_load2f_1( struct x86_program
*p
,
154 struct x86_reg arg0
)
156 emit_load4f_1(p
, dest
, arg0
);
159 static void emit_load1f_1( struct x86_program
*p
,
161 struct x86_reg arg0
)
163 sse_movss(&p
->func
, dest
, arg0
);
166 static void (*load
[4][4])( struct x86_program
*p
,
168 struct x86_reg arg0
) = {
190 static void emit_load( struct x86_program
*p
,
196 load
[sz
-1][src_sz
-1](p
, dest
, src
);
199 static void emit_store4f( struct x86_program
*p
,
201 struct x86_reg arg0
)
203 sse_movups(&p
->func
, dest
, arg0
);
206 static void emit_store3f( struct x86_program
*p
,
208 struct x86_reg arg0
)
210 if (p
->outputs_safe
) {
211 /* Emit the extra dword anyway. This may hurt writecombining,
212 * may cause other problems.
214 sse_movups(&p
->func
, dest
, arg0
);
217 /* Alternate strategy - emit two, shuffle, emit one.
219 sse_movlps(&p
->func
, dest
, arg0
);
220 sse_shufps(&p
->func
, arg0
, arg0
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
221 sse_movss(&p
->func
, x86_make_disp(dest
,8), arg0
);
225 static void emit_store2f( struct x86_program
*p
,
227 struct x86_reg arg0
)
229 sse_movlps(&p
->func
, dest
, arg0
);
232 static void emit_store1f( struct x86_program
*p
,
234 struct x86_reg arg0
)
236 sse_movss(&p
->func
, dest
, arg0
);
240 static void (*store
[4])( struct x86_program
*p
,
242 struct x86_reg arg0
) =
250 static void emit_store( struct x86_program
*p
,
253 struct x86_reg temp
)
256 store
[sz
-1](p
, dest
, temp
);
259 static void emit_pack_store_4ub( struct x86_program
*p
,
261 struct x86_reg temp
)
265 sse_mulps(&p
->func
, temp
, p
->chan0
);
268 sse2_cvtps2dq(&p
->func
, temp
, temp
);
269 sse2_packssdw(&p
->func
, temp
, temp
);
270 sse2_packuswb(&p
->func
, temp
, temp
);
271 sse_movss(&p
->func
, dest
, temp
);
274 struct x86_reg mmx0
= x86_make_reg(file_MMX
, 0);
275 struct x86_reg mmx1
= x86_make_reg(file_MMX
, 1);
276 sse_cvtps2pi(&p
->func
, mmx0
, temp
);
277 sse_movhlps(&p
->func
, temp
, temp
);
278 sse_cvtps2pi(&p
->func
, mmx1
, temp
);
279 mmx_packssdw(&p
->func
, mmx0
, mmx1
);
280 mmx_packuswb(&p
->func
, mmx0
, mmx0
);
281 mmx_movd(&p
->func
, dest
, mmx0
);
285 static int get_offset( const void *a
, const void *b
)
287 return (const char *)b
- (const char *)a
;
290 /* Not much happens here. Eventually use this function to try and
291 * avoid saving/reloading the source pointers each vertex (if some of
292 * them can fit in registers).
294 static void get_src_ptr( struct x86_program
*p
,
295 struct x86_reg srcREG
,
296 struct x86_reg vfREG
,
297 struct draw_vf_attr
*a
)
299 struct draw_vertex_fetch
*vf
= p
->vf
;
300 struct x86_reg ptr_to_src
= x86_make_disp(vfREG
, get_offset(vf
, &a
->inputptr
));
302 /* Load current a[j].inputptr
304 x86_mov(&p
->func
, srcREG
, ptr_to_src
);
307 static void update_src_ptr( struct x86_program
*p
,
308 struct x86_reg srcREG
,
309 struct x86_reg vfREG
,
310 struct draw_vf_attr
*a
)
312 if (a
->inputstride
) {
313 struct draw_vertex_fetch
*vf
= p
->vf
;
314 struct x86_reg ptr_to_src
= x86_make_disp(vfREG
, get_offset(vf
, &a
->inputptr
));
316 /* add a[j].inputstride (hardcoded value - could just as easily
317 * pull the stride value from memory each time).
319 x86_lea(&p
->func
, srcREG
, x86_make_disp(srcREG
, a
->inputstride
));
321 /* save new value of a[j].inputptr
323 x86_mov(&p
->func
, ptr_to_src
, srcREG
);
328 /* Lots of hardcoding
330 * EAX -- pointer to current output vertex
331 * ECX -- pointer to current attribute
334 static boolean
build_vertex_emit( struct x86_program
*p
)
336 struct draw_vertex_fetch
*vf
= p
->vf
;
339 struct x86_reg vertexEAX
= x86_make_reg(file_REG32
, reg_AX
);
340 struct x86_reg srcECX
= x86_make_reg(file_REG32
, reg_CX
);
341 struct x86_reg countEBP
= x86_make_reg(file_REG32
, reg_BP
);
342 struct x86_reg vfESI
= x86_make_reg(file_REG32
, reg_SI
);
343 struct x86_reg temp
= x86_make_reg(file_XMM
, 0);
344 uint8_t *fixup
, *label
;
348 x86_push(&p
->func
, countEBP
);
349 x86_push(&p
->func
, vfESI
);
352 /* Get vertex count, compare to zero
354 x86_xor(&p
->func
, srcECX
, srcECX
);
355 x86_mov(&p
->func
, countEBP
, x86_fn_arg(&p
->func
, 2));
356 x86_cmp(&p
->func
, countEBP
, srcECX
);
357 fixup
= x86_jcc_forward(&p
->func
, cc_E
);
359 /* Initialize destination register.
361 x86_mov(&p
->func
, vertexEAX
, x86_fn_arg(&p
->func
, 3));
363 /* Move argument 1 (vf) into a reg:
365 x86_mov(&p
->func
, vfESI
, x86_fn_arg(&p
->func
, 1));
368 /* always load, needed or not:
370 sse_movups(&p
->func
, p
->identity
, x86_make_disp(vfESI
, get_offset(vf
, &vf
->identity
[0])));
372 /* Note address for loop jump */
373 label
= x86_get_label(&p
->func
);
375 /* Emit code for each of the attributes. Currently routes
376 * everything through SSE registers, even when it might be more
377 * efficient to stick with regular old x86. No optimization or
378 * other tricks - enough new ground to cover here just getting
381 while (j
< vf
->attr_count
) {
382 struct draw_vf_attr
*a
= &vf
->attr
[j
];
383 struct x86_reg dest
= x86_make_disp(vertexEAX
, a
->vertoffset
);
385 /* Now, load an XMM reg from src, perhaps transform, then save.
386 * Could be shortcircuited in specific cases:
390 case DRAW_EMIT_1F_CONST
:
391 get_src_ptr(p
, srcECX
, vfESI
, a
);
392 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
393 emit_store(p
, dest
, 1, temp
);
394 update_src_ptr(p
, srcECX
, vfESI
, a
);
397 case DRAW_EMIT_2F_CONST
:
398 get_src_ptr(p
, srcECX
, vfESI
, a
);
399 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
400 emit_store(p
, dest
, 2, temp
);
401 update_src_ptr(p
, srcECX
, vfESI
, a
);
404 case DRAW_EMIT_3F_CONST
:
405 /* Potentially the worst case - hardcode 2+1 copying:
408 get_src_ptr(p
, srcECX
, vfESI
, a
);
409 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
410 emit_store(p
, dest
, 3, temp
);
411 update_src_ptr(p
, srcECX
, vfESI
, a
);
414 get_src_ptr(p
, srcECX
, vfESI
, a
);
415 emit_load(p
, temp
, 2, x86_deref(srcECX
), a
->inputsize
);
416 emit_store(p
, dest
, 2, temp
);
417 if (a
->inputsize
> 2) {
418 emit_load(p
, temp
, 1, x86_make_disp(srcECX
, 8), 1);
419 emit_store(p
, x86_make_disp(dest
,8), 1, temp
);
422 sse_movss(&p
->func
, x86_make_disp(dest
,8), get_identity(p
));
424 update_src_ptr(p
, srcECX
, vfESI
, a
);
428 case DRAW_EMIT_4F_CONST
:
429 get_src_ptr(p
, srcECX
, vfESI
, a
);
430 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
431 emit_store(p
, dest
, 4, temp
);
432 update_src_ptr(p
, srcECX
, vfESI
, a
);
434 case DRAW_EMIT_3F_XYW
:
435 get_src_ptr(p
, srcECX
, vfESI
, a
);
436 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
437 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,Y
,W
,Z
));
438 emit_store(p
, dest
, 3, temp
);
439 update_src_ptr(p
, srcECX
, vfESI
, a
);
442 case DRAW_EMIT_1UB_1F
:
443 /* Test for PAD3 + 1UB:
446 a
[-1].vertoffset
+ a
[-1].vertattrsize
<= a
->vertoffset
- 3)
448 get_src_ptr(p
, srcECX
, vfESI
, a
);
449 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
->inputsize
);
450 sse_shufps(&p
->func
, temp
, temp
, SHUF(X
,X
,X
,X
));
451 emit_pack_store_4ub(p
, x86_make_disp(dest
, -3), temp
); /* overkill! */
452 update_src_ptr(p
, srcECX
, vfESI
, a
);
455 debug_printf("Can't emit 1ub %x %x %d\n",
456 a
->vertoffset
, a
[-1].vertoffset
, a
[-1].vertattrsize
);
460 case DRAW_EMIT_3UB_3F_RGB
:
461 case DRAW_EMIT_3UB_3F_BGR
:
462 /* Test for 3UB + PAD1:
464 if (j
== vf
->attr_count
- 1 ||
465 a
[1].vertoffset
>= a
->vertoffset
+ 4) {
466 get_src_ptr(p
, srcECX
, vfESI
, a
);
467 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
468 if (a
->format
== DRAW_EMIT_3UB_3F_BGR
)
469 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
470 emit_pack_store_4ub(p
, dest
, temp
);
471 update_src_ptr(p
, srcECX
, vfESI
, a
);
473 /* Test for 3UB + 1UB:
475 else if (j
< vf
->attr_count
- 1 &&
476 a
[1].format
== DRAW_EMIT_1UB_1F
&&
477 a
[1].vertoffset
== a
->vertoffset
+ 3) {
478 get_src_ptr(p
, srcECX
, vfESI
, a
);
479 emit_load(p
, temp
, 3, x86_deref(srcECX
), a
->inputsize
);
480 update_src_ptr(p
, srcECX
, vfESI
, a
);
482 /* Make room for incoming value:
484 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
486 get_src_ptr(p
, srcECX
, vfESI
, &a
[1]);
487 emit_load(p
, temp
, 1, x86_deref(srcECX
), a
[1].inputsize
);
488 update_src_ptr(p
, srcECX
, vfESI
, &a
[1]);
490 /* Rearrange and possibly do BGR conversion:
492 if (a
->format
== DRAW_EMIT_3UB_3F_BGR
)
493 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
495 sse_shufps(&p
->func
, temp
, temp
, SHUF(Y
,Z
,W
,X
));
497 emit_pack_store_4ub(p
, dest
, temp
);
498 j
++; /* NOTE: two attrs consumed */
501 debug_printf("Can't emit 3ub\n");
503 return FALSE
; /* add this later */
506 case DRAW_EMIT_4UB_4F_RGBA
:
507 get_src_ptr(p
, srcECX
, vfESI
, a
);
508 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
509 emit_pack_store_4ub(p
, dest
, temp
);
510 update_src_ptr(p
, srcECX
, vfESI
, a
);
512 case DRAW_EMIT_4UB_4F_BGRA
:
513 get_src_ptr(p
, srcECX
, vfESI
, a
);
514 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
515 sse_shufps(&p
->func
, temp
, temp
, SHUF(Z
,Y
,X
,W
));
516 emit_pack_store_4ub(p
, dest
, temp
);
517 update_src_ptr(p
, srcECX
, vfESI
, a
);
519 case DRAW_EMIT_4UB_4F_ARGB
:
520 get_src_ptr(p
, srcECX
, vfESI
, a
);
521 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
522 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,X
,Y
,Z
));
523 emit_pack_store_4ub(p
, dest
, temp
);
524 update_src_ptr(p
, srcECX
, vfESI
, a
);
526 case DRAW_EMIT_4UB_4F_ABGR
:
527 get_src_ptr(p
, srcECX
, vfESI
, a
);
528 emit_load(p
, temp
, 4, x86_deref(srcECX
), a
->inputsize
);
529 sse_shufps(&p
->func
, temp
, temp
, SHUF(W
,Z
,Y
,X
));
530 emit_pack_store_4ub(p
, dest
, temp
);
531 update_src_ptr(p
, srcECX
, vfESI
, a
);
534 debug_printf("unknown a[%d].format %d\n", j
, a
->format
);
535 return FALSE
; /* catch any new opcodes */
538 /* Increment j by at least 1 - may have been incremented above also:
545 x86_lea(&p
->func
, vertexEAX
, x86_make_disp(vertexEAX
, vf
->vertex_stride
));
547 /* decr count, loop if not zero
549 x86_dec(&p
->func
, countEBP
);
550 x86_test(&p
->func
, countEBP
, countEBP
);
551 x86_jcc(&p
->func
, cc_NZ
, label
);
555 if (p
->func
.need_emms
)
558 /* Land forward jump here:
560 x86_fixup_fwd_jump(&p
->func
, fixup
);
562 /* Pop regs and return
564 x86_pop(&p
->func
, x86_get_base_reg(vfESI
));
565 x86_pop(&p
->func
, countEBP
);
568 vf
->emit
= (draw_vf_emit_func
)x86_get_func(&p
->func
);
574 void draw_vf_generate_sse_emit( struct draw_vertex_fetch
*vf
)
576 struct x86_program p
;
578 if (!rtasm_cpu_has_sse()) {
579 vf
->codegen_emit
= NULL
;
583 memset(&p
, 0, sizeof(p
));
586 p
.inputs_safe
= 0; /* for now */
587 p
.outputs_safe
= 1; /* for now */
588 p
.have_sse2
= rtasm_cpu_has_sse2();
589 p
.identity
= x86_make_reg(file_XMM
, 6);
590 p
.chan0
= x86_make_reg(file_XMM
, 7);
592 x86_init_func(&p
.func
);
594 if (build_vertex_emit(&p
)) {
595 draw_vf_register_fastpath( vf
, TRUE
);
598 /* Note the failure so that we don't keep trying to codegen an
601 draw_vf_register_fastpath( vf
, FALSE
);
602 x86_release_func(&p
.func
);
608 void draw_vf_generate_sse_emit( struct draw_vertex_fetch
*vf
)
610 /* Dummy version for when USE_SSE_ASM not defined */