2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
34 #include "translate.h"
37 #if defined(PIPE_ARCH_X86)
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
49 typedef void (PIPE_CDECL
*run_func
)( struct translate
*translate
,
52 void *output_buffer
);
54 typedef void (PIPE_CDECL
*run_elts_func
)( struct translate
*translate
,
57 void *output_buffer
);
59 struct translate_buffer
{
62 void *ptr
; /* updated per vertex */
66 struct translate_sse
{
67 struct translate translate
;
69 struct x86_function linear_func
;
70 struct x86_function elt_func
;
71 struct x86_function
*func
;
73 boolean loaded_identity
;
75 boolean loaded_inv_255
;
81 struct translate_buffer buffer
[PIPE_MAX_ATTRIBS
];
85 run_elts_func gen_run_elts
;
87 /* these are actually known values, but putting them in a struct
88 * like this is helpful to keep them in sync across the file.
90 struct x86_reg tmp_EAX
;
91 struct x86_reg idx_EBX
; /* either start+i or &elt[i] */
92 struct x86_reg outbuf_ECX
;
93 struct x86_reg machine_EDX
;
94 struct x86_reg count_ESI
; /* decrements to zero */
97 static int get_offset( const void *a
, const void *b
)
99 return (const char *)b
- (const char *)a
;
104 static struct x86_reg
get_identity( struct translate_sse
*p
)
106 struct x86_reg reg
= x86_make_reg(file_XMM
, 6);
108 if (!p
->loaded_identity
) {
109 p
->loaded_identity
= TRUE
;
115 sse_movups(p
->func
, reg
,
116 x86_make_disp(p
->machine_EDX
,
117 get_offset(p
, &p
->identity
[0])));
123 static struct x86_reg
get_255( struct translate_sse
*p
)
125 struct x86_reg reg
= x86_make_reg(file_XMM
, 7);
127 if (!p
->loaded_255
) {
128 p
->loaded_255
= TRUE
;
132 p
->float_255
[3] = 255.0f
;
134 sse_movups(p
->func
, reg
,
135 x86_make_disp(p
->machine_EDX
,
136 get_offset(p
, &p
->float_255
[0])));
142 static struct x86_reg
get_inv_255( struct translate_sse
*p
)
144 struct x86_reg reg
= x86_make_reg(file_XMM
, 5);
146 if (!p
->loaded_inv_255
) {
147 p
->loaded_inv_255
= TRUE
;
151 p
->inv_255
[3] = 1.0f
/ 255.0f
;
153 sse_movups(p
->func
, reg
,
154 x86_make_disp(p
->machine_EDX
,
155 get_offset(p
, &p
->inv_255
[0])));
162 static void emit_load_R32G32B32A32( struct translate_sse
*p
,
164 struct x86_reg arg0
)
166 sse_movups(p
->func
, data
, arg0
);
169 static void emit_load_R32G32B32( struct translate_sse
*p
,
171 struct x86_reg arg0
)
173 /* Have to jump through some hoops:
180 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
181 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
182 sse_shufps(p
->func
, data
, data
, SHUF(Y
,Z
,X
,W
) );
183 sse_movlps(p
->func
, data
, arg0
);
186 static void emit_load_R32G32( struct translate_sse
*p
,
188 struct x86_reg arg0
)
193 sse_movups(p
->func
, data
, get_identity(p
) );
194 sse_movlps(p
->func
, data
, arg0
);
198 static void emit_load_R32( struct translate_sse
*p
,
200 struct x86_reg arg0
)
205 sse_movss(p
->func
, data
, arg0
);
206 sse_orps(p
->func
, data
, get_identity(p
) );
210 static void emit_load_R8G8B8A8_UNORM( struct translate_sse
*p
,
215 /* Load and unpack twice:
217 sse_movss(p
->func
, data
, src
);
218 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
219 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
223 sse2_cvtdq2ps(p
->func
, data
, data
);
228 sse_mulps(p
->func
, data
, get_inv_255(p
));
234 static void emit_store_R32G32B32A32( struct translate_sse
*p
,
236 struct x86_reg dataXMM
)
238 sse_movups(p
->func
, dest
, dataXMM
);
241 static void emit_store_R32G32B32( struct translate_sse
*p
,
243 struct x86_reg dataXMM
)
245 /* Emit two, shuffle, emit one.
247 sse_movlps(p
->func
, dest
, dataXMM
);
248 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
249 sse_movss(p
->func
, x86_make_disp(dest
,8), dataXMM
);
252 static void emit_store_R32G32( struct translate_sse
*p
,
254 struct x86_reg dataXMM
)
256 sse_movlps(p
->func
, dest
, dataXMM
);
259 static void emit_store_R32( struct translate_sse
*p
,
261 struct x86_reg dataXMM
)
263 sse_movss(p
->func
, dest
, dataXMM
);
268 static void emit_store_R8G8B8A8_UNORM( struct translate_sse
*p
,
270 struct x86_reg dataXMM
)
274 sse_mulps(p
->func
, dataXMM
, get_255(p
));
278 sse2_cvtps2dq(p
->func
, dataXMM
, dataXMM
);
279 sse2_packssdw(p
->func
, dataXMM
, dataXMM
);
280 sse2_packuswb(p
->func
, dataXMM
, dataXMM
);
281 sse_movss(p
->func
, dest
, dataXMM
);
288 /* Extended swizzles? Maybe later.
290 static void emit_swizzle( struct translate_sse
*p
,
293 unsigned char shuffle
)
295 sse_shufps(p
->func
, dest
, src
, shuffle
);
299 static boolean
translate_attr( struct translate_sse
*p
,
300 const struct translate_element
*a
,
301 struct x86_reg srcECX
,
302 struct x86_reg dstEAX
)
304 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
306 switch (a
->input_format
) {
307 case PIPE_FORMAT_R32_FLOAT
:
308 emit_load_R32(p
, dataXMM
, srcECX
);
310 case PIPE_FORMAT_R32G32_FLOAT
:
311 emit_load_R32G32(p
, dataXMM
, srcECX
);
313 case PIPE_FORMAT_R32G32B32_FLOAT
:
314 emit_load_R32G32B32(p
, dataXMM
, srcECX
);
316 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
317 emit_load_R32G32B32A32(p
, dataXMM
, srcECX
);
319 case PIPE_FORMAT_B8G8R8A8_UNORM
:
320 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
321 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
323 case PIPE_FORMAT_R8G8B8A8_UNORM
:
324 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
330 switch (a
->output_format
) {
331 case PIPE_FORMAT_R32_FLOAT
:
332 emit_store_R32(p
, dstEAX
, dataXMM
);
334 case PIPE_FORMAT_R32G32_FLOAT
:
335 emit_store_R32G32(p
, dstEAX
, dataXMM
);
337 case PIPE_FORMAT_R32G32B32_FLOAT
:
338 emit_store_R32G32B32(p
, dstEAX
, dataXMM
);
340 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
341 emit_store_R32G32B32A32(p
, dstEAX
, dataXMM
);
343 case PIPE_FORMAT_B8G8R8A8_UNORM
:
344 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
345 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
347 case PIPE_FORMAT_R8G8B8A8_UNORM
:
348 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
358 static boolean
init_inputs( struct translate_sse
*p
,
363 for (i
= 0; i
< p
->nr_buffers
; i
++) {
364 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDX
,
365 get_offset(p
, &p
->buffer
[i
].stride
));
366 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDX
,
367 get_offset(p
, &p
->buffer
[i
].ptr
));
368 struct x86_reg buf_base_ptr
= x86_make_disp(p
->machine_EDX
,
369 get_offset(p
, &p
->buffer
[i
].base_ptr
));
370 struct x86_reg elt
= p
->idx_EBX
;
371 struct x86_reg tmp
= p
->tmp_EAX
;
374 /* Calculate pointer to first attrib:
376 x86_mov(p
->func
, tmp
, buf_stride
);
377 x86_imul(p
->func
, tmp
, elt
);
378 x86_add(p
->func
, tmp
, buf_base_ptr
);
381 /* In the linear case, keep the buffer pointer instead of the
384 if (p
->nr_buffers
== 1)
385 x86_mov( p
->func
, elt
, tmp
);
387 x86_mov( p
->func
, buf_ptr
, tmp
);
395 static struct x86_reg
get_buffer_ptr( struct translate_sse
*p
,
400 if (linear
&& p
->nr_buffers
== 1) {
404 struct x86_reg ptr
= p
->tmp_EAX
;
405 struct x86_reg buf_ptr
=
406 x86_make_disp(p
->machine_EDX
,
407 get_offset(p
, &p
->buffer
[buf_idx
].ptr
));
409 x86_mov(p
->func
, ptr
, buf_ptr
);
413 struct x86_reg ptr
= p
->tmp_EAX
;
415 struct x86_reg buf_stride
=
416 x86_make_disp(p
->machine_EDX
,
417 get_offset(p
, &p
->buffer
[buf_idx
].stride
));
419 struct x86_reg buf_base_ptr
=
420 x86_make_disp(p
->machine_EDX
,
421 get_offset(p
, &p
->buffer
[buf_idx
].base_ptr
));
425 /* Calculate pointer to current attrib:
427 x86_mov(p
->func
, ptr
, buf_stride
);
428 x86_imul(p
->func
, ptr
, elt
);
429 x86_add(p
->func
, ptr
, buf_base_ptr
);
436 static boolean
incr_inputs( struct translate_sse
*p
,
439 if (linear
&& p
->nr_buffers
== 1) {
440 struct x86_reg stride
= x86_make_disp(p
->machine_EDX
,
441 get_offset(p
, &p
->buffer
[0].stride
));
443 x86_add(p
->func
, p
->idx_EBX
, stride
);
444 sse_prefetchnta(p
->func
, x86_make_disp(p
->idx_EBX
, 192));
449 /* Is this worthwhile??
451 for (i
= 0; i
< p
->nr_buffers
; i
++) {
452 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDX
,
453 get_offset(p
, &p
->buffer
[i
].ptr
));
454 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDX
,
455 get_offset(p
, &p
->buffer
[i
].stride
));
457 x86_mov(p
->func
, p
->tmp_EAX
, buf_ptr
);
458 x86_add(p
->func
, p
->tmp_EAX
, buf_stride
);
459 if (i
== 0) sse_prefetchnta(p
->func
, x86_make_disp(p
->tmp_EAX
, 192));
460 x86_mov(p
->func
, buf_ptr
, p
->tmp_EAX
);
464 x86_lea(p
->func
, p
->idx_EBX
, x86_make_disp(p
->idx_EBX
, 4));
471 /* Build run( struct translate *machine,
474 * void *output_buffer )
476 * run_elts( struct translate *machine,
479 * void *output_buffer )
483 * EAX -- pointer to current output vertex
484 * ECX -- pointer to current attribute
487 static boolean
build_vertex_emit( struct translate_sse
*p
,
488 struct x86_function
*func
,
494 p
->tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
495 p
->idx_EBX
= x86_make_reg(file_REG32
, reg_BX
);
496 p
->outbuf_ECX
= x86_make_reg(file_REG32
, reg_CX
);
497 p
->machine_EDX
= x86_make_reg(file_REG32
, reg_DX
);
498 p
->count_ESI
= x86_make_reg(file_REG32
, reg_SI
);
501 p
->loaded_inv_255
= FALSE
;
502 p
->loaded_255
= FALSE
;
503 p
->loaded_identity
= FALSE
;
505 x86_init_func(p
->func
);
509 x86_push(p
->func
, p
->idx_EBX
);
510 x86_push(p
->func
, p
->count_ESI
);
512 /* Load arguments into regs:
514 x86_mov(p
->func
, p
->machine_EDX
, x86_fn_arg(p
->func
, 1));
515 x86_mov(p
->func
, p
->idx_EBX
, x86_fn_arg(p
->func
, 2));
516 x86_mov(p
->func
, p
->count_ESI
, x86_fn_arg(p
->func
, 3));
517 x86_mov(p
->func
, p
->outbuf_ECX
, x86_fn_arg(p
->func
, 4));
519 /* Get vertex count, compare to zero
521 x86_xor(p
->func
, p
->tmp_EAX
, p
->tmp_EAX
);
522 x86_cmp(p
->func
, p
->count_ESI
, p
->tmp_EAX
);
523 fixup
= x86_jcc_forward(p
->func
, cc_E
);
525 /* always load, needed or not:
527 init_inputs(p
, linear
);
529 /* Note address for loop jump
531 label
= x86_get_label(p
->func
);
533 struct x86_reg elt
= linear
? p
->idx_EBX
: x86_deref(p
->idx_EBX
);
537 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
538 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
540 /* Figure out source pointer address:
542 if (a
->input_buffer
!= last_vb
) {
543 last_vb
= a
->input_buffer
;
544 vb
= get_buffer_ptr(p
, linear
, a
->input_buffer
, elt
);
547 if (!translate_attr( p
, a
,
548 x86_make_disp(vb
, a
->input_offset
),
549 x86_make_disp(p
->outbuf_ECX
, a
->output_offset
)))
553 /* Next output vertex:
557 x86_make_disp(p
->outbuf_ECX
,
558 p
->translate
.key
.output_stride
));
562 incr_inputs( p
, linear
);
565 /* decr count, loop if not zero
567 x86_dec(p
->func
, p
->count_ESI
);
568 x86_jcc(p
->func
, cc_NZ
, label
);
572 if (p
->func
->need_emms
)
575 /* Land forward jump here:
577 x86_fixup_fwd_jump(p
->func
, fixup
);
579 /* Pop regs and return
582 x86_pop(p
->func
, p
->count_ESI
);
583 x86_pop(p
->func
, p
->idx_EBX
);
595 static void translate_sse_set_buffer( struct translate
*translate
,
600 struct translate_sse
*p
= (struct translate_sse
*)translate
;
602 if (buf
< p
->nr_buffers
) {
603 p
->buffer
[buf
].base_ptr
= (char *)ptr
;
604 p
->buffer
[buf
].stride
= stride
;
607 if (0) debug_printf("%s %d/%d: %p %d\n",
614 static void translate_sse_release( struct translate
*translate
)
616 struct translate_sse
*p
= (struct translate_sse
*)translate
;
618 x86_release_func( &p
->linear_func
);
619 x86_release_func( &p
->elt_func
);
624 static void PIPE_CDECL
translate_sse_run_elts( struct translate
*translate
,
625 const unsigned *elts
,
627 void *output_buffer
)
629 struct translate_sse
*p
= (struct translate_sse
*)translate
;
631 p
->gen_run_elts( translate
,
637 static void PIPE_CDECL
translate_sse_run( struct translate
*translate
,
640 void *output_buffer
)
642 struct translate_sse
*p
= (struct translate_sse
*)translate
;
644 p
->gen_run( translate
,
651 struct translate
*translate_sse2_create( const struct translate_key
*key
)
653 struct translate_sse
*p
= NULL
;
656 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
659 p
= CALLOC_STRUCT( translate_sse
);
663 p
->translate
.key
= *key
;
664 p
->translate
.release
= translate_sse_release
;
665 p
->translate
.set_buffer
= translate_sse_set_buffer
;
666 p
->translate
.run_elts
= translate_sse_run_elts
;
667 p
->translate
.run
= translate_sse_run
;
669 for (i
= 0; i
< key
->nr_elements
; i
++)
670 p
->nr_buffers
= MAX2( p
->nr_buffers
, key
->element
[i
].input_buffer
+ 1 );
672 if (0) debug_printf("nr_buffers: %d\n", p
->nr_buffers
);
674 if (!build_vertex_emit(p
, &p
->linear_func
, TRUE
))
677 if (!build_vertex_emit(p
, &p
->elt_func
, FALSE
))
680 p
->gen_run
= (run_func
)x86_get_func(&p
->linear_func
);
681 if (p
->gen_run
== NULL
)
684 p
->gen_run_elts
= (run_elts_func
)x86_get_func(&p
->elt_func
);
685 if (p
->gen_run_elts
== NULL
)
688 return &p
->translate
;
692 translate_sse_release( &p
->translate
);
701 struct translate
*translate_sse2_create( const struct translate_key
*key
)