2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
34 #include "translate.h"
37 #if defined(PIPE_ARCH_X86)
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
49 struct translate_buffer
{
55 struct translate_buffer_varient
{
56 unsigned buffer_index
;
57 unsigned instance_divisor
;
58 void *ptr
; /* updated either per vertex or per instance */
62 #define ELEMENT_BUFFER_INSTANCE_ID 1001
65 struct translate_sse
{
66 struct translate translate
;
68 struct x86_function linear_func
;
69 struct x86_function elt_func
;
70 struct x86_function elt16_func
;
71 struct x86_function elt8_func
;
72 struct x86_function
*func
;
74 boolean loaded_identity
;
76 boolean loaded_inv_255
;
82 struct translate_buffer buffer
[PIPE_MAX_ATTRIBS
];
85 /* Multiple buffer varients can map to a single buffer. */
86 struct translate_buffer_varient buffer_varient
[PIPE_MAX_ATTRIBS
];
87 unsigned nr_buffer_varients
;
89 /* Multiple elements can map to a single buffer varient. */
90 unsigned element_to_buffer_varient
[PIPE_MAX_ATTRIBS
];
92 boolean use_instancing
;
95 /* these are actually known values, but putting them in a struct
96 * like this is helpful to keep them in sync across the file.
98 struct x86_reg tmp_EAX
;
99 struct x86_reg idx_EBX
; /* either start+i or &elt[i] */
100 struct x86_reg outbuf_ECX
;
101 struct x86_reg machine_EDX
;
102 struct x86_reg count_ESI
; /* decrements to zero */
105 static int get_offset( const void *a
, const void *b
)
107 return (const char *)b
- (const char *)a
;
112 static struct x86_reg
get_identity( struct translate_sse
*p
)
114 struct x86_reg reg
= x86_make_reg(file_XMM
, 6);
116 if (!p
->loaded_identity
) {
117 p
->loaded_identity
= TRUE
;
123 sse_movups(p
->func
, reg
,
124 x86_make_disp(p
->machine_EDX
,
125 get_offset(p
, &p
->identity
[0])));
131 static struct x86_reg
get_255( struct translate_sse
*p
)
133 struct x86_reg reg
= x86_make_reg(file_XMM
, 7);
135 if (!p
->loaded_255
) {
136 p
->loaded_255
= TRUE
;
140 p
->float_255
[3] = 255.0f
;
142 sse_movups(p
->func
, reg
,
143 x86_make_disp(p
->machine_EDX
,
144 get_offset(p
, &p
->float_255
[0])));
150 static struct x86_reg
get_inv_255( struct translate_sse
*p
)
152 struct x86_reg reg
= x86_make_reg(file_XMM
, 5);
154 if (!p
->loaded_inv_255
) {
155 p
->loaded_inv_255
= TRUE
;
159 p
->inv_255
[3] = 1.0f
/ 255.0f
;
161 sse_movups(p
->func
, reg
,
162 x86_make_disp(p
->machine_EDX
,
163 get_offset(p
, &p
->inv_255
[0])));
170 static void emit_load_R32G32B32A32( struct translate_sse
*p
,
172 struct x86_reg arg0
)
174 sse_movups(p
->func
, data
, arg0
);
177 static void emit_load_R32G32B32( struct translate_sse
*p
,
179 struct x86_reg arg0
)
181 /* Have to jump through some hoops:
188 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
189 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
190 sse_shufps(p
->func
, data
, data
, SHUF(Y
,Z
,X
,W
) );
191 sse_movlps(p
->func
, data
, arg0
);
194 static void emit_load_R32G32( struct translate_sse
*p
,
196 struct x86_reg arg0
)
201 sse_movups(p
->func
, data
, get_identity(p
) );
202 sse_movlps(p
->func
, data
, arg0
);
206 static void emit_load_R32( struct translate_sse
*p
,
208 struct x86_reg arg0
)
213 sse_movss(p
->func
, data
, arg0
);
214 sse_orps(p
->func
, data
, get_identity(p
) );
218 static void emit_load_R8G8B8A8_UNORM( struct translate_sse
*p
,
223 /* Load and unpack twice:
225 sse_movss(p
->func
, data
, src
);
226 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
227 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
231 sse2_cvtdq2ps(p
->func
, data
, data
);
236 sse_mulps(p
->func
, data
, get_inv_255(p
));
242 static void emit_store_R32G32B32A32( struct translate_sse
*p
,
244 struct x86_reg dataXMM
)
246 sse_movups(p
->func
, dest
, dataXMM
);
249 static void emit_store_R32G32B32( struct translate_sse
*p
,
251 struct x86_reg dataXMM
)
253 /* Emit two, shuffle, emit one.
255 sse_movlps(p
->func
, dest
, dataXMM
);
256 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
257 sse_movss(p
->func
, x86_make_disp(dest
,8), dataXMM
);
260 static void emit_store_R32G32( struct translate_sse
*p
,
262 struct x86_reg dataXMM
)
264 sse_movlps(p
->func
, dest
, dataXMM
);
267 static void emit_store_R32( struct translate_sse
*p
,
269 struct x86_reg dataXMM
)
271 sse_movss(p
->func
, dest
, dataXMM
);
276 static void emit_store_R8G8B8A8_UNORM( struct translate_sse
*p
,
278 struct x86_reg dataXMM
)
282 sse_mulps(p
->func
, dataXMM
, get_255(p
));
286 sse2_cvtps2dq(p
->func
, dataXMM
, dataXMM
);
287 sse2_packssdw(p
->func
, dataXMM
, dataXMM
);
288 sse2_packuswb(p
->func
, dataXMM
, dataXMM
);
289 sse_movss(p
->func
, dest
, dataXMM
);
296 /* Extended swizzles? Maybe later.
298 static void emit_swizzle( struct translate_sse
*p
,
301 unsigned char shuffle
)
303 sse_shufps(p
->func
, dest
, src
, shuffle
);
307 static boolean
translate_attr( struct translate_sse
*p
,
308 const struct translate_element
*a
,
309 struct x86_reg srcECX
,
310 struct x86_reg dstEAX
)
312 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
314 switch (a
->input_format
) {
315 case PIPE_FORMAT_R32_FLOAT
:
316 emit_load_R32(p
, dataXMM
, srcECX
);
318 case PIPE_FORMAT_R32G32_FLOAT
:
319 emit_load_R32G32(p
, dataXMM
, srcECX
);
321 case PIPE_FORMAT_R32G32B32_FLOAT
:
322 emit_load_R32G32B32(p
, dataXMM
, srcECX
);
324 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
325 emit_load_R32G32B32A32(p
, dataXMM
, srcECX
);
327 case PIPE_FORMAT_B8G8R8A8_UNORM
:
328 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
329 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
331 case PIPE_FORMAT_R8G8B8A8_UNORM
:
332 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
338 switch (a
->output_format
) {
339 case PIPE_FORMAT_R32_FLOAT
:
340 emit_store_R32(p
, dstEAX
, dataXMM
);
342 case PIPE_FORMAT_R32G32_FLOAT
:
343 emit_store_R32G32(p
, dstEAX
, dataXMM
);
345 case PIPE_FORMAT_R32G32B32_FLOAT
:
346 emit_store_R32G32B32(p
, dstEAX
, dataXMM
);
348 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
349 emit_store_R32G32B32A32(p
, dstEAX
, dataXMM
);
351 case PIPE_FORMAT_B8G8R8A8_UNORM
:
352 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
353 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
355 case PIPE_FORMAT_R8G8B8A8_UNORM
:
356 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
366 static boolean
init_inputs( struct translate_sse
*p
,
367 unsigned index_size
)
370 struct x86_reg instance_id
= x86_make_disp(p
->machine_EDX
,
371 get_offset(p
, &p
->instance_id
));
373 for (i
= 0; i
< p
->nr_buffer_varients
; i
++) {
374 struct translate_buffer_varient
*varient
= &p
->buffer_varient
[i
];
375 struct translate_buffer
*buffer
= &p
->buffer
[varient
->buffer_index
];
377 if (!index_size
|| varient
->instance_divisor
) {
378 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDX
,
379 get_offset(p
, &buffer
->stride
));
380 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDX
,
381 get_offset(p
, &varient
->ptr
));
382 struct x86_reg buf_base_ptr
= x86_make_disp(p
->machine_EDX
,
383 get_offset(p
, &buffer
->base_ptr
));
384 struct x86_reg elt
= p
->idx_EBX
;
385 struct x86_reg tmp_EAX
= p
->tmp_EAX
;
387 /* Calculate pointer to first attrib:
388 * base_ptr + stride * index, where index depends on instance divisor
390 if (varient
->instance_divisor
) {
391 /* Our index is instance ID divided by instance divisor.
393 x86_mov(p
->func
, tmp_EAX
, instance_id
);
395 if (varient
->instance_divisor
!= 1) {
396 struct x86_reg tmp_EDX
= p
->machine_EDX
;
397 struct x86_reg tmp_ECX
= p
->outbuf_ECX
;
399 /* TODO: Add x86_shr() to rtasm and use it whenever
400 * instance divisor is power of two.
403 x86_push(p
->func
, tmp_EDX
);
404 x86_push(p
->func
, tmp_ECX
);
405 x86_xor(p
->func
, tmp_EDX
, tmp_EDX
);
406 x86_mov_reg_imm(p
->func
, tmp_ECX
, varient
->instance_divisor
);
407 x86_div(p
->func
, tmp_ECX
); /* EAX = EDX:EAX / ECX */
408 x86_pop(p
->func
, tmp_ECX
);
409 x86_pop(p
->func
, tmp_EDX
);
412 x86_mov(p
->func
, tmp_EAX
, elt
);
416 * TODO: Respect translate_buffer::max_index.
419 x86_imul(p
->func
, tmp_EAX
, buf_stride
);
420 x86_add(p
->func
, tmp_EAX
, buf_base_ptr
);
423 /* In the linear case, keep the buffer pointer instead of the
426 if (!index_size
&& p
->nr_buffer_varients
== 1)
427 x86_mov(p
->func
, elt
, tmp_EAX
);
429 x86_mov(p
->func
, buf_ptr
, tmp_EAX
);
437 static struct x86_reg
get_buffer_ptr( struct translate_sse
*p
,
442 if (var_idx
== ELEMENT_BUFFER_INSTANCE_ID
) {
443 return x86_make_disp(p
->machine_EDX
,
444 get_offset(p
, &p
->instance_id
));
446 if (!index_size
&& p
->nr_buffer_varients
== 1) {
449 else if (!index_size
|| p
->buffer_varient
[var_idx
].instance_divisor
) {
450 struct x86_reg ptr
= p
->tmp_EAX
;
451 struct x86_reg buf_ptr
=
452 x86_make_disp(p
->machine_EDX
,
453 get_offset(p
, &p
->buffer_varient
[var_idx
].ptr
));
455 x86_mov(p
->func
, ptr
, buf_ptr
);
459 struct x86_reg ptr
= p
->tmp_EAX
;
460 const struct translate_buffer_varient
*varient
= &p
->buffer_varient
[var_idx
];
462 struct x86_reg buf_stride
=
463 x86_make_disp(p
->machine_EDX
,
464 get_offset(p
, &p
->buffer
[varient
->buffer_index
].stride
));
466 struct x86_reg buf_base_ptr
=
467 x86_make_disp(p
->machine_EDX
,
468 get_offset(p
, &p
->buffer
[varient
->buffer_index
].base_ptr
));
472 /* Calculate pointer to current attrib:
477 x86_movzx8(p
->func
, ptr
, elt
);
480 x86_movzx16(p
->func
, ptr
, elt
);
483 x86_mov(p
->func
, ptr
, elt
);
486 x86_imul(p
->func
, ptr
, buf_stride
);
487 x86_add(p
->func
, ptr
, buf_base_ptr
);
494 static boolean
incr_inputs( struct translate_sse
*p
,
495 unsigned index_size
)
497 if (!index_size
&& p
->nr_buffer_varients
== 1) {
498 struct x86_reg stride
= x86_make_disp(p
->machine_EDX
,
499 get_offset(p
, &p
->buffer
[0].stride
));
501 if (p
->buffer_varient
[0].instance_divisor
== 0) {
502 x86_add(p
->func
, p
->idx_EBX
, stride
);
503 sse_prefetchnta(p
->func
, x86_make_disp(p
->idx_EBX
, 192));
506 else if (!index_size
) {
509 /* Is this worthwhile??
511 for (i
= 0; i
< p
->nr_buffer_varients
; i
++) {
512 struct translate_buffer_varient
*varient
= &p
->buffer_varient
[i
];
513 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDX
,
514 get_offset(p
, &varient
->ptr
));
515 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDX
,
516 get_offset(p
, &p
->buffer
[varient
->buffer_index
].stride
));
518 if (varient
->instance_divisor
== 0) {
519 x86_mov(p
->func
, p
->tmp_EAX
, buf_ptr
);
520 x86_add(p
->func
, p
->tmp_EAX
, buf_stride
);
521 if (i
== 0) sse_prefetchnta(p
->func
, x86_make_disp(p
->tmp_EAX
, 192));
522 x86_mov(p
->func
, buf_ptr
, p
->tmp_EAX
);
527 x86_lea(p
->func
, p
->idx_EBX
, x86_make_disp(p
->idx_EBX
, index_size
));
534 /* Build run( struct translate *machine,
537 * void *output_buffer )
539 * run_elts( struct translate *machine,
542 * void *output_buffer )
546 * EAX -- pointer to current output vertex
547 * ECX -- pointer to current attribute
550 static boolean
build_vertex_emit( struct translate_sse
*p
,
551 struct x86_function
*func
,
552 unsigned index_size
)
557 p
->tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
558 p
->idx_EBX
= x86_make_reg(file_REG32
, reg_BX
);
559 p
->outbuf_ECX
= x86_make_reg(file_REG32
, reg_CX
);
560 p
->machine_EDX
= x86_make_reg(file_REG32
, reg_DX
);
561 p
->count_ESI
= x86_make_reg(file_REG32
, reg_SI
);
564 p
->loaded_inv_255
= FALSE
;
565 p
->loaded_255
= FALSE
;
566 p
->loaded_identity
= FALSE
;
568 x86_init_func(p
->func
);
572 x86_push(p
->func
, p
->idx_EBX
);
573 x86_push(p
->func
, p
->count_ESI
);
575 /* Load arguments into regs:
577 x86_mov(p
->func
, p
->machine_EDX
, x86_fn_arg(p
->func
, 1));
578 x86_mov(p
->func
, p
->idx_EBX
, x86_fn_arg(p
->func
, 2));
579 x86_mov(p
->func
, p
->count_ESI
, x86_fn_arg(p
->func
, 3));
580 x86_mov(p
->func
, p
->outbuf_ECX
, x86_fn_arg(p
->func
, 5));
584 if (p
->use_instancing
) {
587 x86_fn_arg(p
->func
, 4));
589 x86_make_disp(p
->machine_EDX
, get_offset(p
, &p
->instance_id
)),
593 /* Get vertex count, compare to zero
595 x86_xor(p
->func
, p
->tmp_EAX
, p
->tmp_EAX
);
596 x86_cmp(p
->func
, p
->count_ESI
, p
->tmp_EAX
);
597 fixup
= x86_jcc_forward(p
->func
, cc_E
);
599 /* always load, needed or not:
601 init_inputs(p
, index_size
);
603 /* Note address for loop jump
605 label
= x86_get_label(p
->func
);
607 struct x86_reg elt
= !index_size
? p
->idx_EBX
: x86_deref(p
->idx_EBX
);
608 int last_varient
= -1;
611 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
612 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
613 unsigned varient
= p
->element_to_buffer_varient
[j
];
615 /* Figure out source pointer address:
617 if (varient
!= last_varient
) {
618 last_varient
= varient
;
619 vb
= get_buffer_ptr(p
, index_size
, varient
, elt
);
622 if (!translate_attr( p
, a
,
623 x86_make_disp(vb
, a
->input_offset
),
624 x86_make_disp(p
->outbuf_ECX
, a
->output_offset
)))
628 /* Next output vertex:
632 x86_make_disp(p
->outbuf_ECX
,
633 p
->translate
.key
.output_stride
));
637 incr_inputs( p
, index_size
);
640 /* decr count, loop if not zero
642 x86_dec(p
->func
, p
->count_ESI
);
643 x86_jcc(p
->func
, cc_NZ
, label
);
647 if (p
->func
->need_emms
)
650 /* Land forward jump here:
652 x86_fixup_fwd_jump(p
->func
, fixup
);
654 /* Pop regs and return
657 x86_pop(p
->func
, p
->count_ESI
);
658 x86_pop(p
->func
, p
->idx_EBX
);
670 static void translate_sse_set_buffer( struct translate
*translate
,
676 struct translate_sse
*p
= (struct translate_sse
*)translate
;
678 if (buf
< p
->nr_buffers
) {
679 p
->buffer
[buf
].base_ptr
= (char *)ptr
;
680 p
->buffer
[buf
].stride
= stride
;
681 p
->buffer
[buf
].max_index
= max_index
;
684 if (0) debug_printf("%s %d/%d: %p %d\n",
691 static void translate_sse_release( struct translate
*translate
)
693 struct translate_sse
*p
= (struct translate_sse
*)translate
;
695 x86_release_func( &p
->linear_func
);
696 x86_release_func( &p
->elt_func
);
702 struct translate
*translate_sse2_create( const struct translate_key
*key
)
704 struct translate_sse
*p
= NULL
;
707 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
710 p
= CALLOC_STRUCT( translate_sse
);
714 p
->translate
.key
= *key
;
715 p
->translate
.release
= translate_sse_release
;
716 p
->translate
.set_buffer
= translate_sse_set_buffer
;
718 for (i
= 0; i
< key
->nr_elements
; i
++) {
719 if (key
->element
[i
].type
== TRANSLATE_ELEMENT_NORMAL
) {
722 p
->nr_buffers
= MAX2(p
->nr_buffers
, key
->element
[i
].input_buffer
+ 1);
724 if (key
->element
[i
].instance_divisor
) {
725 p
->use_instancing
= TRUE
;
729 * Map vertex element to vertex buffer varient.
731 for (j
= 0; j
< p
->nr_buffer_varients
; j
++) {
732 if (p
->buffer_varient
[j
].buffer_index
== key
->element
[i
].input_buffer
&&
733 p
->buffer_varient
[j
].instance_divisor
== key
->element
[i
].instance_divisor
) {
737 if (j
== p
->nr_buffer_varients
) {
738 p
->buffer_varient
[j
].buffer_index
= key
->element
[i
].input_buffer
;
739 p
->buffer_varient
[j
].instance_divisor
= key
->element
[i
].instance_divisor
;
740 p
->nr_buffer_varients
++;
742 p
->element_to_buffer_varient
[i
] = j
;
744 assert(key
->element
[i
].type
== TRANSLATE_ELEMENT_INSTANCE_ID
);
746 p
->element_to_buffer_varient
[i
] = ELEMENT_BUFFER_INSTANCE_ID
;
750 if (0) debug_printf("nr_buffers: %d\n", p
->nr_buffers
);
752 if (!build_vertex_emit(p
, &p
->linear_func
, 0))
755 if (!build_vertex_emit(p
, &p
->elt_func
, 4))
758 if (!build_vertex_emit(p
, &p
->elt16_func
, 2))
761 if (!build_vertex_emit(p
, &p
->elt8_func
, 1))
764 p
->translate
.run
= (void*)x86_get_func(&p
->linear_func
);
765 if (p
->translate
.run
== NULL
)
768 p
->translate
.run_elts
= (void*)x86_get_func(&p
->elt_func
);
769 if (p
->translate
.run_elts
== NULL
)
772 p
->translate
.run_elts16
= (void*)x86_get_func(&p
->elt16_func
);
773 if (p
->translate
.run_elts16
== NULL
)
776 p
->translate
.run_elts8
= (void*)x86_get_func(&p
->elt8_func
);
777 if (p
->translate
.run_elts8
== NULL
)
780 return &p
->translate
;
784 translate_sse_release( &p
->translate
);
793 struct translate
*translate_sse2_create( const struct translate_key
*key
)