2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
34 #include "translate.h"
37 #if defined(PIPE_ARCH_X86)
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
49 typedef void (PIPE_CDECL
*run_func
)( struct translate
*translate
,
53 void *output_buffer
);
55 typedef void (PIPE_CDECL
*run_elts_func
)( struct translate
*translate
,
58 void *output_buffer
);
60 struct translate_buffer
{
65 struct translate_buffer_varient
{
66 unsigned buffer_index
;
67 unsigned instance_divisor
;
68 void *ptr
; /* updated either per vertex or per instance */
72 struct translate_sse
{
73 struct translate translate
;
75 struct x86_function linear_func
;
76 struct x86_function elt_func
;
77 struct x86_function
*func
;
79 boolean loaded_identity
;
81 boolean loaded_inv_255
;
87 struct translate_buffer buffer
[PIPE_MAX_ATTRIBS
];
90 /* Multiple buffer varients can map to a single buffer. */
91 struct translate_buffer_varient buffer_varient
[PIPE_MAX_ATTRIBS
];
92 unsigned nr_buffer_varients
;
94 /* Multiple elements can map to a single buffer varient. */
95 unsigned element_to_buffer_varient
[PIPE_MAX_ATTRIBS
];
97 boolean use_instancing
;
101 run_elts_func gen_run_elts
;
103 /* these are actually known values, but putting them in a struct
104 * like this is helpful to keep them in sync across the file.
106 struct x86_reg tmp_EAX
;
107 struct x86_reg idx_EBX
; /* either start+i or &elt[i] */
108 struct x86_reg outbuf_ECX
;
109 struct x86_reg machine_EDX
;
110 struct x86_reg count_ESI
; /* decrements to zero */
113 static int get_offset( const void *a
, const void *b
)
115 return (const char *)b
- (const char *)a
;
120 static struct x86_reg
get_identity( struct translate_sse
*p
)
122 struct x86_reg reg
= x86_make_reg(file_XMM
, 6);
124 if (!p
->loaded_identity
) {
125 p
->loaded_identity
= TRUE
;
131 sse_movups(p
->func
, reg
,
132 x86_make_disp(p
->machine_EDX
,
133 get_offset(p
, &p
->identity
[0])));
139 static struct x86_reg
get_255( struct translate_sse
*p
)
141 struct x86_reg reg
= x86_make_reg(file_XMM
, 7);
143 if (!p
->loaded_255
) {
144 p
->loaded_255
= TRUE
;
148 p
->float_255
[3] = 255.0f
;
150 sse_movups(p
->func
, reg
,
151 x86_make_disp(p
->machine_EDX
,
152 get_offset(p
, &p
->float_255
[0])));
158 static struct x86_reg
get_inv_255( struct translate_sse
*p
)
160 struct x86_reg reg
= x86_make_reg(file_XMM
, 5);
162 if (!p
->loaded_inv_255
) {
163 p
->loaded_inv_255
= TRUE
;
167 p
->inv_255
[3] = 1.0f
/ 255.0f
;
169 sse_movups(p
->func
, reg
,
170 x86_make_disp(p
->machine_EDX
,
171 get_offset(p
, &p
->inv_255
[0])));
178 static void emit_load_R32G32B32A32( struct translate_sse
*p
,
180 struct x86_reg arg0
)
182 sse_movups(p
->func
, data
, arg0
);
185 static void emit_load_R32G32B32( struct translate_sse
*p
,
187 struct x86_reg arg0
)
189 /* Have to jump through some hoops:
196 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
197 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
198 sse_shufps(p
->func
, data
, data
, SHUF(Y
,Z
,X
,W
) );
199 sse_movlps(p
->func
, data
, arg0
);
202 static void emit_load_R32G32( struct translate_sse
*p
,
204 struct x86_reg arg0
)
209 sse_movups(p
->func
, data
, get_identity(p
) );
210 sse_movlps(p
->func
, data
, arg0
);
214 static void emit_load_R32( struct translate_sse
*p
,
216 struct x86_reg arg0
)
221 sse_movss(p
->func
, data
, arg0
);
222 sse_orps(p
->func
, data
, get_identity(p
) );
226 static void emit_load_R8G8B8A8_UNORM( struct translate_sse
*p
,
231 /* Load and unpack twice:
233 sse_movss(p
->func
, data
, src
);
234 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
235 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
239 sse2_cvtdq2ps(p
->func
, data
, data
);
244 sse_mulps(p
->func
, data
, get_inv_255(p
));
250 static void emit_store_R32G32B32A32( struct translate_sse
*p
,
252 struct x86_reg dataXMM
)
254 sse_movups(p
->func
, dest
, dataXMM
);
257 static void emit_store_R32G32B32( struct translate_sse
*p
,
259 struct x86_reg dataXMM
)
261 /* Emit two, shuffle, emit one.
263 sse_movlps(p
->func
, dest
, dataXMM
);
264 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
265 sse_movss(p
->func
, x86_make_disp(dest
,8), dataXMM
);
268 static void emit_store_R32G32( struct translate_sse
*p
,
270 struct x86_reg dataXMM
)
272 sse_movlps(p
->func
, dest
, dataXMM
);
275 static void emit_store_R32( struct translate_sse
*p
,
277 struct x86_reg dataXMM
)
279 sse_movss(p
->func
, dest
, dataXMM
);
284 static void emit_store_R8G8B8A8_UNORM( struct translate_sse
*p
,
286 struct x86_reg dataXMM
)
290 sse_mulps(p
->func
, dataXMM
, get_255(p
));
294 sse2_cvtps2dq(p
->func
, dataXMM
, dataXMM
);
295 sse2_packssdw(p
->func
, dataXMM
, dataXMM
);
296 sse2_packuswb(p
->func
, dataXMM
, dataXMM
);
297 sse_movss(p
->func
, dest
, dataXMM
);
304 /* Extended swizzles? Maybe later.
306 static void emit_swizzle( struct translate_sse
*p
,
309 unsigned char shuffle
)
311 sse_shufps(p
->func
, dest
, src
, shuffle
);
315 static boolean
translate_attr( struct translate_sse
*p
,
316 const struct translate_element
*a
,
317 struct x86_reg srcECX
,
318 struct x86_reg dstEAX
)
320 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
322 switch (a
->input_format
) {
323 case PIPE_FORMAT_R32_FLOAT
:
324 emit_load_R32(p
, dataXMM
, srcECX
);
326 case PIPE_FORMAT_R32G32_FLOAT
:
327 emit_load_R32G32(p
, dataXMM
, srcECX
);
329 case PIPE_FORMAT_R32G32B32_FLOAT
:
330 emit_load_R32G32B32(p
, dataXMM
, srcECX
);
332 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
333 emit_load_R32G32B32A32(p
, dataXMM
, srcECX
);
335 case PIPE_FORMAT_B8G8R8A8_UNORM
:
336 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
337 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
339 case PIPE_FORMAT_R8G8B8A8_UNORM
:
340 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
346 switch (a
->output_format
) {
347 case PIPE_FORMAT_R32_FLOAT
:
348 emit_store_R32(p
, dstEAX
, dataXMM
);
350 case PIPE_FORMAT_R32G32_FLOAT
:
351 emit_store_R32G32(p
, dstEAX
, dataXMM
);
353 case PIPE_FORMAT_R32G32B32_FLOAT
:
354 emit_store_R32G32B32(p
, dstEAX
, dataXMM
);
356 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
357 emit_store_R32G32B32A32(p
, dstEAX
, dataXMM
);
359 case PIPE_FORMAT_B8G8R8A8_UNORM
:
360 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
361 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
363 case PIPE_FORMAT_R8G8B8A8_UNORM
:
364 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
374 static boolean
init_inputs( struct translate_sse
*p
,
379 struct x86_reg instance_id
= x86_make_disp(p
->machine_EDX
,
380 get_offset(p
, &p
->instance_id
));
382 for (i
= 0; i
< p
->nr_buffer_varients
; i
++) {
383 struct translate_buffer_varient
*varient
= &p
->buffer_varient
[i
];
384 struct translate_buffer
*buffer
= &p
->buffer
[varient
->buffer_index
];
385 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDX
,
386 get_offset(p
, &buffer
->stride
));
387 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDX
,
388 get_offset(p
, &varient
->ptr
));
389 struct x86_reg buf_base_ptr
= x86_make_disp(p
->machine_EDX
,
390 get_offset(p
, &buffer
->base_ptr
));
391 struct x86_reg elt
= p
->idx_EBX
;
392 struct x86_reg tmp_EAX
= p
->tmp_EAX
;
394 /* Calculate pointer to first attrib:
395 * base_ptr + stride * index, where index depends on instance divisor
397 if (varient
->instance_divisor
) {
398 /* Our index is instance ID divided by instance divisor.
400 x86_mov(p
->func
, tmp_EAX
, instance_id
);
402 if (varient
->instance_divisor
!= 1) {
403 struct x86_reg tmp_EDX
= p
->machine_EDX
;
404 struct x86_reg tmp_ECX
= p
->outbuf_ECX
;
406 /* TODO: Add x86_shr() to rtasm and use it whenever
407 * instance divisor is power of two.
410 x86_push(p
->func
, tmp_EDX
);
411 x86_push(p
->func
, tmp_ECX
);
412 x86_xor(p
->func
, tmp_EDX
, tmp_EDX
);
413 x86_mov_reg_imm(p
->func
, tmp_ECX
, varient
->instance_divisor
);
414 x86_div(p
->func
, tmp_ECX
); /* EAX = EDX:EAX / ECX */
415 x86_pop(p
->func
, tmp_ECX
);
416 x86_pop(p
->func
, tmp_EDX
);
419 x86_mov(p
->func
, tmp_EAX
, elt
);
421 x86_imul(p
->func
, tmp_EAX
, buf_stride
);
422 x86_add(p
->func
, tmp_EAX
, buf_base_ptr
);
425 /* In the linear case, keep the buffer pointer instead of the
428 if (p
->nr_buffer_varients
== 1)
429 x86_mov(p
->func
, elt
, tmp_EAX
);
431 x86_mov(p
->func
, buf_ptr
, tmp_EAX
);
439 static struct x86_reg
get_buffer_ptr( struct translate_sse
*p
,
444 if (linear
&& p
->nr_buffer_varients
== 1) {
448 struct x86_reg ptr
= p
->tmp_EAX
;
449 struct x86_reg buf_ptr
=
450 x86_make_disp(p
->machine_EDX
,
451 get_offset(p
, &p
->buffer_varient
[var_idx
].ptr
));
453 x86_mov(p
->func
, ptr
, buf_ptr
);
457 struct x86_reg ptr
= p
->tmp_EAX
;
458 const struct translate_buffer_varient
*varient
= &p
->buffer_varient
[var_idx
];
460 struct x86_reg buf_stride
=
461 x86_make_disp(p
->machine_EDX
,
462 get_offset(p
, &p
->buffer
[varient
->buffer_index
].stride
));
464 struct x86_reg buf_base_ptr
=
465 x86_make_disp(p
->machine_EDX
,
466 get_offset(p
, &p
->buffer
[varient
->buffer_index
].base_ptr
));
470 /* Calculate pointer to current attrib:
472 x86_mov(p
->func
, ptr
, buf_stride
);
473 x86_imul(p
->func
, ptr
, elt
);
474 x86_add(p
->func
, ptr
, buf_base_ptr
);
481 static boolean
incr_inputs( struct translate_sse
*p
,
484 if (linear
&& p
->nr_buffer_varients
== 1) {
485 struct x86_reg stride
= x86_make_disp(p
->machine_EDX
,
486 get_offset(p
, &p
->buffer
[0].stride
));
488 if (p
->buffer_varient
[0].instance_divisor
== 0) {
489 x86_add(p
->func
, p
->idx_EBX
, stride
);
490 sse_prefetchnta(p
->func
, x86_make_disp(p
->idx_EBX
, 192));
496 /* Is this worthwhile??
498 for (i
= 0; i
< p
->nr_buffer_varients
; i
++) {
499 struct translate_buffer_varient
*varient
= &p
->buffer_varient
[i
];
500 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDX
,
501 get_offset(p
, &varient
->ptr
));
502 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDX
,
503 get_offset(p
, &p
->buffer
[varient
->buffer_index
].stride
));
505 if (varient
->instance_divisor
== 0) {
506 x86_mov(p
->func
, p
->tmp_EAX
, buf_ptr
);
507 x86_add(p
->func
, p
->tmp_EAX
, buf_stride
);
508 if (i
== 0) sse_prefetchnta(p
->func
, x86_make_disp(p
->tmp_EAX
, 192));
509 x86_mov(p
->func
, buf_ptr
, p
->tmp_EAX
);
514 x86_lea(p
->func
, p
->idx_EBX
, x86_make_disp(p
->idx_EBX
, 4));
521 /* Build run( struct translate *machine,
524 * void *output_buffer )
526 * run_elts( struct translate *machine,
529 * void *output_buffer )
533 * EAX -- pointer to current output vertex
534 * ECX -- pointer to current attribute
537 static boolean
build_vertex_emit( struct translate_sse
*p
,
538 struct x86_function
*func
,
544 p
->tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
545 p
->idx_EBX
= x86_make_reg(file_REG32
, reg_BX
);
546 p
->outbuf_ECX
= x86_make_reg(file_REG32
, reg_CX
);
547 p
->machine_EDX
= x86_make_reg(file_REG32
, reg_DX
);
548 p
->count_ESI
= x86_make_reg(file_REG32
, reg_SI
);
551 p
->loaded_inv_255
= FALSE
;
552 p
->loaded_255
= FALSE
;
553 p
->loaded_identity
= FALSE
;
555 x86_init_func(p
->func
);
559 x86_push(p
->func
, p
->idx_EBX
);
560 x86_push(p
->func
, p
->count_ESI
);
562 /* Load arguments into regs:
564 x86_mov(p
->func
, p
->machine_EDX
, x86_fn_arg(p
->func
, 1));
565 x86_mov(p
->func
, p
->idx_EBX
, x86_fn_arg(p
->func
, 2));
566 x86_mov(p
->func
, p
->count_ESI
, x86_fn_arg(p
->func
, 3));
567 x86_mov(p
->func
, p
->outbuf_ECX
, x86_fn_arg(p
->func
, 5));
571 if (p
->use_instancing
) {
574 x86_fn_arg(p
->func
, 4));
576 x86_make_disp(p
->machine_EDX
, get_offset(p
, &p
->instance_id
)),
580 /* Get vertex count, compare to zero
582 x86_xor(p
->func
, p
->tmp_EAX
, p
->tmp_EAX
);
583 x86_cmp(p
->func
, p
->count_ESI
, p
->tmp_EAX
);
584 fixup
= x86_jcc_forward(p
->func
, cc_E
);
586 /* always load, needed or not:
588 init_inputs(p
, linear
);
590 /* Note address for loop jump
592 label
= x86_get_label(p
->func
);
594 struct x86_reg elt
= linear
? p
->idx_EBX
: x86_deref(p
->idx_EBX
);
595 int last_varient
= -1;
598 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
599 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
600 unsigned varient
= p
->element_to_buffer_varient
[j
];
602 /* Figure out source pointer address:
604 if (varient
!= last_varient
) {
605 last_varient
= varient
;
606 vb
= get_buffer_ptr(p
, linear
, varient
, elt
);
609 if (!translate_attr( p
, a
,
610 x86_make_disp(vb
, a
->input_offset
),
611 x86_make_disp(p
->outbuf_ECX
, a
->output_offset
)))
615 /* Next output vertex:
619 x86_make_disp(p
->outbuf_ECX
,
620 p
->translate
.key
.output_stride
));
624 incr_inputs( p
, linear
);
627 /* decr count, loop if not zero
629 x86_dec(p
->func
, p
->count_ESI
);
630 x86_jcc(p
->func
, cc_NZ
, label
);
634 if (p
->func
->need_emms
)
637 /* Land forward jump here:
639 x86_fixup_fwd_jump(p
->func
, fixup
);
641 /* Pop regs and return
644 x86_pop(p
->func
, p
->count_ESI
);
645 x86_pop(p
->func
, p
->idx_EBX
);
657 static void translate_sse_set_buffer( struct translate
*translate
,
662 struct translate_sse
*p
= (struct translate_sse
*)translate
;
664 if (buf
< p
->nr_buffers
) {
665 p
->buffer
[buf
].base_ptr
= (char *)ptr
;
666 p
->buffer
[buf
].stride
= stride
;
669 if (0) debug_printf("%s %d/%d: %p %d\n",
676 static void translate_sse_release( struct translate
*translate
)
678 struct translate_sse
*p
= (struct translate_sse
*)translate
;
680 x86_release_func( &p
->linear_func
);
681 x86_release_func( &p
->elt_func
);
686 static void PIPE_CDECL
translate_sse_run_elts( struct translate
*translate
,
687 const unsigned *elts
,
689 void *output_buffer
)
691 struct translate_sse
*p
= (struct translate_sse
*)translate
;
693 p
->gen_run_elts( translate
,
699 static void PIPE_CDECL
translate_sse_run( struct translate
*translate
,
702 unsigned instance_id
,
703 void *output_buffer
)
705 struct translate_sse
*p
= (struct translate_sse
*)translate
;
707 p
->gen_run( translate
,
715 struct translate
*translate_sse2_create( const struct translate_key
*key
)
717 struct translate_sse
*p
= NULL
;
720 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
723 p
= CALLOC_STRUCT( translate_sse
);
727 p
->translate
.key
= *key
;
728 p
->translate
.release
= translate_sse_release
;
729 p
->translate
.set_buffer
= translate_sse_set_buffer
;
730 p
->translate
.run_elts
= translate_sse_run_elts
;
731 p
->translate
.run
= translate_sse_run
;
733 for (i
= 0; i
< key
->nr_elements
; i
++) {
736 p
->nr_buffers
= MAX2( p
->nr_buffers
, key
->element
[i
].input_buffer
+ 1 );
738 if (key
->element
[i
].instance_divisor
) {
739 p
->use_instancing
= TRUE
;
743 * Map vertex element to vertex buffer varient.
745 for (j
= 0; j
< p
->nr_buffer_varients
; j
++) {
746 if (p
->buffer_varient
[j
].buffer_index
== key
->element
[i
].input_buffer
&&
747 p
->buffer_varient
[j
].instance_divisor
== key
->element
[i
].instance_divisor
) {
751 if (j
== p
->nr_buffer_varients
) {
752 p
->buffer_varient
[j
].buffer_index
= key
->element
[i
].input_buffer
;
753 p
->buffer_varient
[j
].instance_divisor
= key
->element
[i
].instance_divisor
;
754 p
->nr_buffer_varients
++;
756 p
->element_to_buffer_varient
[i
] = j
;
759 if (0) debug_printf("nr_buffers: %d\n", p
->nr_buffers
);
761 if (!build_vertex_emit(p
, &p
->linear_func
, TRUE
))
764 if (!build_vertex_emit(p
, &p
->elt_func
, FALSE
))
767 p
->gen_run
= (run_func
)x86_get_func(&p
->linear_func
);
768 if (p
->gen_run
== NULL
)
771 p
->gen_run_elts
= (run_elts_func
)x86_get_func(&p
->elt_func
);
772 if (p
->gen_run_elts
== NULL
)
775 return &p
->translate
;
779 translate_sse_release( &p
->translate
);
788 struct translate
*translate_sse2_create( const struct translate_key
*key
)