2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_compiler.h"
30 #include "pipe/p_util.h"
31 #include "util/u_simple_list.h"
33 #include "translate.h"
36 #if defined(__i386__) || defined(__386__) || defined(i386)
38 #include "rtasm/rtasm_cpu.h"
39 #include "rtasm/rtasm_x86sse.h"
54 typedef void (RTASM
*run_func
)( struct translate
*translate
,
57 void *output_buffer
);
59 typedef void (RTASM
*run_elts_func
)( struct translate
*translate
,
62 void *output_buffer
);
66 struct translate_sse
{
67 struct translate translate
;
69 struct x86_function linear_func
;
70 struct x86_function elt_func
;
71 struct x86_function
*func
;
73 boolean loaded_identity
;
75 boolean loaded_inv_255
;
83 unsigned input_stride
;
84 } attrib
[PIPE_MAX_ATTRIBS
];
87 run_elts_func gen_run_elts
;
91 static int get_offset( const void *a
, const void *b
)
93 return (const char *)b
- (const char *)a
;
98 static struct x86_reg
get_identity( struct translate_sse
*p
)
100 struct x86_reg reg
= x86_make_reg(file_XMM
, 6);
102 if (!p
->loaded_identity
) {
105 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
107 p
->loaded_identity
= TRUE
;
113 sse_movups(p
->func
, reg
,
114 x86_make_disp(translateESI
,
115 get_offset(p
, &p
->identity
[0])));
121 static struct x86_reg
get_255( struct translate_sse
*p
)
123 struct x86_reg reg
= x86_make_reg(file_XMM
, 6);
125 if (!p
->loaded_255
) {
126 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
128 p
->loaded_255
= TRUE
;
132 p
->float_255
[3] = 255.0f
;
134 sse_movups(p
->func
, reg
,
135 x86_make_disp(translateESI
,
136 get_offset(p
, &p
->float_255
[0])));
140 return x86_make_reg(file_XMM
, 7);
143 static struct x86_reg
get_inv_255( struct translate_sse
*p
)
145 struct x86_reg reg
= x86_make_reg(file_XMM
, 5);
147 if (!p
->loaded_inv_255
) {
148 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
150 p
->loaded_inv_255
= TRUE
;
154 p
->inv_255
[3] = 1.0f
/ 255.0f
;
156 sse_movups(p
->func
, reg
,
157 x86_make_disp(translateESI
,
158 get_offset(p
, &p
->inv_255
[0])));
165 static void emit_load_R32G32B32A32( struct translate_sse
*p
,
167 struct x86_reg arg0
)
169 sse_movups(p
->func
, data
, arg0
);
172 static void emit_load_R32G32B32( struct translate_sse
*p
,
174 struct x86_reg arg0
)
176 /* Have to jump through some hoops:
183 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
184 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
185 sse_shufps(p
->func
, data
, data
, SHUF(Y
,Z
,X
,W
) );
186 sse_movlps(p
->func
, data
, arg0
);
189 static void emit_load_R32G32( struct translate_sse
*p
,
191 struct x86_reg arg0
)
196 sse_movups(p
->func
, data
, get_identity(p
) );
197 sse_movlps(p
->func
, data
, arg0
);
201 static void emit_load_R32( struct translate_sse
*p
,
203 struct x86_reg arg0
)
208 sse_movss(p
->func
, data
, arg0
);
209 sse_orps(p
->func
, data
, get_identity(p
) );
213 static void emit_load_R8G8B8A8_UNORM( struct translate_sse
*p
,
218 /* Load and unpack twice:
220 sse_movss(p
->func
, data
, src
);
221 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
222 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
226 sse2_cvtdq2ps(p
->func
, data
, data
);
231 sse_mulps(p
->func
, data
, get_inv_255(p
));
237 static void emit_store_R32G32B32A32( struct translate_sse
*p
,
239 struct x86_reg dataXMM
)
241 sse_movups(p
->func
, dest
, dataXMM
);
244 static void emit_store_R32G32B32( struct translate_sse
*p
,
246 struct x86_reg dataXMM
)
248 /* Emit two, shuffle, emit one.
250 sse_movlps(p
->func
, dest
, dataXMM
);
251 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
252 sse_movss(p
->func
, x86_make_disp(dest
,8), dataXMM
);
255 static void emit_store_R32G32( struct translate_sse
*p
,
257 struct x86_reg dataXMM
)
259 sse_movlps(p
->func
, dest
, dataXMM
);
262 static void emit_store_R32( struct translate_sse
*p
,
264 struct x86_reg dataXMM
)
266 sse_movss(p
->func
, dest
, dataXMM
);
271 static void emit_store_R8G8B8A8_UNORM( struct translate_sse
*p
,
273 struct x86_reg dataXMM
)
277 sse_mulps(p
->func
, dataXMM
, get_255(p
));
281 sse2_cvtps2dq(p
->func
, dataXMM
, dataXMM
);
282 sse2_packssdw(p
->func
, dataXMM
, dataXMM
);
283 sse2_packuswb(p
->func
, dataXMM
, dataXMM
);
284 sse_movss(p
->func
, dest
, dataXMM
);
291 static void get_src_ptr( struct translate_sse
*p
,
292 struct x86_reg srcEAX
,
293 struct x86_reg translateREG
,
294 struct x86_reg eltREG
,
297 struct x86_reg input_ptr
=
298 x86_make_disp(translateREG
,
299 get_offset(p
, &p
->attrib
[a
].input_ptr
));
301 struct x86_reg input_stride
=
302 x86_make_disp(translateREG
,
303 get_offset(p
, &p
->attrib
[a
].input_stride
));
305 /* Calculate pointer to current attrib:
307 x86_mov(p
->func
, srcEAX
, input_stride
);
308 x86_imul(p
->func
, srcEAX
, eltREG
);
309 x86_add(p
->func
, srcEAX
, input_ptr
);
313 /* Extended swizzles? Maybe later.
315 static void emit_swizzle( struct translate_sse
*p
,
320 sse_shufps(p
->func
, dest
, src
, shuffle
);
324 static boolean
translate_attr( struct translate_sse
*p
,
325 const struct translate_element
*a
,
326 struct x86_reg srcECX
,
327 struct x86_reg dstEAX
)
329 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
331 switch (a
->input_format
) {
332 case PIPE_FORMAT_R32_FLOAT
:
333 emit_load_R32(p
, dataXMM
, srcECX
);
335 case PIPE_FORMAT_R32G32_FLOAT
:
336 emit_load_R32G32(p
, dataXMM
, srcECX
);
338 case PIPE_FORMAT_R32G32B32_FLOAT
:
339 emit_load_R32G32B32(p
, dataXMM
, srcECX
);
341 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
342 emit_load_R32G32B32A32(p
, dataXMM
, srcECX
);
344 case PIPE_FORMAT_B8G8R8A8_UNORM
:
345 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
346 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
348 case PIPE_FORMAT_R8G8B8A8_UNORM
:
349 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
355 switch (a
->output_format
) {
356 case PIPE_FORMAT_R32_FLOAT
:
357 emit_store_R32(p
, dstEAX
, dataXMM
);
359 case PIPE_FORMAT_R32G32_FLOAT
:
360 emit_store_R32G32(p
, dstEAX
, dataXMM
);
362 case PIPE_FORMAT_R32G32B32_FLOAT
:
363 emit_store_R32G32B32(p
, dstEAX
, dataXMM
);
365 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
366 emit_store_R32G32B32A32(p
, dstEAX
, dataXMM
);
368 case PIPE_FORMAT_B8G8R8A8_UNORM
:
369 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
370 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
372 case PIPE_FORMAT_R8G8B8A8_UNORM
:
373 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
382 /* Build run( struct translate *translate,
385 * void *output_buffer )
387 * run_elts( struct translate *translate,
390 * void *output_buffer )
394 * EAX -- pointer to current output vertex
395 * ECX -- pointer to current attribute
398 static boolean
build_vertex_emit( struct translate_sse
*p
,
399 struct x86_function
*func
,
402 struct x86_reg vertexECX
= x86_make_reg(file_REG32
, reg_AX
);
403 struct x86_reg idxEBX
= x86_make_reg(file_REG32
, reg_BX
);
404 struct x86_reg srcEAX
= x86_make_reg(file_REG32
, reg_CX
);
405 struct x86_reg countEBP
= x86_make_reg(file_REG32
, reg_BP
);
406 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
407 uint8_t *fixup
, *label
;
411 p
->loaded_inv_255
= FALSE
;
412 p
->loaded_255
= FALSE
;
413 p
->loaded_identity
= FALSE
;
415 x86_init_func(p
->func
);
419 x86_push(p
->func
, countEBP
);
420 x86_push(p
->func
, translateESI
);
421 x86_push(p
->func
, idxEBX
);
423 /* Get vertex count, compare to zero
425 x86_xor(p
->func
, idxEBX
, idxEBX
);
426 x86_mov(p
->func
, countEBP
, x86_fn_arg(p
->func
, 3));
427 x86_cmp(p
->func
, countEBP
, idxEBX
);
428 fixup
= x86_jcc_forward(p
->func
, cc_E
);
430 /* If linear, idx is the current element, otherwise it is a pointer
431 * to the current element.
433 x86_mov(p
->func
, idxEBX
, x86_fn_arg(p
->func
, 2));
435 /* Initialize destination register.
437 x86_mov(p
->func
, vertexECX
, x86_fn_arg(p
->func
, 4));
439 /* Move argument 1 (translate_sse pointer) into a reg:
441 x86_mov(p
->func
, translateESI
, x86_fn_arg(p
->func
, 1));
444 /* always load, needed or not:
447 /* Note address for loop jump */
448 label
= x86_get_label(p
->func
);
451 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
452 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
454 struct x86_reg destEAX
= x86_make_disp(vertexECX
,
457 /* Figure out source pointer address:
460 get_src_ptr(p
, srcEAX
, translateESI
, idxEBX
, j
);
463 get_src_ptr(p
, srcEAX
, translateESI
, x86_deref(idxEBX
), j
);
466 if (!translate_attr( p
, a
, x86_deref(srcEAX
), destEAX
))
472 x86_lea(p
->func
, vertexECX
, x86_make_disp(vertexECX
, p
->translate
.key
.output_stride
));
475 */ /* Emit code for each of the attributes. Currently routes
476 * everything through SSE registers, even when it might be more
477 * efficient to stick with regular old x86. No optimization or
478 * other tricks - enough new ground to cover here just getting
483 x86_inc(p
->func
, idxEBX
);
486 x86_lea(p
->func
, idxEBX
, x86_make_disp(idxEBX
, 4));
489 /* decr count, loop if not zero
491 x86_dec(p
->func
, countEBP
);
492 x86_test(p
->func
, countEBP
, countEBP
);
493 x86_jcc(p
->func
, cc_NZ
, label
);
497 if (p
->func
->need_emms
)
500 /* Land forward jump here:
502 x86_fixup_fwd_jump(p
->func
, fixup
);
504 /* Pop regs and return
507 x86_pop(p
->func
, idxEBX
);
508 x86_pop(p
->func
, translateESI
);
509 x86_pop(p
->func
, countEBP
);
521 static void translate_sse_set_buffer( struct translate
*translate
,
526 struct translate_sse
*p
= (struct translate_sse
*)translate
;
529 for (i
= 0; i
< p
->translate
.key
.nr_elements
; i
++) {
530 if (p
->translate
.key
.element
[i
].input_buffer
== buf
) {
531 p
->attrib
[i
].input_ptr
= ((char *)ptr
+
532 p
->translate
.key
.element
[i
].input_offset
);
533 p
->attrib
[i
].input_stride
= stride
;
539 static void translate_sse_release( struct translate
*translate
)
541 struct translate_sse
*p
= (struct translate_sse
*)translate
;
543 x86_release_func( &p
->linear_func
);
544 x86_release_func( &p
->elt_func
);
549 static void translate_sse_run_elts( struct translate
*translate
,
550 const unsigned *elts
,
552 void *output_buffer
)
554 struct translate_sse
*p
= (struct translate_sse
*)translate
;
556 p
->gen_run_elts( translate
,
562 static void translate_sse_run( struct translate
*translate
,
565 void *output_buffer
)
567 struct translate_sse
*p
= (struct translate_sse
*)translate
;
569 p
->gen_run( translate
,
576 struct translate
*translate_sse2_create( const struct translate_key
*key
)
578 struct translate_sse
*p
= NULL
;
580 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
583 p
= CALLOC_STRUCT( translate_sse
);
587 p
->translate
.key
= *key
;
588 p
->translate
.release
= translate_sse_release
;
589 p
->translate
.set_buffer
= translate_sse_set_buffer
;
590 p
->translate
.run_elts
= translate_sse_run_elts
;
591 p
->translate
.run
= translate_sse_run
;
593 if (!build_vertex_emit(p
, &p
->linear_func
, TRUE
))
596 if (!build_vertex_emit(p
, &p
->elt_func
, FALSE
))
599 p
->gen_run
= (run_func
)x86_get_func(&p
->linear_func
);
600 p
->gen_run_elts
= (run_elts_func
)x86_get_func(&p
->elt_func
);
602 return &p
->translate
;
606 translate_sse_release( &p
->translate
);
615 void translate_create_sse( const struct translate_key
*key
)