2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_simple_list.h"
34 #include "translate.h"
37 #if defined(PIPE_ARCH_X86)
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
49 typedef void (PIPE_CDECL
*run_func
)( struct translate
*translate
,
52 void *output_buffer
);
54 typedef void (PIPE_CDECL
*run_elts_func
)( struct translate
*translate
,
57 void *output_buffer
);
61 struct translate_sse
{
62 struct translate translate
;
64 struct x86_function linear_func
;
65 struct x86_function elt_func
;
66 struct x86_function
*func
;
68 boolean loaded_identity
;
70 boolean loaded_inv_255
;
78 unsigned input_stride
;
79 } attrib
[PIPE_MAX_ATTRIBS
];
82 run_elts_func gen_run_elts
;
86 static int get_offset( const void *a
, const void *b
)
88 return (const char *)b
- (const char *)a
;
93 static struct x86_reg
get_identity( struct translate_sse
*p
)
95 struct x86_reg reg
= x86_make_reg(file_XMM
, 6);
97 if (!p
->loaded_identity
) {
100 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
102 p
->loaded_identity
= TRUE
;
108 sse_movups(p
->func
, reg
,
109 x86_make_disp(translateESI
,
110 get_offset(p
, &p
->identity
[0])));
116 static struct x86_reg
get_255( struct translate_sse
*p
)
118 struct x86_reg reg
= x86_make_reg(file_XMM
, 6);
120 if (!p
->loaded_255
) {
121 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
123 p
->loaded_255
= TRUE
;
127 p
->float_255
[3] = 255.0f
;
129 sse_movups(p
->func
, reg
,
130 x86_make_disp(translateESI
,
131 get_offset(p
, &p
->float_255
[0])));
135 return x86_make_reg(file_XMM
, 7);
138 static struct x86_reg
get_inv_255( struct translate_sse
*p
)
140 struct x86_reg reg
= x86_make_reg(file_XMM
, 5);
142 if (!p
->loaded_inv_255
) {
143 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
145 p
->loaded_inv_255
= TRUE
;
149 p
->inv_255
[3] = 1.0f
/ 255.0f
;
151 sse_movups(p
->func
, reg
,
152 x86_make_disp(translateESI
,
153 get_offset(p
, &p
->inv_255
[0])));
160 static void emit_load_R32G32B32A32( struct translate_sse
*p
,
162 struct x86_reg arg0
)
164 sse_movups(p
->func
, data
, arg0
);
167 static void emit_load_R32G32B32( struct translate_sse
*p
,
169 struct x86_reg arg0
)
171 /* Have to jump through some hoops:
178 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
179 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
180 sse_shufps(p
->func
, data
, data
, SHUF(Y
,Z
,X
,W
) );
181 sse_movlps(p
->func
, data
, arg0
);
184 static void emit_load_R32G32( struct translate_sse
*p
,
186 struct x86_reg arg0
)
191 sse_movups(p
->func
, data
, get_identity(p
) );
192 sse_movlps(p
->func
, data
, arg0
);
196 static void emit_load_R32( struct translate_sse
*p
,
198 struct x86_reg arg0
)
203 sse_movss(p
->func
, data
, arg0
);
204 sse_orps(p
->func
, data
, get_identity(p
) );
208 static void emit_load_R8G8B8A8_UNORM( struct translate_sse
*p
,
213 /* Load and unpack twice:
215 sse_movss(p
->func
, data
, src
);
216 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
217 sse2_punpcklbw(p
->func
, data
, get_identity(p
));
221 sse2_cvtdq2ps(p
->func
, data
, data
);
226 sse_mulps(p
->func
, data
, get_inv_255(p
));
232 static void emit_store_R32G32B32A32( struct translate_sse
*p
,
234 struct x86_reg dataXMM
)
236 sse_movups(p
->func
, dest
, dataXMM
);
239 static void emit_store_R32G32B32( struct translate_sse
*p
,
241 struct x86_reg dataXMM
)
243 /* Emit two, shuffle, emit one.
245 sse_movlps(p
->func
, dest
, dataXMM
);
246 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(Z
,Z
,Z
,Z
) ); /* NOTE! destructive */
247 sse_movss(p
->func
, x86_make_disp(dest
,8), dataXMM
);
250 static void emit_store_R32G32( struct translate_sse
*p
,
252 struct x86_reg dataXMM
)
254 sse_movlps(p
->func
, dest
, dataXMM
);
257 static void emit_store_R32( struct translate_sse
*p
,
259 struct x86_reg dataXMM
)
261 sse_movss(p
->func
, dest
, dataXMM
);
266 static void emit_store_R8G8B8A8_UNORM( struct translate_sse
*p
,
268 struct x86_reg dataXMM
)
272 sse_mulps(p
->func
, dataXMM
, get_255(p
));
276 sse2_cvtps2dq(p
->func
, dataXMM
, dataXMM
);
277 sse2_packssdw(p
->func
, dataXMM
, dataXMM
);
278 sse2_packuswb(p
->func
, dataXMM
, dataXMM
);
279 sse_movss(p
->func
, dest
, dataXMM
);
286 static void get_src_ptr( struct translate_sse
*p
,
287 struct x86_reg srcEAX
,
288 struct x86_reg translateREG
,
289 struct x86_reg eltREG
,
292 struct x86_reg input_ptr
=
293 x86_make_disp(translateREG
,
294 get_offset(p
, &p
->attrib
[a
].input_ptr
));
296 struct x86_reg input_stride
=
297 x86_make_disp(translateREG
,
298 get_offset(p
, &p
->attrib
[a
].input_stride
));
300 /* Calculate pointer to current attrib:
302 x86_mov(p
->func
, srcEAX
, input_stride
);
303 x86_imul(p
->func
, srcEAX
, eltREG
);
304 x86_add(p
->func
, srcEAX
, input_ptr
);
308 /* Extended swizzles? Maybe later.
310 static void emit_swizzle( struct translate_sse
*p
,
313 unsigned char shuffle
)
315 sse_shufps(p
->func
, dest
, src
, shuffle
);
319 static boolean
translate_attr( struct translate_sse
*p
,
320 const struct translate_element
*a
,
321 struct x86_reg srcECX
,
322 struct x86_reg dstEAX
)
324 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
326 switch (a
->input_format
) {
327 case PIPE_FORMAT_R32_FLOAT
:
328 emit_load_R32(p
, dataXMM
, srcECX
);
330 case PIPE_FORMAT_R32G32_FLOAT
:
331 emit_load_R32G32(p
, dataXMM
, srcECX
);
333 case PIPE_FORMAT_R32G32B32_FLOAT
:
334 emit_load_R32G32B32(p
, dataXMM
, srcECX
);
336 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
337 emit_load_R32G32B32A32(p
, dataXMM
, srcECX
);
339 case PIPE_FORMAT_B8G8R8A8_UNORM
:
340 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
341 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
343 case PIPE_FORMAT_R8G8B8A8_UNORM
:
344 emit_load_R8G8B8A8_UNORM(p
, dataXMM
, srcECX
);
350 switch (a
->output_format
) {
351 case PIPE_FORMAT_R32_FLOAT
:
352 emit_store_R32(p
, dstEAX
, dataXMM
);
354 case PIPE_FORMAT_R32G32_FLOAT
:
355 emit_store_R32G32(p
, dstEAX
, dataXMM
);
357 case PIPE_FORMAT_R32G32B32_FLOAT
:
358 emit_store_R32G32B32(p
, dstEAX
, dataXMM
);
360 case PIPE_FORMAT_R32G32B32A32_FLOAT
:
361 emit_store_R32G32B32A32(p
, dstEAX
, dataXMM
);
363 case PIPE_FORMAT_B8G8R8A8_UNORM
:
364 emit_swizzle(p
, dataXMM
, dataXMM
, SHUF(Z
,Y
,X
,W
));
365 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
367 case PIPE_FORMAT_R8G8B8A8_UNORM
:
368 emit_store_R8G8B8A8_UNORM(p
, dstEAX
, dataXMM
);
377 /* Build run( struct translate *translate,
380 * void *output_buffer )
382 * run_elts( struct translate *translate,
385 * void *output_buffer )
389 * EAX -- pointer to current output vertex
390 * ECX -- pointer to current attribute
393 static boolean
build_vertex_emit( struct translate_sse
*p
,
394 struct x86_function
*func
,
397 struct x86_reg vertexECX
= x86_make_reg(file_REG32
, reg_AX
);
398 struct x86_reg idxEBX
= x86_make_reg(file_REG32
, reg_BX
);
399 struct x86_reg srcEAX
= x86_make_reg(file_REG32
, reg_CX
);
400 struct x86_reg countEBP
= x86_make_reg(file_REG32
, reg_BP
);
401 struct x86_reg translateESI
= x86_make_reg(file_REG32
, reg_SI
);
406 p
->loaded_inv_255
= FALSE
;
407 p
->loaded_255
= FALSE
;
408 p
->loaded_identity
= FALSE
;
410 x86_init_func(p
->func
);
414 x86_push(p
->func
, countEBP
);
415 x86_push(p
->func
, translateESI
);
416 x86_push(p
->func
, idxEBX
);
418 /* Get vertex count, compare to zero
420 x86_xor(p
->func
, idxEBX
, idxEBX
);
421 x86_mov(p
->func
, countEBP
, x86_fn_arg(p
->func
, 3));
422 x86_cmp(p
->func
, countEBP
, idxEBX
);
423 fixup
= x86_jcc_forward(p
->func
, cc_E
);
425 /* If linear, idx is the current element, otherwise it is a pointer
426 * to the current element.
428 x86_mov(p
->func
, idxEBX
, x86_fn_arg(p
->func
, 2));
430 /* Initialize destination register.
432 x86_mov(p
->func
, vertexECX
, x86_fn_arg(p
->func
, 4));
434 /* Move argument 1 (translate_sse pointer) into a reg:
436 x86_mov(p
->func
, translateESI
, x86_fn_arg(p
->func
, 1));
439 /* always load, needed or not:
442 /* Note address for loop jump */
443 label
= x86_get_label(p
->func
);
446 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
447 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
449 struct x86_reg destEAX
= x86_make_disp(vertexECX
,
452 /* Figure out source pointer address:
455 get_src_ptr(p
, srcEAX
, translateESI
, idxEBX
, j
);
458 get_src_ptr(p
, srcEAX
, translateESI
, x86_deref(idxEBX
), j
);
461 if (!translate_attr( p
, a
, x86_deref(srcEAX
), destEAX
))
467 x86_lea(p
->func
, vertexECX
, x86_make_disp(vertexECX
, p
->translate
.key
.output_stride
));
472 x86_inc(p
->func
, idxEBX
);
475 x86_lea(p
->func
, idxEBX
, x86_make_disp(idxEBX
, 4));
478 /* decr count, loop if not zero
480 x86_dec(p
->func
, countEBP
);
481 x86_test(p
->func
, countEBP
, countEBP
);
482 x86_jcc(p
->func
, cc_NZ
, label
);
486 if (p
->func
->need_emms
)
489 /* Land forward jump here:
491 x86_fixup_fwd_jump(p
->func
, fixup
);
493 /* Pop regs and return
496 x86_pop(p
->func
, idxEBX
);
497 x86_pop(p
->func
, translateESI
);
498 x86_pop(p
->func
, countEBP
);
510 static void translate_sse_set_buffer( struct translate
*translate
,
515 struct translate_sse
*p
= (struct translate_sse
*)translate
;
518 for (i
= 0; i
< p
->translate
.key
.nr_elements
; i
++) {
519 if (p
->translate
.key
.element
[i
].input_buffer
== buf
) {
520 p
->attrib
[i
].input_ptr
= ((char *)ptr
+
521 p
->translate
.key
.element
[i
].input_offset
);
522 p
->attrib
[i
].input_stride
= stride
;
528 static void translate_sse_release( struct translate
*translate
)
530 struct translate_sse
*p
= (struct translate_sse
*)translate
;
532 x86_release_func( &p
->linear_func
);
533 x86_release_func( &p
->elt_func
);
538 static void PIPE_CDECL
translate_sse_run_elts( struct translate
*translate
,
539 const unsigned *elts
,
541 void *output_buffer
)
543 struct translate_sse
*p
= (struct translate_sse
*)translate
;
545 p
->gen_run_elts( translate
,
551 static void PIPE_CDECL
translate_sse_run( struct translate
*translate
,
554 void *output_buffer
)
556 struct translate_sse
*p
= (struct translate_sse
*)translate
;
558 p
->gen_run( translate
,
565 struct translate
*translate_sse2_create( const struct translate_key
*key
)
567 struct translate_sse
*p
= NULL
;
569 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
572 p
= CALLOC_STRUCT( translate_sse
);
576 p
->translate
.key
= *key
;
577 p
->translate
.release
= translate_sse_release
;
578 p
->translate
.set_buffer
= translate_sse_set_buffer
;
579 p
->translate
.run_elts
= translate_sse_run_elts
;
580 p
->translate
.run
= translate_sse_run
;
582 if (!build_vertex_emit(p
, &p
->linear_func
, TRUE
))
585 if (!build_vertex_emit(p
, &p
->elt_func
, FALSE
))
588 p
->gen_run
= (run_func
)x86_get_func(&p
->linear_func
);
589 if (p
->gen_run
== NULL
)
592 p
->gen_run_elts
= (run_elts_func
)x86_get_func(&p
->elt_func
);
593 if (p
->gen_run_elts
== NULL
)
596 return &p
->translate
;
600 translate_sse_release( &p
->translate
);
609 struct translate
*translate_sse2_create( const struct translate_key
*key
)