2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
35 #include "translate.h"
38 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
50 struct translate_buffer
{
56 struct translate_buffer_varient
{
57 unsigned buffer_index
;
58 unsigned instance_divisor
;
59 void *ptr
; /* updated either per vertex or per instance */
63 #define ELEMENT_BUFFER_INSTANCE_ID 1001
66 struct translate_sse
{
67 struct translate translate
;
69 struct x86_function linear_func
;
70 struct x86_function elt_func
;
71 struct x86_function elt16_func
;
72 struct x86_function elt8_func
;
73 struct x86_function
*func
;
75 boolean loaded_identity
;
76 boolean loaded_const
[5];
79 float const_value
[5][4];
81 struct translate_buffer buffer
[PIPE_MAX_ATTRIBS
];
84 /* Multiple buffer varients can map to a single buffer. */
85 struct translate_buffer_varient buffer_varient
[PIPE_MAX_ATTRIBS
];
86 unsigned nr_buffer_varients
;
88 /* Multiple elements can map to a single buffer varient. */
89 unsigned element_to_buffer_varient
[PIPE_MAX_ATTRIBS
];
91 boolean use_instancing
;
94 /* these are actually known values, but putting them in a struct
95 * like this is helpful to keep them in sync across the file.
97 struct x86_reg tmp_EAX
;
98 struct x86_reg tmp2_EDX
;
99 struct x86_reg tmp3_ECX
;
100 struct x86_reg idx_ESI
; /* either start+i or &elt[i] */
101 struct x86_reg machine_EDI
;
102 struct x86_reg outbuf_EBX
;
103 struct x86_reg count_EBP
; /* decrements to zero */
106 static int get_offset( const void *a
, const void *b
)
108 return (const char *)b
- (const char *)a
;
113 static struct x86_reg
get_identity( struct translate_sse
*p
)
115 struct x86_reg reg
= x86_make_reg(file_XMM
, 7);
117 if (!p
->loaded_identity
) {
118 p
->loaded_identity
= TRUE
;
124 sse_movups(p
->func
, reg
,
125 x86_make_disp(p
->machine_EDI
,
126 get_offset(p
, &p
->identity
[0])));
132 static struct x86_reg
get_const( struct translate_sse
*p
, unsigned i
, float v
)
134 struct x86_reg reg
= x86_make_reg(file_XMM
, 2 + i
);
136 if (!p
->loaded_const
[i
]) {
137 p
->loaded_const
[i
] = TRUE
;
138 p
->const_value
[i
][0] =
139 p
->const_value
[i
][1] =
140 p
->const_value
[i
][2] =
141 p
->const_value
[i
][3] = v
;
143 sse_movups(p
->func
, reg
,
144 x86_make_disp(p
->machine_EDI
,
145 get_offset(p
, &p
->const_value
[i
][0])));
151 static struct x86_reg
get_inv_127( struct translate_sse
*p
)
153 return get_const(p
, 0, 1.0f
/ 127.0f
);
156 static struct x86_reg
get_inv_255( struct translate_sse
*p
)
158 return get_const(p
, 1, 1.0f
/ 255.0f
);
161 static struct x86_reg
get_inv_32767( struct translate_sse
*p
)
163 return get_const(p
, 2, 1.0f
/ 32767.0f
);
166 static struct x86_reg
get_inv_65535( struct translate_sse
*p
)
168 return get_const(p
, 3, 1.0f
/ 65535.0f
);
171 static struct x86_reg
get_inv_2147483647( struct translate_sse
*p
)
173 return get_const(p
, 4, 1.0f
/ 2147483647.0f
);
176 /* load the data in a SSE2 register, padding with zeros */
177 static boolean
emit_load_sse2( struct translate_sse
*p
,
182 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
183 struct x86_reg tmp
= p
->tmp_EAX
;
187 x86_movzx8(p
->func
, tmp
, src
);
188 sse2_movd(p
->func
, data
, tmp
);
191 x86_movzx16(p
->func
, tmp
, src
);
192 sse2_movd(p
->func
, data
, tmp
);
194 x86_movzx8(p
->func
, tmp
, x86_make_disp(src
, 2));
195 x86_shl_imm(p
->func
, tmp
, 16);
196 x86_mov16(p
->func
, tmp
, src
);
197 sse2_movd(p
->func
, data
, tmp
);
199 sse2_movd(p
->func
, data
, src
);
202 sse2_movd(p
->func
, data
, src
);
203 x86_movzx16(p
->func
, tmp
, x86_make_disp(src
, 4));
204 sse2_movd(p
->func
, tmpXMM
, tmp
);
205 sse2_punpckldq(p
->func
, data
, tmpXMM
);
208 sse2_movq(p
->func
, data
, src
);
211 sse2_movq(p
->func
, data
, src
);
212 sse2_movd(p
->func
, tmpXMM
, x86_make_disp(src
, 8));
213 sse2_punpcklqdq(p
->func
, data
, tmpXMM
);
216 sse2_movdqu(p
->func
, data
, src
);
224 /* this value can be passed for the out_chans argument */
225 #define CHANNELS_0001 5
227 /* this function will load #chans float values, and will
228 * pad the register with zeroes at least up to out_chans.
230 * If out_chans is set to CHANNELS_0001, then the fourth
231 * value will be padded with 1. Only pass this value if
232 * chans < 4 or results are undefined.
234 static void emit_load_float32( struct translate_sse
*p
,
246 sse_movss(p
->func
, data
, arg0
);
247 if(out_chans
== CHANNELS_0001
)
248 sse_orps(p
->func
, data
, get_identity(p
) );
254 if(out_chans
== CHANNELS_0001
)
255 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
, Y
, Z
, W
) );
256 else if(out_chans
> 2)
257 sse_movlhps(p
->func
, data
, get_identity(p
) );
258 sse_movlps(p
->func
, data
, arg0
);
261 /* Have to jump through some hoops:
264 * c 0 0 1 if out_chans == CHANNELS_0001
268 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
269 if(out_chans
== CHANNELS_0001
)
270 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
,Y
,Z
,W
) );
271 sse_shufps(p
->func
, data
, data
, SHUF(Y
,Z
,X
,W
) );
272 sse_movlps(p
->func
, data
, arg0
);
275 sse_movups(p
->func
, data
, arg0
);
280 /* this function behaves like emit_load_float32, but loads
281 64-bit floating point numbers, converting them to 32-bit
283 static void emit_load_float64to32( struct translate_sse
*p
,
289 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
293 sse2_movsd(p
->func
, data
, arg0
);
295 sse2_cvtpd2ps(p
->func
, data
, data
);
297 sse2_cvtsd2ss(p
->func
, data
, data
);
298 if(out_chans
== CHANNELS_0001
)
299 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
, Y
, Z
, W
) );
302 sse2_movupd(p
->func
, data
, arg0
);
303 sse2_cvtpd2ps(p
->func
, data
, data
);
304 if(out_chans
== CHANNELS_0001
)
305 sse_shufps(p
->func
, data
, get_identity(p
), SHUF(X
, Y
, Z
, W
) );
306 else if(out_chans
> 2)
307 sse_movlhps(p
->func
, data
, get_identity(p
) );
310 sse2_movupd(p
->func
, data
, arg0
);
311 sse2_cvtpd2ps(p
->func
, data
, data
);
312 sse2_movsd(p
->func
, tmpXMM
, x86_make_disp(arg0
, 16));
314 sse2_cvtpd2ps(p
->func
, tmpXMM
, tmpXMM
);
316 sse2_cvtsd2ss(p
->func
, tmpXMM
, tmpXMM
);
317 sse_movlhps(p
->func
, data
, tmpXMM
);
318 if(out_chans
== CHANNELS_0001
)
319 sse_orps(p
->func
, data
, get_identity(p
) );
322 sse2_movupd(p
->func
, data
, arg0
);
323 sse2_cvtpd2ps(p
->func
, data
, data
);
324 sse2_movupd(p
->func
, tmpXMM
, x86_make_disp(arg0
, 16));
325 sse2_cvtpd2ps(p
->func
, tmpXMM
, tmpXMM
);
326 sse_movlhps(p
->func
, data
, tmpXMM
);
331 static void emit_mov64(struct translate_sse
*p
, struct x86_reg dst_gpr
, struct x86_reg dst_xmm
, struct x86_reg src_gpr
, struct x86_reg src_xmm
)
333 if(x86_target(p
->func
) != X86_32
)
334 x64_mov64(p
->func
, dst_gpr
, src_gpr
);
337 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
338 if(x86_target_caps(p
->func
) & X86_SSE2
)
339 sse2_movq(p
->func
, dst_xmm
, src_xmm
);
341 sse_movlps(p
->func
, dst_xmm
, src_xmm
);
345 static void emit_load64(struct translate_sse
*p
, struct x86_reg dst_gpr
, struct x86_reg dst_xmm
, struct x86_reg src
)
347 emit_mov64(p
, dst_gpr
, dst_xmm
, src
, src
);
350 static void emit_store64(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src_gpr
, struct x86_reg src_xmm
)
352 emit_mov64(p
, dst
, dst
, src_gpr
, src_xmm
);
355 static void emit_mov128(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src
)
357 if(x86_target_caps(p
->func
) & X86_SSE2
)
358 sse2_movdqu(p
->func
, dst
, src
);
360 sse_movups(p
->func
, dst
, src
);
363 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
364 * but may or may not be good on older processors
365 * TODO: may perhaps want to use non-temporal stores here if possible
367 static void emit_memcpy(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src
, unsigned size
)
369 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
370 struct x86_reg dataXMM2
= x86_make_reg(file_XMM
, 1);
371 struct x86_reg dataGPR
= p
->tmp_EAX
;
372 struct x86_reg dataGPR2
= p
->tmp2_EDX
;
379 x86_mov8(p
->func
, dataGPR
, src
);
380 x86_mov8(p
->func
, dst
, dataGPR
);
383 x86_mov16(p
->func
, dataGPR
, src
);
384 x86_mov16(p
->func
, dst
, dataGPR
);
387 x86_mov16(p
->func
, dataGPR
, src
);
388 x86_mov8(p
->func
, dataGPR2
, x86_make_disp(src
, 2));
389 x86_mov16(p
->func
, dst
, dataGPR
);
390 x86_mov8(p
->func
, x86_make_disp(dst
, 2), dataGPR2
);
393 x86_mov(p
->func
, dataGPR
, src
);
394 x86_mov(p
->func
, dst
, dataGPR
);
397 x86_mov(p
->func
, dataGPR
, src
);
398 x86_mov16(p
->func
, dataGPR2
, x86_make_disp(src
, 4));
399 x86_mov(p
->func
, dst
, dataGPR
);
400 x86_mov16(p
->func
, x86_make_disp(dst
, 4), dataGPR2
);
404 else if(!(x86_target_caps(p
->func
) & X86_SSE
))
407 assert((size
& 3) == 0);
408 for(i
= 0; i
< size
; i
+= 4)
410 x86_mov(p
->func
, dataGPR
, x86_make_disp(src
, i
));
411 x86_mov(p
->func
, x86_make_disp(dst
, i
), dataGPR
);
419 emit_load64(p
, dataGPR
, dataXMM
, src
);
420 emit_store64(p
, dst
, dataGPR
, dataXMM
);
423 emit_load64(p
, dataGPR2
, dataXMM
, src
);
424 x86_mov(p
->func
, dataGPR
, x86_make_disp(src
, 8));
425 emit_store64(p
, dst
, dataGPR2
, dataXMM
);
426 x86_mov(p
->func
, x86_make_disp(dst
, 8), dataGPR
);
429 emit_mov128(p
, dataXMM
, src
);
430 emit_mov128(p
, dst
, dataXMM
);
433 emit_mov128(p
, dataXMM
, src
);
434 emit_load64(p
, dataGPR
, dataXMM2
, x86_make_disp(src
, 16));
435 emit_mov128(p
, dst
, dataXMM
);
436 emit_store64(p
, x86_make_disp(dst
, 16), dataGPR
, dataXMM2
);
439 emit_mov128(p
, dataXMM
, src
);
440 emit_mov128(p
, dataXMM2
, x86_make_disp(src
, 16));
441 emit_mov128(p
, dst
, dataXMM
);
442 emit_mov128(p
, x86_make_disp(dst
, 16), dataXMM2
);
450 static boolean
translate_attr_convert( struct translate_sse
*p
,
451 const struct translate_element
*a
,
456 const struct util_format_description
* input_desc
= util_format_description(a
->input_format
);
457 const struct util_format_description
* output_desc
= util_format_description(a
->output_format
);
459 boolean id_swizzle
= TRUE
;
460 unsigned swizzle
[4] = {UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
};
461 unsigned needed_chans
= 0;
462 unsigned imms
[2] = {0, 0x3f800000};
464 if(a
->output_format
== PIPE_FORMAT_NONE
|| a
->input_format
== PIPE_FORMAT_NONE
)
467 if(input_desc
->channel
[0].size
& 7)
470 if(input_desc
->colorspace
!= output_desc
->colorspace
)
473 for(i
= 1; i
< input_desc
->nr_channels
; ++i
)
475 if(memcmp(&input_desc
->channel
[i
], &input_desc
->channel
[0], sizeof(input_desc
->channel
[0])))
479 for(i
= 1; i
< output_desc
->nr_channels
; ++i
)
481 if(memcmp(&output_desc
->channel
[i
], &output_desc
->channel
[0], sizeof(output_desc
->channel
[0])))
485 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
487 if(output_desc
->swizzle
[i
] < 4)
488 swizzle
[output_desc
->swizzle
[i
]] = input_desc
->swizzle
[i
];
491 if((x86_target_caps(p
->func
) & X86_SSE
) && (0
492 || a
->output_format
== PIPE_FORMAT_R32_FLOAT
493 || a
->output_format
== PIPE_FORMAT_R32G32_FLOAT
494 || a
->output_format
== PIPE_FORMAT_R32G32B32_FLOAT
495 || a
->output_format
== PIPE_FORMAT_R32G32B32A32_FLOAT
))
497 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
498 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
500 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
502 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
&& i
>= input_desc
->nr_channels
)
506 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
509 needed_chans
= MAX2(needed_chans
, swizzle
[i
] + 1);
510 if(swizzle
[i
] < UTIL_FORMAT_SWIZZLE_0
&& swizzle
[i
] != i
)
516 switch(input_desc
->channel
[0].type
)
518 case UTIL_FORMAT_TYPE_UNSIGNED
:
519 if(!(x86_target_caps(p
->func
) & X86_SSE2
))
521 emit_load_sse2(p
, dataXMM
, src
, input_desc
->channel
[0].size
* input_desc
->nr_channels
>> 3);
523 /* TODO: add support for SSE4.1 pmovzx */
524 switch(input_desc
->channel
[0].size
)
527 /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
528 sse2_punpcklbw(p
->func
, dataXMM
, get_identity(p
));
529 sse2_punpcklbw(p
->func
, dataXMM
, get_identity(p
));
532 sse2_punpcklwd(p
->func
, dataXMM
, get_identity(p
));
534 case 32: /* we lose precision here */
535 sse2_psrld_imm(p
->func
, dataXMM
, 1);
540 sse2_cvtdq2ps(p
->func
, dataXMM
, dataXMM
);
541 if(input_desc
->channel
[0].normalized
)
543 struct x86_reg factor
;
544 switch(input_desc
->channel
[0].size
)
547 factor
= get_inv_255(p
);
550 factor
= get_inv_65535(p
);
553 factor
= get_inv_2147483647(p
);
556 sse_mulps(p
->func
, dataXMM
, factor
);
558 else if(input_desc
->channel
[0].size
== 32)
559 sse_addps(p
->func
, dataXMM
, dataXMM
); /* compensate for the bit we threw away to fit u32 into s32 */
561 case UTIL_FORMAT_TYPE_SIGNED
:
562 if(!(x86_target_caps(p
->func
) & X86_SSE2
))
564 emit_load_sse2(p
, dataXMM
, src
, input_desc
->channel
[0].size
* input_desc
->nr_channels
>> 3);
566 /* TODO: add support for SSE4.1 pmovsx */
567 switch(input_desc
->channel
[0].size
)
570 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
571 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
572 sse2_psrad_imm(p
->func
, dataXMM
, 24);
575 sse2_punpcklwd(p
->func
, dataXMM
, dataXMM
);
576 sse2_psrad_imm(p
->func
, dataXMM
, 16);
578 case 32: /* we lose precision here */
583 sse2_cvtdq2ps(p
->func
, dataXMM
, dataXMM
);
584 if(input_desc
->channel
[0].normalized
)
586 struct x86_reg factor
;
587 switch(input_desc
->channel
[0].size
)
590 factor
= get_inv_127(p
);
593 factor
= get_inv_32767(p
);
596 factor
= get_inv_2147483647(p
);
599 sse_mulps(p
->func
, dataXMM
, factor
);
604 case UTIL_FORMAT_TYPE_FLOAT
:
605 if(input_desc
->channel
[0].size
!= 32 && input_desc
->channel
[0].size
!= 64)
607 if(swizzle
[3] == UTIL_FORMAT_SWIZZLE_1
&& input_desc
->nr_channels
<= 3)
609 swizzle
[3] = UTIL_FORMAT_SWIZZLE_W
;
610 needed_chans
= CHANNELS_0001
;
612 switch(input_desc
->channel
[0].size
)
615 emit_load_float32(p
, dataXMM
, src
, needed_chans
, input_desc
->nr_channels
);
617 case 64: /* we lose precision here */
618 if(!(x86_target_caps(p
->func
) & X86_SSE2
))
620 emit_load_float64to32(p
, dataXMM
, src
, needed_chans
, input_desc
->nr_channels
);
631 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(swizzle
[0], swizzle
[1], swizzle
[2], swizzle
[3]) );
634 if(output_desc
->nr_channels
>= 4
635 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
636 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
637 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
638 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
640 sse_movups(p
->func
, dst
, dataXMM
);
643 if(output_desc
->nr_channels
>= 2
644 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
645 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
)
646 sse_movlps(p
->func
, dst
, dataXMM
);
649 if(swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
)
650 sse_movss(p
->func
, dst
, dataXMM
);
652 x86_mov_imm(p
->func
, dst
, imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
654 if(output_desc
->nr_channels
>= 2)
656 if(swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
)
658 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(1, 1, 2, 3));
659 sse_movss(p
->func
, x86_make_disp(dst
, 4), dataXMM
);
662 x86_mov_imm(p
->func
, x86_make_disp(dst
, 4), imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
]);
666 if(output_desc
->nr_channels
>= 3)
668 if(output_desc
->nr_channels
>= 4
669 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
670 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
)
671 sse_movhps(p
->func
, x86_make_disp(dst
, 8), dataXMM
);
674 if(swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
)
676 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(2, 2, 2, 3));
677 sse_movss(p
->func
, x86_make_disp(dst
, 8), dataXMM
);
680 x86_mov_imm(p
->func
, x86_make_disp(dst
, 8), imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
682 if(output_desc
->nr_channels
>= 4)
684 if(swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
)
686 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(3, 3, 3, 3));
687 sse_movss(p
->func
, x86_make_disp(dst
, 12), dataXMM
);
690 x86_mov_imm(p
->func
, x86_make_disp(dst
, 12), imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
]);
697 else if((x86_target_caps(p
->func
) & X86_SSE2
) && input_desc
->channel
[0].size
== 8 && output_desc
->channel
[0].size
== 16
698 && output_desc
->channel
[0].normalized
== input_desc
->channel
[0].normalized
700 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
&& output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
)
701 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
&& output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
702 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
&& output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
705 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
706 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
707 struct x86_reg tmp
= p
->tmp_EAX
;
708 unsigned imms
[2] = {0, 1};
710 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
712 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
&& i
>= input_desc
->nr_channels
)
716 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
719 needed_chans
= MAX2(needed_chans
, swizzle
[i
] + 1);
720 if(swizzle
[i
] < UTIL_FORMAT_SWIZZLE_0
&& swizzle
[i
] != i
)
726 emit_load_sse2(p
, dataXMM
, src
, input_desc
->channel
[0].size
* input_desc
->nr_channels
>> 3);
728 switch(input_desc
->channel
[0].type
)
730 case UTIL_FORMAT_TYPE_UNSIGNED
:
731 if(input_desc
->channel
[0].normalized
)
733 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
734 if(output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
735 sse2_psrlw_imm(p
->func
, dataXMM
, 1);
738 sse2_punpcklbw(p
->func
, dataXMM
, get_identity(p
));
740 case UTIL_FORMAT_TYPE_SIGNED
:
741 if(input_desc
->channel
[0].normalized
)
743 sse2_movq(p
->func
, tmpXMM
, get_identity(p
));
744 sse2_punpcklbw(p
->func
, tmpXMM
, dataXMM
);
745 sse2_psllw_imm(p
->func
, dataXMM
, 9);
746 sse2_psrlw_imm(p
->func
, dataXMM
, 8);
747 sse2_por(p
->func
, tmpXMM
, dataXMM
);
748 sse2_psrlw_imm(p
->func
, dataXMM
, 7);
749 sse2_por(p
->func
, tmpXMM
, dataXMM
);
751 struct x86_reg t
= dataXMM
;
758 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
759 sse2_psraw_imm(p
->func
, dataXMM
, 8);
766 if(output_desc
->channel
[0].normalized
)
767 imms
[1] = (output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
) ? 0xffff : 0x7ffff;
770 sse2_pshuflw(p
->func
, dataXMM
, dataXMM
, (swizzle
[0] & 3) | ((swizzle
[1] & 3) << 2) | ((swizzle
[2] & 3) << 4) | ((swizzle
[3] & 3) << 6));
773 if(output_desc
->nr_channels
>= 4
774 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
775 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
776 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
777 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
779 sse2_movq(p
->func
, dst
, dataXMM
);
782 if(swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
)
784 if(output_desc
->nr_channels
>= 2 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
)
785 sse2_movd(p
->func
, dst
, dataXMM
);
788 sse2_movd(p
->func
, tmp
, dataXMM
);
789 x86_mov16(p
->func
, dst
, tmp
);
790 if(output_desc
->nr_channels
>= 2)
791 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 2), imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
]);
796 if(output_desc
->nr_channels
>= 2 && swizzle
[1] >= UTIL_FORMAT_SWIZZLE_0
)
797 x86_mov_imm(p
->func
, dst
, (imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
] << 16) | imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
800 x86_mov16_imm(p
->func
, dst
, imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
801 if(output_desc
->nr_channels
>= 2)
803 sse2_movd(p
->func
, tmp
, dataXMM
);
804 x86_shr_imm(p
->func
, tmp
, 16);
805 x86_mov16(p
->func
, x86_make_disp(dst
, 2), tmp
);
810 if(output_desc
->nr_channels
>= 3)
812 if(swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
)
814 if(output_desc
->nr_channels
>= 4 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
)
816 sse2_psrlq_imm(p
->func
, dataXMM
, 32);
817 sse2_movd(p
->func
, x86_make_disp(dst
, 4), dataXMM
);
821 sse2_psrlq_imm(p
->func
, dataXMM
, 32);
822 sse2_movd(p
->func
, tmp
, dataXMM
);
823 x86_mov16(p
->func
, x86_make_disp(dst
, 4), tmp
);
824 if(output_desc
->nr_channels
>= 4)
826 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 6), imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
]);
832 if(output_desc
->nr_channels
>= 4 && swizzle
[3] >= UTIL_FORMAT_SWIZZLE_0
)
833 x86_mov_imm(p
->func
, x86_make_disp(dst
, 4), (imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
] << 16) | imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
836 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 4), imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
838 if(output_desc
->nr_channels
>= 4)
840 sse2_psrlq_imm(p
->func
, dataXMM
, 48);
841 sse2_movd(p
->func
, tmp
, dataXMM
);
842 x86_mov16(p
->func
, x86_make_disp(dst
, 6), tmp
);
850 else if(!memcmp(&output_desc
->channel
[0], &input_desc
->channel
[0], sizeof(output_desc
->channel
[0])))
852 struct x86_reg tmp
= p
->tmp_EAX
;
853 if(input_desc
->channel
[0].size
== 8 && input_desc
->nr_channels
== 4 && output_desc
->nr_channels
== 4
854 && swizzle
[0] == UTIL_FORMAT_SWIZZLE_W
855 && swizzle
[1] == UTIL_FORMAT_SWIZZLE_Z
856 && swizzle
[2] == UTIL_FORMAT_SWIZZLE_Y
857 && swizzle
[3] == UTIL_FORMAT_SWIZZLE_X
)
859 /* TODO: support movbe */
860 x86_mov(p
->func
, tmp
, src
);
861 x86_bswap(p
->func
, tmp
);
862 x86_mov(p
->func
, dst
, tmp
);
866 for(unsigned i
= 0; i
< output_desc
->nr_channels
; ++i
)
868 switch(output_desc
->channel
[0].size
)
871 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
874 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
876 switch(output_desc
->channel
[0].type
)
878 case UTIL_FORMAT_TYPE_UNSIGNED
:
879 v
= output_desc
->channel
[0].normalized
? 0xff : 1;
881 case UTIL_FORMAT_TYPE_SIGNED
:
882 v
= output_desc
->channel
[0].normalized
? 0x7f : 1;
888 x86_mov8_imm(p
->func
, x86_make_disp(dst
, i
* 1), v
);
892 x86_mov8(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 1));
893 x86_mov8(p
->func
, x86_make_disp(dst
, i
* 1), tmp
);
897 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
900 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
902 switch(output_desc
->channel
[1].type
)
904 case UTIL_FORMAT_TYPE_UNSIGNED
:
905 v
= output_desc
->channel
[1].normalized
? 0xffff : 1;
907 case UTIL_FORMAT_TYPE_SIGNED
:
908 v
= output_desc
->channel
[1].normalized
? 0x7fff : 1;
910 case UTIL_FORMAT_TYPE_FLOAT
:
917 x86_mov16_imm(p
->func
, x86_make_disp(dst
, i
* 2), v
);
919 else if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
)
920 x86_mov16_imm(p
->func
, x86_make_disp(dst
, i
* 2), 0);
923 x86_mov16(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 2));
924 x86_mov16(p
->func
, x86_make_disp(dst
, i
* 2), tmp
);
928 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
931 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
933 switch(output_desc
->channel
[1].type
)
935 case UTIL_FORMAT_TYPE_UNSIGNED
:
936 v
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
938 case UTIL_FORMAT_TYPE_SIGNED
:
939 v
= output_desc
->channel
[1].normalized
? 0x7fffffff : 1;
941 case UTIL_FORMAT_TYPE_FLOAT
:
948 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 4), v
);
952 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 4));
953 x86_mov(p
->func
, x86_make_disp(dst
, i
* 4), tmp
);
957 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
961 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
963 switch(output_desc
->channel
[1].type
)
965 case UTIL_FORMAT_TYPE_UNSIGNED
:
966 h
= output_desc
->channel
[1].normalized
? 0xffffffff : 0;
967 l
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
969 case UTIL_FORMAT_TYPE_SIGNED
:
970 h
= output_desc
->channel
[1].normalized
? 0x7fffffff : 0;
971 l
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
973 case UTIL_FORMAT_TYPE_FLOAT
:
981 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 8), l
);
982 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 8 + 4), h
);
986 if(x86_target_caps(p
->func
) & X86_SSE
)
988 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 0);
989 emit_load64(p
, tmp
, tmpXMM
, x86_make_disp(src
, swizzle
[i
] * 8));
990 emit_store64(p
, x86_make_disp(dst
, i
* 8), tmp
, tmpXMM
);
994 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 8));
995 x86_mov(p
->func
, x86_make_disp(dst
, i
* 8), tmp
);
996 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 8 + 4));
997 x86_mov(p
->func
, x86_make_disp(dst
, i
* 8 + 4), tmp
);
1010 static boolean
translate_attr( struct translate_sse
*p
,
1011 const struct translate_element
*a
,
1015 if(a
->input_format
== a
->output_format
)
1017 emit_memcpy(p
, dst
, src
, util_format_get_stride(a
->input_format
, 1));
1021 return translate_attr_convert(p
, a
, src
, dst
);
1024 static boolean
init_inputs( struct translate_sse
*p
,
1025 unsigned index_size
)
1028 struct x86_reg instance_id
= x86_make_disp(p
->machine_EDI
,
1029 get_offset(p
, &p
->instance_id
));
1031 for (i
= 0; i
< p
->nr_buffer_varients
; i
++) {
1032 struct translate_buffer_varient
*varient
= &p
->buffer_varient
[i
];
1033 struct translate_buffer
*buffer
= &p
->buffer
[varient
->buffer_index
];
1035 if (!index_size
|| varient
->instance_divisor
) {
1036 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDI
,
1037 get_offset(p
, &buffer
->stride
));
1038 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDI
,
1039 get_offset(p
, &varient
->ptr
));
1040 struct x86_reg buf_base_ptr
= x86_make_disp(p
->machine_EDI
,
1041 get_offset(p
, &buffer
->base_ptr
));
1042 struct x86_reg elt
= p
->idx_ESI
;
1043 struct x86_reg tmp_EAX
= p
->tmp_EAX
;
1045 /* Calculate pointer to first attrib:
1046 * base_ptr + stride * index, where index depends on instance divisor
1048 if (varient
->instance_divisor
) {
1049 /* Our index is instance ID divided by instance divisor.
1051 x86_mov(p
->func
, tmp_EAX
, instance_id
);
1053 if (varient
->instance_divisor
!= 1) {
1054 struct x86_reg tmp_EDX
= p
->tmp2_EDX
;
1055 struct x86_reg tmp_ECX
= p
->tmp3_ECX
;
1057 /* TODO: Add x86_shr() to rtasm and use it whenever
1058 * instance divisor is power of two.
1061 x86_xor(p
->func
, tmp_EDX
, tmp_EDX
);
1062 x86_mov_reg_imm(p
->func
, tmp_ECX
, varient
->instance_divisor
);
1063 x86_div(p
->func
, tmp_ECX
); /* EAX = EDX:EAX / ECX */
1066 x86_mov(p
->func
, tmp_EAX
, elt
);
1070 * TODO: Respect translate_buffer::max_index.
1073 x86_imul(p
->func
, tmp_EAX
, buf_stride
);
1075 x86_add(p
->func
, tmp_EAX
, buf_base_ptr
);
1078 /* In the linear case, keep the buffer pointer instead of the
1081 if (!index_size
&& p
->nr_buffer_varients
== 1)
1084 x86_mov(p
->func
, elt
, tmp_EAX
);
1089 x86_mov(p
->func
, buf_ptr
, tmp_EAX
);
1098 static struct x86_reg
get_buffer_ptr( struct translate_sse
*p
,
1099 unsigned index_size
,
1101 struct x86_reg elt
)
1103 if (var_idx
== ELEMENT_BUFFER_INSTANCE_ID
) {
1104 return x86_make_disp(p
->machine_EDI
,
1105 get_offset(p
, &p
->instance_id
));
1107 if (!index_size
&& p
->nr_buffer_varients
== 1) {
1110 else if (!index_size
|| p
->buffer_varient
[var_idx
].instance_divisor
) {
1111 struct x86_reg ptr
= p
->tmp_EAX
;
1112 struct x86_reg buf_ptr
=
1113 x86_make_disp(p
->machine_EDI
,
1114 get_offset(p
, &p
->buffer_varient
[var_idx
].ptr
));
1117 x86_mov(p
->func
, ptr
, buf_ptr
);
1121 struct x86_reg ptr
= p
->tmp_EAX
;
1122 const struct translate_buffer_varient
*varient
= &p
->buffer_varient
[var_idx
];
1124 struct x86_reg buf_stride
=
1125 x86_make_disp(p
->machine_EDI
,
1126 get_offset(p
, &p
->buffer
[varient
->buffer_index
].stride
));
1128 struct x86_reg buf_base_ptr
=
1129 x86_make_disp(p
->machine_EDI
,
1130 get_offset(p
, &p
->buffer
[varient
->buffer_index
].base_ptr
));
1134 /* Calculate pointer to current attrib:
1139 x86_movzx8(p
->func
, ptr
, elt
);
1142 x86_movzx16(p
->func
, ptr
, elt
);
1145 x86_mov(p
->func
, ptr
, elt
);
1148 x86_imul(p
->func
, ptr
, buf_stride
);
1150 x86_add(p
->func
, ptr
, buf_base_ptr
);
1157 static boolean
incr_inputs( struct translate_sse
*p
,
1158 unsigned index_size
)
1160 if (!index_size
&& p
->nr_buffer_varients
== 1) {
1161 struct x86_reg stride
= x86_make_disp(p
->machine_EDI
,
1162 get_offset(p
, &p
->buffer
[0].stride
));
1164 if (p
->buffer_varient
[0].instance_divisor
== 0) {
1166 x86_add(p
->func
, p
->idx_ESI
, stride
);
1167 sse_prefetchnta(p
->func
, x86_make_disp(p
->idx_ESI
, 192));
1170 else if (!index_size
) {
1173 /* Is this worthwhile??
1175 for (i
= 0; i
< p
->nr_buffer_varients
; i
++) {
1176 struct translate_buffer_varient
*varient
= &p
->buffer_varient
[i
];
1177 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDI
,
1178 get_offset(p
, &varient
->ptr
));
1179 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDI
,
1180 get_offset(p
, &p
->buffer
[varient
->buffer_index
].stride
));
1182 if (varient
->instance_divisor
== 0) {
1183 x86_mov(p
->func
, p
->tmp_EAX
, buf_stride
);
1185 x86_add(p
->func
, p
->tmp_EAX
, buf_ptr
);
1186 if (i
== 0) sse_prefetchnta(p
->func
, x86_make_disp(p
->tmp_EAX
, 192));
1188 x86_mov(p
->func
, buf_ptr
, p
->tmp_EAX
);
1193 x86_lea(p
->func
, p
->idx_ESI
, x86_make_disp(p
->idx_ESI
, index_size
));
1200 /* Build run( struct translate *machine,
1203 * void *output_buffer )
1205 * run_elts( struct translate *machine,
1208 * void *output_buffer )
1210 * Lots of hardcoding
1212 * EAX -- pointer to current output vertex
1213 * ECX -- pointer to current attribute
1216 static boolean
build_vertex_emit( struct translate_sse
*p
,
1217 struct x86_function
*func
,
1218 unsigned index_size
)
1223 p
->tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
1224 p
->idx_ESI
= x86_make_reg(file_REG32
, reg_SI
);
1225 p
->outbuf_EBX
= x86_make_reg(file_REG32
, reg_BX
);
1226 p
->machine_EDI
= x86_make_reg(file_REG32
, reg_DI
);
1227 p
->count_EBP
= x86_make_reg(file_REG32
, reg_BP
);
1228 p
->tmp2_EDX
= x86_make_reg(file_REG32
, reg_DX
);
1229 p
->tmp3_ECX
= x86_make_reg(file_REG32
, reg_CX
);
1232 memset(&p
->loaded_const
, 0, sizeof(p
->loaded_const
));
1233 p
->loaded_identity
= FALSE
;
1235 x86_init_func(p
->func
);
1237 if(x86_target(p
->func
) == X86_64_WIN64_ABI
)
1239 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1240 sse2_movdqa(p
->func
, x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 8), x86_make_reg(file_XMM
, 6));
1241 sse2_movdqa(p
->func
, x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 24), x86_make_reg(file_XMM
, 7));
1244 x86_push(p
->func
, p
->outbuf_EBX
);
1245 x86_push(p
->func
, p
->count_EBP
);
1247 /* on non-Win64 x86-64, these are already in the right registers */
1248 if(x86_target(p
->func
) != X86_64_STD_ABI
)
1250 x86_push(p
->func
, p
->machine_EDI
);
1251 x86_push(p
->func
, p
->idx_ESI
);
1253 x86_mov(p
->func
, p
->machine_EDI
, x86_fn_arg(p
->func
, 1));
1254 x86_mov(p
->func
, p
->idx_ESI
, x86_fn_arg(p
->func
, 2));
1257 x86_mov(p
->func
, p
->count_EBP
, x86_fn_arg(p
->func
, 3));
1259 if(x86_target(p
->func
) != X86_32
)
1260 x64_mov64(p
->func
, p
->outbuf_EBX
, x86_fn_arg(p
->func
, 5));
1262 x86_mov(p
->func
, p
->outbuf_EBX
, x86_fn_arg(p
->func
, 5));
1264 /* Load instance ID.
1266 if (p
->use_instancing
) {
1269 x86_fn_arg(p
->func
, 4));
1271 x86_make_disp(p
->machine_EDI
, get_offset(p
, &p
->instance_id
)),
1275 /* Get vertex count, compare to zero
1277 x86_xor(p
->func
, p
->tmp_EAX
, p
->tmp_EAX
);
1278 x86_cmp(p
->func
, p
->count_EBP
, p
->tmp_EAX
);
1279 fixup
= x86_jcc_forward(p
->func
, cc_E
);
1281 /* always load, needed or not:
1283 init_inputs(p
, index_size
);
1285 /* Note address for loop jump
1287 label
= x86_get_label(p
->func
);
1289 struct x86_reg elt
= !index_size
? p
->idx_ESI
: x86_deref(p
->idx_ESI
);
1290 int last_varient
= -1;
1293 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
1294 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
1295 unsigned varient
= p
->element_to_buffer_varient
[j
];
1297 /* Figure out source pointer address:
1299 if (varient
!= last_varient
) {
1300 last_varient
= varient
;
1301 vb
= get_buffer_ptr(p
, index_size
, varient
, elt
);
1304 if (!translate_attr( p
, a
,
1305 x86_make_disp(vb
, a
->input_offset
),
1306 x86_make_disp(p
->outbuf_EBX
, a
->output_offset
)))
1310 /* Next output vertex:
1315 x86_make_disp(p
->outbuf_EBX
,
1316 p
->translate
.key
.output_stride
));
1320 incr_inputs( p
, index_size
);
1323 /* decr count, loop if not zero
1325 x86_dec(p
->func
, p
->count_EBP
);
1326 x86_jcc(p
->func
, cc_NZ
, label
);
1330 if (p
->func
->need_emms
)
1333 /* Land forward jump here:
1335 x86_fixup_fwd_jump(p
->func
, fixup
);
1337 /* Pop regs and return
1340 if(x86_target(p
->func
) != X86_64_STD_ABI
)
1342 x86_pop(p
->func
, p
->idx_ESI
);
1343 x86_pop(p
->func
, p
->machine_EDI
);
1346 x86_pop(p
->func
, p
->count_EBP
);
1347 x86_pop(p
->func
, p
->outbuf_EBX
);
1349 if(x86_target(p
->func
) == X86_64_WIN64_ABI
)
1351 sse2_movdqa(p
->func
, x86_make_reg(file_XMM
, 6), x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 8));
1352 sse2_movdqa(p
->func
, x86_make_reg(file_XMM
, 7), x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 24));
1365 static void translate_sse_set_buffer( struct translate
*translate
,
1369 unsigned max_index
)
1371 struct translate_sse
*p
= (struct translate_sse
*)translate
;
1373 if (buf
< p
->nr_buffers
) {
1374 p
->buffer
[buf
].base_ptr
= (char *)ptr
;
1375 p
->buffer
[buf
].stride
= stride
;
1376 p
->buffer
[buf
].max_index
= max_index
;
1379 if (0) debug_printf("%s %d/%d: %p %d\n",
1386 static void translate_sse_release( struct translate
*translate
)
1388 struct translate_sse
*p
= (struct translate_sse
*)translate
;
1390 x86_release_func( &p
->linear_func
);
1391 x86_release_func( &p
->elt_func
);
1397 struct translate
*translate_sse2_create( const struct translate_key
*key
)
1399 struct translate_sse
*p
= NULL
;
1402 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1403 if (!rtasm_cpu_has_sse())
1406 p
= CALLOC_STRUCT( translate_sse
);
1410 p
->translate
.key
= *key
;
1411 p
->translate
.release
= translate_sse_release
;
1412 p
->translate
.set_buffer
= translate_sse_set_buffer
;
1414 for (i
= 0; i
< key
->nr_elements
; i
++) {
1415 if (key
->element
[i
].type
== TRANSLATE_ELEMENT_NORMAL
) {
1418 p
->nr_buffers
= MAX2(p
->nr_buffers
, key
->element
[i
].input_buffer
+ 1);
1420 if (key
->element
[i
].instance_divisor
) {
1421 p
->use_instancing
= TRUE
;
1425 * Map vertex element to vertex buffer varient.
1427 for (j
= 0; j
< p
->nr_buffer_varients
; j
++) {
1428 if (p
->buffer_varient
[j
].buffer_index
== key
->element
[i
].input_buffer
&&
1429 p
->buffer_varient
[j
].instance_divisor
== key
->element
[i
].instance_divisor
) {
1433 if (j
== p
->nr_buffer_varients
) {
1434 p
->buffer_varient
[j
].buffer_index
= key
->element
[i
].input_buffer
;
1435 p
->buffer_varient
[j
].instance_divisor
= key
->element
[i
].instance_divisor
;
1436 p
->nr_buffer_varients
++;
1438 p
->element_to_buffer_varient
[i
] = j
;
1440 assert(key
->element
[i
].type
== TRANSLATE_ELEMENT_INSTANCE_ID
);
1442 p
->element_to_buffer_varient
[i
] = ELEMENT_BUFFER_INSTANCE_ID
;
1446 if (0) debug_printf("nr_buffers: %d\n", p
->nr_buffers
);
1448 if (!build_vertex_emit(p
, &p
->linear_func
, 0))
1451 if (!build_vertex_emit(p
, &p
->elt_func
, 4))
1454 if (!build_vertex_emit(p
, &p
->elt16_func
, 2))
1457 if (!build_vertex_emit(p
, &p
->elt8_func
, 1))
1460 p
->translate
.run
= (void*)x86_get_func(&p
->linear_func
);
1461 if (p
->translate
.run
== NULL
)
1464 p
->translate
.run_elts
= (void*)x86_get_func(&p
->elt_func
);
1465 if (p
->translate
.run_elts
== NULL
)
1468 p
->translate
.run_elts16
= (void*)x86_get_func(&p
->elt16_func
);
1469 if (p
->translate
.run_elts16
== NULL
)
1472 p
->translate
.run_elts8
= (void*)x86_get_func(&p
->elt8_func
);
1473 if (p
->translate
.run_elts8
== NULL
)
1476 return &p
->translate
;
1480 translate_sse_release( &p
->translate
);
1489 struct translate
*translate_sse2_create( const struct translate_key
*key
)