2 * Copyright 2003 VMware, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@vmware.com>
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
35 #include "translate.h"
38 #if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
50 struct translate_buffer
57 struct translate_buffer_variant
59 unsigned buffer_index
;
60 unsigned instance_divisor
;
61 void *ptr
; /* updated either per vertex or per instance */
65 #define ELEMENT_BUFFER_INSTANCE_ID 1001
80 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
81 static float consts
[NUM_CONSTS
][4] = {
87 C(1.0 / 2147483647.0),
95 struct translate translate
;
97 struct x86_function linear_func
;
98 struct x86_function elt_func
;
99 struct x86_function elt16_func
;
100 struct x86_function elt8_func
;
101 struct x86_function
*func
;
103 PIPE_ALIGN_VAR(16) float consts
[NUM_CONSTS
][4];
104 int8_t reg_to_const
[16];
105 int8_t const_to_reg
[NUM_CONSTS
];
107 struct translate_buffer buffer
[TRANSLATE_MAX_ATTRIBS
];
110 /* Multiple buffer variants can map to a single buffer. */
111 struct translate_buffer_variant buffer_variant
[TRANSLATE_MAX_ATTRIBS
];
112 unsigned nr_buffer_variants
;
114 /* Multiple elements can map to a single buffer variant. */
115 unsigned element_to_buffer_variant
[TRANSLATE_MAX_ATTRIBS
];
117 boolean use_instancing
;
118 unsigned instance_id
;
119 unsigned start_instance
;
121 /* these are actually known values, but putting them in a struct
122 * like this is helpful to keep them in sync across the file.
124 struct x86_reg tmp_EAX
;
125 struct x86_reg tmp2_EDX
;
126 struct x86_reg src_ECX
;
127 struct x86_reg idx_ESI
; /* either start+i or &elt[i] */
128 struct x86_reg machine_EDI
;
129 struct x86_reg outbuf_EBX
;
130 struct x86_reg count_EBP
; /* decrements to zero */
135 get_offset(const void *a
, const void *b
)
137 return (const char *) b
- (const char *) a
;
141 static struct x86_reg
142 get_const(struct translate_sse
*p
, unsigned id
)
147 if (p
->const_to_reg
[id
] >= 0)
148 return x86_make_reg(file_XMM
, p
->const_to_reg
[id
]);
150 for (i
= 2; i
< 8; ++i
) {
151 if (p
->reg_to_const
[i
] < 0)
155 /* TODO: be smarter here */
159 reg
= x86_make_reg(file_XMM
, i
);
161 if (p
->reg_to_const
[i
] >= 0)
162 p
->const_to_reg
[p
->reg_to_const
[i
]] = -1;
164 p
->reg_to_const
[i
] = id
;
165 p
->const_to_reg
[id
] = i
;
167 /* TODO: this should happen outside the loop, if possible */
168 sse_movaps(p
->func
, reg
,
169 x86_make_disp(p
->machine_EDI
,
170 get_offset(p
, &p
->consts
[id
][0])));
176 /* load the data in a SSE2 register, padding with zeros */
178 emit_load_sse2(struct translate_sse
*p
,
179 struct x86_reg data
, struct x86_reg src
, unsigned size
)
181 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
182 struct x86_reg tmp
= p
->tmp_EAX
;
185 x86_movzx8(p
->func
, tmp
, src
);
186 sse2_movd(p
->func
, data
, tmp
);
189 x86_movzx16(p
->func
, tmp
, src
);
190 sse2_movd(p
->func
, data
, tmp
);
193 x86_movzx8(p
->func
, tmp
, x86_make_disp(src
, 2));
194 x86_shl_imm(p
->func
, tmp
, 16);
195 x86_mov16(p
->func
, tmp
, src
);
196 sse2_movd(p
->func
, data
, tmp
);
199 sse2_movd(p
->func
, data
, src
);
202 sse2_movd(p
->func
, data
, src
);
203 x86_movzx16(p
->func
, tmp
, x86_make_disp(src
, 4));
204 sse2_movd(p
->func
, tmpXMM
, tmp
);
205 sse2_punpckldq(p
->func
, data
, tmpXMM
);
208 sse2_movq(p
->func
, data
, src
);
211 sse2_movq(p
->func
, data
, src
);
212 sse2_movd(p
->func
, tmpXMM
, x86_make_disp(src
, 8));
213 sse2_punpcklqdq(p
->func
, data
, tmpXMM
);
216 sse2_movdqu(p
->func
, data
, src
);
225 /* this value can be passed for the out_chans argument */
226 #define CHANNELS_0001 5
229 /* this function will load #chans float values, and will
230 * pad the register with zeroes at least up to out_chans.
232 * If out_chans is set to CHANNELS_0001, then the fourth
233 * value will be padded with 1. Only pass this value if
234 * chans < 4 or results are undefined.
237 emit_load_float32(struct translate_sse
*p
, struct x86_reg data
,
238 struct x86_reg arg0
, unsigned out_chans
, unsigned chans
)
245 sse_movss(p
->func
, data
, arg0
);
246 if (out_chans
== CHANNELS_0001
)
247 sse_orps(p
->func
, data
, get_const(p
, CONST_IDENTITY
));
253 if (out_chans
== CHANNELS_0001
)
254 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
),
256 else if (out_chans
> 2)
257 sse_movlhps(p
->func
, data
, get_const(p
, CONST_IDENTITY
));
258 sse_movlps(p
->func
, data
, arg0
);
261 /* Have to jump through some hoops:
264 * c 0 0 1 if out_chans == CHANNELS_0001
268 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
269 if (out_chans
== CHANNELS_0001
)
270 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
),
272 sse_shufps(p
->func
, data
, data
, SHUF(Y
, Z
, X
, W
));
273 sse_movlps(p
->func
, data
, arg0
);
276 sse_movups(p
->func
, data
, arg0
);
281 /* this function behaves like emit_load_float32, but loads
282 64-bit floating point numbers, converting them to 32-bit
285 emit_load_float64to32(struct translate_sse
*p
, struct x86_reg data
,
286 struct x86_reg arg0
, unsigned out_chans
, unsigned chans
)
288 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
291 sse2_movsd(p
->func
, data
, arg0
);
293 sse2_cvtpd2ps(p
->func
, data
, data
);
295 sse2_cvtsd2ss(p
->func
, data
, data
);
296 if (out_chans
== CHANNELS_0001
)
297 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
),
301 sse2_movupd(p
->func
, data
, arg0
);
302 sse2_cvtpd2ps(p
->func
, data
, data
);
303 if (out_chans
== CHANNELS_0001
)
304 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
),
306 else if (out_chans
> 2)
307 sse_movlhps(p
->func
, data
, get_const(p
, CONST_IDENTITY
));
310 sse2_movupd(p
->func
, data
, arg0
);
311 sse2_cvtpd2ps(p
->func
, data
, data
);
312 sse2_movsd(p
->func
, tmpXMM
, x86_make_disp(arg0
, 16));
314 sse2_cvtpd2ps(p
->func
, tmpXMM
, tmpXMM
);
316 sse2_cvtsd2ss(p
->func
, tmpXMM
, tmpXMM
);
317 sse_movlhps(p
->func
, data
, tmpXMM
);
318 if (out_chans
== CHANNELS_0001
)
319 sse_orps(p
->func
, data
, get_const(p
, CONST_IDENTITY
));
322 sse2_movupd(p
->func
, data
, arg0
);
323 sse2_cvtpd2ps(p
->func
, data
, data
);
324 sse2_movupd(p
->func
, tmpXMM
, x86_make_disp(arg0
, 16));
325 sse2_cvtpd2ps(p
->func
, tmpXMM
, tmpXMM
);
326 sse_movlhps(p
->func
, data
, tmpXMM
);
333 emit_mov64(struct translate_sse
*p
, struct x86_reg dst_gpr
,
334 struct x86_reg dst_xmm
, struct x86_reg src_gpr
,
335 struct x86_reg src_xmm
)
337 if (x86_target(p
->func
) != X86_32
)
338 x64_mov64(p
->func
, dst_gpr
, src_gpr
);
340 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341 if (x86_target_caps(p
->func
) & X86_SSE2
)
342 sse2_movq(p
->func
, dst_xmm
, src_xmm
);
344 sse_movlps(p
->func
, dst_xmm
, src_xmm
);
350 emit_load64(struct translate_sse
*p
, struct x86_reg dst_gpr
,
351 struct x86_reg dst_xmm
, struct x86_reg src
)
353 emit_mov64(p
, dst_gpr
, dst_xmm
, src
, src
);
358 emit_store64(struct translate_sse
*p
, struct x86_reg dst
,
359 struct x86_reg src_gpr
, struct x86_reg src_xmm
)
361 emit_mov64(p
, dst
, dst
, src_gpr
, src_xmm
);
366 emit_mov128(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src
)
368 if (x86_target_caps(p
->func
) & X86_SSE2
)
369 sse2_movdqu(p
->func
, dst
, src
);
371 sse_movups(p
->func
, dst
, src
);
375 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
376 * but may or may not be good on older processors
377 * TODO: may perhaps want to use non-temporal stores here if possible
380 emit_memcpy(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src
,
383 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
384 struct x86_reg dataXMM2
= x86_make_reg(file_XMM
, 1);
385 struct x86_reg dataGPR
= p
->tmp_EAX
;
386 struct x86_reg dataGPR2
= p
->tmp2_EDX
;
391 x86_mov8(p
->func
, dataGPR
, src
);
392 x86_mov8(p
->func
, dst
, dataGPR
);
395 x86_mov16(p
->func
, dataGPR
, src
);
396 x86_mov16(p
->func
, dst
, dataGPR
);
399 x86_mov16(p
->func
, dataGPR
, src
);
400 x86_mov8(p
->func
, dataGPR2
, x86_make_disp(src
, 2));
401 x86_mov16(p
->func
, dst
, dataGPR
);
402 x86_mov8(p
->func
, x86_make_disp(dst
, 2), dataGPR2
);
405 x86_mov(p
->func
, dataGPR
, src
);
406 x86_mov(p
->func
, dst
, dataGPR
);
409 x86_mov(p
->func
, dataGPR
, src
);
410 x86_mov16(p
->func
, dataGPR2
, x86_make_disp(src
, 4));
411 x86_mov(p
->func
, dst
, dataGPR
);
412 x86_mov16(p
->func
, x86_make_disp(dst
, 4), dataGPR2
);
416 else if (!(x86_target_caps(p
->func
) & X86_SSE
)) {
418 assert((size
& 3) == 0);
419 for (i
= 0; i
< size
; i
+= 4) {
420 x86_mov(p
->func
, dataGPR
, x86_make_disp(src
, i
));
421 x86_mov(p
->func
, x86_make_disp(dst
, i
), dataGPR
);
427 emit_load64(p
, dataGPR
, dataXMM
, src
);
428 emit_store64(p
, dst
, dataGPR
, dataXMM
);
431 emit_load64(p
, dataGPR2
, dataXMM
, src
);
432 x86_mov(p
->func
, dataGPR
, x86_make_disp(src
, 8));
433 emit_store64(p
, dst
, dataGPR2
, dataXMM
);
434 x86_mov(p
->func
, x86_make_disp(dst
, 8), dataGPR
);
437 emit_mov128(p
, dataXMM
, src
);
438 emit_mov128(p
, dst
, dataXMM
);
441 emit_mov128(p
, dataXMM
, src
);
442 emit_load64(p
, dataGPR
, dataXMM2
, x86_make_disp(src
, 16));
443 emit_mov128(p
, dst
, dataXMM
);
444 emit_store64(p
, x86_make_disp(dst
, 16), dataGPR
, dataXMM2
);
447 emit_mov128(p
, dataXMM
, src
);
448 emit_mov128(p
, dataXMM2
, x86_make_disp(src
, 16));
449 emit_mov128(p
, dst
, dataXMM
);
450 emit_mov128(p
, x86_make_disp(dst
, 16), dataXMM2
);
459 translate_attr_convert(struct translate_sse
*p
,
460 const struct translate_element
*a
,
461 struct x86_reg src
, struct x86_reg dst
)
463 const struct util_format_description
*input_desc
=
464 util_format_description(a
->input_format
);
465 const struct util_format_description
*output_desc
=
466 util_format_description(a
->output_format
);
468 boolean id_swizzle
= TRUE
;
469 unsigned swizzle
[4] =
470 { UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
,
471 UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
};
472 unsigned needed_chans
= 0;
473 unsigned imms
[2] = { 0, 0x3f800000 };
475 if (a
->output_format
== PIPE_FORMAT_NONE
476 || a
->input_format
== PIPE_FORMAT_NONE
)
479 if (input_desc
->channel
[0].size
& 7)
482 if (input_desc
->colorspace
!= output_desc
->colorspace
)
485 for (i
= 1; i
< input_desc
->nr_channels
; ++i
) {
487 (&input_desc
->channel
[i
], &input_desc
->channel
[0],
488 sizeof(input_desc
->channel
[0])))
492 for (i
= 1; i
< output_desc
->nr_channels
; ++i
) {
494 (&output_desc
->channel
[i
], &output_desc
->channel
[0],
495 sizeof(output_desc
->channel
[0]))) {
500 for (i
= 0; i
< output_desc
->nr_channels
; ++i
) {
501 if (output_desc
->swizzle
[i
] < 4)
502 swizzle
[output_desc
->swizzle
[i
]] = input_desc
->swizzle
[i
];
505 if ((x86_target_caps(p
->func
) & X86_SSE
) &&
506 (0 || a
->output_format
== PIPE_FORMAT_R32_FLOAT
507 || a
->output_format
== PIPE_FORMAT_R32G32_FLOAT
508 || a
->output_format
== PIPE_FORMAT_R32G32B32_FLOAT
509 || a
->output_format
== PIPE_FORMAT_R32G32B32A32_FLOAT
)) {
510 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
512 for (i
= 0; i
< output_desc
->nr_channels
; ++i
) {
513 if (swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
514 && i
>= input_desc
->nr_channels
)
518 for (i
= 0; i
< output_desc
->nr_channels
; ++i
) {
520 needed_chans
= MAX2(needed_chans
, swizzle
[i
] + 1);
521 if (swizzle
[i
] < UTIL_FORMAT_SWIZZLE_0
&& swizzle
[i
] != i
)
525 if (needed_chans
> 0) {
526 switch (input_desc
->channel
[0].type
) {
527 case UTIL_FORMAT_TYPE_UNSIGNED
:
528 if (!(x86_target_caps(p
->func
) & X86_SSE2
))
530 emit_load_sse2(p
, dataXMM
, src
,
531 input_desc
->channel
[0].size
*
532 input_desc
->nr_channels
>> 3);
534 /* TODO: add support for SSE4.1 pmovzx */
535 switch (input_desc
->channel
[0].size
) {
537 /* TODO: this may be inefficient due to get_identity() being
538 * used both as a float and integer register.
540 sse2_punpcklbw(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
541 sse2_punpcklbw(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
544 sse2_punpcklwd(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
546 case 32: /* we lose precision here */
547 sse2_psrld_imm(p
->func
, dataXMM
, 1);
552 sse2_cvtdq2ps(p
->func
, dataXMM
, dataXMM
);
553 if (input_desc
->channel
[0].normalized
) {
554 struct x86_reg factor
;
555 switch (input_desc
->channel
[0].size
) {
557 factor
= get_const(p
, CONST_INV_255
);
560 factor
= get_const(p
, CONST_INV_65535
);
563 factor
= get_const(p
, CONST_INV_2147483647
);
573 sse_mulps(p
->func
, dataXMM
, factor
);
575 else if (input_desc
->channel
[0].size
== 32)
576 /* compensate for the bit we threw away to fit u32 into s32 */
577 sse_addps(p
->func
, dataXMM
, dataXMM
);
579 case UTIL_FORMAT_TYPE_SIGNED
:
580 if (!(x86_target_caps(p
->func
) & X86_SSE2
))
582 emit_load_sse2(p
, dataXMM
, src
,
583 input_desc
->channel
[0].size
*
584 input_desc
->nr_channels
>> 3);
586 /* TODO: add support for SSE4.1 pmovsx */
587 switch (input_desc
->channel
[0].size
) {
589 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
590 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
591 sse2_psrad_imm(p
->func
, dataXMM
, 24);
594 sse2_punpcklwd(p
->func
, dataXMM
, dataXMM
);
595 sse2_psrad_imm(p
->func
, dataXMM
, 16);
597 case 32: /* we lose precision here */
602 sse2_cvtdq2ps(p
->func
, dataXMM
, dataXMM
);
603 if (input_desc
->channel
[0].normalized
) {
604 struct x86_reg factor
;
605 switch (input_desc
->channel
[0].size
) {
607 factor
= get_const(p
, CONST_INV_127
);
610 factor
= get_const(p
, CONST_INV_32767
);
613 factor
= get_const(p
, CONST_INV_2147483647
);
623 sse_mulps(p
->func
, dataXMM
, factor
);
628 case UTIL_FORMAT_TYPE_FLOAT
:
629 if (input_desc
->channel
[0].size
!= 32
630 && input_desc
->channel
[0].size
!= 64) {
633 if (swizzle
[3] == UTIL_FORMAT_SWIZZLE_1
634 && input_desc
->nr_channels
<= 3) {
635 swizzle
[3] = UTIL_FORMAT_SWIZZLE_W
;
636 needed_chans
= CHANNELS_0001
;
638 switch (input_desc
->channel
[0].size
) {
640 emit_load_float32(p
, dataXMM
, src
, needed_chans
,
641 input_desc
->nr_channels
);
643 case 64: /* we lose precision here */
644 if (!(x86_target_caps(p
->func
) & X86_SSE2
))
646 emit_load_float64to32(p
, dataXMM
, src
, needed_chans
,
647 input_desc
->nr_channels
);
658 sse_shufps(p
->func
, dataXMM
, dataXMM
,
659 SHUF(swizzle
[0], swizzle
[1], swizzle
[2], swizzle
[3]));
663 if (output_desc
->nr_channels
>= 4
664 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
665 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
666 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
667 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
) {
668 sse_movups(p
->func
, dst
, dataXMM
);
671 if (output_desc
->nr_channels
>= 2
672 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
673 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
) {
674 sse_movlps(p
->func
, dst
, dataXMM
);
677 if (swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
) {
678 sse_movss(p
->func
, dst
, dataXMM
);
681 x86_mov_imm(p
->func
, dst
,
682 imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
685 if (output_desc
->nr_channels
>= 2) {
686 if (swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
) {
687 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(1, 1, 2, 3));
688 sse_movss(p
->func
, x86_make_disp(dst
, 4), dataXMM
);
691 x86_mov_imm(p
->func
, x86_make_disp(dst
, 4),
692 imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
]);
697 if (output_desc
->nr_channels
>= 3) {
698 if (output_desc
->nr_channels
>= 4
699 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
700 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
) {
701 sse_movhps(p
->func
, x86_make_disp(dst
, 8), dataXMM
);
704 if (swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
) {
705 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(2, 2, 2, 3));
706 sse_movss(p
->func
, x86_make_disp(dst
, 8), dataXMM
);
709 x86_mov_imm(p
->func
, x86_make_disp(dst
, 8),
710 imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
713 if (output_desc
->nr_channels
>= 4) {
714 if (swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
) {
715 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(3, 3, 3, 3));
716 sse_movss(p
->func
, x86_make_disp(dst
, 12), dataXMM
);
719 x86_mov_imm(p
->func
, x86_make_disp(dst
, 12),
720 imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
]);
728 else if ((x86_target_caps(p
->func
) & X86_SSE2
)
729 && input_desc
->channel
[0].size
== 8
730 && output_desc
->channel
[0].size
== 16
731 && output_desc
->channel
[0].normalized
==
732 input_desc
->channel
[0].normalized
&&
733 (0 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
734 && output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
)
735 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
736 && output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
737 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
738 && output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
))) {
739 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
740 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
741 struct x86_reg tmp
= p
->tmp_EAX
;
742 unsigned imms
[2] = { 0, 1 };
744 for (i
= 0; i
< output_desc
->nr_channels
; ++i
) {
745 if (swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
746 && i
>= input_desc
->nr_channels
) {
751 for (i
= 0; i
< output_desc
->nr_channels
; ++i
) {
753 needed_chans
= MAX2(needed_chans
, swizzle
[i
] + 1);
754 if (swizzle
[i
] < UTIL_FORMAT_SWIZZLE_0
&& swizzle
[i
] != i
)
758 if (needed_chans
> 0) {
759 emit_load_sse2(p
, dataXMM
, src
,
760 input_desc
->channel
[0].size
*
761 input_desc
->nr_channels
>> 3);
763 switch (input_desc
->channel
[0].type
) {
764 case UTIL_FORMAT_TYPE_UNSIGNED
:
765 if (input_desc
->channel
[0].normalized
) {
766 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
767 if (output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
768 sse2_psrlw_imm(p
->func
, dataXMM
, 1);
771 sse2_punpcklbw(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
773 case UTIL_FORMAT_TYPE_SIGNED
:
774 if (input_desc
->channel
[0].normalized
) {
775 sse2_movq(p
->func
, tmpXMM
, get_const(p
, CONST_IDENTITY
));
776 sse2_punpcklbw(p
->func
, tmpXMM
, dataXMM
);
777 sse2_psllw_imm(p
->func
, dataXMM
, 9);
778 sse2_psrlw_imm(p
->func
, dataXMM
, 8);
779 sse2_por(p
->func
, tmpXMM
, dataXMM
);
780 sse2_psrlw_imm(p
->func
, dataXMM
, 7);
781 sse2_por(p
->func
, tmpXMM
, dataXMM
);
783 struct x86_reg t
= dataXMM
;
789 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
790 sse2_psraw_imm(p
->func
, dataXMM
, 8);
797 if (output_desc
->channel
[0].normalized
)
799 (output_desc
->channel
[0].type
==
800 UTIL_FORMAT_TYPE_UNSIGNED
) ? 0xffff : 0x7ffff;
803 sse2_pshuflw(p
->func
, dataXMM
, dataXMM
,
804 (swizzle
[0] & 3) | ((swizzle
[1] & 3) << 2) |
805 ((swizzle
[2] & 3) << 4) | ((swizzle
[3] & 3) << 6));
808 if (output_desc
->nr_channels
>= 4
809 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
810 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
811 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
812 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
) {
813 sse2_movq(p
->func
, dst
, dataXMM
);
816 if (swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
) {
817 if (output_desc
->nr_channels
>= 2
818 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
) {
819 sse2_movd(p
->func
, dst
, dataXMM
);
822 sse2_movd(p
->func
, tmp
, dataXMM
);
823 x86_mov16(p
->func
, dst
, tmp
);
824 if (output_desc
->nr_channels
>= 2)
825 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 2),
826 imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
]);
830 if (output_desc
->nr_channels
>= 2
831 && swizzle
[1] >= UTIL_FORMAT_SWIZZLE_0
) {
832 x86_mov_imm(p
->func
, dst
,
833 (imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
] << 16) |
834 imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
837 x86_mov16_imm(p
->func
, dst
,
838 imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
839 if (output_desc
->nr_channels
>= 2) {
840 sse2_movd(p
->func
, tmp
, dataXMM
);
841 x86_shr_imm(p
->func
, tmp
, 16);
842 x86_mov16(p
->func
, x86_make_disp(dst
, 2), tmp
);
847 if (output_desc
->nr_channels
>= 3) {
848 if (swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
) {
849 if (output_desc
->nr_channels
>= 4
850 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
) {
851 sse2_psrlq_imm(p
->func
, dataXMM
, 32);
852 sse2_movd(p
->func
, x86_make_disp(dst
, 4), dataXMM
);
855 sse2_psrlq_imm(p
->func
, dataXMM
, 32);
856 sse2_movd(p
->func
, tmp
, dataXMM
);
857 x86_mov16(p
->func
, x86_make_disp(dst
, 4), tmp
);
858 if (output_desc
->nr_channels
>= 4) {
859 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 6),
860 imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
]);
865 if (output_desc
->nr_channels
>= 4
866 && swizzle
[3] >= UTIL_FORMAT_SWIZZLE_0
) {
867 x86_mov_imm(p
->func
, x86_make_disp(dst
, 4),
868 (imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
] << 16)
869 | imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
872 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 4),
873 imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
875 if (output_desc
->nr_channels
>= 4) {
876 sse2_psrlq_imm(p
->func
, dataXMM
, 48);
877 sse2_movd(p
->func
, tmp
, dataXMM
);
878 x86_mov16(p
->func
, x86_make_disp(dst
, 6), tmp
);
886 else if (!memcmp(&output_desc
->channel
[0], &input_desc
->channel
[0],
887 sizeof(output_desc
->channel
[0]))) {
888 struct x86_reg tmp
= p
->tmp_EAX
;
891 if (input_desc
->channel
[0].size
== 8 && input_desc
->nr_channels
== 4
892 && output_desc
->nr_channels
== 4
893 && swizzle
[0] == UTIL_FORMAT_SWIZZLE_W
894 && swizzle
[1] == UTIL_FORMAT_SWIZZLE_Z
895 && swizzle
[2] == UTIL_FORMAT_SWIZZLE_Y
896 && swizzle
[3] == UTIL_FORMAT_SWIZZLE_X
) {
897 /* TODO: support movbe */
898 x86_mov(p
->func
, tmp
, src
);
899 x86_bswap(p
->func
, tmp
);
900 x86_mov(p
->func
, dst
, tmp
);
904 for (i
= 0; i
< output_desc
->nr_channels
; ++i
) {
905 switch (output_desc
->channel
[0].size
) {
907 if (swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
) {
909 if (swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
) {
910 switch (output_desc
->channel
[0].type
) {
911 case UTIL_FORMAT_TYPE_UNSIGNED
:
912 v
= output_desc
->channel
[0].normalized
? 0xff : 1;
914 case UTIL_FORMAT_TYPE_SIGNED
:
915 v
= output_desc
->channel
[0].normalized
? 0x7f : 1;
921 x86_mov8_imm(p
->func
, x86_make_disp(dst
, i
* 1), v
);
924 x86_mov8(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 1));
925 x86_mov8(p
->func
, x86_make_disp(dst
, i
* 1), tmp
);
929 if (swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
) {
931 if (swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
) {
932 switch (output_desc
->channel
[1].type
) {
933 case UTIL_FORMAT_TYPE_UNSIGNED
:
934 v
= output_desc
->channel
[1].normalized
? 0xffff : 1;
936 case UTIL_FORMAT_TYPE_SIGNED
:
937 v
= output_desc
->channel
[1].normalized
? 0x7fff : 1;
939 case UTIL_FORMAT_TYPE_FLOAT
:
946 x86_mov16_imm(p
->func
, x86_make_disp(dst
, i
* 2), v
);
948 else if (swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
) {
949 x86_mov16_imm(p
->func
, x86_make_disp(dst
, i
* 2), 0);
952 x86_mov16(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 2));
953 x86_mov16(p
->func
, x86_make_disp(dst
, i
* 2), tmp
);
957 if (swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
) {
959 if (swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
) {
960 switch (output_desc
->channel
[1].type
) {
961 case UTIL_FORMAT_TYPE_UNSIGNED
:
962 v
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
964 case UTIL_FORMAT_TYPE_SIGNED
:
965 v
= output_desc
->channel
[1].normalized
? 0x7fffffff : 1;
967 case UTIL_FORMAT_TYPE_FLOAT
:
974 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 4), v
);
977 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 4));
978 x86_mov(p
->func
, x86_make_disp(dst
, i
* 4), tmp
);
982 if (swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
) {
985 if (swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
) {
986 switch (output_desc
->channel
[1].type
) {
987 case UTIL_FORMAT_TYPE_UNSIGNED
:
988 h
= output_desc
->channel
[1].normalized
? 0xffffffff : 0;
989 l
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
991 case UTIL_FORMAT_TYPE_SIGNED
:
992 h
= output_desc
->channel
[1].normalized
? 0x7fffffff : 0;
993 l
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
995 case UTIL_FORMAT_TYPE_FLOAT
:
1003 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 8), l
);
1004 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 8 + 4), h
);
1007 if (x86_target_caps(p
->func
) & X86_SSE
) {
1008 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 0);
1009 emit_load64(p
, tmp
, tmpXMM
,
1010 x86_make_disp(src
, swizzle
[i
] * 8));
1011 emit_store64(p
, x86_make_disp(dst
, i
* 8), tmp
, tmpXMM
);
1014 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 8));
1015 x86_mov(p
->func
, x86_make_disp(dst
, i
* 8), tmp
);
1016 x86_mov(p
->func
, tmp
,
1017 x86_make_disp(src
, swizzle
[i
] * 8 + 4));
1018 x86_mov(p
->func
, x86_make_disp(dst
, i
* 8 + 4), tmp
);
1028 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029 else if ((x86_target_caps(p
->func
) & X86_SSE2
) &&
1030 a
->input_format
== PIPE_FORMAT_R32G32B32A32_FLOAT
&&
1031 (0 || a
->output_format
== PIPE_FORMAT_B8G8R8A8_UNORM
1032 || a
-> output_format
== PIPE_FORMAT_R8G8B8A8_UNORM
)) {
1033 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
1036 sse_movups(p
->func
, dataXMM
, src
);
1038 if (a
->output_format
== PIPE_FORMAT_B8G8R8A8_UNORM
) {
1039 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(2, 1, 0, 3));
1042 /* scale by 255.0 */
1043 sse_mulps(p
->func
, dataXMM
, get_const(p
, CONST_255
));
1046 sse2_cvtps2dq(p
->func
, dataXMM
, dataXMM
);
1047 sse2_packssdw(p
->func
, dataXMM
, dataXMM
);
1048 sse2_packuswb(p
->func
, dataXMM
, dataXMM
);
1049 sse2_movd(p
->func
, dst
, dataXMM
);
1059 translate_attr(struct translate_sse
*p
,
1060 const struct translate_element
*a
,
1061 struct x86_reg src
, struct x86_reg dst
)
1063 if (a
->input_format
== a
->output_format
) {
1064 emit_memcpy(p
, dst
, src
, util_format_get_stride(a
->input_format
, 1));
1068 return translate_attr_convert(p
, a
, src
, dst
);
1073 init_inputs(struct translate_sse
*p
, unsigned index_size
)
1076 struct x86_reg instance_id
=
1077 x86_make_disp(p
->machine_EDI
, get_offset(p
, &p
->instance_id
));
1078 struct x86_reg start_instance
=
1079 x86_make_disp(p
->machine_EDI
, get_offset(p
, &p
->start_instance
));
1081 for (i
= 0; i
< p
->nr_buffer_variants
; i
++) {
1082 struct translate_buffer_variant
*variant
= &p
->buffer_variant
[i
];
1083 struct translate_buffer
*buffer
= &p
->buffer
[variant
->buffer_index
];
1085 if (!index_size
|| variant
->instance_divisor
) {
1086 struct x86_reg buf_max_index
=
1087 x86_make_disp(p
->machine_EDI
, get_offset(p
, &buffer
->max_index
));
1088 struct x86_reg buf_stride
=
1089 x86_make_disp(p
->machine_EDI
, get_offset(p
, &buffer
->stride
));
1090 struct x86_reg buf_ptr
=
1091 x86_make_disp(p
->machine_EDI
, get_offset(p
, &variant
->ptr
));
1092 struct x86_reg buf_base_ptr
=
1093 x86_make_disp(p
->machine_EDI
, get_offset(p
, &buffer
->base_ptr
));
1094 struct x86_reg elt
= p
->idx_ESI
;
1095 struct x86_reg tmp_EAX
= p
->tmp_EAX
;
1097 /* Calculate pointer to first attrib:
1098 * base_ptr + stride * index, where index depends on instance divisor
1100 if (variant
->instance_divisor
) {
1101 /* Start with instance = instance_id
1102 * which is true if divisor is 1.
1104 x86_mov(p
->func
, tmp_EAX
, instance_id
);
1106 if (variant
->instance_divisor
!= 1) {
1107 struct x86_reg tmp_EDX
= p
->tmp2_EDX
;
1108 struct x86_reg tmp_ECX
= p
->src_ECX
;
1110 /* TODO: Add x86_shr() to rtasm and use it whenever
1111 * instance divisor is power of two.
1113 x86_xor(p
->func
, tmp_EDX
, tmp_EDX
);
1114 x86_mov_reg_imm(p
->func
, tmp_ECX
, variant
->instance_divisor
);
1115 x86_div(p
->func
, tmp_ECX
); /* EAX = EDX:EAX / ECX */
1117 /* instance = (instance_id - start_instance) / divisor +
1120 x86_mov(p
->func
, tmp_EDX
, start_instance
);
1121 x86_add(p
->func
, tmp_EAX
, tmp_EDX
);
1124 /* XXX we need to clamp the index here too, but to a
1125 * per-array max value, not the draw->pt.max_index value
1126 * that's being given to us via translate->set_buffer().
1130 x86_mov(p
->func
, tmp_EAX
, elt
);
1132 /* Clamp to max_index
1134 x86_cmp(p
->func
, tmp_EAX
, buf_max_index
);
1135 x86_cmovcc(p
->func
, tmp_EAX
, buf_max_index
, cc_AE
);
1138 x86_mov(p
->func
, p
->tmp2_EDX
, buf_stride
);
1140 x86_imul(p
->func
, tmp_EAX
, p
->tmp2_EDX
);
1142 x86_add(p
->func
, tmp_EAX
, buf_base_ptr
);
1144 x86_cmp(p
->func
, p
->count_EBP
, p
->tmp_EAX
);
1146 /* In the linear case, keep the buffer pointer instead of the
1149 if (!index_size
&& p
->nr_buffer_variants
== 1) {
1151 x86_mov(p
->func
, elt
, tmp_EAX
);
1155 x86_mov(p
->func
, buf_ptr
, tmp_EAX
);
1164 static struct x86_reg
1165 get_buffer_ptr(struct translate_sse
*p
,
1166 unsigned index_size
, unsigned var_idx
, struct x86_reg elt
)
1168 if (var_idx
== ELEMENT_BUFFER_INSTANCE_ID
) {
1169 return x86_make_disp(p
->machine_EDI
, get_offset(p
, &p
->instance_id
));
1171 if (!index_size
&& p
->nr_buffer_variants
== 1) {
1174 else if (!index_size
|| p
->buffer_variant
[var_idx
].instance_divisor
) {
1175 struct x86_reg ptr
= p
->src_ECX
;
1176 struct x86_reg buf_ptr
=
1177 x86_make_disp(p
->machine_EDI
,
1178 get_offset(p
, &p
->buffer_variant
[var_idx
].ptr
));
1181 x86_mov(p
->func
, ptr
, buf_ptr
);
1185 struct x86_reg ptr
= p
->src_ECX
;
1186 const struct translate_buffer_variant
*variant
=
1187 &p
->buffer_variant
[var_idx
];
1188 struct x86_reg buf_stride
=
1189 x86_make_disp(p
->machine_EDI
,
1190 get_offset(p
, &p
->buffer
[variant
->buffer_index
].stride
));
1191 struct x86_reg buf_base_ptr
=
1192 x86_make_disp(p
->machine_EDI
,
1193 get_offset(p
, &p
->buffer
[variant
->buffer_index
].base_ptr
));
1194 struct x86_reg buf_max_index
=
1195 x86_make_disp(p
->machine_EDI
,
1196 get_offset(p
, &p
->buffer
[variant
->buffer_index
].max_index
));
1198 /* Calculate pointer to current attrib:
1200 switch (index_size
) {
1202 x86_movzx8(p
->func
, ptr
, elt
);
1205 x86_movzx16(p
->func
, ptr
, elt
);
1208 x86_mov(p
->func
, ptr
, elt
);
1212 /* Clamp to max_index
1214 x86_cmp(p
->func
, ptr
, buf_max_index
);
1215 x86_cmovcc(p
->func
, ptr
, buf_max_index
, cc_AE
);
1217 x86_mov(p
->func
, p
->tmp2_EDX
, buf_stride
);
1219 x86_imul(p
->func
, ptr
, p
->tmp2_EDX
);
1221 x86_add(p
->func
, ptr
, buf_base_ptr
);
1228 incr_inputs(struct translate_sse
*p
, unsigned index_size
)
1230 if (!index_size
&& p
->nr_buffer_variants
== 1) {
1231 const unsigned buffer_index
= p
->buffer_variant
[0].buffer_index
;
1232 struct x86_reg stride
=
1233 x86_make_disp(p
->machine_EDI
,
1234 get_offset(p
, &p
->buffer
[buffer_index
].stride
));
1236 if (p
->buffer_variant
[0].instance_divisor
== 0) {
1238 x86_add(p
->func
, p
->idx_ESI
, stride
);
1239 sse_prefetchnta(p
->func
, x86_make_disp(p
->idx_ESI
, 192));
1242 else if (!index_size
) {
1245 /* Is this worthwhile??
1247 for (i
= 0; i
< p
->nr_buffer_variants
; i
++) {
1248 struct translate_buffer_variant
*variant
= &p
->buffer_variant
[i
];
1249 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDI
,
1250 get_offset(p
, &variant
->ptr
));
1251 struct x86_reg buf_stride
=
1252 x86_make_disp(p
->machine_EDI
,
1253 get_offset(p
, &p
->buffer
[variant
->buffer_index
].stride
));
1255 if (variant
->instance_divisor
== 0) {
1256 x86_mov(p
->func
, p
->tmp_EAX
, buf_stride
);
1258 x86_add(p
->func
, p
->tmp_EAX
, buf_ptr
);
1260 sse_prefetchnta(p
->func
, x86_make_disp(p
->tmp_EAX
, 192));
1262 x86_mov(p
->func
, buf_ptr
, p
->tmp_EAX
);
1268 x86_lea(p
->func
, p
->idx_ESI
, x86_make_disp(p
->idx_ESI
, index_size
));
1275 /* Build run( struct translate *machine,
1278 * void *output_buffer )
1280 * run_elts( struct translate *machine,
1283 * void *output_buffer )
1285 * Lots of hardcoding
1287 * EAX -- pointer to current output vertex
1288 * ECX -- pointer to current attribute
1292 build_vertex_emit(struct translate_sse
*p
,
1293 struct x86_function
*func
, unsigned index_size
)
1298 memset(p
->reg_to_const
, 0xff, sizeof(p
->reg_to_const
));
1299 memset(p
->const_to_reg
, 0xff, sizeof(p
->const_to_reg
));
1301 p
->tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
1302 p
->idx_ESI
= x86_make_reg(file_REG32
, reg_SI
);
1303 p
->outbuf_EBX
= x86_make_reg(file_REG32
, reg_BX
);
1304 p
->machine_EDI
= x86_make_reg(file_REG32
, reg_DI
);
1305 p
->count_EBP
= x86_make_reg(file_REG32
, reg_BP
);
1306 p
->tmp2_EDX
= x86_make_reg(file_REG32
, reg_DX
);
1307 p
->src_ECX
= x86_make_reg(file_REG32
, reg_CX
);
1311 x86_init_func(p
->func
);
1313 if (x86_target(p
->func
) == X86_64_WIN64_ABI
) {
1314 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315 * above the return address
1317 sse2_movdqa(p
->func
, x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 8),
1318 x86_make_reg(file_XMM
, 6));
1319 sse2_movdqa(p
->func
,
1320 x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 24),
1321 x86_make_reg(file_XMM
, 7));
1324 x86_push(p
->func
, p
->outbuf_EBX
);
1325 x86_push(p
->func
, p
->count_EBP
);
1327 /* on non-Win64 x86-64, these are already in the right registers */
1328 if (x86_target(p
->func
) != X86_64_STD_ABI
) {
1329 x86_push(p
->func
, p
->machine_EDI
);
1330 x86_push(p
->func
, p
->idx_ESI
);
1332 if (x86_target(p
->func
) != X86_32
) {
1333 x64_mov64(p
->func
, p
->machine_EDI
, x86_fn_arg(p
->func
, 1));
1334 x64_mov64(p
->func
, p
->idx_ESI
, x86_fn_arg(p
->func
, 2));
1337 x86_mov(p
->func
, p
->machine_EDI
, x86_fn_arg(p
->func
, 1));
1338 x86_mov(p
->func
, p
->idx_ESI
, x86_fn_arg(p
->func
, 2));
1342 x86_mov(p
->func
, p
->count_EBP
, x86_fn_arg(p
->func
, 3));
1344 if (x86_target(p
->func
) != X86_32
)
1345 x64_mov64(p
->func
, p
->outbuf_EBX
, x86_fn_arg(p
->func
, 6));
1347 x86_mov(p
->func
, p
->outbuf_EBX
, x86_fn_arg(p
->func
, 6));
1349 /* Load instance ID.
1351 if (p
->use_instancing
) {
1352 x86_mov(p
->func
, p
->tmp2_EDX
, x86_fn_arg(p
->func
, 4));
1354 x86_make_disp(p
->machine_EDI
,
1355 get_offset(p
, &p
->start_instance
)), p
->tmp2_EDX
);
1357 x86_mov(p
->func
, p
->tmp_EAX
, x86_fn_arg(p
->func
, 5));
1359 x86_make_disp(p
->machine_EDI
, get_offset(p
, &p
->instance_id
)),
1363 /* Get vertex count, compare to zero
1365 x86_xor(p
->func
, p
->tmp_EAX
, p
->tmp_EAX
);
1366 x86_cmp(p
->func
, p
->count_EBP
, p
->tmp_EAX
);
1367 fixup
= x86_jcc_forward(p
->func
, cc_E
);
1369 /* always load, needed or not:
1371 init_inputs(p
, index_size
);
1373 /* Note address for loop jump
1375 label
= x86_get_label(p
->func
);
1377 struct x86_reg elt
= !index_size
? p
->idx_ESI
: x86_deref(p
->idx_ESI
);
1378 int last_variant
= -1;
1381 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
1382 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
1383 unsigned variant
= p
->element_to_buffer_variant
[j
];
1385 /* Figure out source pointer address:
1387 if (variant
!= last_variant
) {
1388 last_variant
= variant
;
1389 vb
= get_buffer_ptr(p
, index_size
, variant
, elt
);
1392 if (!translate_attr(p
, a
,
1393 x86_make_disp(vb
, a
->input_offset
),
1394 x86_make_disp(p
->outbuf_EBX
, a
->output_offset
)))
1398 /* Next output vertex:
1401 x86_lea(p
->func
, p
->outbuf_EBX
,
1402 x86_make_disp(p
->outbuf_EBX
, p
->translate
.key
.output_stride
));
1406 incr_inputs(p
, index_size
);
1409 /* decr count, loop if not zero
1411 x86_dec(p
->func
, p
->count_EBP
);
1412 x86_jcc(p
->func
, cc_NZ
, label
);
1416 if (p
->func
->need_emms
)
1419 /* Land forward jump here:
1421 x86_fixup_fwd_jump(p
->func
, fixup
);
1423 /* Pop regs and return
1425 if (x86_target(p
->func
) != X86_64_STD_ABI
) {
1426 x86_pop(p
->func
, p
->idx_ESI
);
1427 x86_pop(p
->func
, p
->machine_EDI
);
1430 x86_pop(p
->func
, p
->count_EBP
);
1431 x86_pop(p
->func
, p
->outbuf_EBX
);
1433 if (x86_target(p
->func
) == X86_64_WIN64_ABI
) {
1434 sse2_movdqa(p
->func
, x86_make_reg(file_XMM
, 6),
1435 x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 8));
1436 sse2_movdqa(p
->func
, x86_make_reg(file_XMM
, 7),
1437 x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 24));
1446 translate_sse_set_buffer(struct translate
*translate
,
1448 const void *ptr
, unsigned stride
, unsigned max_index
)
1450 struct translate_sse
*p
= (struct translate_sse
*) translate
;
1452 if (buf
< p
->nr_buffers
) {
1453 p
->buffer
[buf
].base_ptr
= (char *) ptr
;
1454 p
->buffer
[buf
].stride
= stride
;
1455 p
->buffer
[buf
].max_index
= max_index
;
1459 debug_printf("%s %d/%d: %p %d\n",
1460 __FUNCTION__
, buf
, p
->nr_buffers
, ptr
, stride
);
1465 translate_sse_release(struct translate
*translate
)
1467 struct translate_sse
*p
= (struct translate_sse
*) translate
;
1469 x86_release_func(&p
->elt8_func
);
1470 x86_release_func(&p
->elt16_func
);
1471 x86_release_func(&p
->elt_func
);
1472 x86_release_func(&p
->linear_func
);
1479 translate_sse2_create(const struct translate_key
*key
)
1481 struct translate_sse
*p
= NULL
;
1484 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1485 if (!rtasm_cpu_has_sse())
1488 p
= os_malloc_aligned(sizeof(struct translate_sse
), 16);
1492 memset(p
, 0, sizeof(*p
));
1493 memcpy(p
->consts
, consts
, sizeof(consts
));
1495 p
->translate
.key
= *key
;
1496 p
->translate
.release
= translate_sse_release
;
1497 p
->translate
.set_buffer
= translate_sse_set_buffer
;
1499 assert(key
->nr_elements
<= TRANSLATE_MAX_ATTRIBS
);
1501 for (i
= 0; i
< key
->nr_elements
; i
++) {
1502 if (key
->element
[i
].type
== TRANSLATE_ELEMENT_NORMAL
) {
1506 MAX2(p
->nr_buffers
, key
->element
[i
].input_buffer
+ 1);
1508 if (key
->element
[i
].instance_divisor
) {
1509 p
->use_instancing
= TRUE
;
1513 * Map vertex element to vertex buffer variant.
1515 for (j
= 0; j
< p
->nr_buffer_variants
; j
++) {
1516 if (p
->buffer_variant
[j
].buffer_index
==
1517 key
->element
[i
].input_buffer
1518 && p
->buffer_variant
[j
].instance_divisor
==
1519 key
->element
[i
].instance_divisor
) {
1523 if (j
== p
->nr_buffer_variants
) {
1524 p
->buffer_variant
[j
].buffer_index
= key
->element
[i
].input_buffer
;
1525 p
->buffer_variant
[j
].instance_divisor
=
1526 key
->element
[i
].instance_divisor
;
1527 p
->nr_buffer_variants
++;
1529 p
->element_to_buffer_variant
[i
] = j
;
1532 assert(key
->element
[i
].type
== TRANSLATE_ELEMENT_INSTANCE_ID
);
1534 p
->element_to_buffer_variant
[i
] = ELEMENT_BUFFER_INSTANCE_ID
;
1539 debug_printf("nr_buffers: %d\n", p
->nr_buffers
);
1541 if (!build_vertex_emit(p
, &p
->linear_func
, 0))
1544 if (!build_vertex_emit(p
, &p
->elt_func
, 4))
1547 if (!build_vertex_emit(p
, &p
->elt16_func
, 2))
1550 if (!build_vertex_emit(p
, &p
->elt8_func
, 1))
1553 p
->translate
.run
= (run_func
) x86_get_func(&p
->linear_func
);
1554 if (p
->translate
.run
== NULL
)
1557 p
->translate
.run_elts
= (run_elts_func
) x86_get_func(&p
->elt_func
);
1558 if (p
->translate
.run_elts
== NULL
)
1561 p
->translate
.run_elts16
= (run_elts16_func
) x86_get_func(&p
->elt16_func
);
1562 if (p
->translate
.run_elts16
== NULL
)
1565 p
->translate
.run_elts8
= (run_elts8_func
) x86_get_func(&p
->elt8_func
);
1566 if (p
->translate
.run_elts8
== NULL
)
1569 return &p
->translate
;
1573 translate_sse_release(&p
->translate
);
1582 translate_sse2_create(const struct translate_key
*key
)