2 * Copyright 2003 Tungsten Graphics, inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 * Keith Whitwell <keithw@tungstengraphics.com>
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
35 #include "translate.h"
38 #if (defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
50 struct translate_buffer
{
56 struct translate_buffer_variant
{
57 unsigned buffer_index
;
58 unsigned instance_divisor
;
59 void *ptr
; /* updated either per vertex or per instance */
63 #define ELEMENT_BUFFER_INSTANCE_ID 1001
78 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
79 static float consts
[NUM_CONSTS
][4] = {
85 C(1.0 / 2147483647.0),
90 struct translate_sse
{
91 struct translate translate
;
93 struct x86_function linear_func
;
94 struct x86_function elt_func
;
95 struct x86_function elt16_func
;
96 struct x86_function elt8_func
;
97 struct x86_function
*func
;
99 PIPE_ALIGN_VAR(16) float consts
[NUM_CONSTS
][4];
100 int8_t reg_to_const
[16];
101 int8_t const_to_reg
[NUM_CONSTS
];
103 struct translate_buffer buffer
[PIPE_MAX_ATTRIBS
];
106 /* Multiple buffer variants can map to a single buffer. */
107 struct translate_buffer_variant buffer_variant
[PIPE_MAX_ATTRIBS
];
108 unsigned nr_buffer_variants
;
110 /* Multiple elements can map to a single buffer variant. */
111 unsigned element_to_buffer_variant
[PIPE_MAX_ATTRIBS
];
113 boolean use_instancing
;
114 unsigned instance_id
;
115 unsigned start_instance
;
117 /* these are actually known values, but putting them in a struct
118 * like this is helpful to keep them in sync across the file.
120 struct x86_reg tmp_EAX
;
121 struct x86_reg tmp2_EDX
;
122 struct x86_reg src_ECX
;
123 struct x86_reg idx_ESI
; /* either start+i or &elt[i] */
124 struct x86_reg machine_EDI
;
125 struct x86_reg outbuf_EBX
;
126 struct x86_reg count_EBP
; /* decrements to zero */
129 static int get_offset( const void *a
, const void *b
)
131 return (const char *)b
- (const char *)a
;
134 static struct x86_reg
get_const( struct translate_sse
*p
, unsigned id
)
139 if(p
->const_to_reg
[id
] >= 0)
140 return x86_make_reg(file_XMM
, p
->const_to_reg
[id
]);
142 for(i
= 2; i
< 8; ++i
)
144 if(p
->reg_to_const
[i
] < 0)
148 /* TODO: be smarter here */
152 reg
= x86_make_reg(file_XMM
, i
);
154 if(p
->reg_to_const
[i
] >= 0)
155 p
->const_to_reg
[p
->reg_to_const
[i
]] = -1;
157 p
->reg_to_const
[i
] = id
;
158 p
->const_to_reg
[id
] = i
;
160 /* TODO: this should happen outside the loop, if possible */
161 sse_movaps(p
->func
, reg
,
162 x86_make_disp(p
->machine_EDI
,
163 get_offset(p
, &p
->consts
[id
][0])));
168 /* load the data in a SSE2 register, padding with zeros */
169 static boolean
emit_load_sse2( struct translate_sse
*p
,
174 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
175 struct x86_reg tmp
= p
->tmp_EAX
;
179 x86_movzx8(p
->func
, tmp
, src
);
180 sse2_movd(p
->func
, data
, tmp
);
183 x86_movzx16(p
->func
, tmp
, src
);
184 sse2_movd(p
->func
, data
, tmp
);
187 x86_movzx8(p
->func
, tmp
, x86_make_disp(src
, 2));
188 x86_shl_imm(p
->func
, tmp
, 16);
189 x86_mov16(p
->func
, tmp
, src
);
190 sse2_movd(p
->func
, data
, tmp
);
193 sse2_movd(p
->func
, data
, src
);
196 sse2_movd(p
->func
, data
, src
);
197 x86_movzx16(p
->func
, tmp
, x86_make_disp(src
, 4));
198 sse2_movd(p
->func
, tmpXMM
, tmp
);
199 sse2_punpckldq(p
->func
, data
, tmpXMM
);
202 sse2_movq(p
->func
, data
, src
);
205 sse2_movq(p
->func
, data
, src
);
206 sse2_movd(p
->func
, tmpXMM
, x86_make_disp(src
, 8));
207 sse2_punpcklqdq(p
->func
, data
, tmpXMM
);
210 sse2_movdqu(p
->func
, data
, src
);
218 /* this value can be passed for the out_chans argument */
219 #define CHANNELS_0001 5
221 /* this function will load #chans float values, and will
222 * pad the register with zeroes at least up to out_chans.
224 * If out_chans is set to CHANNELS_0001, then the fourth
225 * value will be padded with 1. Only pass this value if
226 * chans < 4 or results are undefined.
228 static void emit_load_float32( struct translate_sse
*p
,
240 sse_movss(p
->func
, data
, arg0
);
241 if(out_chans
== CHANNELS_0001
)
242 sse_orps(p
->func
, data
, get_const(p
, CONST_IDENTITY
) );
248 if(out_chans
== CHANNELS_0001
)
249 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
), SHUF(X
, Y
, Z
, W
) );
250 else if(out_chans
> 2)
251 sse_movlhps(p
->func
, data
, get_const(p
, CONST_IDENTITY
) );
252 sse_movlps(p
->func
, data
, arg0
);
255 /* Have to jump through some hoops:
258 * c 0 0 1 if out_chans == CHANNELS_0001
262 sse_movss(p
->func
, data
, x86_make_disp(arg0
, 8));
263 if(out_chans
== CHANNELS_0001
)
264 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
), SHUF(X
,Y
,Z
,W
) );
265 sse_shufps(p
->func
, data
, data
, SHUF(Y
,Z
,X
,W
) );
266 sse_movlps(p
->func
, data
, arg0
);
269 sse_movups(p
->func
, data
, arg0
);
274 /* this function behaves like emit_load_float32, but loads
275 64-bit floating point numbers, converting them to 32-bit
277 static void emit_load_float64to32( struct translate_sse
*p
,
283 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
287 sse2_movsd(p
->func
, data
, arg0
);
289 sse2_cvtpd2ps(p
->func
, data
, data
);
291 sse2_cvtsd2ss(p
->func
, data
, data
);
292 if(out_chans
== CHANNELS_0001
)
293 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
), SHUF(X
, Y
, Z
, W
) );
296 sse2_movupd(p
->func
, data
, arg0
);
297 sse2_cvtpd2ps(p
->func
, data
, data
);
298 if(out_chans
== CHANNELS_0001
)
299 sse_shufps(p
->func
, data
, get_const(p
, CONST_IDENTITY
), SHUF(X
, Y
, Z
, W
) );
300 else if(out_chans
> 2)
301 sse_movlhps(p
->func
, data
, get_const(p
, CONST_IDENTITY
) );
304 sse2_movupd(p
->func
, data
, arg0
);
305 sse2_cvtpd2ps(p
->func
, data
, data
);
306 sse2_movsd(p
->func
, tmpXMM
, x86_make_disp(arg0
, 16));
308 sse2_cvtpd2ps(p
->func
, tmpXMM
, tmpXMM
);
310 sse2_cvtsd2ss(p
->func
, tmpXMM
, tmpXMM
);
311 sse_movlhps(p
->func
, data
, tmpXMM
);
312 if(out_chans
== CHANNELS_0001
)
313 sse_orps(p
->func
, data
, get_const(p
, CONST_IDENTITY
) );
316 sse2_movupd(p
->func
, data
, arg0
);
317 sse2_cvtpd2ps(p
->func
, data
, data
);
318 sse2_movupd(p
->func
, tmpXMM
, x86_make_disp(arg0
, 16));
319 sse2_cvtpd2ps(p
->func
, tmpXMM
, tmpXMM
);
320 sse_movlhps(p
->func
, data
, tmpXMM
);
325 static void emit_mov64(struct translate_sse
*p
, struct x86_reg dst_gpr
, struct x86_reg dst_xmm
, struct x86_reg src_gpr
, struct x86_reg src_xmm
)
327 if(x86_target(p
->func
) != X86_32
)
328 x64_mov64(p
->func
, dst_gpr
, src_gpr
);
331 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
332 if(x86_target_caps(p
->func
) & X86_SSE2
)
333 sse2_movq(p
->func
, dst_xmm
, src_xmm
);
335 sse_movlps(p
->func
, dst_xmm
, src_xmm
);
339 static void emit_load64(struct translate_sse
*p
, struct x86_reg dst_gpr
, struct x86_reg dst_xmm
, struct x86_reg src
)
341 emit_mov64(p
, dst_gpr
, dst_xmm
, src
, src
);
344 static void emit_store64(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src_gpr
, struct x86_reg src_xmm
)
346 emit_mov64(p
, dst
, dst
, src_gpr
, src_xmm
);
349 static void emit_mov128(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src
)
351 if(x86_target_caps(p
->func
) & X86_SSE2
)
352 sse2_movdqu(p
->func
, dst
, src
);
354 sse_movups(p
->func
, dst
, src
);
357 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
358 * but may or may not be good on older processors
359 * TODO: may perhaps want to use non-temporal stores here if possible
361 static void emit_memcpy(struct translate_sse
*p
, struct x86_reg dst
, struct x86_reg src
, unsigned size
)
363 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
364 struct x86_reg dataXMM2
= x86_make_reg(file_XMM
, 1);
365 struct x86_reg dataGPR
= p
->tmp_EAX
;
366 struct x86_reg dataGPR2
= p
->tmp2_EDX
;
373 x86_mov8(p
->func
, dataGPR
, src
);
374 x86_mov8(p
->func
, dst
, dataGPR
);
377 x86_mov16(p
->func
, dataGPR
, src
);
378 x86_mov16(p
->func
, dst
, dataGPR
);
381 x86_mov16(p
->func
, dataGPR
, src
);
382 x86_mov8(p
->func
, dataGPR2
, x86_make_disp(src
, 2));
383 x86_mov16(p
->func
, dst
, dataGPR
);
384 x86_mov8(p
->func
, x86_make_disp(dst
, 2), dataGPR2
);
387 x86_mov(p
->func
, dataGPR
, src
);
388 x86_mov(p
->func
, dst
, dataGPR
);
391 x86_mov(p
->func
, dataGPR
, src
);
392 x86_mov16(p
->func
, dataGPR2
, x86_make_disp(src
, 4));
393 x86_mov(p
->func
, dst
, dataGPR
);
394 x86_mov16(p
->func
, x86_make_disp(dst
, 4), dataGPR2
);
398 else if(!(x86_target_caps(p
->func
) & X86_SSE
))
401 assert((size
& 3) == 0);
402 for(i
= 0; i
< size
; i
+= 4)
404 x86_mov(p
->func
, dataGPR
, x86_make_disp(src
, i
));
405 x86_mov(p
->func
, x86_make_disp(dst
, i
), dataGPR
);
413 emit_load64(p
, dataGPR
, dataXMM
, src
);
414 emit_store64(p
, dst
, dataGPR
, dataXMM
);
417 emit_load64(p
, dataGPR2
, dataXMM
, src
);
418 x86_mov(p
->func
, dataGPR
, x86_make_disp(src
, 8));
419 emit_store64(p
, dst
, dataGPR2
, dataXMM
);
420 x86_mov(p
->func
, x86_make_disp(dst
, 8), dataGPR
);
423 emit_mov128(p
, dataXMM
, src
);
424 emit_mov128(p
, dst
, dataXMM
);
427 emit_mov128(p
, dataXMM
, src
);
428 emit_load64(p
, dataGPR
, dataXMM2
, x86_make_disp(src
, 16));
429 emit_mov128(p
, dst
, dataXMM
);
430 emit_store64(p
, x86_make_disp(dst
, 16), dataGPR
, dataXMM2
);
433 emit_mov128(p
, dataXMM
, src
);
434 emit_mov128(p
, dataXMM2
, x86_make_disp(src
, 16));
435 emit_mov128(p
, dst
, dataXMM
);
436 emit_mov128(p
, x86_make_disp(dst
, 16), dataXMM2
);
444 static boolean
translate_attr_convert( struct translate_sse
*p
,
445 const struct translate_element
*a
,
450 const struct util_format_description
* input_desc
= util_format_description(a
->input_format
);
451 const struct util_format_description
* output_desc
= util_format_description(a
->output_format
);
453 boolean id_swizzle
= TRUE
;
454 unsigned swizzle
[4] = {UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
, UTIL_FORMAT_SWIZZLE_NONE
};
455 unsigned needed_chans
= 0;
456 unsigned imms
[2] = {0, 0x3f800000};
458 if(a
->output_format
== PIPE_FORMAT_NONE
|| a
->input_format
== PIPE_FORMAT_NONE
)
461 if(input_desc
->channel
[0].size
& 7)
464 if(input_desc
->colorspace
!= output_desc
->colorspace
)
467 for(i
= 1; i
< input_desc
->nr_channels
; ++i
)
469 if(memcmp(&input_desc
->channel
[i
], &input_desc
->channel
[0], sizeof(input_desc
->channel
[0])))
473 for(i
= 1; i
< output_desc
->nr_channels
; ++i
)
475 if(memcmp(&output_desc
->channel
[i
], &output_desc
->channel
[0], sizeof(output_desc
->channel
[0])))
479 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
481 if(output_desc
->swizzle
[i
] < 4)
482 swizzle
[output_desc
->swizzle
[i
]] = input_desc
->swizzle
[i
];
485 if((x86_target_caps(p
->func
) & X86_SSE
) && (0
486 || a
->output_format
== PIPE_FORMAT_R32_FLOAT
487 || a
->output_format
== PIPE_FORMAT_R32G32_FLOAT
488 || a
->output_format
== PIPE_FORMAT_R32G32B32_FLOAT
489 || a
->output_format
== PIPE_FORMAT_R32G32B32A32_FLOAT
))
491 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
493 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
495 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
&& i
>= input_desc
->nr_channels
)
499 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
502 needed_chans
= MAX2(needed_chans
, swizzle
[i
] + 1);
503 if(swizzle
[i
] < UTIL_FORMAT_SWIZZLE_0
&& swizzle
[i
] != i
)
509 switch(input_desc
->channel
[0].type
)
511 case UTIL_FORMAT_TYPE_UNSIGNED
:
512 if(!(x86_target_caps(p
->func
) & X86_SSE2
))
514 emit_load_sse2(p
, dataXMM
, src
, input_desc
->channel
[0].size
* input_desc
->nr_channels
>> 3);
516 /* TODO: add support for SSE4.1 pmovzx */
517 switch(input_desc
->channel
[0].size
)
520 /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
521 sse2_punpcklbw(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
522 sse2_punpcklbw(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
525 sse2_punpcklwd(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
527 case 32: /* we lose precision here */
528 sse2_psrld_imm(p
->func
, dataXMM
, 1);
533 sse2_cvtdq2ps(p
->func
, dataXMM
, dataXMM
);
534 if(input_desc
->channel
[0].normalized
)
536 struct x86_reg factor
;
537 switch(input_desc
->channel
[0].size
)
540 factor
= get_const(p
, CONST_INV_255
);
543 factor
= get_const(p
, CONST_INV_65535
);
546 factor
= get_const(p
, CONST_INV_2147483647
);
556 sse_mulps(p
->func
, dataXMM
, factor
);
558 else if(input_desc
->channel
[0].size
== 32)
559 sse_addps(p
->func
, dataXMM
, dataXMM
); /* compensate for the bit we threw away to fit u32 into s32 */
561 case UTIL_FORMAT_TYPE_SIGNED
:
562 if(!(x86_target_caps(p
->func
) & X86_SSE2
))
564 emit_load_sse2(p
, dataXMM
, src
, input_desc
->channel
[0].size
* input_desc
->nr_channels
>> 3);
566 /* TODO: add support for SSE4.1 pmovsx */
567 switch(input_desc
->channel
[0].size
)
570 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
571 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
572 sse2_psrad_imm(p
->func
, dataXMM
, 24);
575 sse2_punpcklwd(p
->func
, dataXMM
, dataXMM
);
576 sse2_psrad_imm(p
->func
, dataXMM
, 16);
578 case 32: /* we lose precision here */
583 sse2_cvtdq2ps(p
->func
, dataXMM
, dataXMM
);
584 if(input_desc
->channel
[0].normalized
)
586 struct x86_reg factor
;
587 switch(input_desc
->channel
[0].size
)
590 factor
= get_const(p
, CONST_INV_127
);
593 factor
= get_const(p
, CONST_INV_32767
);
596 factor
= get_const(p
, CONST_INV_2147483647
);
606 sse_mulps(p
->func
, dataXMM
, factor
);
611 case UTIL_FORMAT_TYPE_FLOAT
:
612 if(input_desc
->channel
[0].size
!= 32 && input_desc
->channel
[0].size
!= 64)
614 if(swizzle
[3] == UTIL_FORMAT_SWIZZLE_1
&& input_desc
->nr_channels
<= 3)
616 swizzle
[3] = UTIL_FORMAT_SWIZZLE_W
;
617 needed_chans
= CHANNELS_0001
;
619 switch(input_desc
->channel
[0].size
)
622 emit_load_float32(p
, dataXMM
, src
, needed_chans
, input_desc
->nr_channels
);
624 case 64: /* we lose precision here */
625 if(!(x86_target_caps(p
->func
) & X86_SSE2
))
627 emit_load_float64to32(p
, dataXMM
, src
, needed_chans
, input_desc
->nr_channels
);
638 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(swizzle
[0], swizzle
[1], swizzle
[2], swizzle
[3]) );
641 if(output_desc
->nr_channels
>= 4
642 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
643 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
644 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
645 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
647 sse_movups(p
->func
, dst
, dataXMM
);
650 if(output_desc
->nr_channels
>= 2
651 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
652 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
)
653 sse_movlps(p
->func
, dst
, dataXMM
);
656 if(swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
)
657 sse_movss(p
->func
, dst
, dataXMM
);
659 x86_mov_imm(p
->func
, dst
, imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
661 if(output_desc
->nr_channels
>= 2)
663 if(swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
)
665 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(1, 1, 2, 3));
666 sse_movss(p
->func
, x86_make_disp(dst
, 4), dataXMM
);
669 x86_mov_imm(p
->func
, x86_make_disp(dst
, 4), imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
]);
673 if(output_desc
->nr_channels
>= 3)
675 if(output_desc
->nr_channels
>= 4
676 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
677 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
)
678 sse_movhps(p
->func
, x86_make_disp(dst
, 8), dataXMM
);
681 if(swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
)
683 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(2, 2, 2, 3));
684 sse_movss(p
->func
, x86_make_disp(dst
, 8), dataXMM
);
687 x86_mov_imm(p
->func
, x86_make_disp(dst
, 8), imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
689 if(output_desc
->nr_channels
>= 4)
691 if(swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
)
693 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(3, 3, 3, 3));
694 sse_movss(p
->func
, x86_make_disp(dst
, 12), dataXMM
);
697 x86_mov_imm(p
->func
, x86_make_disp(dst
, 12), imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
]);
704 else if((x86_target_caps(p
->func
) & X86_SSE2
) && input_desc
->channel
[0].size
== 8 && output_desc
->channel
[0].size
== 16
705 && output_desc
->channel
[0].normalized
== input_desc
->channel
[0].normalized
707 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
&& output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
)
708 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
&& output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
709 || (input_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
&& output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
712 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
713 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 1);
714 struct x86_reg tmp
= p
->tmp_EAX
;
715 unsigned imms
[2] = {0, 1};
717 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
719 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
&& i
>= input_desc
->nr_channels
)
723 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
726 needed_chans
= MAX2(needed_chans
, swizzle
[i
] + 1);
727 if(swizzle
[i
] < UTIL_FORMAT_SWIZZLE_0
&& swizzle
[i
] != i
)
733 emit_load_sse2(p
, dataXMM
, src
, input_desc
->channel
[0].size
* input_desc
->nr_channels
>> 3);
735 switch(input_desc
->channel
[0].type
)
737 case UTIL_FORMAT_TYPE_UNSIGNED
:
738 if(input_desc
->channel
[0].normalized
)
740 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
741 if(output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_SIGNED
)
742 sse2_psrlw_imm(p
->func
, dataXMM
, 1);
745 sse2_punpcklbw(p
->func
, dataXMM
, get_const(p
, CONST_IDENTITY
));
747 case UTIL_FORMAT_TYPE_SIGNED
:
748 if(input_desc
->channel
[0].normalized
)
750 sse2_movq(p
->func
, tmpXMM
, get_const(p
, CONST_IDENTITY
));
751 sse2_punpcklbw(p
->func
, tmpXMM
, dataXMM
);
752 sse2_psllw_imm(p
->func
, dataXMM
, 9);
753 sse2_psrlw_imm(p
->func
, dataXMM
, 8);
754 sse2_por(p
->func
, tmpXMM
, dataXMM
);
755 sse2_psrlw_imm(p
->func
, dataXMM
, 7);
756 sse2_por(p
->func
, tmpXMM
, dataXMM
);
758 struct x86_reg t
= dataXMM
;
765 sse2_punpcklbw(p
->func
, dataXMM
, dataXMM
);
766 sse2_psraw_imm(p
->func
, dataXMM
, 8);
773 if(output_desc
->channel
[0].normalized
)
774 imms
[1] = (output_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_UNSIGNED
) ? 0xffff : 0x7ffff;
777 sse2_pshuflw(p
->func
, dataXMM
, dataXMM
, (swizzle
[0] & 3) | ((swizzle
[1] & 3) << 2) | ((swizzle
[2] & 3) << 4) | ((swizzle
[3] & 3) << 6));
780 if(output_desc
->nr_channels
>= 4
781 && swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
782 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
783 && swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
784 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
786 sse2_movq(p
->func
, dst
, dataXMM
);
789 if(swizzle
[0] < UTIL_FORMAT_SWIZZLE_0
)
791 if(output_desc
->nr_channels
>= 2 && swizzle
[1] < UTIL_FORMAT_SWIZZLE_0
)
792 sse2_movd(p
->func
, dst
, dataXMM
);
795 sse2_movd(p
->func
, tmp
, dataXMM
);
796 x86_mov16(p
->func
, dst
, tmp
);
797 if(output_desc
->nr_channels
>= 2)
798 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 2), imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
]);
803 if(output_desc
->nr_channels
>= 2 && swizzle
[1] >= UTIL_FORMAT_SWIZZLE_0
)
804 x86_mov_imm(p
->func
, dst
, (imms
[swizzle
[1] - UTIL_FORMAT_SWIZZLE_0
] << 16) | imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
807 x86_mov16_imm(p
->func
, dst
, imms
[swizzle
[0] - UTIL_FORMAT_SWIZZLE_0
]);
808 if(output_desc
->nr_channels
>= 2)
810 sse2_movd(p
->func
, tmp
, dataXMM
);
811 x86_shr_imm(p
->func
, tmp
, 16);
812 x86_mov16(p
->func
, x86_make_disp(dst
, 2), tmp
);
817 if(output_desc
->nr_channels
>= 3)
819 if(swizzle
[2] < UTIL_FORMAT_SWIZZLE_0
)
821 if(output_desc
->nr_channels
>= 4 && swizzle
[3] < UTIL_FORMAT_SWIZZLE_0
)
823 sse2_psrlq_imm(p
->func
, dataXMM
, 32);
824 sse2_movd(p
->func
, x86_make_disp(dst
, 4), dataXMM
);
828 sse2_psrlq_imm(p
->func
, dataXMM
, 32);
829 sse2_movd(p
->func
, tmp
, dataXMM
);
830 x86_mov16(p
->func
, x86_make_disp(dst
, 4), tmp
);
831 if(output_desc
->nr_channels
>= 4)
833 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 6), imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
]);
839 if(output_desc
->nr_channels
>= 4 && swizzle
[3] >= UTIL_FORMAT_SWIZZLE_0
)
840 x86_mov_imm(p
->func
, x86_make_disp(dst
, 4), (imms
[swizzle
[3] - UTIL_FORMAT_SWIZZLE_0
] << 16) | imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
843 x86_mov16_imm(p
->func
, x86_make_disp(dst
, 4), imms
[swizzle
[2] - UTIL_FORMAT_SWIZZLE_0
]);
845 if(output_desc
->nr_channels
>= 4)
847 sse2_psrlq_imm(p
->func
, dataXMM
, 48);
848 sse2_movd(p
->func
, tmp
, dataXMM
);
849 x86_mov16(p
->func
, x86_make_disp(dst
, 6), tmp
);
857 else if(!memcmp(&output_desc
->channel
[0], &input_desc
->channel
[0], sizeof(output_desc
->channel
[0])))
859 struct x86_reg tmp
= p
->tmp_EAX
;
861 if(input_desc
->channel
[0].size
== 8 && input_desc
->nr_channels
== 4 && output_desc
->nr_channels
== 4
862 && swizzle
[0] == UTIL_FORMAT_SWIZZLE_W
863 && swizzle
[1] == UTIL_FORMAT_SWIZZLE_Z
864 && swizzle
[2] == UTIL_FORMAT_SWIZZLE_Y
865 && swizzle
[3] == UTIL_FORMAT_SWIZZLE_X
)
867 /* TODO: support movbe */
868 x86_mov(p
->func
, tmp
, src
);
869 x86_bswap(p
->func
, tmp
);
870 x86_mov(p
->func
, dst
, tmp
);
874 for(i
= 0; i
< output_desc
->nr_channels
; ++i
)
876 switch(output_desc
->channel
[0].size
)
879 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
882 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
884 switch(output_desc
->channel
[0].type
)
886 case UTIL_FORMAT_TYPE_UNSIGNED
:
887 v
= output_desc
->channel
[0].normalized
? 0xff : 1;
889 case UTIL_FORMAT_TYPE_SIGNED
:
890 v
= output_desc
->channel
[0].normalized
? 0x7f : 1;
896 x86_mov8_imm(p
->func
, x86_make_disp(dst
, i
* 1), v
);
900 x86_mov8(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 1));
901 x86_mov8(p
->func
, x86_make_disp(dst
, i
* 1), tmp
);
905 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
908 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
910 switch(output_desc
->channel
[1].type
)
912 case UTIL_FORMAT_TYPE_UNSIGNED
:
913 v
= output_desc
->channel
[1].normalized
? 0xffff : 1;
915 case UTIL_FORMAT_TYPE_SIGNED
:
916 v
= output_desc
->channel
[1].normalized
? 0x7fff : 1;
918 case UTIL_FORMAT_TYPE_FLOAT
:
925 x86_mov16_imm(p
->func
, x86_make_disp(dst
, i
* 2), v
);
927 else if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_0
)
928 x86_mov16_imm(p
->func
, x86_make_disp(dst
, i
* 2), 0);
931 x86_mov16(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 2));
932 x86_mov16(p
->func
, x86_make_disp(dst
, i
* 2), tmp
);
936 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
939 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
941 switch(output_desc
->channel
[1].type
)
943 case UTIL_FORMAT_TYPE_UNSIGNED
:
944 v
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
946 case UTIL_FORMAT_TYPE_SIGNED
:
947 v
= output_desc
->channel
[1].normalized
? 0x7fffffff : 1;
949 case UTIL_FORMAT_TYPE_FLOAT
:
956 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 4), v
);
960 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 4));
961 x86_mov(p
->func
, x86_make_disp(dst
, i
* 4), tmp
);
965 if(swizzle
[i
] >= UTIL_FORMAT_SWIZZLE_0
)
969 if(swizzle
[i
] == UTIL_FORMAT_SWIZZLE_1
)
971 switch(output_desc
->channel
[1].type
)
973 case UTIL_FORMAT_TYPE_UNSIGNED
:
974 h
= output_desc
->channel
[1].normalized
? 0xffffffff : 0;
975 l
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
977 case UTIL_FORMAT_TYPE_SIGNED
:
978 h
= output_desc
->channel
[1].normalized
? 0x7fffffff : 0;
979 l
= output_desc
->channel
[1].normalized
? 0xffffffff : 1;
981 case UTIL_FORMAT_TYPE_FLOAT
:
989 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 8), l
);
990 x86_mov_imm(p
->func
, x86_make_disp(dst
, i
* 8 + 4), h
);
994 if(x86_target_caps(p
->func
) & X86_SSE
)
996 struct x86_reg tmpXMM
= x86_make_reg(file_XMM
, 0);
997 emit_load64(p
, tmp
, tmpXMM
, x86_make_disp(src
, swizzle
[i
] * 8));
998 emit_store64(p
, x86_make_disp(dst
, i
* 8), tmp
, tmpXMM
);
1002 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 8));
1003 x86_mov(p
->func
, x86_make_disp(dst
, i
* 8), tmp
);
1004 x86_mov(p
->func
, tmp
, x86_make_disp(src
, swizzle
[i
] * 8 + 4));
1005 x86_mov(p
->func
, x86_make_disp(dst
, i
* 8 + 4), tmp
);
1015 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1016 else if((x86_target_caps(p
->func
) & X86_SSE2
) &&
1017 a
->input_format
== PIPE_FORMAT_R32G32B32A32_FLOAT
&& (0
1018 || a
->output_format
== PIPE_FORMAT_B8G8R8A8_UNORM
1019 || a
->output_format
== PIPE_FORMAT_R8G8B8A8_UNORM
1022 struct x86_reg dataXMM
= x86_make_reg(file_XMM
, 0);
1025 sse_movups(p
->func
, dataXMM
, src
);
1027 if (a
->output_format
== PIPE_FORMAT_B8G8R8A8_UNORM
)
1028 sse_shufps(p
->func
, dataXMM
, dataXMM
, SHUF(2,1,0,3));
1030 /* scale by 255.0 */
1031 sse_mulps(p
->func
, dataXMM
, get_const(p
, CONST_255
));
1034 sse2_cvtps2dq(p
->func
, dataXMM
, dataXMM
);
1035 sse2_packssdw(p
->func
, dataXMM
, dataXMM
);
1036 sse2_packuswb(p
->func
, dataXMM
, dataXMM
);
1037 sse2_movd(p
->func
, dst
, dataXMM
);
1045 static boolean
translate_attr( struct translate_sse
*p
,
1046 const struct translate_element
*a
,
1050 if(a
->input_format
== a
->output_format
)
1052 emit_memcpy(p
, dst
, src
, util_format_get_stride(a
->input_format
, 1));
1056 return translate_attr_convert(p
, a
, src
, dst
);
1059 static boolean
init_inputs( struct translate_sse
*p
,
1060 unsigned index_size
)
1063 struct x86_reg instance_id
= x86_make_disp(p
->machine_EDI
,
1064 get_offset(p
, &p
->instance_id
));
1065 struct x86_reg start_instance
= x86_make_disp(p
->machine_EDI
,
1066 get_offset(p
, &p
->start_instance
));
1068 for (i
= 0; i
< p
->nr_buffer_variants
; i
++) {
1069 struct translate_buffer_variant
*variant
= &p
->buffer_variant
[i
];
1070 struct translate_buffer
*buffer
= &p
->buffer
[variant
->buffer_index
];
1072 if (!index_size
|| variant
->instance_divisor
) {
1073 struct x86_reg buf_max_index
= x86_make_disp(p
->machine_EDI
,
1074 get_offset(p
, &buffer
->max_index
));
1075 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDI
,
1076 get_offset(p
, &buffer
->stride
));
1077 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDI
,
1078 get_offset(p
, &variant
->ptr
));
1079 struct x86_reg buf_base_ptr
= x86_make_disp(p
->machine_EDI
,
1080 get_offset(p
, &buffer
->base_ptr
));
1081 struct x86_reg elt
= p
->idx_ESI
;
1082 struct x86_reg tmp_EAX
= p
->tmp_EAX
;
1084 /* Calculate pointer to first attrib:
1085 * base_ptr + stride * index, where index depends on instance divisor
1087 if (variant
->instance_divisor
) {
1088 /* Start with instance = instance_id
1089 * which is true if divisor is 1.
1091 x86_mov(p
->func
, tmp_EAX
, instance_id
);
1093 if (variant
->instance_divisor
!= 1) {
1094 struct x86_reg tmp_EDX
= p
->tmp2_EDX
;
1095 struct x86_reg tmp_ECX
= p
->src_ECX
;
1097 /* TODO: Add x86_shr() to rtasm and use it whenever
1098 * instance divisor is power of two.
1100 x86_xor(p
->func
, tmp_EDX
, tmp_EDX
);
1101 x86_mov_reg_imm(p
->func
, tmp_ECX
, variant
->instance_divisor
);
1102 x86_div(p
->func
, tmp_ECX
); /* EAX = EDX:EAX / ECX */
1104 /* instance = (instance_id - start_instance) / divisor +
1107 x86_mov(p
->func
, tmp_EDX
, start_instance
);
1108 x86_add(p
->func
, tmp_EAX
, tmp_EDX
);
1111 /* XXX we need to clamp the index here too, but to a
1112 * per-array max value, not the draw->pt.max_index value
1113 * that's being given to us via translate->set_buffer().
1116 x86_mov(p
->func
, tmp_EAX
, elt
);
1118 /* Clamp to max_index
1120 x86_cmp(p
->func
, tmp_EAX
, buf_max_index
);
1121 x86_cmovcc(p
->func
, tmp_EAX
, buf_max_index
, cc_AE
);
1124 x86_imul(p
->func
, tmp_EAX
, buf_stride
);
1126 x86_add(p
->func
, tmp_EAX
, buf_base_ptr
);
1128 x86_cmp(p
->func
, p
->count_EBP
, p
->tmp_EAX
);
1130 /* In the linear case, keep the buffer pointer instead of the
1133 if (!index_size
&& p
->nr_buffer_variants
== 1)
1136 x86_mov(p
->func
, elt
, tmp_EAX
);
1141 x86_mov(p
->func
, buf_ptr
, tmp_EAX
);
1150 static struct x86_reg
get_buffer_ptr( struct translate_sse
*p
,
1151 unsigned index_size
,
1153 struct x86_reg elt
)
1155 if (var_idx
== ELEMENT_BUFFER_INSTANCE_ID
) {
1156 return x86_make_disp(p
->machine_EDI
,
1157 get_offset(p
, &p
->instance_id
));
1159 if (!index_size
&& p
->nr_buffer_variants
== 1) {
1162 else if (!index_size
|| p
->buffer_variant
[var_idx
].instance_divisor
) {
1163 struct x86_reg ptr
= p
->src_ECX
;
1164 struct x86_reg buf_ptr
=
1165 x86_make_disp(p
->machine_EDI
,
1166 get_offset(p
, &p
->buffer_variant
[var_idx
].ptr
));
1169 x86_mov(p
->func
, ptr
, buf_ptr
);
1173 struct x86_reg ptr
= p
->src_ECX
;
1174 const struct translate_buffer_variant
*variant
= &p
->buffer_variant
[var_idx
];
1176 struct x86_reg buf_stride
=
1177 x86_make_disp(p
->machine_EDI
,
1178 get_offset(p
, &p
->buffer
[variant
->buffer_index
].stride
));
1180 struct x86_reg buf_base_ptr
=
1181 x86_make_disp(p
->machine_EDI
,
1182 get_offset(p
, &p
->buffer
[variant
->buffer_index
].base_ptr
));
1184 struct x86_reg buf_max_index
=
1185 x86_make_disp(p
->machine_EDI
,
1186 get_offset(p
, &p
->buffer
[variant
->buffer_index
].max_index
));
1190 /* Calculate pointer to current attrib:
1195 x86_movzx8(p
->func
, ptr
, elt
);
1198 x86_movzx16(p
->func
, ptr
, elt
);
1201 x86_mov(p
->func
, ptr
, elt
);
1205 /* Clamp to max_index
1207 x86_cmp(p
->func
, ptr
, buf_max_index
);
1208 x86_cmovcc(p
->func
, ptr
, buf_max_index
, cc_AE
);
1210 x86_imul(p
->func
, ptr
, buf_stride
);
1212 x86_add(p
->func
, ptr
, buf_base_ptr
);
1219 static boolean
incr_inputs( struct translate_sse
*p
,
1220 unsigned index_size
)
1222 if (!index_size
&& p
->nr_buffer_variants
== 1) {
1223 struct x86_reg stride
= x86_make_disp(p
->machine_EDI
,
1224 get_offset(p
, &p
->buffer
[0].stride
));
1226 if (p
->buffer_variant
[0].instance_divisor
== 0) {
1228 x86_add(p
->func
, p
->idx_ESI
, stride
);
1229 sse_prefetchnta(p
->func
, x86_make_disp(p
->idx_ESI
, 192));
1232 else if (!index_size
) {
1235 /* Is this worthwhile??
1237 for (i
= 0; i
< p
->nr_buffer_variants
; i
++) {
1238 struct translate_buffer_variant
*variant
= &p
->buffer_variant
[i
];
1239 struct x86_reg buf_ptr
= x86_make_disp(p
->machine_EDI
,
1240 get_offset(p
, &variant
->ptr
));
1241 struct x86_reg buf_stride
= x86_make_disp(p
->machine_EDI
,
1242 get_offset(p
, &p
->buffer
[variant
->buffer_index
].stride
));
1244 if (variant
->instance_divisor
== 0) {
1245 x86_mov(p
->func
, p
->tmp_EAX
, buf_stride
);
1247 x86_add(p
->func
, p
->tmp_EAX
, buf_ptr
);
1248 if (i
== 0) sse_prefetchnta(p
->func
, x86_make_disp(p
->tmp_EAX
, 192));
1250 x86_mov(p
->func
, buf_ptr
, p
->tmp_EAX
);
1256 x86_lea(p
->func
, p
->idx_ESI
, x86_make_disp(p
->idx_ESI
, index_size
));
1263 /* Build run( struct translate *machine,
1266 * void *output_buffer )
1268 * run_elts( struct translate *machine,
1271 * void *output_buffer )
1273 * Lots of hardcoding
1275 * EAX -- pointer to current output vertex
1276 * ECX -- pointer to current attribute
1279 static boolean
build_vertex_emit( struct translate_sse
*p
,
1280 struct x86_function
*func
,
1281 unsigned index_size
)
1286 memset(p
->reg_to_const
, 0xff, sizeof(p
->reg_to_const
));
1287 memset(p
->const_to_reg
, 0xff, sizeof(p
->const_to_reg
));
1289 p
->tmp_EAX
= x86_make_reg(file_REG32
, reg_AX
);
1290 p
->idx_ESI
= x86_make_reg(file_REG32
, reg_SI
);
1291 p
->outbuf_EBX
= x86_make_reg(file_REG32
, reg_BX
);
1292 p
->machine_EDI
= x86_make_reg(file_REG32
, reg_DI
);
1293 p
->count_EBP
= x86_make_reg(file_REG32
, reg_BP
);
1294 p
->tmp2_EDX
= x86_make_reg(file_REG32
, reg_DX
);
1295 p
->src_ECX
= x86_make_reg(file_REG32
, reg_CX
);
1299 x86_init_func(p
->func
);
1301 if(x86_target(p
->func
) == X86_64_WIN64_ABI
)
1303 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1304 sse2_movdqa(p
->func
, x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 8), x86_make_reg(file_XMM
, 6));
1305 sse2_movdqa(p
->func
, x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 24), x86_make_reg(file_XMM
, 7));
1308 x86_push(p
->func
, p
->outbuf_EBX
);
1309 x86_push(p
->func
, p
->count_EBP
);
1311 /* on non-Win64 x86-64, these are already in the right registers */
1312 if(x86_target(p
->func
) != X86_64_STD_ABI
)
1314 x86_push(p
->func
, p
->machine_EDI
);
1315 x86_push(p
->func
, p
->idx_ESI
);
1317 x86_mov(p
->func
, p
->machine_EDI
, x86_fn_arg(p
->func
, 1));
1318 x86_mov(p
->func
, p
->idx_ESI
, x86_fn_arg(p
->func
, 2));
1321 x86_mov(p
->func
, p
->count_EBP
, x86_fn_arg(p
->func
, 3));
1323 if(x86_target(p
->func
) != X86_32
)
1324 x64_mov64(p
->func
, p
->outbuf_EBX
, x86_fn_arg(p
->func
, 6));
1326 x86_mov(p
->func
, p
->outbuf_EBX
, x86_fn_arg(p
->func
, 6));
1328 /* Load instance ID.
1330 if (p
->use_instancing
) {
1333 x86_fn_arg(p
->func
, 4));
1335 x86_make_disp(p
->machine_EDI
, get_offset(p
, &p
->start_instance
)),
1340 x86_fn_arg(p
->func
, 5));
1342 x86_make_disp(p
->machine_EDI
, get_offset(p
, &p
->instance_id
)),
1346 /* Get vertex count, compare to zero
1348 x86_xor(p
->func
, p
->tmp_EAX
, p
->tmp_EAX
);
1349 x86_cmp(p
->func
, p
->count_EBP
, p
->tmp_EAX
);
1350 fixup
= x86_jcc_forward(p
->func
, cc_E
);
1352 /* always load, needed or not:
1354 init_inputs(p
, index_size
);
1356 /* Note address for loop jump
1358 label
= x86_get_label(p
->func
);
1360 struct x86_reg elt
= !index_size
? p
->idx_ESI
: x86_deref(p
->idx_ESI
);
1361 int last_variant
= -1;
1364 for (j
= 0; j
< p
->translate
.key
.nr_elements
; j
++) {
1365 const struct translate_element
*a
= &p
->translate
.key
.element
[j
];
1366 unsigned variant
= p
->element_to_buffer_variant
[j
];
1368 /* Figure out source pointer address:
1370 if (variant
!= last_variant
) {
1371 last_variant
= variant
;
1372 vb
= get_buffer_ptr(p
, index_size
, variant
, elt
);
1375 if (!translate_attr( p
, a
,
1376 x86_make_disp(vb
, a
->input_offset
),
1377 x86_make_disp(p
->outbuf_EBX
, a
->output_offset
)))
1381 /* Next output vertex:
1386 x86_make_disp(p
->outbuf_EBX
,
1387 p
->translate
.key
.output_stride
));
1391 incr_inputs( p
, index_size
);
1394 /* decr count, loop if not zero
1396 x86_dec(p
->func
, p
->count_EBP
);
1397 x86_jcc(p
->func
, cc_NZ
, label
);
1401 if (p
->func
->need_emms
)
1404 /* Land forward jump here:
1406 x86_fixup_fwd_jump(p
->func
, fixup
);
1408 /* Pop regs and return
1411 if(x86_target(p
->func
) != X86_64_STD_ABI
)
1413 x86_pop(p
->func
, p
->idx_ESI
);
1414 x86_pop(p
->func
, p
->machine_EDI
);
1417 x86_pop(p
->func
, p
->count_EBP
);
1418 x86_pop(p
->func
, p
->outbuf_EBX
);
1420 if(x86_target(p
->func
) == X86_64_WIN64_ABI
)
1422 sse2_movdqa(p
->func
, x86_make_reg(file_XMM
, 6), x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 8));
1423 sse2_movdqa(p
->func
, x86_make_reg(file_XMM
, 7), x86_make_disp(x86_make_reg(file_REG32
, reg_SP
), 24));
1436 static void translate_sse_set_buffer( struct translate
*translate
,
1440 unsigned max_index
)
1442 struct translate_sse
*p
= (struct translate_sse
*)translate
;
1444 if (buf
< p
->nr_buffers
) {
1445 p
->buffer
[buf
].base_ptr
= (char *)ptr
;
1446 p
->buffer
[buf
].stride
= stride
;
1447 p
->buffer
[buf
].max_index
= max_index
;
1450 if (0) debug_printf("%s %d/%d: %p %d\n",
1457 static void translate_sse_release( struct translate
*translate
)
1459 struct translate_sse
*p
= (struct translate_sse
*)translate
;
1461 x86_release_func( &p
->elt8_func
);
1462 x86_release_func( &p
->elt16_func
);
1463 x86_release_func( &p
->elt_func
);
1464 x86_release_func( &p
->linear_func
);
1470 struct translate
*translate_sse2_create( const struct translate_key
*key
)
1472 struct translate_sse
*p
= NULL
;
1475 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1476 if (!rtasm_cpu_has_sse())
1479 p
= os_malloc_aligned(sizeof(struct translate_sse
), 16);
1482 memset(p
, 0, sizeof(*p
));
1483 memcpy(p
->consts
, consts
, sizeof(consts
));
1485 p
->translate
.key
= *key
;
1486 p
->translate
.release
= translate_sse_release
;
1487 p
->translate
.set_buffer
= translate_sse_set_buffer
;
1489 for (i
= 0; i
< key
->nr_elements
; i
++) {
1490 if (key
->element
[i
].type
== TRANSLATE_ELEMENT_NORMAL
) {
1493 p
->nr_buffers
= MAX2(p
->nr_buffers
, key
->element
[i
].input_buffer
+ 1);
1495 if (key
->element
[i
].instance_divisor
) {
1496 p
->use_instancing
= TRUE
;
1500 * Map vertex element to vertex buffer variant.
1502 for (j
= 0; j
< p
->nr_buffer_variants
; j
++) {
1503 if (p
->buffer_variant
[j
].buffer_index
== key
->element
[i
].input_buffer
&&
1504 p
->buffer_variant
[j
].instance_divisor
== key
->element
[i
].instance_divisor
) {
1508 if (j
== p
->nr_buffer_variants
) {
1509 p
->buffer_variant
[j
].buffer_index
= key
->element
[i
].input_buffer
;
1510 p
->buffer_variant
[j
].instance_divisor
= key
->element
[i
].instance_divisor
;
1511 p
->nr_buffer_variants
++;
1513 p
->element_to_buffer_variant
[i
] = j
;
1515 assert(key
->element
[i
].type
== TRANSLATE_ELEMENT_INSTANCE_ID
);
1517 p
->element_to_buffer_variant
[i
] = ELEMENT_BUFFER_INSTANCE_ID
;
1521 if (0) debug_printf("nr_buffers: %d\n", p
->nr_buffers
);
1523 if (!build_vertex_emit(p
, &p
->linear_func
, 0))
1526 if (!build_vertex_emit(p
, &p
->elt_func
, 4))
1529 if (!build_vertex_emit(p
, &p
->elt16_func
, 2))
1532 if (!build_vertex_emit(p
, &p
->elt8_func
, 1))
1535 p
->translate
.run
= (run_func
) x86_get_func(&p
->linear_func
);
1536 if (p
->translate
.run
== NULL
)
1539 p
->translate
.run_elts
= (run_elts_func
) x86_get_func(&p
->elt_func
);
1540 if (p
->translate
.run_elts
== NULL
)
1543 p
->translate
.run_elts16
= (run_elts16_func
) x86_get_func(&p
->elt16_func
);
1544 if (p
->translate
.run_elts16
== NULL
)
1547 p
->translate
.run_elts8
= (run_elts8_func
) x86_get_func(&p
->elt8_func
);
1548 if (p
->translate
.run_elts8
== NULL
)
1551 return &p
->translate
;
1555 translate_sse_release( &p
->translate
);
1564 struct translate
*translate_sse2_create( const struct translate_key
*key
)