5188e49cd506dcc2bc683f17e132dcc3bb4b8190
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
34
35 #include "translate.h"
36
37
38 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
39
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
42
43
44 #define X 0
45 #define Y 1
46 #define Z 2
47 #define W 3
48
49
50 struct translate_buffer {
51 const void *base_ptr;
52 uintptr_t stride;
53 unsigned max_index;
54 };
55
56 struct translate_buffer_varient {
57 unsigned buffer_index;
58 unsigned instance_divisor;
59 void *ptr; /* updated either per vertex or per instance */
60 };
61
62
63 #define ELEMENT_BUFFER_INSTANCE_ID 1001
64
65
66 struct translate_sse {
67 struct translate translate;
68
69 struct x86_function linear_func;
70 struct x86_function elt_func;
71 struct x86_function elt16_func;
72 struct x86_function elt8_func;
73 struct x86_function *func;
74
75 boolean loaded_identity;
76 boolean loaded_const[5];
77
78 float identity[4];
79 float const_value[5][4];
80
81 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
82 unsigned nr_buffers;
83
84 /* Multiple buffer varients can map to a single buffer. */
85 struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
86 unsigned nr_buffer_varients;
87
88 /* Multiple elements can map to a single buffer varient. */
89 unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
90
91 boolean use_instancing;
92 unsigned instance_id;
93
94 /* these are actually known values, but putting them in a struct
95 * like this is helpful to keep them in sync across the file.
96 */
97 struct x86_reg tmp_EAX;
98 struct x86_reg tmp2_EDX;
99 struct x86_reg src_ECX;
100 struct x86_reg idx_ESI; /* either start+i or &elt[i] */
101 struct x86_reg machine_EDI;
102 struct x86_reg outbuf_EBX;
103 struct x86_reg count_EBP; /* decrements to zero */
104 };
105
106 static int get_offset( const void *a, const void *b )
107 {
108 return (const char *)b - (const char *)a;
109 }
110
111
112
113 static struct x86_reg get_identity( struct translate_sse *p )
114 {
115 struct x86_reg reg = x86_make_reg(file_XMM, 7);
116
117 if (!p->loaded_identity) {
118 p->loaded_identity = TRUE;
119 p->identity[0] = 0;
120 p->identity[1] = 0;
121 p->identity[2] = 0;
122 p->identity[3] = 1;
123
124 sse_movups(p->func, reg,
125 x86_make_disp(p->machine_EDI,
126 get_offset(p, &p->identity[0])));
127 }
128
129 return reg;
130 }
131
132 static struct x86_reg get_const( struct translate_sse *p, unsigned i, float v)
133 {
134 struct x86_reg reg = x86_make_reg(file_XMM, 2 + i);
135
136 if (!p->loaded_const[i]) {
137 p->loaded_const[i] = TRUE;
138 p->const_value[i][0] =
139 p->const_value[i][1] =
140 p->const_value[i][2] =
141 p->const_value[i][3] = v;
142
143 sse_movups(p->func, reg,
144 x86_make_disp(p->machine_EDI,
145 get_offset(p, &p->const_value[i][0])));
146 }
147
148 return reg;
149 }
150
151 static struct x86_reg get_inv_127( struct translate_sse *p )
152 {
153 return get_const(p, 0, 1.0f / 127.0f);
154 }
155
156 static struct x86_reg get_inv_255( struct translate_sse *p )
157 {
158 return get_const(p, 1, 1.0f / 255.0f);
159 }
160
161 static struct x86_reg get_inv_32767( struct translate_sse *p )
162 {
163 return get_const(p, 2, 1.0f / 32767.0f);
164 }
165
166 static struct x86_reg get_inv_65535( struct translate_sse *p )
167 {
168 return get_const(p, 3, 1.0f / 65535.0f);
169 }
170
171 static struct x86_reg get_inv_2147483647( struct translate_sse *p )
172 {
173 return get_const(p, 4, 1.0f / 2147483647.0f);
174 }
175
176 /* load the data in a SSE2 register, padding with zeros */
177 static boolean emit_load_sse2( struct translate_sse *p,
178 struct x86_reg data,
179 struct x86_reg src,
180 unsigned size)
181 {
182 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
183 struct x86_reg tmp = p->tmp_EAX;
184 switch(size)
185 {
186 case 1:
187 x86_movzx8(p->func, tmp, src);
188 sse2_movd(p->func, data, tmp);
189 break;
190 case 2:
191 x86_movzx16(p->func, tmp, src);
192 sse2_movd(p->func, data, tmp);
193 break;
194 case 3:
195 x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
196 x86_shl_imm(p->func, tmp, 16);
197 x86_mov16(p->func, tmp, src);
198 sse2_movd(p->func, data, tmp);
199 break;
200 case 4:
201 sse2_movd(p->func, data, src);
202 break;
203 case 6:
204 sse2_movd(p->func, data, src);
205 x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
206 sse2_movd(p->func, tmpXMM, tmp);
207 sse2_punpckldq(p->func, data, tmpXMM);
208 break;
209 case 8:
210 sse2_movq(p->func, data, src);
211 break;
212 case 12:
213 sse2_movq(p->func, data, src);
214 sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
215 sse2_punpcklqdq(p->func, data, tmpXMM);
216 break;
217 case 16:
218 sse2_movdqu(p->func, data, src);
219 break;
220 default:
221 return FALSE;
222 }
223 return TRUE;
224 }
225
226 /* this value can be passed for the out_chans argument */
227 #define CHANNELS_0001 5
228
229 /* this function will load #chans float values, and will
230 * pad the register with zeroes at least up to out_chans.
231 *
232 * If out_chans is set to CHANNELS_0001, then the fourth
233 * value will be padded with 1. Only pass this value if
234 * chans < 4 or results are undefined.
235 */
236 static void emit_load_float32( struct translate_sse *p,
237 struct x86_reg data,
238 struct x86_reg arg0,
239 unsigned out_chans,
240 unsigned chans)
241 {
242 switch(chans)
243 {
244 case 1:
245 /* a 0 0 0
246 * a 0 0 1
247 */
248 sse_movss(p->func, data, arg0);
249 if(out_chans == CHANNELS_0001)
250 sse_orps(p->func, data, get_identity(p) );
251 break;
252 case 2:
253 /* 0 0 0 1
254 * a b 0 1
255 */
256 if(out_chans == CHANNELS_0001)
257 sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
258 else if(out_chans > 2)
259 sse_movlhps(p->func, data, get_identity(p) );
260 sse_movlps(p->func, data, arg0);
261 break;
262 case 3:
263 /* Have to jump through some hoops:
264 *
265 * c 0 0 0
266 * c 0 0 1 if out_chans == CHANNELS_0001
267 * 0 0 c 0/1
268 * a b c 0/1
269 */
270 sse_movss(p->func, data, x86_make_disp(arg0, 8));
271 if(out_chans == CHANNELS_0001)
272 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
273 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
274 sse_movlps(p->func, data, arg0);
275 break;
276 case 4:
277 sse_movups(p->func, data, arg0);
278 break;
279 }
280 }
281
282 /* this function behaves like emit_load_float32, but loads
283 64-bit floating point numbers, converting them to 32-bit
284 ones */
285 static void emit_load_float64to32( struct translate_sse *p,
286 struct x86_reg data,
287 struct x86_reg arg0,
288 unsigned out_chans,
289 unsigned chans)
290 {
291 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
292 switch(chans)
293 {
294 case 1:
295 sse2_movsd(p->func, data, arg0);
296 if(out_chans > 1)
297 sse2_cvtpd2ps(p->func, data, data);
298 else
299 sse2_cvtsd2ss(p->func, data, data);
300 if(out_chans == CHANNELS_0001)
301 sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
302 break;
303 case 2:
304 sse2_movupd(p->func, data, arg0);
305 sse2_cvtpd2ps(p->func, data, data);
306 if(out_chans == CHANNELS_0001)
307 sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
308 else if(out_chans > 2)
309 sse_movlhps(p->func, data, get_identity(p) );
310 break;
311 case 3:
312 sse2_movupd(p->func, data, arg0);
313 sse2_cvtpd2ps(p->func, data, data);
314 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
315 if(out_chans > 3)
316 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
317 else
318 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
319 sse_movlhps(p->func, data, tmpXMM);
320 if(out_chans == CHANNELS_0001)
321 sse_orps(p->func, data, get_identity(p) );
322 break;
323 case 4:
324 sse2_movupd(p->func, data, arg0);
325 sse2_cvtpd2ps(p->func, data, data);
326 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
327 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
328 sse_movlhps(p->func, data, tmpXMM);
329 break;
330 }
331 }
332
333 static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm)
334 {
335 if(x86_target(p->func) != X86_32)
336 x64_mov64(p->func, dst_gpr, src_gpr);
337 else
338 {
339 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
340 if(x86_target_caps(p->func) & X86_SSE2)
341 sse2_movq(p->func, dst_xmm, src_xmm);
342 else
343 sse_movlps(p->func, dst_xmm, src_xmm);
344 }
345 }
346
347 static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
348 {
349 emit_mov64(p, dst_gpr, dst_xmm, src, src);
350 }
351
352 static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
353 {
354 emit_mov64(p, dst, dst, src_gpr, src_xmm);
355 }
356
357 static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
358 {
359 if(x86_target_caps(p->func) & X86_SSE2)
360 sse2_movdqu(p->func, dst, src);
361 else
362 sse_movups(p->func, dst, src);
363 }
364
365 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
366 * but may or may not be good on older processors
367 * TODO: may perhaps want to use non-temporal stores here if possible
368 */
369 static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
370 {
371 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
372 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
373 struct x86_reg dataGPR = p->tmp_EAX;
374 struct x86_reg dataGPR2 = p->tmp2_EDX;
375
376 if(size < 8)
377 {
378 switch (size)
379 {
380 case 1:
381 x86_mov8(p->func, dataGPR, src);
382 x86_mov8(p->func, dst, dataGPR);
383 break;
384 case 2:
385 x86_mov16(p->func, dataGPR, src);
386 x86_mov16(p->func, dst, dataGPR);
387 break;
388 case 3:
389 x86_mov16(p->func, dataGPR, src);
390 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
391 x86_mov16(p->func, dst, dataGPR);
392 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
393 break;
394 case 4:
395 x86_mov(p->func, dataGPR, src);
396 x86_mov(p->func, dst, dataGPR);
397 break;
398 case 6:
399 x86_mov(p->func, dataGPR, src);
400 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
401 x86_mov(p->func, dst, dataGPR);
402 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
403 break;
404 }
405 }
406 else if(!(x86_target_caps(p->func) & X86_SSE))
407 {
408 unsigned i = 0;
409 assert((size & 3) == 0);
410 for(i = 0; i < size; i += 4)
411 {
412 x86_mov(p->func, dataGPR, x86_make_disp(src, i));
413 x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
414 }
415 }
416 else
417 {
418 switch(size)
419 {
420 case 8:
421 emit_load64(p, dataGPR, dataXMM, src);
422 emit_store64(p, dst, dataGPR, dataXMM);
423 break;
424 case 12:
425 emit_load64(p, dataGPR2, dataXMM, src);
426 x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
427 emit_store64(p, dst, dataGPR2, dataXMM);
428 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
429 break;
430 case 16:
431 emit_mov128(p, dataXMM, src);
432 emit_mov128(p, dst, dataXMM);
433 break;
434 case 24:
435 emit_mov128(p, dataXMM, src);
436 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
437 emit_mov128(p, dst, dataXMM);
438 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
439 break;
440 case 32:
441 emit_mov128(p, dataXMM, src);
442 emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
443 emit_mov128(p, dst, dataXMM);
444 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
445 break;
446 default:
447 assert(0);
448 }
449 }
450 }
451
452 static boolean translate_attr_convert( struct translate_sse *p,
453 const struct translate_element *a,
454 struct x86_reg src,
455 struct x86_reg dst)
456
457 {
458 const struct util_format_description* input_desc = util_format_description(a->input_format);
459 const struct util_format_description* output_desc = util_format_description(a->output_format);
460 unsigned i;
461 boolean id_swizzle = TRUE;
462 unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
463 unsigned needed_chans = 0;
464 unsigned imms[2] = {0, 0x3f800000};
465
466 if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
467 return FALSE;
468
469 if(input_desc->channel[0].size & 7)
470 return FALSE;
471
472 if(input_desc->colorspace != output_desc->colorspace)
473 return FALSE;
474
475 for(i = 1; i < input_desc->nr_channels; ++i)
476 {
477 if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
478 return FALSE;
479 }
480
481 for(i = 1; i < output_desc->nr_channels; ++i)
482 {
483 if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
484 return FALSE;
485 }
486
487 for(i = 0; i < output_desc->nr_channels; ++i)
488 {
489 if(output_desc->swizzle[i] < 4)
490 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
491 }
492
493 if((x86_target_caps(p->func) & X86_SSE) && (0
494 || a->output_format == PIPE_FORMAT_R32_FLOAT
495 || a->output_format == PIPE_FORMAT_R32G32_FLOAT
496 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
497 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
498 {
499 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
500
501 for(i = 0; i < output_desc->nr_channels; ++i)
502 {
503 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
504 swizzle[i] = i;
505 }
506
507 for(i = 0; i < output_desc->nr_channels; ++i)
508 {
509 if(swizzle[i] < 4)
510 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
511 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
512 id_swizzle = FALSE;
513 }
514
515 if(needed_chans > 0)
516 {
517 switch(input_desc->channel[0].type)
518 {
519 case UTIL_FORMAT_TYPE_UNSIGNED:
520 if(!(x86_target_caps(p->func) & X86_SSE2))
521 return FALSE;
522 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
523
524 /* TODO: add support for SSE4.1 pmovzx */
525 switch(input_desc->channel[0].size)
526 {
527 case 8:
528 /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
529 sse2_punpcklbw(p->func, dataXMM, get_identity(p));
530 sse2_punpcklbw(p->func, dataXMM, get_identity(p));
531 break;
532 case 16:
533 sse2_punpcklwd(p->func, dataXMM, get_identity(p));
534 break;
535 case 32: /* we lose precision here */
536 sse2_psrld_imm(p->func, dataXMM, 1);
537 break;
538 default:
539 return FALSE;
540 }
541 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
542 if(input_desc->channel[0].normalized)
543 {
544 struct x86_reg factor;
545 switch(input_desc->channel[0].size)
546 {
547 case 8:
548 factor = get_inv_255(p);
549 break;
550 case 16:
551 factor = get_inv_65535(p);
552 break;
553 case 32:
554 factor = get_inv_2147483647(p);
555 break;
556 default:
557 assert(0);
558 factor.disp = 0;
559 factor.file = 0;
560 factor.idx = 0;
561 factor.mod = 0;
562 break;
563 }
564 sse_mulps(p->func, dataXMM, factor);
565 }
566 else if(input_desc->channel[0].size == 32)
567 sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
568 break;
569 case UTIL_FORMAT_TYPE_SIGNED:
570 if(!(x86_target_caps(p->func) & X86_SSE2))
571 return FALSE;
572 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
573
574 /* TODO: add support for SSE4.1 pmovsx */
575 switch(input_desc->channel[0].size)
576 {
577 case 8:
578 sse2_punpcklbw(p->func, dataXMM, dataXMM);
579 sse2_punpcklbw(p->func, dataXMM, dataXMM);
580 sse2_psrad_imm(p->func, dataXMM, 24);
581 break;
582 case 16:
583 sse2_punpcklwd(p->func, dataXMM, dataXMM);
584 sse2_psrad_imm(p->func, dataXMM, 16);
585 break;
586 case 32: /* we lose precision here */
587 break;
588 default:
589 return FALSE;
590 }
591 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
592 if(input_desc->channel[0].normalized)
593 {
594 struct x86_reg factor;
595 switch(input_desc->channel[0].size)
596 {
597 case 8:
598 factor = get_inv_127(p);
599 break;
600 case 16:
601 factor = get_inv_32767(p);
602 break;
603 case 32:
604 factor = get_inv_2147483647(p);
605 break;
606 default:
607 assert(0);
608 factor.disp = 0;
609 factor.file = 0;
610 factor.idx = 0;
611 factor.mod = 0;
612 break;
613 }
614 sse_mulps(p->func, dataXMM, factor);
615 }
616 break;
617
618 break;
619 case UTIL_FORMAT_TYPE_FLOAT:
620 if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
621 return FALSE;
622 if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
623 {
624 swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
625 needed_chans = CHANNELS_0001;
626 }
627 switch(input_desc->channel[0].size)
628 {
629 case 32:
630 emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
631 break;
632 case 64: /* we lose precision here */
633 if(!(x86_target_caps(p->func) & X86_SSE2))
634 return FALSE;
635 emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
636 break;
637 default:
638 return FALSE;
639 }
640 break;
641 default:
642 return FALSE;
643 }
644
645 if(!id_swizzle)
646 sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
647 }
648
649 if(output_desc->nr_channels >= 4
650 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
651 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
652 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
653 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
654 )
655 sse_movups(p->func, dst, dataXMM);
656 else
657 {
658 if(output_desc->nr_channels >= 2
659 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
660 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
661 sse_movlps(p->func, dst, dataXMM);
662 else
663 {
664 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
665 sse_movss(p->func, dst, dataXMM);
666 else
667 x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
668
669 if(output_desc->nr_channels >= 2)
670 {
671 if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
672 {
673 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
674 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
675 }
676 else
677 x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
678 }
679 }
680
681 if(output_desc->nr_channels >= 3)
682 {
683 if(output_desc->nr_channels >= 4
684 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
685 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
686 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
687 else
688 {
689 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
690 {
691 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
692 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
693 }
694 else
695 x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
696
697 if(output_desc->nr_channels >= 4)
698 {
699 if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
700 {
701 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
702 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
703 }
704 else
705 x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
706 }
707 }
708 }
709 }
710 return TRUE;
711 }
712 else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
713 && output_desc->channel[0].normalized == input_desc->channel[0].normalized
714 && (0
715 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
716 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
717 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
718 ))
719 {
720 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
721 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
722 struct x86_reg tmp = p->tmp_EAX;
723 unsigned imms[2] = {0, 1};
724
725 for(i = 0; i < output_desc->nr_channels; ++i)
726 {
727 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
728 swizzle[i] = i;
729 }
730
731 for(i = 0; i < output_desc->nr_channels; ++i)
732 {
733 if(swizzle[i] < 4)
734 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
735 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
736 id_swizzle = FALSE;
737 }
738
739 if(needed_chans > 0)
740 {
741 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
742
743 switch(input_desc->channel[0].type)
744 {
745 case UTIL_FORMAT_TYPE_UNSIGNED:
746 if(input_desc->channel[0].normalized)
747 {
748 sse2_punpcklbw(p->func, dataXMM, dataXMM);
749 if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
750 sse2_psrlw_imm(p->func, dataXMM, 1);
751 }
752 else
753 sse2_punpcklbw(p->func, dataXMM, get_identity(p));
754 break;
755 case UTIL_FORMAT_TYPE_SIGNED:
756 if(input_desc->channel[0].normalized)
757 {
758 sse2_movq(p->func, tmpXMM, get_identity(p));
759 sse2_punpcklbw(p->func, tmpXMM, dataXMM);
760 sse2_psllw_imm(p->func, dataXMM, 9);
761 sse2_psrlw_imm(p->func, dataXMM, 8);
762 sse2_por(p->func, tmpXMM, dataXMM);
763 sse2_psrlw_imm(p->func, dataXMM, 7);
764 sse2_por(p->func, tmpXMM, dataXMM);
765 {
766 struct x86_reg t = dataXMM;
767 dataXMM = tmpXMM;
768 tmpXMM = t;
769 }
770 }
771 else
772 {
773 sse2_punpcklbw(p->func, dataXMM, dataXMM);
774 sse2_psraw_imm(p->func, dataXMM, 8);
775 }
776 break;
777 default:
778 assert(0);
779 }
780
781 if(output_desc->channel[0].normalized)
782 imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
783
784 if(!id_swizzle)
785 sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
786 }
787
788 if(output_desc->nr_channels >= 4
789 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
790 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
791 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
792 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
793 )
794 sse2_movq(p->func, dst, dataXMM);
795 else
796 {
797 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
798 {
799 if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
800 sse2_movd(p->func, dst, dataXMM);
801 else
802 {
803 sse2_movd(p->func, tmp, dataXMM);
804 x86_mov16(p->func, dst, tmp);
805 if(output_desc->nr_channels >= 2)
806 x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
807 }
808 }
809 else
810 {
811 if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
812 x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
813 else
814 {
815 x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
816 if(output_desc->nr_channels >= 2)
817 {
818 sse2_movd(p->func, tmp, dataXMM);
819 x86_shr_imm(p->func, tmp, 16);
820 x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
821 }
822 }
823 }
824
825 if(output_desc->nr_channels >= 3)
826 {
827 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
828 {
829 if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
830 {
831 sse2_psrlq_imm(p->func, dataXMM, 32);
832 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
833 }
834 else
835 {
836 sse2_psrlq_imm(p->func, dataXMM, 32);
837 sse2_movd(p->func, tmp, dataXMM);
838 x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
839 if(output_desc->nr_channels >= 4)
840 {
841 x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
842 }
843 }
844 }
845 else
846 {
847 if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
848 x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
849 else
850 {
851 x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
852
853 if(output_desc->nr_channels >= 4)
854 {
855 sse2_psrlq_imm(p->func, dataXMM, 48);
856 sse2_movd(p->func, tmp, dataXMM);
857 x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
858 }
859 }
860 }
861 }
862 }
863 return TRUE;
864 }
865 else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
866 {
867 struct x86_reg tmp = p->tmp_EAX;
868 unsigned i;
869 if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
870 && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
871 && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
872 && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
873 && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
874 {
875 /* TODO: support movbe */
876 x86_mov(p->func, tmp, src);
877 x86_bswap(p->func, tmp);
878 x86_mov(p->func, dst, tmp);
879 return TRUE;
880 }
881
882 for(i = 0; i < output_desc->nr_channels; ++i)
883 {
884 switch(output_desc->channel[0].size)
885 {
886 case 8:
887 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
888 {
889 unsigned v = 0;
890 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
891 {
892 switch(output_desc->channel[0].type)
893 {
894 case UTIL_FORMAT_TYPE_UNSIGNED:
895 v = output_desc->channel[0].normalized ? 0xff : 1;
896 break;
897 case UTIL_FORMAT_TYPE_SIGNED:
898 v = output_desc->channel[0].normalized ? 0x7f : 1;
899 break;
900 default:
901 return FALSE;
902 }
903 }
904 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
905 }
906 else
907 {
908 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
909 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
910 }
911 break;
912 case 16:
913 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
914 {
915 unsigned v = 0;
916 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
917 {
918 switch(output_desc->channel[1].type)
919 {
920 case UTIL_FORMAT_TYPE_UNSIGNED:
921 v = output_desc->channel[1].normalized ? 0xffff : 1;
922 break;
923 case UTIL_FORMAT_TYPE_SIGNED:
924 v = output_desc->channel[1].normalized ? 0x7fff : 1;
925 break;
926 case UTIL_FORMAT_TYPE_FLOAT:
927 v = 0x3c00;
928 break;
929 default:
930 return FALSE;
931 }
932 }
933 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
934 }
935 else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
936 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
937 else
938 {
939 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
940 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
941 }
942 break;
943 case 32:
944 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
945 {
946 unsigned v = 0;
947 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
948 {
949 switch(output_desc->channel[1].type)
950 {
951 case UTIL_FORMAT_TYPE_UNSIGNED:
952 v = output_desc->channel[1].normalized ? 0xffffffff : 1;
953 break;
954 case UTIL_FORMAT_TYPE_SIGNED:
955 v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
956 break;
957 case UTIL_FORMAT_TYPE_FLOAT:
958 v = 0x3f800000;
959 break;
960 default:
961 return FALSE;
962 }
963 }
964 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
965 }
966 else
967 {
968 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
969 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
970 }
971 break;
972 case 64:
973 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
974 {
975 unsigned l = 0;
976 unsigned h = 0;
977 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
978 {
979 switch(output_desc->channel[1].type)
980 {
981 case UTIL_FORMAT_TYPE_UNSIGNED:
982 h = output_desc->channel[1].normalized ? 0xffffffff : 0;
983 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
984 break;
985 case UTIL_FORMAT_TYPE_SIGNED:
986 h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
987 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
988 break;
989 case UTIL_FORMAT_TYPE_FLOAT:
990 h = 0x3ff00000;
991 l = 0;
992 break;
993 default:
994 return FALSE;
995 }
996 }
997 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
998 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
999 }
1000 else
1001 {
1002 if(x86_target_caps(p->func) & X86_SSE)
1003 {
1004 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1005 emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
1006 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1007 }
1008 else
1009 {
1010 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1011 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1012 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
1013 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1014 }
1015 }
1016 break;
1017 default:
1018 return FALSE;
1019 }
1020 }
1021 return TRUE;
1022 }
1023 return FALSE;
1024 }
1025
1026 static boolean translate_attr( struct translate_sse *p,
1027 const struct translate_element *a,
1028 struct x86_reg src,
1029 struct x86_reg dst)
1030 {
1031 if(a->input_format == a->output_format)
1032 {
1033 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1034 return TRUE;
1035 }
1036
1037 return translate_attr_convert(p, a, src, dst);
1038 }
1039
1040 static boolean init_inputs( struct translate_sse *p,
1041 unsigned index_size )
1042 {
1043 unsigned i;
1044 struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1045 get_offset(p, &p->instance_id));
1046
1047 for (i = 0; i < p->nr_buffer_varients; i++) {
1048 struct translate_buffer_varient *varient = &p->buffer_varient[i];
1049 struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
1050
1051 if (!index_size || varient->instance_divisor) {
1052 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1053 get_offset(p, &buffer->stride));
1054 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1055 get_offset(p, &varient->ptr));
1056 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1057 get_offset(p, &buffer->base_ptr));
1058 struct x86_reg elt = p->idx_ESI;
1059 struct x86_reg tmp_EAX = p->tmp_EAX;
1060
1061 /* Calculate pointer to first attrib:
1062 * base_ptr + stride * index, where index depends on instance divisor
1063 */
1064 if (varient->instance_divisor) {
1065 /* Our index is instance ID divided by instance divisor.
1066 */
1067 x86_mov(p->func, tmp_EAX, instance_id);
1068
1069 if (varient->instance_divisor != 1) {
1070 struct x86_reg tmp_EDX = p->tmp2_EDX;
1071 struct x86_reg tmp_ECX = p->src_ECX;
1072
1073 /* TODO: Add x86_shr() to rtasm and use it whenever
1074 * instance divisor is power of two.
1075 */
1076
1077 x86_xor(p->func, tmp_EDX, tmp_EDX);
1078 x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
1079 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
1080 }
1081 } else {
1082 x86_mov(p->func, tmp_EAX, elt);
1083 }
1084
1085 /*
1086 * TODO: Respect translate_buffer::max_index.
1087 */
1088
1089 x86_imul(p->func, tmp_EAX, buf_stride);
1090 x64_rexw(p->func);
1091 x86_add(p->func, tmp_EAX, buf_base_ptr);
1092
1093
1094 /* In the linear case, keep the buffer pointer instead of the
1095 * index number.
1096 */
1097 if (!index_size && p->nr_buffer_varients == 1)
1098 {
1099 x64_rexw(p->func);
1100 x86_mov(p->func, elt, tmp_EAX);
1101 }
1102 else
1103 {
1104 x64_rexw(p->func);
1105 x86_mov(p->func, buf_ptr, tmp_EAX);
1106 }
1107 }
1108 }
1109
1110 return TRUE;
1111 }
1112
1113
1114 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1115 unsigned index_size,
1116 unsigned var_idx,
1117 struct x86_reg elt )
1118 {
1119 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1120 return x86_make_disp(p->machine_EDI,
1121 get_offset(p, &p->instance_id));
1122 }
1123 if (!index_size && p->nr_buffer_varients == 1) {
1124 return p->idx_ESI;
1125 }
1126 else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
1127 struct x86_reg ptr = p->src_ECX;
1128 struct x86_reg buf_ptr =
1129 x86_make_disp(p->machine_EDI,
1130 get_offset(p, &p->buffer_varient[var_idx].ptr));
1131
1132 x64_rexw(p->func);
1133 x86_mov(p->func, ptr, buf_ptr);
1134 return ptr;
1135 }
1136 else {
1137 struct x86_reg ptr = p->src_ECX;
1138 const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
1139
1140 struct x86_reg buf_stride =
1141 x86_make_disp(p->machine_EDI,
1142 get_offset(p, &p->buffer[varient->buffer_index].stride));
1143
1144 struct x86_reg buf_base_ptr =
1145 x86_make_disp(p->machine_EDI,
1146 get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
1147
1148
1149
1150 /* Calculate pointer to current attrib:
1151 */
1152 switch(index_size)
1153 {
1154 case 1:
1155 x86_movzx8(p->func, ptr, elt);
1156 break;
1157 case 2:
1158 x86_movzx16(p->func, ptr, elt);
1159 break;
1160 case 4:
1161 x86_mov(p->func, ptr, elt);
1162 break;
1163 }
1164 x86_imul(p->func, ptr, buf_stride);
1165 x64_rexw(p->func);
1166 x86_add(p->func, ptr, buf_base_ptr);
1167 return ptr;
1168 }
1169 }
1170
1171
1172
1173 static boolean incr_inputs( struct translate_sse *p,
1174 unsigned index_size )
1175 {
1176 if (!index_size && p->nr_buffer_varients == 1) {
1177 struct x86_reg stride = x86_make_disp(p->machine_EDI,
1178 get_offset(p, &p->buffer[0].stride));
1179
1180 if (p->buffer_varient[0].instance_divisor == 0) {
1181 x64_rexw(p->func);
1182 x86_add(p->func, p->idx_ESI, stride);
1183 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1184 }
1185 }
1186 else if (!index_size) {
1187 unsigned i;
1188
1189 /* Is this worthwhile??
1190 */
1191 for (i = 0; i < p->nr_buffer_varients; i++) {
1192 struct translate_buffer_varient *varient = &p->buffer_varient[i];
1193 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1194 get_offset(p, &varient->ptr));
1195 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1196 get_offset(p, &p->buffer[varient->buffer_index].stride));
1197
1198 if (varient->instance_divisor == 0) {
1199 x86_mov(p->func, p->tmp_EAX, buf_stride);
1200 x64_rexw(p->func);
1201 x86_add(p->func, p->tmp_EAX, buf_ptr);
1202 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1203 x64_rexw(p->func);
1204 x86_mov(p->func, buf_ptr, p->tmp_EAX);
1205 }
1206 }
1207 }
1208 else {
1209 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1210 }
1211
1212 return TRUE;
1213 }
1214
1215
1216 /* Build run( struct translate *machine,
1217 * unsigned start,
1218 * unsigned count,
1219 * void *output_buffer )
1220 * or
1221 * run_elts( struct translate *machine,
1222 * unsigned *elts,
1223 * unsigned count,
1224 * void *output_buffer )
1225 *
1226 * Lots of hardcoding
1227 *
1228 * EAX -- pointer to current output vertex
1229 * ECX -- pointer to current attribute
1230 *
1231 */
1232 static boolean build_vertex_emit( struct translate_sse *p,
1233 struct x86_function *func,
1234 unsigned index_size )
1235 {
1236 int fixup, label;
1237 unsigned j;
1238
1239 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1240 p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1241 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1242 p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1243 p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1244 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1245 p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1246
1247 p->func = func;
1248 memset(&p->loaded_const, 0, sizeof(p->loaded_const));
1249 p->loaded_identity = FALSE;
1250
1251 x86_init_func(p->func);
1252
1253 if(x86_target(p->func) == X86_64_WIN64_ABI)
1254 {
1255 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1256 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1257 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1258 }
1259
1260 x86_push(p->func, p->outbuf_EBX);
1261 x86_push(p->func, p->count_EBP);
1262
1263 /* on non-Win64 x86-64, these are already in the right registers */
1264 if(x86_target(p->func) != X86_64_STD_ABI)
1265 {
1266 x86_push(p->func, p->machine_EDI);
1267 x86_push(p->func, p->idx_ESI);
1268
1269 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1270 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1271 }
1272
1273 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1274
1275 if(x86_target(p->func) != X86_32)
1276 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1277 else
1278 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1279
1280 /* Load instance ID.
1281 */
1282 if (p->use_instancing) {
1283 x86_mov(p->func,
1284 p->tmp_EAX,
1285 x86_fn_arg(p->func, 4));
1286 x86_mov(p->func,
1287 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1288 p->tmp_EAX);
1289 }
1290
1291 /* Get vertex count, compare to zero
1292 */
1293 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1294 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1295 fixup = x86_jcc_forward(p->func, cc_E);
1296
1297 /* always load, needed or not:
1298 */
1299 init_inputs(p, index_size);
1300
1301 /* Note address for loop jump
1302 */
1303 label = x86_get_label(p->func);
1304 {
1305 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1306 int last_varient = -1;
1307 struct x86_reg vb;
1308
1309 for (j = 0; j < p->translate.key.nr_elements; j++) {
1310 const struct translate_element *a = &p->translate.key.element[j];
1311 unsigned varient = p->element_to_buffer_varient[j];
1312
1313 /* Figure out source pointer address:
1314 */
1315 if (varient != last_varient) {
1316 last_varient = varient;
1317 vb = get_buffer_ptr(p, index_size, varient, elt);
1318 }
1319
1320 if (!translate_attr( p, a,
1321 x86_make_disp(vb, a->input_offset),
1322 x86_make_disp(p->outbuf_EBX, a->output_offset)))
1323 return FALSE;
1324 }
1325
1326 /* Next output vertex:
1327 */
1328 x64_rexw(p->func);
1329 x86_lea(p->func,
1330 p->outbuf_EBX,
1331 x86_make_disp(p->outbuf_EBX,
1332 p->translate.key.output_stride));
1333
1334 /* Incr index
1335 */
1336 incr_inputs( p, index_size );
1337 }
1338
1339 /* decr count, loop if not zero
1340 */
1341 x86_dec(p->func, p->count_EBP);
1342 x86_jcc(p->func, cc_NZ, label);
1343
1344 /* Exit mmx state?
1345 */
1346 if (p->func->need_emms)
1347 mmx_emms(p->func);
1348
1349 /* Land forward jump here:
1350 */
1351 x86_fixup_fwd_jump(p->func, fixup);
1352
1353 /* Pop regs and return
1354 */
1355
1356 if(x86_target(p->func) != X86_64_STD_ABI)
1357 {
1358 x86_pop(p->func, p->idx_ESI);
1359 x86_pop(p->func, p->machine_EDI);
1360 }
1361
1362 x86_pop(p->func, p->count_EBP);
1363 x86_pop(p->func, p->outbuf_EBX);
1364
1365 if(x86_target(p->func) == X86_64_WIN64_ABI)
1366 {
1367 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1368 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1369 }
1370 x86_ret(p->func);
1371
1372 return TRUE;
1373 }
1374
1375
1376
1377
1378
1379
1380
1381 static void translate_sse_set_buffer( struct translate *translate,
1382 unsigned buf,
1383 const void *ptr,
1384 unsigned stride,
1385 unsigned max_index )
1386 {
1387 struct translate_sse *p = (struct translate_sse *)translate;
1388
1389 if (buf < p->nr_buffers) {
1390 p->buffer[buf].base_ptr = (char *)ptr;
1391 p->buffer[buf].stride = stride;
1392 p->buffer[buf].max_index = max_index;
1393 }
1394
1395 if (0) debug_printf("%s %d/%d: %p %d\n",
1396 __FUNCTION__, buf,
1397 p->nr_buffers,
1398 ptr, stride);
1399 }
1400
1401
1402 static void translate_sse_release( struct translate *translate )
1403 {
1404 struct translate_sse *p = (struct translate_sse *)translate;
1405
1406 x86_release_func( &p->linear_func );
1407 x86_release_func( &p->elt_func );
1408
1409 FREE(p);
1410 }
1411
1412
1413 struct translate *translate_sse2_create( const struct translate_key *key )
1414 {
1415 struct translate_sse *p = NULL;
1416 unsigned i;
1417
1418 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1419 if (!rtasm_cpu_has_sse())
1420 goto fail;
1421
1422 p = CALLOC_STRUCT( translate_sse );
1423 if (p == NULL)
1424 goto fail;
1425
1426 p->translate.key = *key;
1427 p->translate.release = translate_sse_release;
1428 p->translate.set_buffer = translate_sse_set_buffer;
1429
1430 for (i = 0; i < key->nr_elements; i++) {
1431 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1432 unsigned j;
1433
1434 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1435
1436 if (key->element[i].instance_divisor) {
1437 p->use_instancing = TRUE;
1438 }
1439
1440 /*
1441 * Map vertex element to vertex buffer varient.
1442 */
1443 for (j = 0; j < p->nr_buffer_varients; j++) {
1444 if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
1445 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
1446 break;
1447 }
1448 }
1449 if (j == p->nr_buffer_varients) {
1450 p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
1451 p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
1452 p->nr_buffer_varients++;
1453 }
1454 p->element_to_buffer_varient[i] = j;
1455 } else {
1456 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1457
1458 p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
1459 }
1460 }
1461
1462 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1463
1464 if (!build_vertex_emit(p, &p->linear_func, 0))
1465 goto fail;
1466
1467 if (!build_vertex_emit(p, &p->elt_func, 4))
1468 goto fail;
1469
1470 if (!build_vertex_emit(p, &p->elt16_func, 2))
1471 goto fail;
1472
1473 if (!build_vertex_emit(p, &p->elt8_func, 1))
1474 goto fail;
1475
1476 p->translate.run = (void*)x86_get_func(&p->linear_func);
1477 if (p->translate.run == NULL)
1478 goto fail;
1479
1480 p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
1481 if (p->translate.run_elts == NULL)
1482 goto fail;
1483
1484 p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
1485 if (p->translate.run_elts16 == NULL)
1486 goto fail;
1487
1488 p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
1489 if (p->translate.run_elts8 == NULL)
1490 goto fail;
1491
1492 return &p->translate;
1493
1494 fail:
1495 if (p)
1496 translate_sse_release( &p->translate );
1497
1498 return NULL;
1499 }
1500
1501
1502
1503 #else
1504
1505 struct translate *translate_sse2_create( const struct translate_key *key )
1506 {
1507 return NULL;
1508 }
1509
1510 #endif