translate_sse: don't overwrite source buffer pointer
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
34
35 #include "translate.h"
36
37
38 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
39
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
42
43
44 #define X 0
45 #define Y 1
46 #define Z 2
47 #define W 3
48
49
50 struct translate_buffer {
51 const void *base_ptr;
52 uintptr_t stride;
53 unsigned max_index;
54 };
55
56 struct translate_buffer_varient {
57 unsigned buffer_index;
58 unsigned instance_divisor;
59 void *ptr; /* updated either per vertex or per instance */
60 };
61
62
63 #define ELEMENT_BUFFER_INSTANCE_ID 1001
64
65
66 struct translate_sse {
67 struct translate translate;
68
69 struct x86_function linear_func;
70 struct x86_function elt_func;
71 struct x86_function elt16_func;
72 struct x86_function elt8_func;
73 struct x86_function *func;
74
75 boolean loaded_identity;
76 boolean loaded_const[5];
77
78 float identity[4];
79 float const_value[5][4];
80
81 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
82 unsigned nr_buffers;
83
84 /* Multiple buffer varients can map to a single buffer. */
85 struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
86 unsigned nr_buffer_varients;
87
88 /* Multiple elements can map to a single buffer varient. */
89 unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
90
91 boolean use_instancing;
92 unsigned instance_id;
93
94 /* these are actually known values, but putting them in a struct
95 * like this is helpful to keep them in sync across the file.
96 */
97 struct x86_reg tmp_EAX;
98 struct x86_reg tmp2_EDX;
99 struct x86_reg src_ECX;
100 struct x86_reg idx_ESI; /* either start+i or &elt[i] */
101 struct x86_reg machine_EDI;
102 struct x86_reg outbuf_EBX;
103 struct x86_reg count_EBP; /* decrements to zero */
104 };
105
106 static int get_offset( const void *a, const void *b )
107 {
108 return (const char *)b - (const char *)a;
109 }
110
111
112
113 static struct x86_reg get_identity( struct translate_sse *p )
114 {
115 struct x86_reg reg = x86_make_reg(file_XMM, 7);
116
117 if (!p->loaded_identity) {
118 p->loaded_identity = TRUE;
119 p->identity[0] = 0;
120 p->identity[1] = 0;
121 p->identity[2] = 0;
122 p->identity[3] = 1;
123
124 sse_movups(p->func, reg,
125 x86_make_disp(p->machine_EDI,
126 get_offset(p, &p->identity[0])));
127 }
128
129 return reg;
130 }
131
132 static struct x86_reg get_const( struct translate_sse *p, unsigned i, float v)
133 {
134 struct x86_reg reg = x86_make_reg(file_XMM, 2 + i);
135
136 if (!p->loaded_const[i]) {
137 p->loaded_const[i] = TRUE;
138 p->const_value[i][0] =
139 p->const_value[i][1] =
140 p->const_value[i][2] =
141 p->const_value[i][3] = v;
142
143 sse_movups(p->func, reg,
144 x86_make_disp(p->machine_EDI,
145 get_offset(p, &p->const_value[i][0])));
146 }
147
148 return reg;
149 }
150
151 static struct x86_reg get_inv_127( struct translate_sse *p )
152 {
153 return get_const(p, 0, 1.0f / 127.0f);
154 }
155
156 static struct x86_reg get_inv_255( struct translate_sse *p )
157 {
158 return get_const(p, 1, 1.0f / 255.0f);
159 }
160
161 static struct x86_reg get_inv_32767( struct translate_sse *p )
162 {
163 return get_const(p, 2, 1.0f / 32767.0f);
164 }
165
166 static struct x86_reg get_inv_65535( struct translate_sse *p )
167 {
168 return get_const(p, 3, 1.0f / 65535.0f);
169 }
170
171 static struct x86_reg get_inv_2147483647( struct translate_sse *p )
172 {
173 return get_const(p, 4, 1.0f / 2147483647.0f);
174 }
175
176 /* load the data in a SSE2 register, padding with zeros */
177 static boolean emit_load_sse2( struct translate_sse *p,
178 struct x86_reg data,
179 struct x86_reg src,
180 unsigned size)
181 {
182 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
183 struct x86_reg tmp = p->tmp_EAX;
184 switch(size)
185 {
186 case 1:
187 x86_movzx8(p->func, tmp, src);
188 sse2_movd(p->func, data, tmp);
189 break;
190 case 2:
191 x86_movzx16(p->func, tmp, src);
192 sse2_movd(p->func, data, tmp);
193 case 3:
194 x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
195 x86_shl_imm(p->func, tmp, 16);
196 x86_mov16(p->func, tmp, src);
197 sse2_movd(p->func, data, tmp);
198 case 4:
199 sse2_movd(p->func, data, src);
200 break;
201 case 6:
202 sse2_movd(p->func, data, src);
203 x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
204 sse2_movd(p->func, tmpXMM, tmp);
205 sse2_punpckldq(p->func, data, tmpXMM);
206 break;
207 case 8:
208 sse2_movq(p->func, data, src);
209 break;
210 case 12:
211 sse2_movq(p->func, data, src);
212 sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
213 sse2_punpcklqdq(p->func, data, tmpXMM);
214 break;
215 case 16:
216 sse2_movdqu(p->func, data, src);
217 break;
218 default:
219 return FALSE;
220 }
221 return TRUE;
222 }
223
224 /* this value can be passed for the out_chans argument */
225 #define CHANNELS_0001 5
226
227 /* this function will load #chans float values, and will
228 * pad the register with zeroes at least up to out_chans.
229 *
230 * If out_chans is set to CHANNELS_0001, then the fourth
231 * value will be padded with 1. Only pass this value if
232 * chans < 4 or results are undefined.
233 */
234 static void emit_load_float32( struct translate_sse *p,
235 struct x86_reg data,
236 struct x86_reg arg0,
237 unsigned out_chans,
238 unsigned chans)
239 {
240 switch(chans)
241 {
242 case 1:
243 /* a 0 0 0
244 * a 0 0 1
245 */
246 sse_movss(p->func, data, arg0);
247 if(out_chans == CHANNELS_0001)
248 sse_orps(p->func, data, get_identity(p) );
249 break;
250 case 2:
251 /* 0 0 0 1
252 * a b 0 1
253 */
254 if(out_chans == CHANNELS_0001)
255 sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
256 else if(out_chans > 2)
257 sse_movlhps(p->func, data, get_identity(p) );
258 sse_movlps(p->func, data, arg0);
259 break;
260 case 3:
261 /* Have to jump through some hoops:
262 *
263 * c 0 0 0
264 * c 0 0 1 if out_chans == CHANNELS_0001
265 * 0 0 c 0/1
266 * a b c 0/1
267 */
268 sse_movss(p->func, data, x86_make_disp(arg0, 8));
269 if(out_chans == CHANNELS_0001)
270 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
271 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
272 sse_movlps(p->func, data, arg0);
273 break;
274 case 4:
275 sse_movups(p->func, data, arg0);
276 break;
277 }
278 }
279
280 /* this function behaves like emit_load_float32, but loads
281 64-bit floating point numbers, converting them to 32-bit
282 ones */
283 static void emit_load_float64to32( struct translate_sse *p,
284 struct x86_reg data,
285 struct x86_reg arg0,
286 unsigned out_chans,
287 unsigned chans)
288 {
289 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
290 switch(chans)
291 {
292 case 1:
293 sse2_movsd(p->func, data, arg0);
294 if(out_chans > 1)
295 sse2_cvtpd2ps(p->func, data, data);
296 else
297 sse2_cvtsd2ss(p->func, data, data);
298 if(out_chans == CHANNELS_0001)
299 sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
300 break;
301 case 2:
302 sse2_movupd(p->func, data, arg0);
303 sse2_cvtpd2ps(p->func, data, data);
304 if(out_chans == CHANNELS_0001)
305 sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
306 else if(out_chans > 2)
307 sse_movlhps(p->func, data, get_identity(p) );
308 break;
309 case 3:
310 sse2_movupd(p->func, data, arg0);
311 sse2_cvtpd2ps(p->func, data, data);
312 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313 if(out_chans > 3)
314 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
315 else
316 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
317 sse_movlhps(p->func, data, tmpXMM);
318 if(out_chans == CHANNELS_0001)
319 sse_orps(p->func, data, get_identity(p) );
320 break;
321 case 4:
322 sse2_movupd(p->func, data, arg0);
323 sse2_cvtpd2ps(p->func, data, data);
324 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
325 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
326 sse_movlhps(p->func, data, tmpXMM);
327 break;
328 }
329 }
330
331 static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm)
332 {
333 if(x86_target(p->func) != X86_32)
334 x64_mov64(p->func, dst_gpr, src_gpr);
335 else
336 {
337 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
338 if(x86_target_caps(p->func) & X86_SSE2)
339 sse2_movq(p->func, dst_xmm, src_xmm);
340 else
341 sse_movlps(p->func, dst_xmm, src_xmm);
342 }
343 }
344
345 static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
346 {
347 emit_mov64(p, dst_gpr, dst_xmm, src, src);
348 }
349
350 static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
351 {
352 emit_mov64(p, dst, dst, src_gpr, src_xmm);
353 }
354
355 static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
356 {
357 if(x86_target_caps(p->func) & X86_SSE2)
358 sse2_movdqu(p->func, dst, src);
359 else
360 sse_movups(p->func, dst, src);
361 }
362
363 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
364 * but may or may not be good on older processors
365 * TODO: may perhaps want to use non-temporal stores here if possible
366 */
367 static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
368 {
369 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
370 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
371 struct x86_reg dataGPR = p->tmp_EAX;
372 struct x86_reg dataGPR2 = p->tmp2_EDX;
373
374 if(size < 8)
375 {
376 switch (size)
377 {
378 case 1:
379 x86_mov8(p->func, dataGPR, src);
380 x86_mov8(p->func, dst, dataGPR);
381 break;
382 case 2:
383 x86_mov16(p->func, dataGPR, src);
384 x86_mov16(p->func, dst, dataGPR);
385 break;
386 case 3:
387 x86_mov16(p->func, dataGPR, src);
388 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
389 x86_mov16(p->func, dst, dataGPR);
390 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
391 break;
392 case 4:
393 x86_mov(p->func, dataGPR, src);
394 x86_mov(p->func, dst, dataGPR);
395 break;
396 case 6:
397 x86_mov(p->func, dataGPR, src);
398 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
399 x86_mov(p->func, dst, dataGPR);
400 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
401 break;
402 }
403 }
404 else if(!(x86_target_caps(p->func) & X86_SSE))
405 {
406 unsigned i = 0;
407 assert((size & 3) == 0);
408 for(i = 0; i < size; i += 4)
409 {
410 x86_mov(p->func, dataGPR, x86_make_disp(src, i));
411 x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
412 }
413 }
414 else
415 {
416 switch(size)
417 {
418 case 8:
419 emit_load64(p, dataGPR, dataXMM, src);
420 emit_store64(p, dst, dataGPR, dataXMM);
421 break;
422 case 12:
423 emit_load64(p, dataGPR2, dataXMM, src);
424 x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
425 emit_store64(p, dst, dataGPR2, dataXMM);
426 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
427 break;
428 case 16:
429 emit_mov128(p, dataXMM, src);
430 emit_mov128(p, dst, dataXMM);
431 break;
432 case 24:
433 emit_mov128(p, dataXMM, src);
434 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
435 emit_mov128(p, dst, dataXMM);
436 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
437 break;
438 case 32:
439 emit_mov128(p, dataXMM, src);
440 emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
441 emit_mov128(p, dst, dataXMM);
442 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
443 break;
444 default:
445 assert(0);
446 }
447 }
448 }
449
450 static boolean translate_attr_convert( struct translate_sse *p,
451 const struct translate_element *a,
452 struct x86_reg src,
453 struct x86_reg dst)
454
455 {
456 const struct util_format_description* input_desc = util_format_description(a->input_format);
457 const struct util_format_description* output_desc = util_format_description(a->output_format);
458 unsigned i;
459 boolean id_swizzle = TRUE;
460 unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
461 unsigned needed_chans = 0;
462 unsigned imms[2] = {0, 0x3f800000};
463
464 if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
465 return FALSE;
466
467 if(input_desc->channel[0].size & 7)
468 return FALSE;
469
470 if(input_desc->colorspace != output_desc->colorspace)
471 return FALSE;
472
473 for(i = 1; i < input_desc->nr_channels; ++i)
474 {
475 if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
476 return FALSE;
477 }
478
479 for(i = 1; i < output_desc->nr_channels; ++i)
480 {
481 if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
482 return FALSE;
483 }
484
485 for(i = 0; i < output_desc->nr_channels; ++i)
486 {
487 if(output_desc->swizzle[i] < 4)
488 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
489 }
490
491 if((x86_target_caps(p->func) & X86_SSE) && (0
492 || a->output_format == PIPE_FORMAT_R32_FLOAT
493 || a->output_format == PIPE_FORMAT_R32G32_FLOAT
494 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
495 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
496 {
497 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
498
499 for(i = 0; i < output_desc->nr_channels; ++i)
500 {
501 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
502 swizzle[i] = i;
503 }
504
505 for(i = 0; i < output_desc->nr_channels; ++i)
506 {
507 if(swizzle[i] < 4)
508 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
509 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
510 id_swizzle = FALSE;
511 }
512
513 if(needed_chans > 0)
514 {
515 switch(input_desc->channel[0].type)
516 {
517 case UTIL_FORMAT_TYPE_UNSIGNED:
518 if(!(x86_target_caps(p->func) & X86_SSE2))
519 return FALSE;
520 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
521
522 /* TODO: add support for SSE4.1 pmovzx */
523 switch(input_desc->channel[0].size)
524 {
525 case 8:
526 /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
527 sse2_punpcklbw(p->func, dataXMM, get_identity(p));
528 sse2_punpcklbw(p->func, dataXMM, get_identity(p));
529 break;
530 case 16:
531 sse2_punpcklwd(p->func, dataXMM, get_identity(p));
532 break;
533 case 32: /* we lose precision here */
534 sse2_psrld_imm(p->func, dataXMM, 1);
535 break;
536 default:
537 return FALSE;
538 }
539 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
540 if(input_desc->channel[0].normalized)
541 {
542 struct x86_reg factor;
543 switch(input_desc->channel[0].size)
544 {
545 case 8:
546 factor = get_inv_255(p);
547 break;
548 case 16:
549 factor = get_inv_65535(p);
550 break;
551 case 32:
552 factor = get_inv_2147483647(p);
553 break;
554 }
555 sse_mulps(p->func, dataXMM, factor);
556 }
557 else if(input_desc->channel[0].size == 32)
558 sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
559 break;
560 case UTIL_FORMAT_TYPE_SIGNED:
561 if(!(x86_target_caps(p->func) & X86_SSE2))
562 return FALSE;
563 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
564
565 /* TODO: add support for SSE4.1 pmovsx */
566 switch(input_desc->channel[0].size)
567 {
568 case 8:
569 sse2_punpcklbw(p->func, dataXMM, dataXMM);
570 sse2_punpcklbw(p->func, dataXMM, dataXMM);
571 sse2_psrad_imm(p->func, dataXMM, 24);
572 break;
573 case 16:
574 sse2_punpcklwd(p->func, dataXMM, dataXMM);
575 sse2_psrad_imm(p->func, dataXMM, 16);
576 break;
577 case 32: /* we lose precision here */
578 break;
579 default:
580 return FALSE;
581 }
582 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
583 if(input_desc->channel[0].normalized)
584 {
585 struct x86_reg factor;
586 switch(input_desc->channel[0].size)
587 {
588 case 8:
589 factor = get_inv_127(p);
590 break;
591 case 16:
592 factor = get_inv_32767(p);
593 break;
594 case 32:
595 factor = get_inv_2147483647(p);
596 break;
597 }
598 sse_mulps(p->func, dataXMM, factor);
599 }
600 break;
601
602 break;
603 case UTIL_FORMAT_TYPE_FLOAT:
604 if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
605 return FALSE;
606 if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
607 {
608 swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
609 needed_chans = CHANNELS_0001;
610 }
611 switch(input_desc->channel[0].size)
612 {
613 case 32:
614 emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
615 break;
616 case 64: /* we lose precision here */
617 if(!(x86_target_caps(p->func) & X86_SSE2))
618 return FALSE;
619 emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
620 break;
621 default:
622 return FALSE;
623 }
624 break;
625 default:
626 return FALSE;
627 }
628
629 if(!id_swizzle)
630 sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
631 }
632
633 if(output_desc->nr_channels >= 4
634 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
635 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
636 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
637 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
638 )
639 sse_movups(p->func, dst, dataXMM);
640 else
641 {
642 if(output_desc->nr_channels >= 2
643 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
644 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
645 sse_movlps(p->func, dst, dataXMM);
646 else
647 {
648 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
649 sse_movss(p->func, dst, dataXMM);
650 else
651 x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
652
653 if(output_desc->nr_channels >= 2)
654 {
655 if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
656 {
657 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
658 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
659 }
660 else
661 x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
662 }
663 }
664
665 if(output_desc->nr_channels >= 3)
666 {
667 if(output_desc->nr_channels >= 4
668 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
669 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
670 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
671 else
672 {
673 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
674 {
675 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
676 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
677 }
678 else
679 x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
680
681 if(output_desc->nr_channels >= 4)
682 {
683 if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
684 {
685 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
686 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
687 }
688 else
689 x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
690 }
691 }
692 }
693 }
694 return TRUE;
695 }
696 else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
697 && output_desc->channel[0].normalized == input_desc->channel[0].normalized
698 && (0
699 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
700 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
701 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
702 ))
703 {
704 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
705 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
706 struct x86_reg tmp = p->tmp_EAX;
707 unsigned imms[2] = {0, 1};
708
709 for(i = 0; i < output_desc->nr_channels; ++i)
710 {
711 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
712 swizzle[i] = i;
713 }
714
715 for(i = 0; i < output_desc->nr_channels; ++i)
716 {
717 if(swizzle[i] < 4)
718 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
719 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
720 id_swizzle = FALSE;
721 }
722
723 if(needed_chans > 0)
724 {
725 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
726
727 switch(input_desc->channel[0].type)
728 {
729 case UTIL_FORMAT_TYPE_UNSIGNED:
730 if(input_desc->channel[0].normalized)
731 {
732 sse2_punpcklbw(p->func, dataXMM, dataXMM);
733 if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
734 sse2_psrlw_imm(p->func, dataXMM, 1);
735 }
736 else
737 sse2_punpcklbw(p->func, dataXMM, get_identity(p));
738 break;
739 case UTIL_FORMAT_TYPE_SIGNED:
740 if(input_desc->channel[0].normalized)
741 {
742 sse2_movq(p->func, tmpXMM, get_identity(p));
743 sse2_punpcklbw(p->func, tmpXMM, dataXMM);
744 sse2_psllw_imm(p->func, dataXMM, 9);
745 sse2_psrlw_imm(p->func, dataXMM, 8);
746 sse2_por(p->func, tmpXMM, dataXMM);
747 sse2_psrlw_imm(p->func, dataXMM, 7);
748 sse2_por(p->func, tmpXMM, dataXMM);
749 {
750 struct x86_reg t = dataXMM;
751 dataXMM = tmpXMM;
752 tmpXMM = t;
753 }
754 }
755 else
756 {
757 sse2_punpcklbw(p->func, dataXMM, dataXMM);
758 sse2_psraw_imm(p->func, dataXMM, 8);
759 }
760 break;
761 default:
762 assert(0);
763 }
764
765 if(output_desc->channel[0].normalized)
766 imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
767
768 if(!id_swizzle)
769 sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
770 }
771
772 if(output_desc->nr_channels >= 4
773 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
774 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
775 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
776 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
777 )
778 sse2_movq(p->func, dst, dataXMM);
779 else
780 {
781 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
782 {
783 if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
784 sse2_movd(p->func, dst, dataXMM);
785 else
786 {
787 sse2_movd(p->func, tmp, dataXMM);
788 x86_mov16(p->func, dst, tmp);
789 if(output_desc->nr_channels >= 2)
790 x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
791 }
792 }
793 else
794 {
795 if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
796 x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
797 else
798 {
799 x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
800 if(output_desc->nr_channels >= 2)
801 {
802 sse2_movd(p->func, tmp, dataXMM);
803 x86_shr_imm(p->func, tmp, 16);
804 x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
805 }
806 }
807 }
808
809 if(output_desc->nr_channels >= 3)
810 {
811 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
812 {
813 if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
814 {
815 sse2_psrlq_imm(p->func, dataXMM, 32);
816 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
817 }
818 else
819 {
820 sse2_psrlq_imm(p->func, dataXMM, 32);
821 sse2_movd(p->func, tmp, dataXMM);
822 x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
823 if(output_desc->nr_channels >= 4)
824 {
825 x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
826 }
827 }
828 }
829 else
830 {
831 if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
832 x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
833 else
834 {
835 x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
836
837 if(output_desc->nr_channels >= 4)
838 {
839 sse2_psrlq_imm(p->func, dataXMM, 48);
840 sse2_movd(p->func, tmp, dataXMM);
841 x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
842 }
843 }
844 }
845 }
846 }
847 return TRUE;
848 }
849 else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
850 {
851 struct x86_reg tmp = p->tmp_EAX;
852 unsigned i;
853 if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
854 && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
855 && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
856 && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
857 && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
858 {
859 /* TODO: support movbe */
860 x86_mov(p->func, tmp, src);
861 x86_bswap(p->func, tmp);
862 x86_mov(p->func, dst, tmp);
863 return TRUE;
864 }
865
866 for(i = 0; i < output_desc->nr_channels; ++i)
867 {
868 switch(output_desc->channel[0].size)
869 {
870 case 8:
871 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
872 {
873 unsigned v = 0;
874 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
875 {
876 switch(output_desc->channel[0].type)
877 {
878 case UTIL_FORMAT_TYPE_UNSIGNED:
879 v = output_desc->channel[0].normalized ? 0xff : 1;
880 break;
881 case UTIL_FORMAT_TYPE_SIGNED:
882 v = output_desc->channel[0].normalized ? 0x7f : 1;
883 break;
884 default:
885 return FALSE;
886 }
887 }
888 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
889 }
890 else
891 {
892 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
893 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
894 }
895 break;
896 case 16:
897 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
898 {
899 unsigned v = 0;
900 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
901 {
902 switch(output_desc->channel[1].type)
903 {
904 case UTIL_FORMAT_TYPE_UNSIGNED:
905 v = output_desc->channel[1].normalized ? 0xffff : 1;
906 break;
907 case UTIL_FORMAT_TYPE_SIGNED:
908 v = output_desc->channel[1].normalized ? 0x7fff : 1;
909 break;
910 case UTIL_FORMAT_TYPE_FLOAT:
911 v = 0x3c00;
912 break;
913 default:
914 return FALSE;
915 }
916 }
917 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
918 }
919 else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
920 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
921 else
922 {
923 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
924 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
925 }
926 break;
927 case 32:
928 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
929 {
930 unsigned v = 0;
931 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
932 {
933 switch(output_desc->channel[1].type)
934 {
935 case UTIL_FORMAT_TYPE_UNSIGNED:
936 v = output_desc->channel[1].normalized ? 0xffffffff : 1;
937 break;
938 case UTIL_FORMAT_TYPE_SIGNED:
939 v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
940 break;
941 case UTIL_FORMAT_TYPE_FLOAT:
942 v = 0x3f800000;
943 break;
944 default:
945 return FALSE;
946 }
947 }
948 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
949 }
950 else
951 {
952 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
953 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
954 }
955 break;
956 case 64:
957 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
958 {
959 unsigned l = 0;
960 unsigned h = 0;
961 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
962 {
963 switch(output_desc->channel[1].type)
964 {
965 case UTIL_FORMAT_TYPE_UNSIGNED:
966 h = output_desc->channel[1].normalized ? 0xffffffff : 0;
967 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
968 break;
969 case UTIL_FORMAT_TYPE_SIGNED:
970 h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
971 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
972 break;
973 case UTIL_FORMAT_TYPE_FLOAT:
974 h = 0x3ff00000;
975 l = 0;
976 break;
977 default:
978 return FALSE;
979 }
980 }
981 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
982 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
983 }
984 else
985 {
986 if(x86_target_caps(p->func) & X86_SSE)
987 {
988 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
989 emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
990 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
991 }
992 else
993 {
994 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
995 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
996 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
997 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
998 }
999 }
1000 break;
1001 default:
1002 return FALSE;
1003 }
1004 }
1005 return TRUE;
1006 }
1007 return FALSE;
1008 }
1009
1010 static boolean translate_attr( struct translate_sse *p,
1011 const struct translate_element *a,
1012 struct x86_reg src,
1013 struct x86_reg dst)
1014 {
1015 if(a->input_format == a->output_format)
1016 {
1017 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1018 return TRUE;
1019 }
1020
1021 return translate_attr_convert(p, a, src, dst);
1022 }
1023
1024 static boolean init_inputs( struct translate_sse *p,
1025 unsigned index_size )
1026 {
1027 unsigned i;
1028 struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1029 get_offset(p, &p->instance_id));
1030
1031 for (i = 0; i < p->nr_buffer_varients; i++) {
1032 struct translate_buffer_varient *varient = &p->buffer_varient[i];
1033 struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
1034
1035 if (!index_size || varient->instance_divisor) {
1036 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1037 get_offset(p, &buffer->stride));
1038 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1039 get_offset(p, &varient->ptr));
1040 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1041 get_offset(p, &buffer->base_ptr));
1042 struct x86_reg elt = p->idx_ESI;
1043 struct x86_reg tmp_EAX = p->tmp_EAX;
1044
1045 /* Calculate pointer to first attrib:
1046 * base_ptr + stride * index, where index depends on instance divisor
1047 */
1048 if (varient->instance_divisor) {
1049 /* Our index is instance ID divided by instance divisor.
1050 */
1051 x86_mov(p->func, tmp_EAX, instance_id);
1052
1053 if (varient->instance_divisor != 1) {
1054 struct x86_reg tmp_EDX = p->tmp2_EDX;
1055 struct x86_reg tmp_ECX = p->src_ECX;
1056
1057 /* TODO: Add x86_shr() to rtasm and use it whenever
1058 * instance divisor is power of two.
1059 */
1060
1061 x86_xor(p->func, tmp_EDX, tmp_EDX);
1062 x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
1063 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
1064 }
1065 } else {
1066 x86_mov(p->func, tmp_EAX, elt);
1067 }
1068
1069 /*
1070 * TODO: Respect translate_buffer::max_index.
1071 */
1072
1073 x86_imul(p->func, tmp_EAX, buf_stride);
1074 x64_rexw(p->func);
1075 x86_add(p->func, tmp_EAX, buf_base_ptr);
1076
1077
1078 /* In the linear case, keep the buffer pointer instead of the
1079 * index number.
1080 */
1081 if (!index_size && p->nr_buffer_varients == 1)
1082 {
1083 x64_rexw(p->func);
1084 x86_mov(p->func, elt, tmp_EAX);
1085 }
1086 else
1087 {
1088 x64_rexw(p->func);
1089 x86_mov(p->func, buf_ptr, tmp_EAX);
1090 }
1091 }
1092 }
1093
1094 return TRUE;
1095 }
1096
1097
1098 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1099 unsigned index_size,
1100 unsigned var_idx,
1101 struct x86_reg elt )
1102 {
1103 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1104 return x86_make_disp(p->machine_EDI,
1105 get_offset(p, &p->instance_id));
1106 }
1107 if (!index_size && p->nr_buffer_varients == 1) {
1108 return p->idx_ESI;
1109 }
1110 else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
1111 struct x86_reg ptr = p->src_ECX;
1112 struct x86_reg buf_ptr =
1113 x86_make_disp(p->machine_EDI,
1114 get_offset(p, &p->buffer_varient[var_idx].ptr));
1115
1116 x64_rexw(p->func);
1117 x86_mov(p->func, ptr, buf_ptr);
1118 return ptr;
1119 }
1120 else {
1121 struct x86_reg ptr = p->src_ECX;
1122 const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
1123
1124 struct x86_reg buf_stride =
1125 x86_make_disp(p->machine_EDI,
1126 get_offset(p, &p->buffer[varient->buffer_index].stride));
1127
1128 struct x86_reg buf_base_ptr =
1129 x86_make_disp(p->machine_EDI,
1130 get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
1131
1132
1133
1134 /* Calculate pointer to current attrib:
1135 */
1136 switch(index_size)
1137 {
1138 case 1:
1139 x86_movzx8(p->func, ptr, elt);
1140 break;
1141 case 2:
1142 x86_movzx16(p->func, ptr, elt);
1143 break;
1144 case 4:
1145 x86_mov(p->func, ptr, elt);
1146 break;
1147 }
1148 x86_imul(p->func, ptr, buf_stride);
1149 x64_rexw(p->func);
1150 x86_add(p->func, ptr, buf_base_ptr);
1151 return ptr;
1152 }
1153 }
1154
1155
1156
1157 static boolean incr_inputs( struct translate_sse *p,
1158 unsigned index_size )
1159 {
1160 if (!index_size && p->nr_buffer_varients == 1) {
1161 struct x86_reg stride = x86_make_disp(p->machine_EDI,
1162 get_offset(p, &p->buffer[0].stride));
1163
1164 if (p->buffer_varient[0].instance_divisor == 0) {
1165 x64_rexw(p->func);
1166 x86_add(p->func, p->idx_ESI, stride);
1167 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1168 }
1169 }
1170 else if (!index_size) {
1171 unsigned i;
1172
1173 /* Is this worthwhile??
1174 */
1175 for (i = 0; i < p->nr_buffer_varients; i++) {
1176 struct translate_buffer_varient *varient = &p->buffer_varient[i];
1177 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1178 get_offset(p, &varient->ptr));
1179 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1180 get_offset(p, &p->buffer[varient->buffer_index].stride));
1181
1182 if (varient->instance_divisor == 0) {
1183 x86_mov(p->func, p->tmp_EAX, buf_stride);
1184 x64_rexw(p->func);
1185 x86_add(p->func, p->tmp_EAX, buf_ptr);
1186 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1187 x64_rexw(p->func);
1188 x86_mov(p->func, buf_ptr, p->tmp_EAX);
1189 }
1190 }
1191 }
1192 else {
1193 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1194 }
1195
1196 return TRUE;
1197 }
1198
1199
1200 /* Build run( struct translate *machine,
1201 * unsigned start,
1202 * unsigned count,
1203 * void *output_buffer )
1204 * or
1205 * run_elts( struct translate *machine,
1206 * unsigned *elts,
1207 * unsigned count,
1208 * void *output_buffer )
1209 *
1210 * Lots of hardcoding
1211 *
1212 * EAX -- pointer to current output vertex
1213 * ECX -- pointer to current attribute
1214 *
1215 */
1216 static boolean build_vertex_emit( struct translate_sse *p,
1217 struct x86_function *func,
1218 unsigned index_size )
1219 {
1220 int fixup, label;
1221 unsigned j;
1222
1223 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1224 p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1225 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1226 p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1227 p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1228 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1229 p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1230
1231 p->func = func;
1232 memset(&p->loaded_const, 0, sizeof(p->loaded_const));
1233 p->loaded_identity = FALSE;
1234
1235 x86_init_func(p->func);
1236
1237 if(x86_target(p->func) == X86_64_WIN64_ABI)
1238 {
1239 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1240 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1241 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1242 }
1243
1244 x86_push(p->func, p->outbuf_EBX);
1245 x86_push(p->func, p->count_EBP);
1246
1247 /* on non-Win64 x86-64, these are already in the right registers */
1248 if(x86_target(p->func) != X86_64_STD_ABI)
1249 {
1250 x86_push(p->func, p->machine_EDI);
1251 x86_push(p->func, p->idx_ESI);
1252
1253 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1254 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1255 }
1256
1257 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1258
1259 if(x86_target(p->func) != X86_32)
1260 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1261 else
1262 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1263
1264 /* Load instance ID.
1265 */
1266 if (p->use_instancing) {
1267 x86_mov(p->func,
1268 p->tmp_EAX,
1269 x86_fn_arg(p->func, 4));
1270 x86_mov(p->func,
1271 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1272 p->tmp_EAX);
1273 }
1274
1275 /* Get vertex count, compare to zero
1276 */
1277 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1278 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1279 fixup = x86_jcc_forward(p->func, cc_E);
1280
1281 /* always load, needed or not:
1282 */
1283 init_inputs(p, index_size);
1284
1285 /* Note address for loop jump
1286 */
1287 label = x86_get_label(p->func);
1288 {
1289 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1290 int last_varient = -1;
1291 struct x86_reg vb;
1292
1293 for (j = 0; j < p->translate.key.nr_elements; j++) {
1294 const struct translate_element *a = &p->translate.key.element[j];
1295 unsigned varient = p->element_to_buffer_varient[j];
1296
1297 /* Figure out source pointer address:
1298 */
1299 if (varient != last_varient) {
1300 last_varient = varient;
1301 vb = get_buffer_ptr(p, index_size, varient, elt);
1302 }
1303
1304 if (!translate_attr( p, a,
1305 x86_make_disp(vb, a->input_offset),
1306 x86_make_disp(p->outbuf_EBX, a->output_offset)))
1307 return FALSE;
1308 }
1309
1310 /* Next output vertex:
1311 */
1312 x64_rexw(p->func);
1313 x86_lea(p->func,
1314 p->outbuf_EBX,
1315 x86_make_disp(p->outbuf_EBX,
1316 p->translate.key.output_stride));
1317
1318 /* Incr index
1319 */
1320 incr_inputs( p, index_size );
1321 }
1322
1323 /* decr count, loop if not zero
1324 */
1325 x86_dec(p->func, p->count_EBP);
1326 x86_jcc(p->func, cc_NZ, label);
1327
1328 /* Exit mmx state?
1329 */
1330 if (p->func->need_emms)
1331 mmx_emms(p->func);
1332
1333 /* Land forward jump here:
1334 */
1335 x86_fixup_fwd_jump(p->func, fixup);
1336
1337 /* Pop regs and return
1338 */
1339
1340 if(x86_target(p->func) != X86_64_STD_ABI)
1341 {
1342 x86_pop(p->func, p->idx_ESI);
1343 x86_pop(p->func, p->machine_EDI);
1344 }
1345
1346 x86_pop(p->func, p->count_EBP);
1347 x86_pop(p->func, p->outbuf_EBX);
1348
1349 if(x86_target(p->func) == X86_64_WIN64_ABI)
1350 {
1351 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1352 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1353 }
1354 x86_ret(p->func);
1355
1356 return TRUE;
1357 }
1358
1359
1360
1361
1362
1363
1364
1365 static void translate_sse_set_buffer( struct translate *translate,
1366 unsigned buf,
1367 const void *ptr,
1368 unsigned stride,
1369 unsigned max_index )
1370 {
1371 struct translate_sse *p = (struct translate_sse *)translate;
1372
1373 if (buf < p->nr_buffers) {
1374 p->buffer[buf].base_ptr = (char *)ptr;
1375 p->buffer[buf].stride = stride;
1376 p->buffer[buf].max_index = max_index;
1377 }
1378
1379 if (0) debug_printf("%s %d/%d: %p %d\n",
1380 __FUNCTION__, buf,
1381 p->nr_buffers,
1382 ptr, stride);
1383 }
1384
1385
1386 static void translate_sse_release( struct translate *translate )
1387 {
1388 struct translate_sse *p = (struct translate_sse *)translate;
1389
1390 x86_release_func( &p->linear_func );
1391 x86_release_func( &p->elt_func );
1392
1393 FREE(p);
1394 }
1395
1396
1397 struct translate *translate_sse2_create( const struct translate_key *key )
1398 {
1399 struct translate_sse *p = NULL;
1400 unsigned i;
1401
1402 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1403 if (!rtasm_cpu_has_sse())
1404 goto fail;
1405
1406 p = CALLOC_STRUCT( translate_sse );
1407 if (p == NULL)
1408 goto fail;
1409
1410 p->translate.key = *key;
1411 p->translate.release = translate_sse_release;
1412 p->translate.set_buffer = translate_sse_set_buffer;
1413
1414 for (i = 0; i < key->nr_elements; i++) {
1415 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1416 unsigned j;
1417
1418 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1419
1420 if (key->element[i].instance_divisor) {
1421 p->use_instancing = TRUE;
1422 }
1423
1424 /*
1425 * Map vertex element to vertex buffer varient.
1426 */
1427 for (j = 0; j < p->nr_buffer_varients; j++) {
1428 if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
1429 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
1430 break;
1431 }
1432 }
1433 if (j == p->nr_buffer_varients) {
1434 p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
1435 p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
1436 p->nr_buffer_varients++;
1437 }
1438 p->element_to_buffer_varient[i] = j;
1439 } else {
1440 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1441
1442 p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
1443 }
1444 }
1445
1446 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1447
1448 if (!build_vertex_emit(p, &p->linear_func, 0))
1449 goto fail;
1450
1451 if (!build_vertex_emit(p, &p->elt_func, 4))
1452 goto fail;
1453
1454 if (!build_vertex_emit(p, &p->elt16_func, 2))
1455 goto fail;
1456
1457 if (!build_vertex_emit(p, &p->elt8_func, 1))
1458 goto fail;
1459
1460 p->translate.run = (void*)x86_get_func(&p->linear_func);
1461 if (p->translate.run == NULL)
1462 goto fail;
1463
1464 p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
1465 if (p->translate.run_elts == NULL)
1466 goto fail;
1467
1468 p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
1469 if (p->translate.run_elts16 == NULL)
1470 goto fail;
1471
1472 p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
1473 if (p->translate.run_elts8 == NULL)
1474 goto fail;
1475
1476 return &p->translate;
1477
1478 fail:
1479 if (p)
1480 translate_sse_release( &p->translate );
1481
1482 return NULL;
1483 }
1484
1485
1486
1487 #else
1488
1489 struct translate *translate_sse2_create( const struct translate_key *key )
1490 {
1491 return NULL;
1492 }
1493
1494 #endif