3e12f1ef7e73122aec86c7ccd5c424c3de06b196
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
34
35 #include "translate.h"
36
37
38 #if (defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
39
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
42
43
44 #define X 0
45 #define Y 1
46 #define Z 2
47 #define W 3
48
49
50 struct translate_buffer {
51 const void *base_ptr;
52 uintptr_t stride;
53 unsigned max_index;
54 };
55
56 struct translate_buffer_variant {
57 unsigned buffer_index;
58 unsigned instance_divisor;
59 void *ptr; /* updated either per vertex or per instance */
60 };
61
62
63 #define ELEMENT_BUFFER_INSTANCE_ID 1001
64
65 #define NUM_CONSTS 7
66
67 enum
68 {
69 CONST_IDENTITY,
70 CONST_INV_127,
71 CONST_INV_255,
72 CONST_INV_32767,
73 CONST_INV_65535,
74 CONST_INV_2147483647,
75 CONST_255
76 };
77
78 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
79 static float consts[NUM_CONSTS][4] = {
80 {0, 0, 0, 1},
81 C(1.0 / 127.0),
82 C(1.0 / 255.0),
83 C(1.0 / 32767.0),
84 C(1.0 / 65535.0),
85 C(1.0 / 2147483647.0),
86 C(255.0)
87 };
88 #undef C
89
90 struct translate_sse {
91 struct translate translate;
92
93 struct x86_function linear_func;
94 struct x86_function elt_func;
95 struct x86_function elt16_func;
96 struct x86_function elt8_func;
97 struct x86_function *func;
98
99 PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
100 int8_t reg_to_const[16];
101 int8_t const_to_reg[NUM_CONSTS];
102
103 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
104 unsigned nr_buffers;
105
106 /* Multiple buffer variants can map to a single buffer. */
107 struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS];
108 unsigned nr_buffer_variants;
109
110 /* Multiple elements can map to a single buffer variant. */
111 unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS];
112
113 boolean use_instancing;
114 unsigned instance_id;
115 unsigned start_instance;
116
117 /* these are actually known values, but putting them in a struct
118 * like this is helpful to keep them in sync across the file.
119 */
120 struct x86_reg tmp_EAX;
121 struct x86_reg tmp2_EDX;
122 struct x86_reg src_ECX;
123 struct x86_reg idx_ESI; /* either start+i or &elt[i] */
124 struct x86_reg machine_EDI;
125 struct x86_reg outbuf_EBX;
126 struct x86_reg count_EBP; /* decrements to zero */
127 };
128
129 static int get_offset( const void *a, const void *b )
130 {
131 return (const char *)b - (const char *)a;
132 }
133
134 static struct x86_reg get_const( struct translate_sse *p, unsigned id)
135 {
136 struct x86_reg reg;
137 unsigned i;
138
139 if(p->const_to_reg[id] >= 0)
140 return x86_make_reg(file_XMM, p->const_to_reg[id]);
141
142 for(i = 2; i < 8; ++i)
143 {
144 if(p->reg_to_const[i] < 0)
145 break;
146 }
147
148 /* TODO: be smarter here */
149 if(i == 8)
150 --i;
151
152 reg = x86_make_reg(file_XMM, i);
153
154 if(p->reg_to_const[i] >= 0)
155 p->const_to_reg[p->reg_to_const[i]] = -1;
156
157 p->reg_to_const[i] = id;
158 p->const_to_reg[id] = i;
159
160 /* TODO: this should happen outside the loop, if possible */
161 sse_movaps(p->func, reg,
162 x86_make_disp(p->machine_EDI,
163 get_offset(p, &p->consts[id][0])));
164
165 return reg;
166 }
167
168 /* load the data in a SSE2 register, padding with zeros */
169 static boolean emit_load_sse2( struct translate_sse *p,
170 struct x86_reg data,
171 struct x86_reg src,
172 unsigned size)
173 {
174 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
175 struct x86_reg tmp = p->tmp_EAX;
176 switch(size)
177 {
178 case 1:
179 x86_movzx8(p->func, tmp, src);
180 sse2_movd(p->func, data, tmp);
181 break;
182 case 2:
183 x86_movzx16(p->func, tmp, src);
184 sse2_movd(p->func, data, tmp);
185 break;
186 case 3:
187 x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
188 x86_shl_imm(p->func, tmp, 16);
189 x86_mov16(p->func, tmp, src);
190 sse2_movd(p->func, data, tmp);
191 break;
192 case 4:
193 sse2_movd(p->func, data, src);
194 break;
195 case 6:
196 sse2_movd(p->func, data, src);
197 x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
198 sse2_movd(p->func, tmpXMM, tmp);
199 sse2_punpckldq(p->func, data, tmpXMM);
200 break;
201 case 8:
202 sse2_movq(p->func, data, src);
203 break;
204 case 12:
205 sse2_movq(p->func, data, src);
206 sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
207 sse2_punpcklqdq(p->func, data, tmpXMM);
208 break;
209 case 16:
210 sse2_movdqu(p->func, data, src);
211 break;
212 default:
213 return FALSE;
214 }
215 return TRUE;
216 }
217
218 /* this value can be passed for the out_chans argument */
219 #define CHANNELS_0001 5
220
221 /* this function will load #chans float values, and will
222 * pad the register with zeroes at least up to out_chans.
223 *
224 * If out_chans is set to CHANNELS_0001, then the fourth
225 * value will be padded with 1. Only pass this value if
226 * chans < 4 or results are undefined.
227 */
228 static void emit_load_float32( struct translate_sse *p,
229 struct x86_reg data,
230 struct x86_reg arg0,
231 unsigned out_chans,
232 unsigned chans)
233 {
234 switch(chans)
235 {
236 case 1:
237 /* a 0 0 0
238 * a 0 0 1
239 */
240 sse_movss(p->func, data, arg0);
241 if(out_chans == CHANNELS_0001)
242 sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
243 break;
244 case 2:
245 /* 0 0 0 1
246 * a b 0 1
247 */
248 if(out_chans == CHANNELS_0001)
249 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
250 else if(out_chans > 2)
251 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
252 sse_movlps(p->func, data, arg0);
253 break;
254 case 3:
255 /* Have to jump through some hoops:
256 *
257 * c 0 0 0
258 * c 0 0 1 if out_chans == CHANNELS_0001
259 * 0 0 c 0/1
260 * a b c 0/1
261 */
262 sse_movss(p->func, data, x86_make_disp(arg0, 8));
263 if(out_chans == CHANNELS_0001)
264 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
265 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
266 sse_movlps(p->func, data, arg0);
267 break;
268 case 4:
269 sse_movups(p->func, data, arg0);
270 break;
271 }
272 }
273
274 /* this function behaves like emit_load_float32, but loads
275 64-bit floating point numbers, converting them to 32-bit
276 ones */
277 static void emit_load_float64to32( struct translate_sse *p,
278 struct x86_reg data,
279 struct x86_reg arg0,
280 unsigned out_chans,
281 unsigned chans)
282 {
283 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
284 switch(chans)
285 {
286 case 1:
287 sse2_movsd(p->func, data, arg0);
288 if(out_chans > 1)
289 sse2_cvtpd2ps(p->func, data, data);
290 else
291 sse2_cvtsd2ss(p->func, data, data);
292 if(out_chans == CHANNELS_0001)
293 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
294 break;
295 case 2:
296 sse2_movupd(p->func, data, arg0);
297 sse2_cvtpd2ps(p->func, data, data);
298 if(out_chans == CHANNELS_0001)
299 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
300 else if(out_chans > 2)
301 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
302 break;
303 case 3:
304 sse2_movupd(p->func, data, arg0);
305 sse2_cvtpd2ps(p->func, data, data);
306 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
307 if(out_chans > 3)
308 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
309 else
310 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
311 sse_movlhps(p->func, data, tmpXMM);
312 if(out_chans == CHANNELS_0001)
313 sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
314 break;
315 case 4:
316 sse2_movupd(p->func, data, arg0);
317 sse2_cvtpd2ps(p->func, data, data);
318 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
319 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
320 sse_movlhps(p->func, data, tmpXMM);
321 break;
322 }
323 }
324
325 static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm)
326 {
327 if(x86_target(p->func) != X86_32)
328 x64_mov64(p->func, dst_gpr, src_gpr);
329 else
330 {
331 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
332 if(x86_target_caps(p->func) & X86_SSE2)
333 sse2_movq(p->func, dst_xmm, src_xmm);
334 else
335 sse_movlps(p->func, dst_xmm, src_xmm);
336 }
337 }
338
339 static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
340 {
341 emit_mov64(p, dst_gpr, dst_xmm, src, src);
342 }
343
344 static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
345 {
346 emit_mov64(p, dst, dst, src_gpr, src_xmm);
347 }
348
349 static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
350 {
351 if(x86_target_caps(p->func) & X86_SSE2)
352 sse2_movdqu(p->func, dst, src);
353 else
354 sse_movups(p->func, dst, src);
355 }
356
357 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
358 * but may or may not be good on older processors
359 * TODO: may perhaps want to use non-temporal stores here if possible
360 */
361 static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
362 {
363 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
364 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
365 struct x86_reg dataGPR = p->tmp_EAX;
366 struct x86_reg dataGPR2 = p->tmp2_EDX;
367
368 if(size < 8)
369 {
370 switch (size)
371 {
372 case 1:
373 x86_mov8(p->func, dataGPR, src);
374 x86_mov8(p->func, dst, dataGPR);
375 break;
376 case 2:
377 x86_mov16(p->func, dataGPR, src);
378 x86_mov16(p->func, dst, dataGPR);
379 break;
380 case 3:
381 x86_mov16(p->func, dataGPR, src);
382 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
383 x86_mov16(p->func, dst, dataGPR);
384 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
385 break;
386 case 4:
387 x86_mov(p->func, dataGPR, src);
388 x86_mov(p->func, dst, dataGPR);
389 break;
390 case 6:
391 x86_mov(p->func, dataGPR, src);
392 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
393 x86_mov(p->func, dst, dataGPR);
394 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
395 break;
396 }
397 }
398 else if(!(x86_target_caps(p->func) & X86_SSE))
399 {
400 unsigned i = 0;
401 assert((size & 3) == 0);
402 for(i = 0; i < size; i += 4)
403 {
404 x86_mov(p->func, dataGPR, x86_make_disp(src, i));
405 x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
406 }
407 }
408 else
409 {
410 switch(size)
411 {
412 case 8:
413 emit_load64(p, dataGPR, dataXMM, src);
414 emit_store64(p, dst, dataGPR, dataXMM);
415 break;
416 case 12:
417 emit_load64(p, dataGPR2, dataXMM, src);
418 x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
419 emit_store64(p, dst, dataGPR2, dataXMM);
420 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
421 break;
422 case 16:
423 emit_mov128(p, dataXMM, src);
424 emit_mov128(p, dst, dataXMM);
425 break;
426 case 24:
427 emit_mov128(p, dataXMM, src);
428 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
429 emit_mov128(p, dst, dataXMM);
430 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
431 break;
432 case 32:
433 emit_mov128(p, dataXMM, src);
434 emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
435 emit_mov128(p, dst, dataXMM);
436 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
437 break;
438 default:
439 assert(0);
440 }
441 }
442 }
443
444 static boolean translate_attr_convert( struct translate_sse *p,
445 const struct translate_element *a,
446 struct x86_reg src,
447 struct x86_reg dst)
448
449 {
450 const struct util_format_description* input_desc = util_format_description(a->input_format);
451 const struct util_format_description* output_desc = util_format_description(a->output_format);
452 unsigned i;
453 boolean id_swizzle = TRUE;
454 unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
455 unsigned needed_chans = 0;
456 unsigned imms[2] = {0, 0x3f800000};
457
458 if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
459 return FALSE;
460
461 if(input_desc->channel[0].size & 7)
462 return FALSE;
463
464 if(input_desc->colorspace != output_desc->colorspace)
465 return FALSE;
466
467 for(i = 1; i < input_desc->nr_channels; ++i)
468 {
469 if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
470 return FALSE;
471 }
472
473 for(i = 1; i < output_desc->nr_channels; ++i)
474 {
475 if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
476 return FALSE;
477 }
478
479 for(i = 0; i < output_desc->nr_channels; ++i)
480 {
481 if(output_desc->swizzle[i] < 4)
482 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
483 }
484
485 if((x86_target_caps(p->func) & X86_SSE) && (0
486 || a->output_format == PIPE_FORMAT_R32_FLOAT
487 || a->output_format == PIPE_FORMAT_R32G32_FLOAT
488 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
489 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
490 {
491 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
492
493 for(i = 0; i < output_desc->nr_channels; ++i)
494 {
495 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
496 swizzle[i] = i;
497 }
498
499 for(i = 0; i < output_desc->nr_channels; ++i)
500 {
501 if(swizzle[i] < 4)
502 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
503 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
504 id_swizzle = FALSE;
505 }
506
507 if(needed_chans > 0)
508 {
509 switch(input_desc->channel[0].type)
510 {
511 case UTIL_FORMAT_TYPE_UNSIGNED:
512 if(!(x86_target_caps(p->func) & X86_SSE2))
513 return FALSE;
514 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
515
516 /* TODO: add support for SSE4.1 pmovzx */
517 switch(input_desc->channel[0].size)
518 {
519 case 8:
520 /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
521 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
522 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
523 break;
524 case 16:
525 sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
526 break;
527 case 32: /* we lose precision here */
528 sse2_psrld_imm(p->func, dataXMM, 1);
529 break;
530 default:
531 return FALSE;
532 }
533 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
534 if(input_desc->channel[0].normalized)
535 {
536 struct x86_reg factor;
537 switch(input_desc->channel[0].size)
538 {
539 case 8:
540 factor = get_const(p, CONST_INV_255);
541 break;
542 case 16:
543 factor = get_const(p, CONST_INV_65535);
544 break;
545 case 32:
546 factor = get_const(p, CONST_INV_2147483647);
547 break;
548 default:
549 assert(0);
550 factor.disp = 0;
551 factor.file = 0;
552 factor.idx = 0;
553 factor.mod = 0;
554 break;
555 }
556 sse_mulps(p->func, dataXMM, factor);
557 }
558 else if(input_desc->channel[0].size == 32)
559 sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
560 break;
561 case UTIL_FORMAT_TYPE_SIGNED:
562 if(!(x86_target_caps(p->func) & X86_SSE2))
563 return FALSE;
564 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
565
566 /* TODO: add support for SSE4.1 pmovsx */
567 switch(input_desc->channel[0].size)
568 {
569 case 8:
570 sse2_punpcklbw(p->func, dataXMM, dataXMM);
571 sse2_punpcklbw(p->func, dataXMM, dataXMM);
572 sse2_psrad_imm(p->func, dataXMM, 24);
573 break;
574 case 16:
575 sse2_punpcklwd(p->func, dataXMM, dataXMM);
576 sse2_psrad_imm(p->func, dataXMM, 16);
577 break;
578 case 32: /* we lose precision here */
579 break;
580 default:
581 return FALSE;
582 }
583 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
584 if(input_desc->channel[0].normalized)
585 {
586 struct x86_reg factor;
587 switch(input_desc->channel[0].size)
588 {
589 case 8:
590 factor = get_const(p, CONST_INV_127);
591 break;
592 case 16:
593 factor = get_const(p, CONST_INV_32767);
594 break;
595 case 32:
596 factor = get_const(p, CONST_INV_2147483647);
597 break;
598 default:
599 assert(0);
600 factor.disp = 0;
601 factor.file = 0;
602 factor.idx = 0;
603 factor.mod = 0;
604 break;
605 }
606 sse_mulps(p->func, dataXMM, factor);
607 }
608 break;
609
610 break;
611 case UTIL_FORMAT_TYPE_FLOAT:
612 if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
613 return FALSE;
614 if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
615 {
616 swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
617 needed_chans = CHANNELS_0001;
618 }
619 switch(input_desc->channel[0].size)
620 {
621 case 32:
622 emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
623 break;
624 case 64: /* we lose precision here */
625 if(!(x86_target_caps(p->func) & X86_SSE2))
626 return FALSE;
627 emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
628 break;
629 default:
630 return FALSE;
631 }
632 break;
633 default:
634 return FALSE;
635 }
636
637 if(!id_swizzle)
638 sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
639 }
640
641 if(output_desc->nr_channels >= 4
642 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
643 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
644 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
645 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
646 )
647 sse_movups(p->func, dst, dataXMM);
648 else
649 {
650 if(output_desc->nr_channels >= 2
651 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
652 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
653 sse_movlps(p->func, dst, dataXMM);
654 else
655 {
656 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
657 sse_movss(p->func, dst, dataXMM);
658 else
659 x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
660
661 if(output_desc->nr_channels >= 2)
662 {
663 if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
664 {
665 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
666 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
667 }
668 else
669 x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
670 }
671 }
672
673 if(output_desc->nr_channels >= 3)
674 {
675 if(output_desc->nr_channels >= 4
676 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
677 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
678 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
679 else
680 {
681 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
682 {
683 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
684 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
685 }
686 else
687 x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
688
689 if(output_desc->nr_channels >= 4)
690 {
691 if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
692 {
693 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
694 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
695 }
696 else
697 x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
698 }
699 }
700 }
701 }
702 return TRUE;
703 }
704 else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
705 && output_desc->channel[0].normalized == input_desc->channel[0].normalized
706 && (0
707 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
708 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
709 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
710 ))
711 {
712 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
713 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
714 struct x86_reg tmp = p->tmp_EAX;
715 unsigned imms[2] = {0, 1};
716
717 for(i = 0; i < output_desc->nr_channels; ++i)
718 {
719 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
720 swizzle[i] = i;
721 }
722
723 for(i = 0; i < output_desc->nr_channels; ++i)
724 {
725 if(swizzle[i] < 4)
726 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
727 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
728 id_swizzle = FALSE;
729 }
730
731 if(needed_chans > 0)
732 {
733 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
734
735 switch(input_desc->channel[0].type)
736 {
737 case UTIL_FORMAT_TYPE_UNSIGNED:
738 if(input_desc->channel[0].normalized)
739 {
740 sse2_punpcklbw(p->func, dataXMM, dataXMM);
741 if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
742 sse2_psrlw_imm(p->func, dataXMM, 1);
743 }
744 else
745 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
746 break;
747 case UTIL_FORMAT_TYPE_SIGNED:
748 if(input_desc->channel[0].normalized)
749 {
750 sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
751 sse2_punpcklbw(p->func, tmpXMM, dataXMM);
752 sse2_psllw_imm(p->func, dataXMM, 9);
753 sse2_psrlw_imm(p->func, dataXMM, 8);
754 sse2_por(p->func, tmpXMM, dataXMM);
755 sse2_psrlw_imm(p->func, dataXMM, 7);
756 sse2_por(p->func, tmpXMM, dataXMM);
757 {
758 struct x86_reg t = dataXMM;
759 dataXMM = tmpXMM;
760 tmpXMM = t;
761 }
762 }
763 else
764 {
765 sse2_punpcklbw(p->func, dataXMM, dataXMM);
766 sse2_psraw_imm(p->func, dataXMM, 8);
767 }
768 break;
769 default:
770 assert(0);
771 }
772
773 if(output_desc->channel[0].normalized)
774 imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
775
776 if(!id_swizzle)
777 sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
778 }
779
780 if(output_desc->nr_channels >= 4
781 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
782 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
783 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
784 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
785 )
786 sse2_movq(p->func, dst, dataXMM);
787 else
788 {
789 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
790 {
791 if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
792 sse2_movd(p->func, dst, dataXMM);
793 else
794 {
795 sse2_movd(p->func, tmp, dataXMM);
796 x86_mov16(p->func, dst, tmp);
797 if(output_desc->nr_channels >= 2)
798 x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
799 }
800 }
801 else
802 {
803 if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
804 x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
805 else
806 {
807 x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
808 if(output_desc->nr_channels >= 2)
809 {
810 sse2_movd(p->func, tmp, dataXMM);
811 x86_shr_imm(p->func, tmp, 16);
812 x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
813 }
814 }
815 }
816
817 if(output_desc->nr_channels >= 3)
818 {
819 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
820 {
821 if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
822 {
823 sse2_psrlq_imm(p->func, dataXMM, 32);
824 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
825 }
826 else
827 {
828 sse2_psrlq_imm(p->func, dataXMM, 32);
829 sse2_movd(p->func, tmp, dataXMM);
830 x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
831 if(output_desc->nr_channels >= 4)
832 {
833 x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
834 }
835 }
836 }
837 else
838 {
839 if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
840 x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
841 else
842 {
843 x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
844
845 if(output_desc->nr_channels >= 4)
846 {
847 sse2_psrlq_imm(p->func, dataXMM, 48);
848 sse2_movd(p->func, tmp, dataXMM);
849 x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
850 }
851 }
852 }
853 }
854 }
855 return TRUE;
856 }
857 else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
858 {
859 struct x86_reg tmp = p->tmp_EAX;
860 unsigned i;
861 if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
862 && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
863 && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
864 && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
865 && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
866 {
867 /* TODO: support movbe */
868 x86_mov(p->func, tmp, src);
869 x86_bswap(p->func, tmp);
870 x86_mov(p->func, dst, tmp);
871 return TRUE;
872 }
873
874 for(i = 0; i < output_desc->nr_channels; ++i)
875 {
876 switch(output_desc->channel[0].size)
877 {
878 case 8:
879 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
880 {
881 unsigned v = 0;
882 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
883 {
884 switch(output_desc->channel[0].type)
885 {
886 case UTIL_FORMAT_TYPE_UNSIGNED:
887 v = output_desc->channel[0].normalized ? 0xff : 1;
888 break;
889 case UTIL_FORMAT_TYPE_SIGNED:
890 v = output_desc->channel[0].normalized ? 0x7f : 1;
891 break;
892 default:
893 return FALSE;
894 }
895 }
896 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
897 }
898 else
899 {
900 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
901 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
902 }
903 break;
904 case 16:
905 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
906 {
907 unsigned v = 0;
908 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
909 {
910 switch(output_desc->channel[1].type)
911 {
912 case UTIL_FORMAT_TYPE_UNSIGNED:
913 v = output_desc->channel[1].normalized ? 0xffff : 1;
914 break;
915 case UTIL_FORMAT_TYPE_SIGNED:
916 v = output_desc->channel[1].normalized ? 0x7fff : 1;
917 break;
918 case UTIL_FORMAT_TYPE_FLOAT:
919 v = 0x3c00;
920 break;
921 default:
922 return FALSE;
923 }
924 }
925 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
926 }
927 else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
928 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
929 else
930 {
931 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
932 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
933 }
934 break;
935 case 32:
936 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
937 {
938 unsigned v = 0;
939 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
940 {
941 switch(output_desc->channel[1].type)
942 {
943 case UTIL_FORMAT_TYPE_UNSIGNED:
944 v = output_desc->channel[1].normalized ? 0xffffffff : 1;
945 break;
946 case UTIL_FORMAT_TYPE_SIGNED:
947 v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
948 break;
949 case UTIL_FORMAT_TYPE_FLOAT:
950 v = 0x3f800000;
951 break;
952 default:
953 return FALSE;
954 }
955 }
956 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
957 }
958 else
959 {
960 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
961 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
962 }
963 break;
964 case 64:
965 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
966 {
967 unsigned l = 0;
968 unsigned h = 0;
969 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
970 {
971 switch(output_desc->channel[1].type)
972 {
973 case UTIL_FORMAT_TYPE_UNSIGNED:
974 h = output_desc->channel[1].normalized ? 0xffffffff : 0;
975 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
976 break;
977 case UTIL_FORMAT_TYPE_SIGNED:
978 h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
979 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
980 break;
981 case UTIL_FORMAT_TYPE_FLOAT:
982 h = 0x3ff00000;
983 l = 0;
984 break;
985 default:
986 return FALSE;
987 }
988 }
989 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
990 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
991 }
992 else
993 {
994 if(x86_target_caps(p->func) & X86_SSE)
995 {
996 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
997 emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
998 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
999 }
1000 else
1001 {
1002 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1003 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1004 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
1005 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1006 }
1007 }
1008 break;
1009 default:
1010 return FALSE;
1011 }
1012 }
1013 return TRUE;
1014 }
1015 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1016 else if((x86_target_caps(p->func) & X86_SSE2) &&
1017 a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
1018 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1019 || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
1020 ))
1021 {
1022 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1023
1024 /* load */
1025 sse_movups(p->func, dataXMM, src);
1026
1027 if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
1028 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
1029
1030 /* scale by 255.0 */
1031 sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1032
1033 /* pack and emit */
1034 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1035 sse2_packssdw(p->func, dataXMM, dataXMM);
1036 sse2_packuswb(p->func, dataXMM, dataXMM);
1037 sse2_movd(p->func, dst, dataXMM);
1038
1039 return TRUE;
1040 }
1041
1042 return FALSE;
1043 }
1044
1045 static boolean translate_attr( struct translate_sse *p,
1046 const struct translate_element *a,
1047 struct x86_reg src,
1048 struct x86_reg dst)
1049 {
1050 if(a->input_format == a->output_format)
1051 {
1052 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1053 return TRUE;
1054 }
1055
1056 return translate_attr_convert(p, a, src, dst);
1057 }
1058
1059 static boolean init_inputs( struct translate_sse *p,
1060 unsigned index_size )
1061 {
1062 unsigned i;
1063 struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1064 get_offset(p, &p->instance_id));
1065 struct x86_reg start_instance = x86_make_disp(p->machine_EDI,
1066 get_offset(p, &p->start_instance));
1067
1068 for (i = 0; i < p->nr_buffer_variants; i++) {
1069 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1070 struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1071
1072 if (!index_size || variant->instance_divisor) {
1073 struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
1074 get_offset(p, &buffer->max_index));
1075 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1076 get_offset(p, &buffer->stride));
1077 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1078 get_offset(p, &variant->ptr));
1079 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1080 get_offset(p, &buffer->base_ptr));
1081 struct x86_reg elt = p->idx_ESI;
1082 struct x86_reg tmp_EAX = p->tmp_EAX;
1083
1084 /* Calculate pointer to first attrib:
1085 * base_ptr + stride * index, where index depends on instance divisor
1086 */
1087 if (variant->instance_divisor) {
1088 /* Start with instance = instance_id
1089 * which is true if divisor is 1.
1090 */
1091 x86_mov(p->func, tmp_EAX, instance_id);
1092
1093 if (variant->instance_divisor != 1) {
1094 struct x86_reg tmp_EDX = p->tmp2_EDX;
1095 struct x86_reg tmp_ECX = p->src_ECX;
1096
1097 /* TODO: Add x86_shr() to rtasm and use it whenever
1098 * instance divisor is power of two.
1099 */
1100 x86_xor(p->func, tmp_EDX, tmp_EDX);
1101 x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1102 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
1103
1104 /* instance = (instance_id - start_instance) / divisor +
1105 * start_instance
1106 */
1107 x86_mov(p->func, tmp_EDX, start_instance);
1108 x86_add(p->func, tmp_EAX, tmp_EDX);
1109 }
1110
1111 /* XXX we need to clamp the index here too, but to a
1112 * per-array max value, not the draw->pt.max_index value
1113 * that's being given to us via translate->set_buffer().
1114 */
1115 } else {
1116 x86_mov(p->func, tmp_EAX, elt);
1117
1118 /* Clamp to max_index
1119 */
1120 x86_cmp(p->func, tmp_EAX, buf_max_index);
1121 x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1122 }
1123
1124 x86_imul(p->func, tmp_EAX, buf_stride);
1125 x64_rexw(p->func);
1126 x86_add(p->func, tmp_EAX, buf_base_ptr);
1127
1128 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1129
1130 /* In the linear case, keep the buffer pointer instead of the
1131 * index number.
1132 */
1133 if (!index_size && p->nr_buffer_variants == 1)
1134 {
1135 x64_rexw(p->func);
1136 x86_mov(p->func, elt, tmp_EAX);
1137 }
1138 else
1139 {
1140 x64_rexw(p->func);
1141 x86_mov(p->func, buf_ptr, tmp_EAX);
1142 }
1143 }
1144 }
1145
1146 return TRUE;
1147 }
1148
1149
1150 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1151 unsigned index_size,
1152 unsigned var_idx,
1153 struct x86_reg elt )
1154 {
1155 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1156 return x86_make_disp(p->machine_EDI,
1157 get_offset(p, &p->instance_id));
1158 }
1159 if (!index_size && p->nr_buffer_variants == 1) {
1160 return p->idx_ESI;
1161 }
1162 else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1163 struct x86_reg ptr = p->src_ECX;
1164 struct x86_reg buf_ptr =
1165 x86_make_disp(p->machine_EDI,
1166 get_offset(p, &p->buffer_variant[var_idx].ptr));
1167
1168 x64_rexw(p->func);
1169 x86_mov(p->func, ptr, buf_ptr);
1170 return ptr;
1171 }
1172 else {
1173 struct x86_reg ptr = p->src_ECX;
1174 const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
1175
1176 struct x86_reg buf_stride =
1177 x86_make_disp(p->machine_EDI,
1178 get_offset(p, &p->buffer[variant->buffer_index].stride));
1179
1180 struct x86_reg buf_base_ptr =
1181 x86_make_disp(p->machine_EDI,
1182 get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1183
1184 struct x86_reg buf_max_index =
1185 x86_make_disp(p->machine_EDI,
1186 get_offset(p, &p->buffer[variant->buffer_index].max_index));
1187
1188
1189
1190 /* Calculate pointer to current attrib:
1191 */
1192 switch(index_size)
1193 {
1194 case 1:
1195 x86_movzx8(p->func, ptr, elt);
1196 break;
1197 case 2:
1198 x86_movzx16(p->func, ptr, elt);
1199 break;
1200 case 4:
1201 x86_mov(p->func, ptr, elt);
1202 break;
1203 }
1204
1205 /* Clamp to max_index
1206 */
1207 x86_cmp(p->func, ptr, buf_max_index);
1208 x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1209
1210 x86_imul(p->func, ptr, buf_stride);
1211 x64_rexw(p->func);
1212 x86_add(p->func, ptr, buf_base_ptr);
1213 return ptr;
1214 }
1215 }
1216
1217
1218
1219 static boolean incr_inputs( struct translate_sse *p,
1220 unsigned index_size )
1221 {
1222 if (!index_size && p->nr_buffer_variants == 1) {
1223 struct x86_reg stride = x86_make_disp(p->machine_EDI,
1224 get_offset(p, &p->buffer[0].stride));
1225
1226 if (p->buffer_variant[0].instance_divisor == 0) {
1227 x64_rexw(p->func);
1228 x86_add(p->func, p->idx_ESI, stride);
1229 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1230 }
1231 }
1232 else if (!index_size) {
1233 unsigned i;
1234
1235 /* Is this worthwhile??
1236 */
1237 for (i = 0; i < p->nr_buffer_variants; i++) {
1238 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1239 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1240 get_offset(p, &variant->ptr));
1241 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1242 get_offset(p, &p->buffer[variant->buffer_index].stride));
1243
1244 if (variant->instance_divisor == 0) {
1245 x86_mov(p->func, p->tmp_EAX, buf_stride);
1246 x64_rexw(p->func);
1247 x86_add(p->func, p->tmp_EAX, buf_ptr);
1248 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1249 x64_rexw(p->func);
1250 x86_mov(p->func, buf_ptr, p->tmp_EAX);
1251 }
1252 }
1253 }
1254 else {
1255 x64_rexw(p->func);
1256 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1257 }
1258
1259 return TRUE;
1260 }
1261
1262
1263 /* Build run( struct translate *machine,
1264 * unsigned start,
1265 * unsigned count,
1266 * void *output_buffer )
1267 * or
1268 * run_elts( struct translate *machine,
1269 * unsigned *elts,
1270 * unsigned count,
1271 * void *output_buffer )
1272 *
1273 * Lots of hardcoding
1274 *
1275 * EAX -- pointer to current output vertex
1276 * ECX -- pointer to current attribute
1277 *
1278 */
1279 static boolean build_vertex_emit( struct translate_sse *p,
1280 struct x86_function *func,
1281 unsigned index_size )
1282 {
1283 int fixup, label;
1284 unsigned j;
1285
1286 memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1287 memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1288
1289 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1290 p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1291 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1292 p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1293 p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1294 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1295 p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1296
1297 p->func = func;
1298
1299 x86_init_func(p->func);
1300
1301 if(x86_target(p->func) == X86_64_WIN64_ABI)
1302 {
1303 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1304 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1305 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1306 }
1307
1308 x86_push(p->func, p->outbuf_EBX);
1309 x86_push(p->func, p->count_EBP);
1310
1311 /* on non-Win64 x86-64, these are already in the right registers */
1312 if(x86_target(p->func) != X86_64_STD_ABI)
1313 {
1314 x86_push(p->func, p->machine_EDI);
1315 x86_push(p->func, p->idx_ESI);
1316
1317 if(x86_target(p->func) != X86_32)
1318 {
1319 x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1320 x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1321 }
1322 else
1323 {
1324 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1325 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1326 }
1327 }
1328
1329 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1330
1331 if(x86_target(p->func) != X86_32)
1332 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1333 else
1334 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1335
1336 /* Load instance ID.
1337 */
1338 if (p->use_instancing) {
1339 x86_mov(p->func,
1340 p->tmp2_EDX,
1341 x86_fn_arg(p->func, 4));
1342 x86_mov(p->func,
1343 x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)),
1344 p->tmp2_EDX);
1345
1346 x86_mov(p->func,
1347 p->tmp_EAX,
1348 x86_fn_arg(p->func, 5));
1349 x86_mov(p->func,
1350 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1351 p->tmp_EAX);
1352 }
1353
1354 /* Get vertex count, compare to zero
1355 */
1356 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1357 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1358 fixup = x86_jcc_forward(p->func, cc_E);
1359
1360 /* always load, needed or not:
1361 */
1362 init_inputs(p, index_size);
1363
1364 /* Note address for loop jump
1365 */
1366 label = x86_get_label(p->func);
1367 {
1368 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1369 int last_variant = -1;
1370 struct x86_reg vb;
1371
1372 for (j = 0; j < p->translate.key.nr_elements; j++) {
1373 const struct translate_element *a = &p->translate.key.element[j];
1374 unsigned variant = p->element_to_buffer_variant[j];
1375
1376 /* Figure out source pointer address:
1377 */
1378 if (variant != last_variant) {
1379 last_variant = variant;
1380 vb = get_buffer_ptr(p, index_size, variant, elt);
1381 }
1382
1383 if (!translate_attr( p, a,
1384 x86_make_disp(vb, a->input_offset),
1385 x86_make_disp(p->outbuf_EBX, a->output_offset)))
1386 return FALSE;
1387 }
1388
1389 /* Next output vertex:
1390 */
1391 x64_rexw(p->func);
1392 x86_lea(p->func,
1393 p->outbuf_EBX,
1394 x86_make_disp(p->outbuf_EBX,
1395 p->translate.key.output_stride));
1396
1397 /* Incr index
1398 */
1399 incr_inputs( p, index_size );
1400 }
1401
1402 /* decr count, loop if not zero
1403 */
1404 x86_dec(p->func, p->count_EBP);
1405 x86_jcc(p->func, cc_NZ, label);
1406
1407 /* Exit mmx state?
1408 */
1409 if (p->func->need_emms)
1410 mmx_emms(p->func);
1411
1412 /* Land forward jump here:
1413 */
1414 x86_fixup_fwd_jump(p->func, fixup);
1415
1416 /* Pop regs and return
1417 */
1418
1419 if(x86_target(p->func) != X86_64_STD_ABI)
1420 {
1421 x86_pop(p->func, p->idx_ESI);
1422 x86_pop(p->func, p->machine_EDI);
1423 }
1424
1425 x86_pop(p->func, p->count_EBP);
1426 x86_pop(p->func, p->outbuf_EBX);
1427
1428 if(x86_target(p->func) == X86_64_WIN64_ABI)
1429 {
1430 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1431 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1432 }
1433 x86_ret(p->func);
1434
1435 return TRUE;
1436 }
1437
1438
1439
1440
1441
1442
1443
1444 static void translate_sse_set_buffer( struct translate *translate,
1445 unsigned buf,
1446 const void *ptr,
1447 unsigned stride,
1448 unsigned max_index )
1449 {
1450 struct translate_sse *p = (struct translate_sse *)translate;
1451
1452 if (buf < p->nr_buffers) {
1453 p->buffer[buf].base_ptr = (char *)ptr;
1454 p->buffer[buf].stride = stride;
1455 p->buffer[buf].max_index = max_index;
1456 }
1457
1458 if (0) debug_printf("%s %d/%d: %p %d\n",
1459 __FUNCTION__, buf,
1460 p->nr_buffers,
1461 ptr, stride);
1462 }
1463
1464
1465 static void translate_sse_release( struct translate *translate )
1466 {
1467 struct translate_sse *p = (struct translate_sse *)translate;
1468
1469 x86_release_func( &p->elt8_func );
1470 x86_release_func( &p->elt16_func );
1471 x86_release_func( &p->elt_func );
1472 x86_release_func( &p->linear_func );
1473
1474 os_free_aligned(p);
1475 }
1476
1477
1478 struct translate *translate_sse2_create( const struct translate_key *key )
1479 {
1480 struct translate_sse *p = NULL;
1481 unsigned i;
1482
1483 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1484 if (!rtasm_cpu_has_sse())
1485 goto fail;
1486
1487 p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1488 if (p == NULL)
1489 goto fail;
1490 memset(p, 0, sizeof(*p));
1491 memcpy(p->consts, consts, sizeof(consts));
1492
1493 p->translate.key = *key;
1494 p->translate.release = translate_sse_release;
1495 p->translate.set_buffer = translate_sse_set_buffer;
1496
1497 for (i = 0; i < key->nr_elements; i++) {
1498 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1499 unsigned j;
1500
1501 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1502
1503 if (key->element[i].instance_divisor) {
1504 p->use_instancing = TRUE;
1505 }
1506
1507 /*
1508 * Map vertex element to vertex buffer variant.
1509 */
1510 for (j = 0; j < p->nr_buffer_variants; j++) {
1511 if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
1512 p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
1513 break;
1514 }
1515 }
1516 if (j == p->nr_buffer_variants) {
1517 p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1518 p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
1519 p->nr_buffer_variants++;
1520 }
1521 p->element_to_buffer_variant[i] = j;
1522 } else {
1523 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1524
1525 p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1526 }
1527 }
1528
1529 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1530
1531 if (!build_vertex_emit(p, &p->linear_func, 0))
1532 goto fail;
1533
1534 if (!build_vertex_emit(p, &p->elt_func, 4))
1535 goto fail;
1536
1537 if (!build_vertex_emit(p, &p->elt16_func, 2))
1538 goto fail;
1539
1540 if (!build_vertex_emit(p, &p->elt8_func, 1))
1541 goto fail;
1542
1543 p->translate.run = (run_func) x86_get_func(&p->linear_func);
1544 if (p->translate.run == NULL)
1545 goto fail;
1546
1547 p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1548 if (p->translate.run_elts == NULL)
1549 goto fail;
1550
1551 p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1552 if (p->translate.run_elts16 == NULL)
1553 goto fail;
1554
1555 p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1556 if (p->translate.run_elts8 == NULL)
1557 goto fail;
1558
1559 return &p->translate;
1560
1561 fail:
1562 if (p)
1563 translate_sse_release( &p->translate );
1564
1565 return NULL;
1566 }
1567
1568
1569
1570 #else
1571
1572 struct translate *translate_sse2_create( const struct translate_key *key )
1573 {
1574 return NULL;
1575 }
1576
1577 #endif