translate: check for PIPE_SUBSYSTEM_EMBEDDED
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
34
35 #include "translate.h"
36
37
38 #if (defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
39
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
42
43
44 #define X 0
45 #define Y 1
46 #define Z 2
47 #define W 3
48
49
50 struct translate_buffer {
51 const void *base_ptr;
52 uintptr_t stride;
53 unsigned max_index;
54 };
55
56 struct translate_buffer_variant {
57 unsigned buffer_index;
58 unsigned instance_divisor;
59 void *ptr; /* updated either per vertex or per instance */
60 };
61
62
63 #define ELEMENT_BUFFER_INSTANCE_ID 1001
64
65 #define NUM_CONSTS 7
66
67 enum
68 {
69 CONST_IDENTITY,
70 CONST_INV_127,
71 CONST_INV_255,
72 CONST_INV_32767,
73 CONST_INV_65535,
74 CONST_INV_2147483647,
75 CONST_255
76 };
77
78 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
79 static float consts[NUM_CONSTS][4] = {
80 {0, 0, 0, 1},
81 C(1.0 / 127.0),
82 C(1.0 / 255.0),
83 C(1.0 / 32767.0),
84 C(1.0 / 65535.0),
85 C(1.0 / 2147483647.0),
86 C(255.0)
87 };
88 #undef C
89
90 struct translate_sse {
91 struct translate translate;
92
93 struct x86_function linear_func;
94 struct x86_function elt_func;
95 struct x86_function elt16_func;
96 struct x86_function elt8_func;
97 struct x86_function *func;
98
99 PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
100 int8_t reg_to_const[16];
101 int8_t const_to_reg[NUM_CONSTS];
102
103 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
104 unsigned nr_buffers;
105
106 /* Multiple buffer variants can map to a single buffer. */
107 struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS];
108 unsigned nr_buffer_variants;
109
110 /* Multiple elements can map to a single buffer variant. */
111 unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS];
112
113 boolean use_instancing;
114 unsigned instance_id;
115
116 /* these are actually known values, but putting them in a struct
117 * like this is helpful to keep them in sync across the file.
118 */
119 struct x86_reg tmp_EAX;
120 struct x86_reg tmp2_EDX;
121 struct x86_reg src_ECX;
122 struct x86_reg idx_ESI; /* either start+i or &elt[i] */
123 struct x86_reg machine_EDI;
124 struct x86_reg outbuf_EBX;
125 struct x86_reg count_EBP; /* decrements to zero */
126 };
127
128 static int get_offset( const void *a, const void *b )
129 {
130 return (const char *)b - (const char *)a;
131 }
132
133 static struct x86_reg get_const( struct translate_sse *p, unsigned id)
134 {
135 struct x86_reg reg;
136 unsigned i;
137
138 if(p->const_to_reg[id] >= 0)
139 return x86_make_reg(file_XMM, p->const_to_reg[id]);
140
141 for(i = 2; i < 8; ++i)
142 {
143 if(p->reg_to_const[i] < 0)
144 break;
145 }
146
147 /* TODO: be smarter here */
148 if(i == 8)
149 --i;
150
151 reg = x86_make_reg(file_XMM, i);
152
153 if(p->reg_to_const[i] >= 0)
154 p->const_to_reg[p->reg_to_const[i]] = -1;
155
156 p->reg_to_const[i] = id;
157 p->const_to_reg[id] = i;
158
159 /* TODO: this should happen outside the loop, if possible */
160 sse_movaps(p->func, reg,
161 x86_make_disp(p->machine_EDI,
162 get_offset(p, &p->consts[id][0])));
163
164 return reg;
165 }
166
167 /* load the data in a SSE2 register, padding with zeros */
168 static boolean emit_load_sse2( struct translate_sse *p,
169 struct x86_reg data,
170 struct x86_reg src,
171 unsigned size)
172 {
173 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
174 struct x86_reg tmp = p->tmp_EAX;
175 switch(size)
176 {
177 case 1:
178 x86_movzx8(p->func, tmp, src);
179 sse2_movd(p->func, data, tmp);
180 break;
181 case 2:
182 x86_movzx16(p->func, tmp, src);
183 sse2_movd(p->func, data, tmp);
184 break;
185 case 3:
186 x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
187 x86_shl_imm(p->func, tmp, 16);
188 x86_mov16(p->func, tmp, src);
189 sse2_movd(p->func, data, tmp);
190 break;
191 case 4:
192 sse2_movd(p->func, data, src);
193 break;
194 case 6:
195 sse2_movd(p->func, data, src);
196 x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
197 sse2_movd(p->func, tmpXMM, tmp);
198 sse2_punpckldq(p->func, data, tmpXMM);
199 break;
200 case 8:
201 sse2_movq(p->func, data, src);
202 break;
203 case 12:
204 sse2_movq(p->func, data, src);
205 sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
206 sse2_punpcklqdq(p->func, data, tmpXMM);
207 break;
208 case 16:
209 sse2_movdqu(p->func, data, src);
210 break;
211 default:
212 return FALSE;
213 }
214 return TRUE;
215 }
216
217 /* this value can be passed for the out_chans argument */
218 #define CHANNELS_0001 5
219
220 /* this function will load #chans float values, and will
221 * pad the register with zeroes at least up to out_chans.
222 *
223 * If out_chans is set to CHANNELS_0001, then the fourth
224 * value will be padded with 1. Only pass this value if
225 * chans < 4 or results are undefined.
226 */
227 static void emit_load_float32( struct translate_sse *p,
228 struct x86_reg data,
229 struct x86_reg arg0,
230 unsigned out_chans,
231 unsigned chans)
232 {
233 switch(chans)
234 {
235 case 1:
236 /* a 0 0 0
237 * a 0 0 1
238 */
239 sse_movss(p->func, data, arg0);
240 if(out_chans == CHANNELS_0001)
241 sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
242 break;
243 case 2:
244 /* 0 0 0 1
245 * a b 0 1
246 */
247 if(out_chans == CHANNELS_0001)
248 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
249 else if(out_chans > 2)
250 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
251 sse_movlps(p->func, data, arg0);
252 break;
253 case 3:
254 /* Have to jump through some hoops:
255 *
256 * c 0 0 0
257 * c 0 0 1 if out_chans == CHANNELS_0001
258 * 0 0 c 0/1
259 * a b c 0/1
260 */
261 sse_movss(p->func, data, x86_make_disp(arg0, 8));
262 if(out_chans == CHANNELS_0001)
263 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
264 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
265 sse_movlps(p->func, data, arg0);
266 break;
267 case 4:
268 sse_movups(p->func, data, arg0);
269 break;
270 }
271 }
272
273 /* this function behaves like emit_load_float32, but loads
274 64-bit floating point numbers, converting them to 32-bit
275 ones */
276 static void emit_load_float64to32( struct translate_sse *p,
277 struct x86_reg data,
278 struct x86_reg arg0,
279 unsigned out_chans,
280 unsigned chans)
281 {
282 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
283 switch(chans)
284 {
285 case 1:
286 sse2_movsd(p->func, data, arg0);
287 if(out_chans > 1)
288 sse2_cvtpd2ps(p->func, data, data);
289 else
290 sse2_cvtsd2ss(p->func, data, data);
291 if(out_chans == CHANNELS_0001)
292 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
293 break;
294 case 2:
295 sse2_movupd(p->func, data, arg0);
296 sse2_cvtpd2ps(p->func, data, data);
297 if(out_chans == CHANNELS_0001)
298 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
299 else if(out_chans > 2)
300 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
301 break;
302 case 3:
303 sse2_movupd(p->func, data, arg0);
304 sse2_cvtpd2ps(p->func, data, data);
305 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
306 if(out_chans > 3)
307 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
308 else
309 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
310 sse_movlhps(p->func, data, tmpXMM);
311 if(out_chans == CHANNELS_0001)
312 sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
313 break;
314 case 4:
315 sse2_movupd(p->func, data, arg0);
316 sse2_cvtpd2ps(p->func, data, data);
317 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
318 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
319 sse_movlhps(p->func, data, tmpXMM);
320 break;
321 }
322 }
323
324 static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm)
325 {
326 if(x86_target(p->func) != X86_32)
327 x64_mov64(p->func, dst_gpr, src_gpr);
328 else
329 {
330 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
331 if(x86_target_caps(p->func) & X86_SSE2)
332 sse2_movq(p->func, dst_xmm, src_xmm);
333 else
334 sse_movlps(p->func, dst_xmm, src_xmm);
335 }
336 }
337
338 static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
339 {
340 emit_mov64(p, dst_gpr, dst_xmm, src, src);
341 }
342
343 static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
344 {
345 emit_mov64(p, dst, dst, src_gpr, src_xmm);
346 }
347
348 static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
349 {
350 if(x86_target_caps(p->func) & X86_SSE2)
351 sse2_movdqu(p->func, dst, src);
352 else
353 sse_movups(p->func, dst, src);
354 }
355
356 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
357 * but may or may not be good on older processors
358 * TODO: may perhaps want to use non-temporal stores here if possible
359 */
360 static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
361 {
362 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
363 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
364 struct x86_reg dataGPR = p->tmp_EAX;
365 struct x86_reg dataGPR2 = p->tmp2_EDX;
366
367 if(size < 8)
368 {
369 switch (size)
370 {
371 case 1:
372 x86_mov8(p->func, dataGPR, src);
373 x86_mov8(p->func, dst, dataGPR);
374 break;
375 case 2:
376 x86_mov16(p->func, dataGPR, src);
377 x86_mov16(p->func, dst, dataGPR);
378 break;
379 case 3:
380 x86_mov16(p->func, dataGPR, src);
381 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
382 x86_mov16(p->func, dst, dataGPR);
383 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
384 break;
385 case 4:
386 x86_mov(p->func, dataGPR, src);
387 x86_mov(p->func, dst, dataGPR);
388 break;
389 case 6:
390 x86_mov(p->func, dataGPR, src);
391 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
392 x86_mov(p->func, dst, dataGPR);
393 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
394 break;
395 }
396 }
397 else if(!(x86_target_caps(p->func) & X86_SSE))
398 {
399 unsigned i = 0;
400 assert((size & 3) == 0);
401 for(i = 0; i < size; i += 4)
402 {
403 x86_mov(p->func, dataGPR, x86_make_disp(src, i));
404 x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
405 }
406 }
407 else
408 {
409 switch(size)
410 {
411 case 8:
412 emit_load64(p, dataGPR, dataXMM, src);
413 emit_store64(p, dst, dataGPR, dataXMM);
414 break;
415 case 12:
416 emit_load64(p, dataGPR2, dataXMM, src);
417 x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
418 emit_store64(p, dst, dataGPR2, dataXMM);
419 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
420 break;
421 case 16:
422 emit_mov128(p, dataXMM, src);
423 emit_mov128(p, dst, dataXMM);
424 break;
425 case 24:
426 emit_mov128(p, dataXMM, src);
427 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
428 emit_mov128(p, dst, dataXMM);
429 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
430 break;
431 case 32:
432 emit_mov128(p, dataXMM, src);
433 emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
434 emit_mov128(p, dst, dataXMM);
435 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
436 break;
437 default:
438 assert(0);
439 }
440 }
441 }
442
443 static boolean translate_attr_convert( struct translate_sse *p,
444 const struct translate_element *a,
445 struct x86_reg src,
446 struct x86_reg dst)
447
448 {
449 const struct util_format_description* input_desc = util_format_description(a->input_format);
450 const struct util_format_description* output_desc = util_format_description(a->output_format);
451 unsigned i;
452 boolean id_swizzle = TRUE;
453 unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
454 unsigned needed_chans = 0;
455 unsigned imms[2] = {0, 0x3f800000};
456
457 if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
458 return FALSE;
459
460 if(input_desc->channel[0].size & 7)
461 return FALSE;
462
463 if(input_desc->colorspace != output_desc->colorspace)
464 return FALSE;
465
466 for(i = 1; i < input_desc->nr_channels; ++i)
467 {
468 if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
469 return FALSE;
470 }
471
472 for(i = 1; i < output_desc->nr_channels; ++i)
473 {
474 if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
475 return FALSE;
476 }
477
478 for(i = 0; i < output_desc->nr_channels; ++i)
479 {
480 if(output_desc->swizzle[i] < 4)
481 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
482 }
483
484 if((x86_target_caps(p->func) & X86_SSE) && (0
485 || a->output_format == PIPE_FORMAT_R32_FLOAT
486 || a->output_format == PIPE_FORMAT_R32G32_FLOAT
487 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
488 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
489 {
490 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
491
492 for(i = 0; i < output_desc->nr_channels; ++i)
493 {
494 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
495 swizzle[i] = i;
496 }
497
498 for(i = 0; i < output_desc->nr_channels; ++i)
499 {
500 if(swizzle[i] < 4)
501 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
502 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
503 id_swizzle = FALSE;
504 }
505
506 if(needed_chans > 0)
507 {
508 switch(input_desc->channel[0].type)
509 {
510 case UTIL_FORMAT_TYPE_UNSIGNED:
511 if(!(x86_target_caps(p->func) & X86_SSE2))
512 return FALSE;
513 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
514
515 /* TODO: add support for SSE4.1 pmovzx */
516 switch(input_desc->channel[0].size)
517 {
518 case 8:
519 /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
520 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
521 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
522 break;
523 case 16:
524 sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
525 break;
526 case 32: /* we lose precision here */
527 sse2_psrld_imm(p->func, dataXMM, 1);
528 break;
529 default:
530 return FALSE;
531 }
532 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
533 if(input_desc->channel[0].normalized)
534 {
535 struct x86_reg factor;
536 switch(input_desc->channel[0].size)
537 {
538 case 8:
539 factor = get_const(p, CONST_INV_255);
540 break;
541 case 16:
542 factor = get_const(p, CONST_INV_65535);
543 break;
544 case 32:
545 factor = get_const(p, CONST_INV_2147483647);
546 break;
547 default:
548 assert(0);
549 factor.disp = 0;
550 factor.file = 0;
551 factor.idx = 0;
552 factor.mod = 0;
553 break;
554 }
555 sse_mulps(p->func, dataXMM, factor);
556 }
557 else if(input_desc->channel[0].size == 32)
558 sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
559 break;
560 case UTIL_FORMAT_TYPE_SIGNED:
561 if(!(x86_target_caps(p->func) & X86_SSE2))
562 return FALSE;
563 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
564
565 /* TODO: add support for SSE4.1 pmovsx */
566 switch(input_desc->channel[0].size)
567 {
568 case 8:
569 sse2_punpcklbw(p->func, dataXMM, dataXMM);
570 sse2_punpcklbw(p->func, dataXMM, dataXMM);
571 sse2_psrad_imm(p->func, dataXMM, 24);
572 break;
573 case 16:
574 sse2_punpcklwd(p->func, dataXMM, dataXMM);
575 sse2_psrad_imm(p->func, dataXMM, 16);
576 break;
577 case 32: /* we lose precision here */
578 break;
579 default:
580 return FALSE;
581 }
582 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
583 if(input_desc->channel[0].normalized)
584 {
585 struct x86_reg factor;
586 switch(input_desc->channel[0].size)
587 {
588 case 8:
589 factor = get_const(p, CONST_INV_127);
590 break;
591 case 16:
592 factor = get_const(p, CONST_INV_32767);
593 break;
594 case 32:
595 factor = get_const(p, CONST_INV_2147483647);
596 break;
597 default:
598 assert(0);
599 factor.disp = 0;
600 factor.file = 0;
601 factor.idx = 0;
602 factor.mod = 0;
603 break;
604 }
605 sse_mulps(p->func, dataXMM, factor);
606 }
607 break;
608
609 break;
610 case UTIL_FORMAT_TYPE_FLOAT:
611 if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
612 return FALSE;
613 if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
614 {
615 swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
616 needed_chans = CHANNELS_0001;
617 }
618 switch(input_desc->channel[0].size)
619 {
620 case 32:
621 emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
622 break;
623 case 64: /* we lose precision here */
624 if(!(x86_target_caps(p->func) & X86_SSE2))
625 return FALSE;
626 emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
627 break;
628 default:
629 return FALSE;
630 }
631 break;
632 default:
633 return FALSE;
634 }
635
636 if(!id_swizzle)
637 sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
638 }
639
640 if(output_desc->nr_channels >= 4
641 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
642 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
643 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
644 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
645 )
646 sse_movups(p->func, dst, dataXMM);
647 else
648 {
649 if(output_desc->nr_channels >= 2
650 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
651 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
652 sse_movlps(p->func, dst, dataXMM);
653 else
654 {
655 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
656 sse_movss(p->func, dst, dataXMM);
657 else
658 x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
659
660 if(output_desc->nr_channels >= 2)
661 {
662 if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
663 {
664 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
665 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
666 }
667 else
668 x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
669 }
670 }
671
672 if(output_desc->nr_channels >= 3)
673 {
674 if(output_desc->nr_channels >= 4
675 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
676 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
677 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
678 else
679 {
680 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
681 {
682 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
683 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
684 }
685 else
686 x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
687
688 if(output_desc->nr_channels >= 4)
689 {
690 if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
691 {
692 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
693 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
694 }
695 else
696 x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
697 }
698 }
699 }
700 }
701 return TRUE;
702 }
703 else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
704 && output_desc->channel[0].normalized == input_desc->channel[0].normalized
705 && (0
706 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
707 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
708 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
709 ))
710 {
711 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
712 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
713 struct x86_reg tmp = p->tmp_EAX;
714 unsigned imms[2] = {0, 1};
715
716 for(i = 0; i < output_desc->nr_channels; ++i)
717 {
718 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
719 swizzle[i] = i;
720 }
721
722 for(i = 0; i < output_desc->nr_channels; ++i)
723 {
724 if(swizzle[i] < 4)
725 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
726 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
727 id_swizzle = FALSE;
728 }
729
730 if(needed_chans > 0)
731 {
732 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
733
734 switch(input_desc->channel[0].type)
735 {
736 case UTIL_FORMAT_TYPE_UNSIGNED:
737 if(input_desc->channel[0].normalized)
738 {
739 sse2_punpcklbw(p->func, dataXMM, dataXMM);
740 if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
741 sse2_psrlw_imm(p->func, dataXMM, 1);
742 }
743 else
744 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
745 break;
746 case UTIL_FORMAT_TYPE_SIGNED:
747 if(input_desc->channel[0].normalized)
748 {
749 sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
750 sse2_punpcklbw(p->func, tmpXMM, dataXMM);
751 sse2_psllw_imm(p->func, dataXMM, 9);
752 sse2_psrlw_imm(p->func, dataXMM, 8);
753 sse2_por(p->func, tmpXMM, dataXMM);
754 sse2_psrlw_imm(p->func, dataXMM, 7);
755 sse2_por(p->func, tmpXMM, dataXMM);
756 {
757 struct x86_reg t = dataXMM;
758 dataXMM = tmpXMM;
759 tmpXMM = t;
760 }
761 }
762 else
763 {
764 sse2_punpcklbw(p->func, dataXMM, dataXMM);
765 sse2_psraw_imm(p->func, dataXMM, 8);
766 }
767 break;
768 default:
769 assert(0);
770 }
771
772 if(output_desc->channel[0].normalized)
773 imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
774
775 if(!id_swizzle)
776 sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
777 }
778
779 if(output_desc->nr_channels >= 4
780 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
781 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
782 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
783 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
784 )
785 sse2_movq(p->func, dst, dataXMM);
786 else
787 {
788 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
789 {
790 if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
791 sse2_movd(p->func, dst, dataXMM);
792 else
793 {
794 sse2_movd(p->func, tmp, dataXMM);
795 x86_mov16(p->func, dst, tmp);
796 if(output_desc->nr_channels >= 2)
797 x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
798 }
799 }
800 else
801 {
802 if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
803 x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
804 else
805 {
806 x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
807 if(output_desc->nr_channels >= 2)
808 {
809 sse2_movd(p->func, tmp, dataXMM);
810 x86_shr_imm(p->func, tmp, 16);
811 x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
812 }
813 }
814 }
815
816 if(output_desc->nr_channels >= 3)
817 {
818 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
819 {
820 if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
821 {
822 sse2_psrlq_imm(p->func, dataXMM, 32);
823 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
824 }
825 else
826 {
827 sse2_psrlq_imm(p->func, dataXMM, 32);
828 sse2_movd(p->func, tmp, dataXMM);
829 x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
830 if(output_desc->nr_channels >= 4)
831 {
832 x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
833 }
834 }
835 }
836 else
837 {
838 if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
839 x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
840 else
841 {
842 x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
843
844 if(output_desc->nr_channels >= 4)
845 {
846 sse2_psrlq_imm(p->func, dataXMM, 48);
847 sse2_movd(p->func, tmp, dataXMM);
848 x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
849 }
850 }
851 }
852 }
853 }
854 return TRUE;
855 }
856 else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
857 {
858 struct x86_reg tmp = p->tmp_EAX;
859 unsigned i;
860 if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
861 && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
862 && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
863 && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
864 && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
865 {
866 /* TODO: support movbe */
867 x86_mov(p->func, tmp, src);
868 x86_bswap(p->func, tmp);
869 x86_mov(p->func, dst, tmp);
870 return TRUE;
871 }
872
873 for(i = 0; i < output_desc->nr_channels; ++i)
874 {
875 switch(output_desc->channel[0].size)
876 {
877 case 8:
878 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
879 {
880 unsigned v = 0;
881 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
882 {
883 switch(output_desc->channel[0].type)
884 {
885 case UTIL_FORMAT_TYPE_UNSIGNED:
886 v = output_desc->channel[0].normalized ? 0xff : 1;
887 break;
888 case UTIL_FORMAT_TYPE_SIGNED:
889 v = output_desc->channel[0].normalized ? 0x7f : 1;
890 break;
891 default:
892 return FALSE;
893 }
894 }
895 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
896 }
897 else
898 {
899 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
900 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
901 }
902 break;
903 case 16:
904 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
905 {
906 unsigned v = 0;
907 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
908 {
909 switch(output_desc->channel[1].type)
910 {
911 case UTIL_FORMAT_TYPE_UNSIGNED:
912 v = output_desc->channel[1].normalized ? 0xffff : 1;
913 break;
914 case UTIL_FORMAT_TYPE_SIGNED:
915 v = output_desc->channel[1].normalized ? 0x7fff : 1;
916 break;
917 case UTIL_FORMAT_TYPE_FLOAT:
918 v = 0x3c00;
919 break;
920 default:
921 return FALSE;
922 }
923 }
924 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
925 }
926 else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
927 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
928 else
929 {
930 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
931 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
932 }
933 break;
934 case 32:
935 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
936 {
937 unsigned v = 0;
938 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
939 {
940 switch(output_desc->channel[1].type)
941 {
942 case UTIL_FORMAT_TYPE_UNSIGNED:
943 v = output_desc->channel[1].normalized ? 0xffffffff : 1;
944 break;
945 case UTIL_FORMAT_TYPE_SIGNED:
946 v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
947 break;
948 case UTIL_FORMAT_TYPE_FLOAT:
949 v = 0x3f800000;
950 break;
951 default:
952 return FALSE;
953 }
954 }
955 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
956 }
957 else
958 {
959 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
960 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
961 }
962 break;
963 case 64:
964 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
965 {
966 unsigned l = 0;
967 unsigned h = 0;
968 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
969 {
970 switch(output_desc->channel[1].type)
971 {
972 case UTIL_FORMAT_TYPE_UNSIGNED:
973 h = output_desc->channel[1].normalized ? 0xffffffff : 0;
974 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
975 break;
976 case UTIL_FORMAT_TYPE_SIGNED:
977 h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
978 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
979 break;
980 case UTIL_FORMAT_TYPE_FLOAT:
981 h = 0x3ff00000;
982 l = 0;
983 break;
984 default:
985 return FALSE;
986 }
987 }
988 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
989 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
990 }
991 else
992 {
993 if(x86_target_caps(p->func) & X86_SSE)
994 {
995 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
996 emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
997 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
998 }
999 else
1000 {
1001 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1002 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1003 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
1004 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1005 }
1006 }
1007 break;
1008 default:
1009 return FALSE;
1010 }
1011 }
1012 return TRUE;
1013 }
1014 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1015 else if((x86_target_caps(p->func) & X86_SSE2) &&
1016 a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
1017 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1018 || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
1019 ))
1020 {
1021 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1022
1023 /* load */
1024 sse_movups(p->func, dataXMM, src);
1025
1026 if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
1027 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
1028
1029 /* scale by 255.0 */
1030 sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1031
1032 /* pack and emit */
1033 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1034 sse2_packssdw(p->func, dataXMM, dataXMM);
1035 sse2_packuswb(p->func, dataXMM, dataXMM);
1036 sse2_movd(p->func, dst, dataXMM);
1037
1038 return TRUE;
1039 }
1040
1041 return FALSE;
1042 }
1043
1044 static boolean translate_attr( struct translate_sse *p,
1045 const struct translate_element *a,
1046 struct x86_reg src,
1047 struct x86_reg dst)
1048 {
1049 if(a->input_format == a->output_format)
1050 {
1051 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1052 return TRUE;
1053 }
1054
1055 return translate_attr_convert(p, a, src, dst);
1056 }
1057
1058 static boolean init_inputs( struct translate_sse *p,
1059 unsigned index_size )
1060 {
1061 unsigned i;
1062 struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1063 get_offset(p, &p->instance_id));
1064
1065 for (i = 0; i < p->nr_buffer_variants; i++) {
1066 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1067 struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1068
1069 if (!index_size || variant->instance_divisor) {
1070 struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
1071 get_offset(p, &buffer->max_index));
1072 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1073 get_offset(p, &buffer->stride));
1074 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1075 get_offset(p, &variant->ptr));
1076 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1077 get_offset(p, &buffer->base_ptr));
1078 struct x86_reg elt = p->idx_ESI;
1079 struct x86_reg tmp_EAX = p->tmp_EAX;
1080
1081 /* Calculate pointer to first attrib:
1082 * base_ptr + stride * index, where index depends on instance divisor
1083 */
1084 if (variant->instance_divisor) {
1085 /* Our index is instance ID divided by instance divisor.
1086 */
1087 x86_mov(p->func, tmp_EAX, instance_id);
1088
1089 if (variant->instance_divisor != 1) {
1090 struct x86_reg tmp_EDX = p->tmp2_EDX;
1091 struct x86_reg tmp_ECX = p->src_ECX;
1092
1093 /* TODO: Add x86_shr() to rtasm and use it whenever
1094 * instance divisor is power of two.
1095 */
1096
1097 x86_xor(p->func, tmp_EDX, tmp_EDX);
1098 x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1099 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
1100 }
1101
1102 /* XXX we need to clamp the index here too, but to a
1103 * per-array max value, not the draw->pt.max_index value
1104 * that's being given to us via translate->set_buffer().
1105 */
1106 } else {
1107 x86_mov(p->func, tmp_EAX, elt);
1108
1109 /* Clamp to max_index
1110 */
1111 x86_cmp(p->func, tmp_EAX, buf_max_index);
1112 x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1113 }
1114
1115 x86_imul(p->func, tmp_EAX, buf_stride);
1116 x64_rexw(p->func);
1117 x86_add(p->func, tmp_EAX, buf_base_ptr);
1118
1119 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1120
1121 /* In the linear case, keep the buffer pointer instead of the
1122 * index number.
1123 */
1124 if (!index_size && p->nr_buffer_variants == 1)
1125 {
1126 x64_rexw(p->func);
1127 x86_mov(p->func, elt, tmp_EAX);
1128 }
1129 else
1130 {
1131 x64_rexw(p->func);
1132 x86_mov(p->func, buf_ptr, tmp_EAX);
1133 }
1134 }
1135 }
1136
1137 return TRUE;
1138 }
1139
1140
1141 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1142 unsigned index_size,
1143 unsigned var_idx,
1144 struct x86_reg elt )
1145 {
1146 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1147 return x86_make_disp(p->machine_EDI,
1148 get_offset(p, &p->instance_id));
1149 }
1150 if (!index_size && p->nr_buffer_variants == 1) {
1151 return p->idx_ESI;
1152 }
1153 else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1154 struct x86_reg ptr = p->src_ECX;
1155 struct x86_reg buf_ptr =
1156 x86_make_disp(p->machine_EDI,
1157 get_offset(p, &p->buffer_variant[var_idx].ptr));
1158
1159 x64_rexw(p->func);
1160 x86_mov(p->func, ptr, buf_ptr);
1161 return ptr;
1162 }
1163 else {
1164 struct x86_reg ptr = p->src_ECX;
1165 const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
1166
1167 struct x86_reg buf_stride =
1168 x86_make_disp(p->machine_EDI,
1169 get_offset(p, &p->buffer[variant->buffer_index].stride));
1170
1171 struct x86_reg buf_base_ptr =
1172 x86_make_disp(p->machine_EDI,
1173 get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1174
1175 struct x86_reg buf_max_index =
1176 x86_make_disp(p->machine_EDI,
1177 get_offset(p, &p->buffer[variant->buffer_index].max_index));
1178
1179
1180
1181 /* Calculate pointer to current attrib:
1182 */
1183 switch(index_size)
1184 {
1185 case 1:
1186 x86_movzx8(p->func, ptr, elt);
1187 break;
1188 case 2:
1189 x86_movzx16(p->func, ptr, elt);
1190 break;
1191 case 4:
1192 x86_mov(p->func, ptr, elt);
1193 break;
1194 }
1195
1196 /* Clamp to max_index
1197 */
1198 x86_cmp(p->func, ptr, buf_max_index);
1199 x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1200
1201 x86_imul(p->func, ptr, buf_stride);
1202 x64_rexw(p->func);
1203 x86_add(p->func, ptr, buf_base_ptr);
1204 return ptr;
1205 }
1206 }
1207
1208
1209
1210 static boolean incr_inputs( struct translate_sse *p,
1211 unsigned index_size )
1212 {
1213 if (!index_size && p->nr_buffer_variants == 1) {
1214 struct x86_reg stride = x86_make_disp(p->machine_EDI,
1215 get_offset(p, &p->buffer[0].stride));
1216
1217 if (p->buffer_variant[0].instance_divisor == 0) {
1218 x64_rexw(p->func);
1219 x86_add(p->func, p->idx_ESI, stride);
1220 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1221 }
1222 }
1223 else if (!index_size) {
1224 unsigned i;
1225
1226 /* Is this worthwhile??
1227 */
1228 for (i = 0; i < p->nr_buffer_variants; i++) {
1229 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1230 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1231 get_offset(p, &variant->ptr));
1232 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1233 get_offset(p, &p->buffer[variant->buffer_index].stride));
1234
1235 if (variant->instance_divisor == 0) {
1236 x86_mov(p->func, p->tmp_EAX, buf_stride);
1237 x64_rexw(p->func);
1238 x86_add(p->func, p->tmp_EAX, buf_ptr);
1239 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1240 x64_rexw(p->func);
1241 x86_mov(p->func, buf_ptr, p->tmp_EAX);
1242 }
1243 }
1244 }
1245 else {
1246 x64_rexw(p->func);
1247 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1248 }
1249
1250 return TRUE;
1251 }
1252
1253
1254 /* Build run( struct translate *machine,
1255 * unsigned start,
1256 * unsigned count,
1257 * void *output_buffer )
1258 * or
1259 * run_elts( struct translate *machine,
1260 * unsigned *elts,
1261 * unsigned count,
1262 * void *output_buffer )
1263 *
1264 * Lots of hardcoding
1265 *
1266 * EAX -- pointer to current output vertex
1267 * ECX -- pointer to current attribute
1268 *
1269 */
1270 static boolean build_vertex_emit( struct translate_sse *p,
1271 struct x86_function *func,
1272 unsigned index_size )
1273 {
1274 int fixup, label;
1275 unsigned j;
1276
1277 memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1278 memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1279
1280 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1281 p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1282 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1283 p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1284 p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1285 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1286 p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1287
1288 p->func = func;
1289
1290 x86_init_func(p->func);
1291
1292 if(x86_target(p->func) == X86_64_WIN64_ABI)
1293 {
1294 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1295 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1296 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1297 }
1298
1299 x86_push(p->func, p->outbuf_EBX);
1300 x86_push(p->func, p->count_EBP);
1301
1302 /* on non-Win64 x86-64, these are already in the right registers */
1303 if(x86_target(p->func) != X86_64_STD_ABI)
1304 {
1305 x86_push(p->func, p->machine_EDI);
1306 x86_push(p->func, p->idx_ESI);
1307
1308 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1309 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1310 }
1311
1312 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1313
1314 if(x86_target(p->func) != X86_32)
1315 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1316 else
1317 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1318
1319 /* Load instance ID.
1320 */
1321 if (p->use_instancing) {
1322 x86_mov(p->func,
1323 p->tmp_EAX,
1324 x86_fn_arg(p->func, 4));
1325 x86_mov(p->func,
1326 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1327 p->tmp_EAX);
1328 }
1329
1330 /* Get vertex count, compare to zero
1331 */
1332 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1333 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1334 fixup = x86_jcc_forward(p->func, cc_E);
1335
1336 /* always load, needed or not:
1337 */
1338 init_inputs(p, index_size);
1339
1340 /* Note address for loop jump
1341 */
1342 label = x86_get_label(p->func);
1343 {
1344 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1345 int last_variant = -1;
1346 struct x86_reg vb;
1347
1348 for (j = 0; j < p->translate.key.nr_elements; j++) {
1349 const struct translate_element *a = &p->translate.key.element[j];
1350 unsigned variant = p->element_to_buffer_variant[j];
1351
1352 /* Figure out source pointer address:
1353 */
1354 if (variant != last_variant) {
1355 last_variant = variant;
1356 vb = get_buffer_ptr(p, index_size, variant, elt);
1357 }
1358
1359 if (!translate_attr( p, a,
1360 x86_make_disp(vb, a->input_offset),
1361 x86_make_disp(p->outbuf_EBX, a->output_offset)))
1362 return FALSE;
1363 }
1364
1365 /* Next output vertex:
1366 */
1367 x64_rexw(p->func);
1368 x86_lea(p->func,
1369 p->outbuf_EBX,
1370 x86_make_disp(p->outbuf_EBX,
1371 p->translate.key.output_stride));
1372
1373 /* Incr index
1374 */
1375 incr_inputs( p, index_size );
1376 }
1377
1378 /* decr count, loop if not zero
1379 */
1380 x86_dec(p->func, p->count_EBP);
1381 x86_jcc(p->func, cc_NZ, label);
1382
1383 /* Exit mmx state?
1384 */
1385 if (p->func->need_emms)
1386 mmx_emms(p->func);
1387
1388 /* Land forward jump here:
1389 */
1390 x86_fixup_fwd_jump(p->func, fixup);
1391
1392 /* Pop regs and return
1393 */
1394
1395 if(x86_target(p->func) != X86_64_STD_ABI)
1396 {
1397 x86_pop(p->func, p->idx_ESI);
1398 x86_pop(p->func, p->machine_EDI);
1399 }
1400
1401 x86_pop(p->func, p->count_EBP);
1402 x86_pop(p->func, p->outbuf_EBX);
1403
1404 if(x86_target(p->func) == X86_64_WIN64_ABI)
1405 {
1406 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1407 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1408 }
1409 x86_ret(p->func);
1410
1411 return TRUE;
1412 }
1413
1414
1415
1416
1417
1418
1419
1420 static void translate_sse_set_buffer( struct translate *translate,
1421 unsigned buf,
1422 const void *ptr,
1423 unsigned stride,
1424 unsigned max_index )
1425 {
1426 struct translate_sse *p = (struct translate_sse *)translate;
1427
1428 if (buf < p->nr_buffers) {
1429 p->buffer[buf].base_ptr = (char *)ptr;
1430 p->buffer[buf].stride = stride;
1431 p->buffer[buf].max_index = max_index;
1432 }
1433
1434 if (0) debug_printf("%s %d/%d: %p %d\n",
1435 __FUNCTION__, buf,
1436 p->nr_buffers,
1437 ptr, stride);
1438 }
1439
1440
1441 static void translate_sse_release( struct translate *translate )
1442 {
1443 struct translate_sse *p = (struct translate_sse *)translate;
1444
1445 x86_release_func( &p->linear_func );
1446 x86_release_func( &p->elt_func );
1447
1448 os_free_aligned(p);
1449 }
1450
1451
1452 struct translate *translate_sse2_create( const struct translate_key *key )
1453 {
1454 struct translate_sse *p = NULL;
1455 unsigned i;
1456
1457 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1458 if (!rtasm_cpu_has_sse())
1459 goto fail;
1460
1461 p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1462 if (p == NULL)
1463 goto fail;
1464 memset(p, 0, sizeof(*p));
1465 memcpy(p->consts, consts, sizeof(consts));
1466
1467 p->translate.key = *key;
1468 p->translate.release = translate_sse_release;
1469 p->translate.set_buffer = translate_sse_set_buffer;
1470
1471 for (i = 0; i < key->nr_elements; i++) {
1472 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1473 unsigned j;
1474
1475 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1476
1477 if (key->element[i].instance_divisor) {
1478 p->use_instancing = TRUE;
1479 }
1480
1481 /*
1482 * Map vertex element to vertex buffer variant.
1483 */
1484 for (j = 0; j < p->nr_buffer_variants; j++) {
1485 if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
1486 p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
1487 break;
1488 }
1489 }
1490 if (j == p->nr_buffer_variants) {
1491 p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1492 p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
1493 p->nr_buffer_variants++;
1494 }
1495 p->element_to_buffer_variant[i] = j;
1496 } else {
1497 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1498
1499 p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1500 }
1501 }
1502
1503 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1504
1505 if (!build_vertex_emit(p, &p->linear_func, 0))
1506 goto fail;
1507
1508 if (!build_vertex_emit(p, &p->elt_func, 4))
1509 goto fail;
1510
1511 if (!build_vertex_emit(p, &p->elt16_func, 2))
1512 goto fail;
1513
1514 if (!build_vertex_emit(p, &p->elt8_func, 1))
1515 goto fail;
1516
1517 p->translate.run = (run_func) x86_get_func(&p->linear_func);
1518 if (p->translate.run == NULL)
1519 goto fail;
1520
1521 p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1522 if (p->translate.run_elts == NULL)
1523 goto fail;
1524
1525 p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1526 if (p->translate.run_elts16 == NULL)
1527 goto fail;
1528
1529 p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1530 if (p->translate.run_elts8 == NULL)
1531 goto fail;
1532
1533 return &p->translate;
1534
1535 fail:
1536 if (p)
1537 translate_sse_release( &p->translate );
1538
1539 return NULL;
1540 }
1541
1542
1543
1544 #else
1545
1546 struct translate *translate_sse2_create( const struct translate_key *key )
1547 {
1548 return NULL;
1549 }
1550
1551 #endif