298192f2875d56631af8fdcf8c8124b637116de5
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@vmware.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
34
35 #include "translate.h"
36
37
38 #if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
39
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
42
43
44 #define X 0
45 #define Y 1
46 #define Z 2
47 #define W 3
48
49
50 struct translate_buffer
51 {
52 const void *base_ptr;
53 uintptr_t stride;
54 unsigned max_index;
55 };
56
57 struct translate_buffer_variant
58 {
59 unsigned buffer_index;
60 unsigned instance_divisor;
61 void *ptr; /* updated either per vertex or per instance */
62 };
63
64
65 #define ELEMENT_BUFFER_INSTANCE_ID 1001
66
67 #define NUM_CONSTS 7
68
69 enum
70 {
71 CONST_IDENTITY,
72 CONST_INV_127,
73 CONST_INV_255,
74 CONST_INV_32767,
75 CONST_INV_65535,
76 CONST_INV_2147483647,
77 CONST_255
78 };
79
80 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
81 static float consts[NUM_CONSTS][4] = {
82 {0, 0, 0, 1},
83 C(1.0 / 127.0),
84 C(1.0 / 255.0),
85 C(1.0 / 32767.0),
86 C(1.0 / 65535.0),
87 C(1.0 / 2147483647.0),
88 C(255.0)
89 };
90
91 #undef C
92
93 struct translate_sse
94 {
95 struct translate translate;
96
97 struct x86_function linear_func;
98 struct x86_function elt_func;
99 struct x86_function elt16_func;
100 struct x86_function elt8_func;
101 struct x86_function *func;
102
103 PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
104 int8_t reg_to_const[16];
105 int8_t const_to_reg[NUM_CONSTS];
106
107 struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
108 unsigned nr_buffers;
109
110 /* Multiple buffer variants can map to a single buffer. */
111 struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
112 unsigned nr_buffer_variants;
113
114 /* Multiple elements can map to a single buffer variant. */
115 unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
116
117 boolean use_instancing;
118 unsigned instance_id;
119 unsigned start_instance;
120
121 /* these are actually known values, but putting them in a struct
122 * like this is helpful to keep them in sync across the file.
123 */
124 struct x86_reg tmp_EAX;
125 struct x86_reg tmp2_EDX;
126 struct x86_reg src_ECX;
127 struct x86_reg idx_ESI; /* either start+i or &elt[i] */
128 struct x86_reg machine_EDI;
129 struct x86_reg outbuf_EBX;
130 struct x86_reg count_EBP; /* decrements to zero */
131 };
132
133
134 static int
135 get_offset(const void *a, const void *b)
136 {
137 return (const char *) b - (const char *) a;
138 }
139
140
141 static struct x86_reg
142 get_const(struct translate_sse *p, unsigned id)
143 {
144 struct x86_reg reg;
145 unsigned i;
146
147 if (p->const_to_reg[id] >= 0)
148 return x86_make_reg(file_XMM, p->const_to_reg[id]);
149
150 for (i = 2; i < 8; ++i) {
151 if (p->reg_to_const[i] < 0)
152 break;
153 }
154
155 /* TODO: be smarter here */
156 if (i == 8)
157 --i;
158
159 reg = x86_make_reg(file_XMM, i);
160
161 if (p->reg_to_const[i] >= 0)
162 p->const_to_reg[p->reg_to_const[i]] = -1;
163
164 p->reg_to_const[i] = id;
165 p->const_to_reg[id] = i;
166
167 /* TODO: this should happen outside the loop, if possible */
168 sse_movaps(p->func, reg,
169 x86_make_disp(p->machine_EDI,
170 get_offset(p, &p->consts[id][0])));
171
172 return reg;
173 }
174
175
176 /* load the data in a SSE2 register, padding with zeros */
177 static boolean
178 emit_load_sse2(struct translate_sse *p,
179 struct x86_reg data, struct x86_reg src, unsigned size)
180 {
181 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
182 struct x86_reg tmp = p->tmp_EAX;
183 switch (size) {
184 case 1:
185 x86_movzx8(p->func, tmp, src);
186 sse2_movd(p->func, data, tmp);
187 break;
188 case 2:
189 x86_movzx16(p->func, tmp, src);
190 sse2_movd(p->func, data, tmp);
191 break;
192 case 3:
193 x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
194 x86_shl_imm(p->func, tmp, 16);
195 x86_mov16(p->func, tmp, src);
196 sse2_movd(p->func, data, tmp);
197 break;
198 case 4:
199 sse2_movd(p->func, data, src);
200 break;
201 case 6:
202 sse2_movd(p->func, data, src);
203 x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
204 sse2_movd(p->func, tmpXMM, tmp);
205 sse2_punpckldq(p->func, data, tmpXMM);
206 break;
207 case 8:
208 sse2_movq(p->func, data, src);
209 break;
210 case 12:
211 sse2_movq(p->func, data, src);
212 sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
213 sse2_punpcklqdq(p->func, data, tmpXMM);
214 break;
215 case 16:
216 sse2_movdqu(p->func, data, src);
217 break;
218 default:
219 return FALSE;
220 }
221 return TRUE;
222 }
223
224
225 /* this value can be passed for the out_chans argument */
226 #define CHANNELS_0001 5
227
228
229 /* this function will load #chans float values, and will
230 * pad the register with zeroes at least up to out_chans.
231 *
232 * If out_chans is set to CHANNELS_0001, then the fourth
233 * value will be padded with 1. Only pass this value if
234 * chans < 4 or results are undefined.
235 */
236 static void
237 emit_load_float32(struct translate_sse *p, struct x86_reg data,
238 struct x86_reg arg0, unsigned out_chans, unsigned chans)
239 {
240 switch (chans) {
241 case 1:
242 /* a 0 0 0
243 * a 0 0 1
244 */
245 sse_movss(p->func, data, arg0);
246 if (out_chans == CHANNELS_0001)
247 sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
248 break;
249 case 2:
250 /* 0 0 0 1
251 * a b 0 1
252 */
253 if (out_chans == CHANNELS_0001)
254 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
255 SHUF(X, Y, Z, W));
256 else if (out_chans > 2)
257 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
258 sse_movlps(p->func, data, arg0);
259 break;
260 case 3:
261 /* Have to jump through some hoops:
262 *
263 * c 0 0 0
264 * c 0 0 1 if out_chans == CHANNELS_0001
265 * 0 0 c 0/1
266 * a b c 0/1
267 */
268 sse_movss(p->func, data, x86_make_disp(arg0, 8));
269 if (out_chans == CHANNELS_0001)
270 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271 SHUF(X, Y, Z, W));
272 sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
273 sse_movlps(p->func, data, arg0);
274 break;
275 case 4:
276 sse_movups(p->func, data, arg0);
277 break;
278 }
279 }
280
281 /* this function behaves like emit_load_float32, but loads
282 64-bit floating point numbers, converting them to 32-bit
283 ones */
284 static void
285 emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
286 struct x86_reg arg0, unsigned out_chans, unsigned chans)
287 {
288 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
289 switch (chans) {
290 case 1:
291 sse2_movsd(p->func, data, arg0);
292 if (out_chans > 1)
293 sse2_cvtpd2ps(p->func, data, data);
294 else
295 sse2_cvtsd2ss(p->func, data, data);
296 if (out_chans == CHANNELS_0001)
297 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
298 SHUF(X, Y, Z, W));
299 break;
300 case 2:
301 sse2_movupd(p->func, data, arg0);
302 sse2_cvtpd2ps(p->func, data, data);
303 if (out_chans == CHANNELS_0001)
304 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
305 SHUF(X, Y, Z, W));
306 else if (out_chans > 2)
307 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
308 break;
309 case 3:
310 sse2_movupd(p->func, data, arg0);
311 sse2_cvtpd2ps(p->func, data, data);
312 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313 if (out_chans > 3)
314 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
315 else
316 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
317 sse_movlhps(p->func, data, tmpXMM);
318 if (out_chans == CHANNELS_0001)
319 sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
320 break;
321 case 4:
322 sse2_movupd(p->func, data, arg0);
323 sse2_cvtpd2ps(p->func, data, data);
324 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
325 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
326 sse_movlhps(p->func, data, tmpXMM);
327 break;
328 }
329 }
330
331
332 static void
333 emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
334 struct x86_reg dst_xmm, struct x86_reg src_gpr,
335 struct x86_reg src_xmm)
336 {
337 if (x86_target(p->func) != X86_32)
338 x64_mov64(p->func, dst_gpr, src_gpr);
339 else {
340 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341 if (x86_target_caps(p->func) & X86_SSE2)
342 sse2_movq(p->func, dst_xmm, src_xmm);
343 else
344 sse_movlps(p->func, dst_xmm, src_xmm);
345 }
346 }
347
348
349 static void
350 emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
351 struct x86_reg dst_xmm, struct x86_reg src)
352 {
353 emit_mov64(p, dst_gpr, dst_xmm, src, src);
354 }
355
356
357 static void
358 emit_store64(struct translate_sse *p, struct x86_reg dst,
359 struct x86_reg src_gpr, struct x86_reg src_xmm)
360 {
361 emit_mov64(p, dst, dst, src_gpr, src_xmm);
362 }
363
364
365 static void
366 emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
367 {
368 if (x86_target_caps(p->func) & X86_SSE2)
369 sse2_movdqu(p->func, dst, src);
370 else
371 sse_movups(p->func, dst, src);
372 }
373
374
375 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
376 * but may or may not be good on older processors
377 * TODO: may perhaps want to use non-temporal stores here if possible
378 */
379 static void
380 emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
381 unsigned size)
382 {
383 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
384 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
385 struct x86_reg dataGPR = p->tmp_EAX;
386 struct x86_reg dataGPR2 = p->tmp2_EDX;
387
388 if (size < 8) {
389 switch (size) {
390 case 1:
391 x86_mov8(p->func, dataGPR, src);
392 x86_mov8(p->func, dst, dataGPR);
393 break;
394 case 2:
395 x86_mov16(p->func, dataGPR, src);
396 x86_mov16(p->func, dst, dataGPR);
397 break;
398 case 3:
399 x86_mov16(p->func, dataGPR, src);
400 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
401 x86_mov16(p->func, dst, dataGPR);
402 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
403 break;
404 case 4:
405 x86_mov(p->func, dataGPR, src);
406 x86_mov(p->func, dst, dataGPR);
407 break;
408 case 6:
409 x86_mov(p->func, dataGPR, src);
410 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
411 x86_mov(p->func, dst, dataGPR);
412 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
413 break;
414 }
415 }
416 else if (!(x86_target_caps(p->func) & X86_SSE)) {
417 unsigned i = 0;
418 assert((size & 3) == 0);
419 for (i = 0; i < size; i += 4) {
420 x86_mov(p->func, dataGPR, x86_make_disp(src, i));
421 x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
422 }
423 }
424 else {
425 switch (size) {
426 case 8:
427 emit_load64(p, dataGPR, dataXMM, src);
428 emit_store64(p, dst, dataGPR, dataXMM);
429 break;
430 case 12:
431 emit_load64(p, dataGPR2, dataXMM, src);
432 x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
433 emit_store64(p, dst, dataGPR2, dataXMM);
434 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
435 break;
436 case 16:
437 emit_mov128(p, dataXMM, src);
438 emit_mov128(p, dst, dataXMM);
439 break;
440 case 24:
441 emit_mov128(p, dataXMM, src);
442 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
443 emit_mov128(p, dst, dataXMM);
444 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
445 break;
446 case 32:
447 emit_mov128(p, dataXMM, src);
448 emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
449 emit_mov128(p, dst, dataXMM);
450 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
451 break;
452 default:
453 assert(0);
454 }
455 }
456 }
457
458 static boolean
459 translate_attr_convert(struct translate_sse *p,
460 const struct translate_element *a,
461 struct x86_reg src, struct x86_reg dst)
462 {
463 const struct util_format_description *input_desc =
464 util_format_description(a->input_format);
465 const struct util_format_description *output_desc =
466 util_format_description(a->output_format);
467 unsigned i;
468 boolean id_swizzle = TRUE;
469 unsigned swizzle[4] =
470 { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
471 PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
472 unsigned needed_chans = 0;
473 unsigned imms[2] = { 0, 0x3f800000 };
474
475 if (a->output_format == PIPE_FORMAT_NONE
476 || a->input_format == PIPE_FORMAT_NONE)
477 return FALSE;
478
479 if (input_desc->channel[0].size & 7)
480 return FALSE;
481
482 if (input_desc->colorspace != output_desc->colorspace)
483 return FALSE;
484
485 for (i = 1; i < input_desc->nr_channels; ++i) {
486 if (memcmp
487 (&input_desc->channel[i], &input_desc->channel[0],
488 sizeof(input_desc->channel[0])))
489 return FALSE;
490 }
491
492 for (i = 1; i < output_desc->nr_channels; ++i) {
493 if (memcmp
494 (&output_desc->channel[i], &output_desc->channel[0],
495 sizeof(output_desc->channel[0]))) {
496 return FALSE;
497 }
498 }
499
500 for (i = 0; i < output_desc->nr_channels; ++i) {
501 if (output_desc->swizzle[i] < 4)
502 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
503 }
504
505 if ((x86_target_caps(p->func) & X86_SSE) &&
506 (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
507 || a->output_format == PIPE_FORMAT_R32G32_FLOAT
508 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
509 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
510 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
511
512 for (i = 0; i < output_desc->nr_channels; ++i) {
513 if (swizzle[i] == PIPE_SWIZZLE_0
514 && i >= input_desc->nr_channels)
515 swizzle[i] = i;
516 }
517
518 for (i = 0; i < output_desc->nr_channels; ++i) {
519 if (swizzle[i] < 4)
520 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
521 if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
522 id_swizzle = FALSE;
523 }
524
525 if (needed_chans > 0) {
526 switch (input_desc->channel[0].type) {
527 case UTIL_FORMAT_TYPE_UNSIGNED:
528 if (!(x86_target_caps(p->func) & X86_SSE2))
529 return FALSE;
530 emit_load_sse2(p, dataXMM, src,
531 input_desc->channel[0].size *
532 input_desc->nr_channels >> 3);
533
534 /* TODO: add support for SSE4.1 pmovzx */
535 switch (input_desc->channel[0].size) {
536 case 8:
537 /* TODO: this may be inefficient due to get_identity() being
538 * used both as a float and integer register.
539 */
540 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
541 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
542 break;
543 case 16:
544 sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
545 break;
546 case 32: /* we lose precision here */
547 sse2_psrld_imm(p->func, dataXMM, 1);
548 break;
549 default:
550 return FALSE;
551 }
552 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
553 if (input_desc->channel[0].normalized) {
554 struct x86_reg factor;
555 switch (input_desc->channel[0].size) {
556 case 8:
557 factor = get_const(p, CONST_INV_255);
558 break;
559 case 16:
560 factor = get_const(p, CONST_INV_65535);
561 break;
562 case 32:
563 factor = get_const(p, CONST_INV_2147483647);
564 break;
565 default:
566 assert(0);
567 factor.disp = 0;
568 factor.file = 0;
569 factor.idx = 0;
570 factor.mod = 0;
571 break;
572 }
573 sse_mulps(p->func, dataXMM, factor);
574 }
575 else if (input_desc->channel[0].size == 32)
576 /* compensate for the bit we threw away to fit u32 into s32 */
577 sse_addps(p->func, dataXMM, dataXMM);
578 break;
579 case UTIL_FORMAT_TYPE_SIGNED:
580 if (!(x86_target_caps(p->func) & X86_SSE2))
581 return FALSE;
582 emit_load_sse2(p, dataXMM, src,
583 input_desc->channel[0].size *
584 input_desc->nr_channels >> 3);
585
586 /* TODO: add support for SSE4.1 pmovsx */
587 switch (input_desc->channel[0].size) {
588 case 8:
589 sse2_punpcklbw(p->func, dataXMM, dataXMM);
590 sse2_punpcklbw(p->func, dataXMM, dataXMM);
591 sse2_psrad_imm(p->func, dataXMM, 24);
592 break;
593 case 16:
594 sse2_punpcklwd(p->func, dataXMM, dataXMM);
595 sse2_psrad_imm(p->func, dataXMM, 16);
596 break;
597 case 32: /* we lose precision here */
598 break;
599 default:
600 return FALSE;
601 }
602 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
603 if (input_desc->channel[0].normalized) {
604 struct x86_reg factor;
605 switch (input_desc->channel[0].size) {
606 case 8:
607 factor = get_const(p, CONST_INV_127);
608 break;
609 case 16:
610 factor = get_const(p, CONST_INV_32767);
611 break;
612 case 32:
613 factor = get_const(p, CONST_INV_2147483647);
614 break;
615 default:
616 assert(0);
617 factor.disp = 0;
618 factor.file = 0;
619 factor.idx = 0;
620 factor.mod = 0;
621 break;
622 }
623 sse_mulps(p->func, dataXMM, factor);
624 }
625 break;
626
627 break;
628 case UTIL_FORMAT_TYPE_FLOAT:
629 if (input_desc->channel[0].size != 32
630 && input_desc->channel[0].size != 64) {
631 return FALSE;
632 }
633 if (swizzle[3] == PIPE_SWIZZLE_1
634 && input_desc->nr_channels <= 3) {
635 swizzle[3] = PIPE_SWIZZLE_W;
636 needed_chans = CHANNELS_0001;
637 }
638 switch (input_desc->channel[0].size) {
639 case 32:
640 emit_load_float32(p, dataXMM, src, needed_chans,
641 input_desc->nr_channels);
642 break;
643 case 64: /* we lose precision here */
644 if (!(x86_target_caps(p->func) & X86_SSE2))
645 return FALSE;
646 emit_load_float64to32(p, dataXMM, src, needed_chans,
647 input_desc->nr_channels);
648 break;
649 default:
650 return FALSE;
651 }
652 break;
653 default:
654 return FALSE;
655 }
656
657 if (!id_swizzle) {
658 sse_shufps(p->func, dataXMM, dataXMM,
659 SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
660 }
661 }
662
663 if (output_desc->nr_channels >= 4
664 && swizzle[0] < PIPE_SWIZZLE_0
665 && swizzle[1] < PIPE_SWIZZLE_0
666 && swizzle[2] < PIPE_SWIZZLE_0
667 && swizzle[3] < PIPE_SWIZZLE_0) {
668 sse_movups(p->func, dst, dataXMM);
669 }
670 else {
671 if (output_desc->nr_channels >= 2
672 && swizzle[0] < PIPE_SWIZZLE_0
673 && swizzle[1] < PIPE_SWIZZLE_0) {
674 sse_movlps(p->func, dst, dataXMM);
675 }
676 else {
677 if (swizzle[0] < PIPE_SWIZZLE_0) {
678 sse_movss(p->func, dst, dataXMM);
679 }
680 else {
681 x86_mov_imm(p->func, dst,
682 imms[swizzle[0] - PIPE_SWIZZLE_0]);
683 }
684
685 if (output_desc->nr_channels >= 2) {
686 if (swizzle[1] < PIPE_SWIZZLE_0) {
687 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
688 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
689 }
690 else {
691 x86_mov_imm(p->func, x86_make_disp(dst, 4),
692 imms[swizzle[1] - PIPE_SWIZZLE_0]);
693 }
694 }
695 }
696
697 if (output_desc->nr_channels >= 3) {
698 if (output_desc->nr_channels >= 4
699 && swizzle[2] < PIPE_SWIZZLE_0
700 && swizzle[3] < PIPE_SWIZZLE_0) {
701 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
702 }
703 else {
704 if (swizzle[2] < PIPE_SWIZZLE_0) {
705 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
706 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
707 }
708 else {
709 x86_mov_imm(p->func, x86_make_disp(dst, 8),
710 imms[swizzle[2] - PIPE_SWIZZLE_0]);
711 }
712
713 if (output_desc->nr_channels >= 4) {
714 if (swizzle[3] < PIPE_SWIZZLE_0) {
715 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
716 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
717 }
718 else {
719 x86_mov_imm(p->func, x86_make_disp(dst, 12),
720 imms[swizzle[3] - PIPE_SWIZZLE_0]);
721 }
722 }
723 }
724 }
725 }
726 return TRUE;
727 }
728 else if ((x86_target_caps(p->func) & X86_SSE2)
729 && input_desc->channel[0].size == 8
730 && output_desc->channel[0].size == 16
731 && output_desc->channel[0].normalized ==
732 input_desc->channel[0].normalized &&
733 (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
734 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
735 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
736 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
737 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
738 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
739 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
740 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
741 struct x86_reg tmp = p->tmp_EAX;
742 unsigned imms[2] = { 0, 1 };
743
744 for (i = 0; i < output_desc->nr_channels; ++i) {
745 if (swizzle[i] == PIPE_SWIZZLE_0
746 && i >= input_desc->nr_channels) {
747 swizzle[i] = i;
748 }
749 }
750
751 for (i = 0; i < output_desc->nr_channels; ++i) {
752 if (swizzle[i] < 4)
753 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
754 if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
755 id_swizzle = FALSE;
756 }
757
758 if (needed_chans > 0) {
759 emit_load_sse2(p, dataXMM, src,
760 input_desc->channel[0].size *
761 input_desc->nr_channels >> 3);
762
763 switch (input_desc->channel[0].type) {
764 case UTIL_FORMAT_TYPE_UNSIGNED:
765 if (input_desc->channel[0].normalized) {
766 sse2_punpcklbw(p->func, dataXMM, dataXMM);
767 if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
768 sse2_psrlw_imm(p->func, dataXMM, 1);
769 }
770 else
771 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
772 break;
773 case UTIL_FORMAT_TYPE_SIGNED:
774 if (input_desc->channel[0].normalized) {
775 sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
776 sse2_punpcklbw(p->func, tmpXMM, dataXMM);
777 sse2_psllw_imm(p->func, dataXMM, 9);
778 sse2_psrlw_imm(p->func, dataXMM, 8);
779 sse2_por(p->func, tmpXMM, dataXMM);
780 sse2_psrlw_imm(p->func, dataXMM, 7);
781 sse2_por(p->func, tmpXMM, dataXMM);
782 {
783 struct x86_reg t = dataXMM;
784 dataXMM = tmpXMM;
785 tmpXMM = t;
786 }
787 }
788 else {
789 sse2_punpcklbw(p->func, dataXMM, dataXMM);
790 sse2_psraw_imm(p->func, dataXMM, 8);
791 }
792 break;
793 default:
794 assert(0);
795 }
796
797 if (output_desc->channel[0].normalized)
798 imms[1] =
799 (output_desc->channel[0].type ==
800 UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
801
802 if (!id_swizzle)
803 sse2_pshuflw(p->func, dataXMM, dataXMM,
804 (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
805 ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
806 }
807
808 if (output_desc->nr_channels >= 4
809 && swizzle[0] < PIPE_SWIZZLE_0
810 && swizzle[1] < PIPE_SWIZZLE_0
811 && swizzle[2] < PIPE_SWIZZLE_0
812 && swizzle[3] < PIPE_SWIZZLE_0) {
813 sse2_movq(p->func, dst, dataXMM);
814 }
815 else {
816 if (swizzle[0] < PIPE_SWIZZLE_0) {
817 if (output_desc->nr_channels >= 2
818 && swizzle[1] < PIPE_SWIZZLE_0) {
819 sse2_movd(p->func, dst, dataXMM);
820 }
821 else {
822 sse2_movd(p->func, tmp, dataXMM);
823 x86_mov16(p->func, dst, tmp);
824 if (output_desc->nr_channels >= 2)
825 x86_mov16_imm(p->func, x86_make_disp(dst, 2),
826 imms[swizzle[1] - PIPE_SWIZZLE_0]);
827 }
828 }
829 else {
830 if (output_desc->nr_channels >= 2
831 && swizzle[1] >= PIPE_SWIZZLE_0) {
832 x86_mov_imm(p->func, dst,
833 (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
834 imms[swizzle[0] - PIPE_SWIZZLE_0]);
835 }
836 else {
837 x86_mov16_imm(p->func, dst,
838 imms[swizzle[0] - PIPE_SWIZZLE_0]);
839 if (output_desc->nr_channels >= 2) {
840 sse2_movd(p->func, tmp, dataXMM);
841 x86_shr_imm(p->func, tmp, 16);
842 x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
843 }
844 }
845 }
846
847 if (output_desc->nr_channels >= 3) {
848 if (swizzle[2] < PIPE_SWIZZLE_0) {
849 if (output_desc->nr_channels >= 4
850 && swizzle[3] < PIPE_SWIZZLE_0) {
851 sse2_psrlq_imm(p->func, dataXMM, 32);
852 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
853 }
854 else {
855 sse2_psrlq_imm(p->func, dataXMM, 32);
856 sse2_movd(p->func, tmp, dataXMM);
857 x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
858 if (output_desc->nr_channels >= 4) {
859 x86_mov16_imm(p->func, x86_make_disp(dst, 6),
860 imms[swizzle[3] - PIPE_SWIZZLE_0]);
861 }
862 }
863 }
864 else {
865 if (output_desc->nr_channels >= 4
866 && swizzle[3] >= PIPE_SWIZZLE_0) {
867 x86_mov_imm(p->func, x86_make_disp(dst, 4),
868 (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
869 | imms[swizzle[2] - PIPE_SWIZZLE_0]);
870 }
871 else {
872 x86_mov16_imm(p->func, x86_make_disp(dst, 4),
873 imms[swizzle[2] - PIPE_SWIZZLE_0]);
874
875 if (output_desc->nr_channels >= 4) {
876 sse2_psrlq_imm(p->func, dataXMM, 48);
877 sse2_movd(p->func, tmp, dataXMM);
878 x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
879 }
880 }
881 }
882 }
883 }
884 return TRUE;
885 }
886 else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
887 sizeof(output_desc->channel[0]))) {
888 struct x86_reg tmp = p->tmp_EAX;
889 unsigned i;
890
891 if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
892 && output_desc->nr_channels == 4
893 && swizzle[0] == PIPE_SWIZZLE_W
894 && swizzle[1] == PIPE_SWIZZLE_Z
895 && swizzle[2] == PIPE_SWIZZLE_Y
896 && swizzle[3] == PIPE_SWIZZLE_X) {
897 /* TODO: support movbe */
898 x86_mov(p->func, tmp, src);
899 x86_bswap(p->func, tmp);
900 x86_mov(p->func, dst, tmp);
901 return TRUE;
902 }
903
904 for (i = 0; i < output_desc->nr_channels; ++i) {
905 switch (output_desc->channel[0].size) {
906 case 8:
907 if (swizzle[i] >= PIPE_SWIZZLE_0) {
908 unsigned v = 0;
909 if (swizzle[i] == PIPE_SWIZZLE_1) {
910 switch (output_desc->channel[0].type) {
911 case UTIL_FORMAT_TYPE_UNSIGNED:
912 v = output_desc->channel[0].normalized ? 0xff : 1;
913 break;
914 case UTIL_FORMAT_TYPE_SIGNED:
915 v = output_desc->channel[0].normalized ? 0x7f : 1;
916 break;
917 default:
918 return FALSE;
919 }
920 }
921 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
922 }
923 else {
924 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
925 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
926 }
927 break;
928 case 16:
929 if (swizzle[i] >= PIPE_SWIZZLE_0) {
930 unsigned v = 0;
931 if (swizzle[i] == PIPE_SWIZZLE_1) {
932 switch (output_desc->channel[1].type) {
933 case UTIL_FORMAT_TYPE_UNSIGNED:
934 v = output_desc->channel[1].normalized ? 0xffff : 1;
935 break;
936 case UTIL_FORMAT_TYPE_SIGNED:
937 v = output_desc->channel[1].normalized ? 0x7fff : 1;
938 break;
939 case UTIL_FORMAT_TYPE_FLOAT:
940 v = 0x3c00;
941 break;
942 default:
943 return FALSE;
944 }
945 }
946 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
947 }
948 else if (swizzle[i] == PIPE_SWIZZLE_0) {
949 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
950 }
951 else {
952 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
953 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
954 }
955 break;
956 case 32:
957 if (swizzle[i] >= PIPE_SWIZZLE_0) {
958 unsigned v = 0;
959 if (swizzle[i] == PIPE_SWIZZLE_1) {
960 switch (output_desc->channel[1].type) {
961 case UTIL_FORMAT_TYPE_UNSIGNED:
962 v = output_desc->channel[1].normalized ? 0xffffffff : 1;
963 break;
964 case UTIL_FORMAT_TYPE_SIGNED:
965 v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
966 break;
967 case UTIL_FORMAT_TYPE_FLOAT:
968 v = 0x3f800000;
969 break;
970 default:
971 return FALSE;
972 }
973 }
974 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
975 }
976 else {
977 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
978 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
979 }
980 break;
981 case 64:
982 if (swizzle[i] >= PIPE_SWIZZLE_0) {
983 unsigned l = 0;
984 unsigned h = 0;
985 if (swizzle[i] == PIPE_SWIZZLE_1) {
986 switch (output_desc->channel[1].type) {
987 case UTIL_FORMAT_TYPE_UNSIGNED:
988 h = output_desc->channel[1].normalized ? 0xffffffff : 0;
989 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
990 break;
991 case UTIL_FORMAT_TYPE_SIGNED:
992 h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
993 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
994 break;
995 case UTIL_FORMAT_TYPE_FLOAT:
996 h = 0x3ff00000;
997 l = 0;
998 break;
999 default:
1000 return FALSE;
1001 }
1002 }
1003 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1004 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1005 }
1006 else {
1007 if (x86_target_caps(p->func) & X86_SSE) {
1008 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1009 emit_load64(p, tmp, tmpXMM,
1010 x86_make_disp(src, swizzle[i] * 8));
1011 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1012 }
1013 else {
1014 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1015 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1016 x86_mov(p->func, tmp,
1017 x86_make_disp(src, swizzle[i] * 8 + 4));
1018 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1019 }
1020 }
1021 break;
1022 default:
1023 return FALSE;
1024 }
1025 }
1026 return TRUE;
1027 }
1028 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029 else if ((x86_target_caps(p->func) & X86_SSE2) &&
1030 a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1031 (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1032 || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1033 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1034
1035 /* load */
1036 sse_movups(p->func, dataXMM, src);
1037
1038 if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1039 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1040 }
1041
1042 /* scale by 255.0 */
1043 sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1044
1045 /* pack and emit */
1046 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1047 sse2_packssdw(p->func, dataXMM, dataXMM);
1048 sse2_packuswb(p->func, dataXMM, dataXMM);
1049 sse2_movd(p->func, dst, dataXMM);
1050
1051 return TRUE;
1052 }
1053
1054 return FALSE;
1055 }
1056
1057
1058 static boolean
1059 translate_attr(struct translate_sse *p,
1060 const struct translate_element *a,
1061 struct x86_reg src, struct x86_reg dst)
1062 {
1063 if (a->input_format == a->output_format) {
1064 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1065 return TRUE;
1066 }
1067
1068 return translate_attr_convert(p, a, src, dst);
1069 }
1070
1071
1072 static boolean
1073 init_inputs(struct translate_sse *p, unsigned index_size)
1074 {
1075 unsigned i;
1076 struct x86_reg instance_id =
1077 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1078 struct x86_reg start_instance =
1079 x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1080
1081 for (i = 0; i < p->nr_buffer_variants; i++) {
1082 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1083 struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1084
1085 if (!index_size || variant->instance_divisor) {
1086 struct x86_reg buf_max_index =
1087 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1088 struct x86_reg buf_stride =
1089 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1090 struct x86_reg buf_ptr =
1091 x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1092 struct x86_reg buf_base_ptr =
1093 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1094 struct x86_reg elt = p->idx_ESI;
1095 struct x86_reg tmp_EAX = p->tmp_EAX;
1096
1097 /* Calculate pointer to first attrib:
1098 * base_ptr + stride * index, where index depends on instance divisor
1099 */
1100 if (variant->instance_divisor) {
1101 struct x86_reg tmp_EDX = p->tmp2_EDX;
1102
1103 /* Start with instance = instance_id
1104 * which is true if divisor is 1.
1105 */
1106 x86_mov(p->func, tmp_EAX, instance_id);
1107
1108 if (variant->instance_divisor != 1) {
1109 struct x86_reg tmp_ECX = p->src_ECX;
1110
1111 /* TODO: Add x86_shr() to rtasm and use it whenever
1112 * instance divisor is power of two.
1113 */
1114 x86_xor(p->func, tmp_EDX, tmp_EDX);
1115 x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1116 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
1117 }
1118
1119 /* instance = (instance_id / divisor) + start_instance
1120 */
1121 x86_mov(p->func, tmp_EDX, start_instance);
1122 x86_add(p->func, tmp_EAX, tmp_EDX);
1123
1124 /* XXX we need to clamp the index here too, but to a
1125 * per-array max value, not the draw->pt.max_index value
1126 * that's being given to us via translate->set_buffer().
1127 */
1128 }
1129 else {
1130 x86_mov(p->func, tmp_EAX, elt);
1131
1132 /* Clamp to max_index
1133 */
1134 x86_cmp(p->func, tmp_EAX, buf_max_index);
1135 x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1136 }
1137
1138 x86_mov(p->func, p->tmp2_EDX, buf_stride);
1139 x64_rexw(p->func);
1140 x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1141 x64_rexw(p->func);
1142 x86_add(p->func, tmp_EAX, buf_base_ptr);
1143
1144 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1145
1146 /* In the linear case, keep the buffer pointer instead of the
1147 * index number.
1148 */
1149 if (!index_size && p->nr_buffer_variants == 1) {
1150 x64_rexw(p->func);
1151 x86_mov(p->func, elt, tmp_EAX);
1152 }
1153 else {
1154 x64_rexw(p->func);
1155 x86_mov(p->func, buf_ptr, tmp_EAX);
1156 }
1157 }
1158 }
1159
1160 return TRUE;
1161 }
1162
1163
1164 static struct x86_reg
1165 get_buffer_ptr(struct translate_sse *p,
1166 unsigned index_size, unsigned var_idx, struct x86_reg elt)
1167 {
1168 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1169 return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1170 }
1171 if (!index_size && p->nr_buffer_variants == 1) {
1172 return p->idx_ESI;
1173 }
1174 else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1175 struct x86_reg ptr = p->src_ECX;
1176 struct x86_reg buf_ptr =
1177 x86_make_disp(p->machine_EDI,
1178 get_offset(p, &p->buffer_variant[var_idx].ptr));
1179
1180 x64_rexw(p->func);
1181 x86_mov(p->func, ptr, buf_ptr);
1182 return ptr;
1183 }
1184 else {
1185 struct x86_reg ptr = p->src_ECX;
1186 const struct translate_buffer_variant *variant =
1187 &p->buffer_variant[var_idx];
1188 struct x86_reg buf_stride =
1189 x86_make_disp(p->machine_EDI,
1190 get_offset(p, &p->buffer[variant->buffer_index].stride));
1191 struct x86_reg buf_base_ptr =
1192 x86_make_disp(p->machine_EDI,
1193 get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1194 struct x86_reg buf_max_index =
1195 x86_make_disp(p->machine_EDI,
1196 get_offset(p, &p->buffer[variant->buffer_index].max_index));
1197
1198 /* Calculate pointer to current attrib:
1199 */
1200 switch (index_size) {
1201 case 1:
1202 x86_movzx8(p->func, ptr, elt);
1203 break;
1204 case 2:
1205 x86_movzx16(p->func, ptr, elt);
1206 break;
1207 case 4:
1208 x86_mov(p->func, ptr, elt);
1209 break;
1210 }
1211
1212 /* Clamp to max_index
1213 */
1214 x86_cmp(p->func, ptr, buf_max_index);
1215 x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1216
1217 x86_mov(p->func, p->tmp2_EDX, buf_stride);
1218 x64_rexw(p->func);
1219 x86_imul(p->func, ptr, p->tmp2_EDX);
1220 x64_rexw(p->func);
1221 x86_add(p->func, ptr, buf_base_ptr);
1222 return ptr;
1223 }
1224 }
1225
1226
1227 static boolean
1228 incr_inputs(struct translate_sse *p, unsigned index_size)
1229 {
1230 if (!index_size && p->nr_buffer_variants == 1) {
1231 const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1232 struct x86_reg stride =
1233 x86_make_disp(p->machine_EDI,
1234 get_offset(p, &p->buffer[buffer_index].stride));
1235
1236 if (p->buffer_variant[0].instance_divisor == 0) {
1237 x64_rexw(p->func);
1238 x86_add(p->func, p->idx_ESI, stride);
1239 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1240 }
1241 }
1242 else if (!index_size) {
1243 unsigned i;
1244
1245 /* Is this worthwhile??
1246 */
1247 for (i = 0; i < p->nr_buffer_variants; i++) {
1248 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1249 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1250 get_offset(p, &variant->ptr));
1251 struct x86_reg buf_stride =
1252 x86_make_disp(p->machine_EDI,
1253 get_offset(p, &p->buffer[variant->buffer_index].stride));
1254
1255 if (variant->instance_divisor == 0) {
1256 x86_mov(p->func, p->tmp_EAX, buf_stride);
1257 x64_rexw(p->func);
1258 x86_add(p->func, p->tmp_EAX, buf_ptr);
1259 if (i == 0)
1260 sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1261 x64_rexw(p->func);
1262 x86_mov(p->func, buf_ptr, p->tmp_EAX);
1263 }
1264 }
1265 }
1266 else {
1267 x64_rexw(p->func);
1268 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1269 }
1270
1271 return TRUE;
1272 }
1273
1274
1275 /* Build run( struct translate *machine,
1276 * unsigned start,
1277 * unsigned count,
1278 * void *output_buffer )
1279 * or
1280 * run_elts( struct translate *machine,
1281 * unsigned *elts,
1282 * unsigned count,
1283 * void *output_buffer )
1284 *
1285 * Lots of hardcoding
1286 *
1287 * EAX -- pointer to current output vertex
1288 * ECX -- pointer to current attribute
1289 *
1290 */
1291 static boolean
1292 build_vertex_emit(struct translate_sse *p,
1293 struct x86_function *func, unsigned index_size)
1294 {
1295 int fixup, label;
1296 unsigned j;
1297
1298 memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1299 memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1300
1301 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1302 p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1303 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1304 p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1305 p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1306 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1307 p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1308
1309 p->func = func;
1310
1311 x86_init_func(p->func);
1312
1313 if (x86_target(p->func) == X86_64_WIN64_ABI) {
1314 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315 * above the return address
1316 */
1317 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1318 x86_make_reg(file_XMM, 6));
1319 sse2_movdqa(p->func,
1320 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1321 x86_make_reg(file_XMM, 7));
1322 }
1323
1324 x86_push(p->func, p->outbuf_EBX);
1325 x86_push(p->func, p->count_EBP);
1326
1327 /* on non-Win64 x86-64, these are already in the right registers */
1328 if (x86_target(p->func) != X86_64_STD_ABI) {
1329 x86_push(p->func, p->machine_EDI);
1330 x86_push(p->func, p->idx_ESI);
1331
1332 if (x86_target(p->func) != X86_32) {
1333 x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1334 x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1335 }
1336 else {
1337 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1338 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1339 }
1340 }
1341
1342 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1343
1344 if (x86_target(p->func) != X86_32)
1345 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1346 else
1347 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1348
1349 /* Load instance ID.
1350 */
1351 if (p->use_instancing) {
1352 x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1353 x86_mov(p->func,
1354 x86_make_disp(p->machine_EDI,
1355 get_offset(p, &p->start_instance)), p->tmp2_EDX);
1356
1357 x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1358 x86_mov(p->func,
1359 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1360 p->tmp_EAX);
1361 }
1362
1363 /* Get vertex count, compare to zero
1364 */
1365 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1366 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1367 fixup = x86_jcc_forward(p->func, cc_E);
1368
1369 /* always load, needed or not:
1370 */
1371 init_inputs(p, index_size);
1372
1373 /* Note address for loop jump
1374 */
1375 label = x86_get_label(p->func);
1376 {
1377 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1378 int last_variant = -1;
1379 struct x86_reg vb;
1380
1381 for (j = 0; j < p->translate.key.nr_elements; j++) {
1382 const struct translate_element *a = &p->translate.key.element[j];
1383 unsigned variant = p->element_to_buffer_variant[j];
1384
1385 /* Figure out source pointer address:
1386 */
1387 if (variant != last_variant) {
1388 last_variant = variant;
1389 vb = get_buffer_ptr(p, index_size, variant, elt);
1390 }
1391
1392 if (!translate_attr(p, a,
1393 x86_make_disp(vb, a->input_offset),
1394 x86_make_disp(p->outbuf_EBX, a->output_offset)))
1395 return FALSE;
1396 }
1397
1398 /* Next output vertex:
1399 */
1400 x64_rexw(p->func);
1401 x86_lea(p->func, p->outbuf_EBX,
1402 x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1403
1404 /* Incr index
1405 */
1406 incr_inputs(p, index_size);
1407 }
1408
1409 /* decr count, loop if not zero
1410 */
1411 x86_dec(p->func, p->count_EBP);
1412 x86_jcc(p->func, cc_NZ, label);
1413
1414 /* Exit mmx state?
1415 */
1416 if (p->func->need_emms)
1417 mmx_emms(p->func);
1418
1419 /* Land forward jump here:
1420 */
1421 x86_fixup_fwd_jump(p->func, fixup);
1422
1423 /* Pop regs and return
1424 */
1425 if (x86_target(p->func) != X86_64_STD_ABI) {
1426 x86_pop(p->func, p->idx_ESI);
1427 x86_pop(p->func, p->machine_EDI);
1428 }
1429
1430 x86_pop(p->func, p->count_EBP);
1431 x86_pop(p->func, p->outbuf_EBX);
1432
1433 if (x86_target(p->func) == X86_64_WIN64_ABI) {
1434 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1435 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1436 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1437 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1438 }
1439 x86_ret(p->func);
1440
1441 return TRUE;
1442 }
1443
1444
1445 static void
1446 translate_sse_set_buffer(struct translate *translate,
1447 unsigned buf,
1448 const void *ptr, unsigned stride, unsigned max_index)
1449 {
1450 struct translate_sse *p = (struct translate_sse *) translate;
1451
1452 if (buf < p->nr_buffers) {
1453 p->buffer[buf].base_ptr = (char *) ptr;
1454 p->buffer[buf].stride = stride;
1455 p->buffer[buf].max_index = max_index;
1456 }
1457
1458 if (0)
1459 debug_printf("%s %d/%d: %p %d\n",
1460 __FUNCTION__, buf, p->nr_buffers, ptr, stride);
1461 }
1462
1463
1464 static void
1465 translate_sse_release(struct translate *translate)
1466 {
1467 struct translate_sse *p = (struct translate_sse *) translate;
1468
1469 x86_release_func(&p->elt8_func);
1470 x86_release_func(&p->elt16_func);
1471 x86_release_func(&p->elt_func);
1472 x86_release_func(&p->linear_func);
1473
1474 os_free_aligned(p);
1475 }
1476
1477
1478 struct translate *
1479 translate_sse2_create(const struct translate_key *key)
1480 {
1481 struct translate_sse *p = NULL;
1482 unsigned i;
1483
1484 /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1485 if (!rtasm_cpu_has_sse())
1486 goto fail;
1487
1488 p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1489 if (!p)
1490 goto fail;
1491
1492 memset(p, 0, sizeof(*p));
1493 memcpy(p->consts, consts, sizeof(consts));
1494
1495 p->translate.key = *key;
1496 p->translate.release = translate_sse_release;
1497 p->translate.set_buffer = translate_sse_set_buffer;
1498
1499 assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1500
1501 for (i = 0; i < key->nr_elements; i++) {
1502 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1503 unsigned j;
1504
1505 p->nr_buffers =
1506 MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1507
1508 if (key->element[i].instance_divisor) {
1509 p->use_instancing = TRUE;
1510 }
1511
1512 /*
1513 * Map vertex element to vertex buffer variant.
1514 */
1515 for (j = 0; j < p->nr_buffer_variants; j++) {
1516 if (p->buffer_variant[j].buffer_index ==
1517 key->element[i].input_buffer
1518 && p->buffer_variant[j].instance_divisor ==
1519 key->element[i].instance_divisor) {
1520 break;
1521 }
1522 }
1523 if (j == p->nr_buffer_variants) {
1524 p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1525 p->buffer_variant[j].instance_divisor =
1526 key->element[i].instance_divisor;
1527 p->nr_buffer_variants++;
1528 }
1529 p->element_to_buffer_variant[i] = j;
1530 }
1531 else {
1532 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1533
1534 p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1535 }
1536 }
1537
1538 if (0)
1539 debug_printf("nr_buffers: %d\n", p->nr_buffers);
1540
1541 if (!build_vertex_emit(p, &p->linear_func, 0))
1542 goto fail;
1543
1544 if (!build_vertex_emit(p, &p->elt_func, 4))
1545 goto fail;
1546
1547 if (!build_vertex_emit(p, &p->elt16_func, 2))
1548 goto fail;
1549
1550 if (!build_vertex_emit(p, &p->elt8_func, 1))
1551 goto fail;
1552
1553 p->translate.run = (run_func) x86_get_func(&p->linear_func);
1554 if (p->translate.run == NULL)
1555 goto fail;
1556
1557 p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1558 if (p->translate.run_elts == NULL)
1559 goto fail;
1560
1561 p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1562 if (p->translate.run_elts16 == NULL)
1563 goto fail;
1564
1565 p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1566 if (p->translate.run_elts8 == NULL)
1567 goto fail;
1568
1569 return &p->translate;
1570
1571 fail:
1572 if (p)
1573 translate_sse_release(&p->translate);
1574
1575 return NULL;
1576 }
1577
1578
1579 #else
1580
1581 struct translate *
1582 translate_sse2_create(const struct translate_key *key)
1583 {
1584 return NULL;
1585 }
1586
1587 #endif