8e152a002a3bf2d6941fc31d00415ab3fae8ee81
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33
34 #include "translate.h"
35
36
37 #if defined(PIPE_ARCH_X86)
38
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
41
42
43 #define X 0
44 #define Y 1
45 #define Z 2
46 #define W 3
47
48
49 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50 unsigned start,
51 unsigned count,
52 unsigned instance_id,
53 void *output_buffer,
54 float instance_id_float );
55
56 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
57 const unsigned *elts,
58 unsigned count,
59 unsigned instance_id,
60 void *output_buffer,
61 float instance_id_float );
62
63 struct translate_buffer {
64 const void *base_ptr;
65 unsigned stride;
66 };
67
68 struct translate_buffer_varient {
69 unsigned buffer_index;
70 unsigned instance_divisor;
71 void *ptr; /* updated either per vertex or per instance */
72 };
73
74
75 #define ELEMENT_BUFFER_INSTANCE_ID 1001
76
77
78 struct translate_sse {
79 struct translate translate;
80
81 struct x86_function linear_func;
82 struct x86_function elt_func;
83 struct x86_function *func;
84
85 boolean loaded_identity;
86 boolean loaded_255;
87 boolean loaded_inv_255;
88
89 float identity[4];
90 float float_255[4];
91 float inv_255[4];
92
93 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
94 unsigned nr_buffers;
95
96 /* Multiple buffer varients can map to a single buffer. */
97 struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
98 unsigned nr_buffer_varients;
99
100 /* Multiple elements can map to a single buffer varient. */
101 unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
102
103 boolean use_instancing;
104 unsigned instance_id;
105 float instance_id_float; /* XXX: needed while no integer support in TGSI */
106
107 run_func gen_run;
108 run_elts_func gen_run_elts;
109
110 /* these are actually known values, but putting them in a struct
111 * like this is helpful to keep them in sync across the file.
112 */
113 struct x86_reg tmp_EAX;
114 struct x86_reg idx_EBX; /* either start+i or &elt[i] */
115 struct x86_reg outbuf_ECX;
116 struct x86_reg machine_EDX;
117 struct x86_reg count_ESI; /* decrements to zero */
118 };
119
120 static int get_offset( const void *a, const void *b )
121 {
122 return (const char *)b - (const char *)a;
123 }
124
125
126
127 static struct x86_reg get_identity( struct translate_sse *p )
128 {
129 struct x86_reg reg = x86_make_reg(file_XMM, 6);
130
131 if (!p->loaded_identity) {
132 p->loaded_identity = TRUE;
133 p->identity[0] = 0;
134 p->identity[1] = 0;
135 p->identity[2] = 0;
136 p->identity[3] = 1;
137
138 sse_movups(p->func, reg,
139 x86_make_disp(p->machine_EDX,
140 get_offset(p, &p->identity[0])));
141 }
142
143 return reg;
144 }
145
146 static struct x86_reg get_255( struct translate_sse *p )
147 {
148 struct x86_reg reg = x86_make_reg(file_XMM, 7);
149
150 if (!p->loaded_255) {
151 p->loaded_255 = TRUE;
152 p->float_255[0] =
153 p->float_255[1] =
154 p->float_255[2] =
155 p->float_255[3] = 255.0f;
156
157 sse_movups(p->func, reg,
158 x86_make_disp(p->machine_EDX,
159 get_offset(p, &p->float_255[0])));
160 }
161
162 return reg;
163 }
164
165 static struct x86_reg get_inv_255( struct translate_sse *p )
166 {
167 struct x86_reg reg = x86_make_reg(file_XMM, 5);
168
169 if (!p->loaded_inv_255) {
170 p->loaded_inv_255 = TRUE;
171 p->inv_255[0] =
172 p->inv_255[1] =
173 p->inv_255[2] =
174 p->inv_255[3] = 1.0f / 255.0f;
175
176 sse_movups(p->func, reg,
177 x86_make_disp(p->machine_EDX,
178 get_offset(p, &p->inv_255[0])));
179 }
180
181 return reg;
182 }
183
184
185 static void emit_load_R32G32B32A32( struct translate_sse *p,
186 struct x86_reg data,
187 struct x86_reg arg0 )
188 {
189 sse_movups(p->func, data, arg0);
190 }
191
192 static void emit_load_R32G32B32( struct translate_sse *p,
193 struct x86_reg data,
194 struct x86_reg arg0 )
195 {
196 /* Have to jump through some hoops:
197 *
198 * c 0 0 0
199 * c 0 0 1
200 * 0 0 c 1
201 * a b c 1
202 */
203 sse_movss(p->func, data, x86_make_disp(arg0, 8));
204 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
205 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
206 sse_movlps(p->func, data, arg0);
207 }
208
209 static void emit_load_R32G32( struct translate_sse *p,
210 struct x86_reg data,
211 struct x86_reg arg0 )
212 {
213 /* 0 0 0 1
214 * a b 0 1
215 */
216 sse_movups(p->func, data, get_identity(p) );
217 sse_movlps(p->func, data, arg0);
218 }
219
220
221 static void emit_load_R32( struct translate_sse *p,
222 struct x86_reg data,
223 struct x86_reg arg0 )
224 {
225 /* a 0 0 0
226 * a 0 0 1
227 */
228 sse_movss(p->func, data, arg0);
229 sse_orps(p->func, data, get_identity(p) );
230 }
231
232
233 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
234 struct x86_reg data,
235 struct x86_reg src )
236 {
237
238 /* Load and unpack twice:
239 */
240 sse_movss(p->func, data, src);
241 sse2_punpcklbw(p->func, data, get_identity(p));
242 sse2_punpcklbw(p->func, data, get_identity(p));
243
244 /* Convert to float:
245 */
246 sse2_cvtdq2ps(p->func, data, data);
247
248
249 /* Scale by 1/255.0
250 */
251 sse_mulps(p->func, data, get_inv_255(p));
252 }
253
254
255
256
257 static void emit_store_R32G32B32A32( struct translate_sse *p,
258 struct x86_reg dest,
259 struct x86_reg dataXMM )
260 {
261 sse_movups(p->func, dest, dataXMM);
262 }
263
264 static void emit_store_R32G32B32( struct translate_sse *p,
265 struct x86_reg dest,
266 struct x86_reg dataXMM )
267 {
268 /* Emit two, shuffle, emit one.
269 */
270 sse_movlps(p->func, dest, dataXMM);
271 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
272 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
273 }
274
275 static void emit_store_R32G32( struct translate_sse *p,
276 struct x86_reg dest,
277 struct x86_reg dataXMM )
278 {
279 sse_movlps(p->func, dest, dataXMM);
280 }
281
282 static void emit_store_R32( struct translate_sse *p,
283 struct x86_reg dest,
284 struct x86_reg dataXMM )
285 {
286 sse_movss(p->func, dest, dataXMM);
287 }
288
289
290
291 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
292 struct x86_reg dest,
293 struct x86_reg dataXMM )
294 {
295 /* Scale by 255.0
296 */
297 sse_mulps(p->func, dataXMM, get_255(p));
298
299 /* Pack and emit:
300 */
301 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
302 sse2_packssdw(p->func, dataXMM, dataXMM);
303 sse2_packuswb(p->func, dataXMM, dataXMM);
304 sse_movss(p->func, dest, dataXMM);
305 }
306
307
308
309
310
311 /* Extended swizzles? Maybe later.
312 */
313 static void emit_swizzle( struct translate_sse *p,
314 struct x86_reg dest,
315 struct x86_reg src,
316 unsigned char shuffle )
317 {
318 sse_shufps(p->func, dest, src, shuffle);
319 }
320
321
322 static boolean translate_attr( struct translate_sse *p,
323 const struct translate_element *a,
324 struct x86_reg srcECX,
325 struct x86_reg dstEAX)
326 {
327 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
328
329 switch (a->input_format) {
330 case PIPE_FORMAT_R32_FLOAT:
331 emit_load_R32(p, dataXMM, srcECX);
332 break;
333 case PIPE_FORMAT_R32G32_FLOAT:
334 emit_load_R32G32(p, dataXMM, srcECX);
335 break;
336 case PIPE_FORMAT_R32G32B32_FLOAT:
337 emit_load_R32G32B32(p, dataXMM, srcECX);
338 break;
339 case PIPE_FORMAT_R32G32B32A32_FLOAT:
340 emit_load_R32G32B32A32(p, dataXMM, srcECX);
341 break;
342 case PIPE_FORMAT_B8G8R8A8_UNORM:
343 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
344 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
345 break;
346 case PIPE_FORMAT_R8G8B8A8_UNORM:
347 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
348 break;
349 default:
350 return FALSE;
351 }
352
353 switch (a->output_format) {
354 case PIPE_FORMAT_R32_FLOAT:
355 emit_store_R32(p, dstEAX, dataXMM);
356 break;
357 case PIPE_FORMAT_R32G32_FLOAT:
358 emit_store_R32G32(p, dstEAX, dataXMM);
359 break;
360 case PIPE_FORMAT_R32G32B32_FLOAT:
361 emit_store_R32G32B32(p, dstEAX, dataXMM);
362 break;
363 case PIPE_FORMAT_R32G32B32A32_FLOAT:
364 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
365 break;
366 case PIPE_FORMAT_B8G8R8A8_UNORM:
367 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
368 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
369 break;
370 case PIPE_FORMAT_R8G8B8A8_UNORM:
371 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
372 break;
373 default:
374 return FALSE;
375 }
376
377 return TRUE;
378 }
379
380
381 static boolean init_inputs( struct translate_sse *p,
382 boolean linear )
383 {
384 unsigned i;
385 struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
386 get_offset(p, &p->instance_id));
387
388 for (i = 0; i < p->nr_buffer_varients; i++) {
389 struct translate_buffer_varient *varient = &p->buffer_varient[i];
390 struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
391
392 if (linear || varient->instance_divisor) {
393 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
394 get_offset(p, &buffer->stride));
395 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
396 get_offset(p, &varient->ptr));
397 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
398 get_offset(p, &buffer->base_ptr));
399 struct x86_reg elt = p->idx_EBX;
400 struct x86_reg tmp_EAX = p->tmp_EAX;
401
402 /* Calculate pointer to first attrib:
403 * base_ptr + stride * index, where index depends on instance divisor
404 */
405 if (varient->instance_divisor) {
406 /* Our index is instance ID divided by instance divisor.
407 */
408 x86_mov(p->func, tmp_EAX, instance_id);
409
410 if (varient->instance_divisor != 1) {
411 struct x86_reg tmp_EDX = p->machine_EDX;
412 struct x86_reg tmp_ECX = p->outbuf_ECX;
413
414 /* TODO: Add x86_shr() to rtasm and use it whenever
415 * instance divisor is power of two.
416 */
417
418 x86_push(p->func, tmp_EDX);
419 x86_push(p->func, tmp_ECX);
420 x86_xor(p->func, tmp_EDX, tmp_EDX);
421 x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
422 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
423 x86_pop(p->func, tmp_ECX);
424 x86_pop(p->func, tmp_EDX);
425 }
426 } else {
427 x86_mov(p->func, tmp_EAX, elt);
428 }
429 x86_imul(p->func, tmp_EAX, buf_stride);
430 x86_add(p->func, tmp_EAX, buf_base_ptr);
431
432
433 /* In the linear case, keep the buffer pointer instead of the
434 * index number.
435 */
436 if (linear && p->nr_buffer_varients == 1)
437 x86_mov(p->func, elt, tmp_EAX);
438 else
439 x86_mov(p->func, buf_ptr, tmp_EAX);
440 }
441 }
442
443 return TRUE;
444 }
445
446
447 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
448 boolean linear,
449 unsigned var_idx,
450 struct x86_reg elt )
451 {
452 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
453 return x86_make_disp(p->machine_EDX,
454 get_offset(p, &p->instance_id_float));
455 }
456 if (linear && p->nr_buffer_varients == 1) {
457 return p->idx_EBX;
458 }
459 else if (linear || p->buffer_varient[var_idx].instance_divisor) {
460 struct x86_reg ptr = p->tmp_EAX;
461 struct x86_reg buf_ptr =
462 x86_make_disp(p->machine_EDX,
463 get_offset(p, &p->buffer_varient[var_idx].ptr));
464
465 x86_mov(p->func, ptr, buf_ptr);
466 return ptr;
467 }
468 else {
469 struct x86_reg ptr = p->tmp_EAX;
470 const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
471
472 struct x86_reg buf_stride =
473 x86_make_disp(p->machine_EDX,
474 get_offset(p, &p->buffer[varient->buffer_index].stride));
475
476 struct x86_reg buf_base_ptr =
477 x86_make_disp(p->machine_EDX,
478 get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
479
480
481
482 /* Calculate pointer to current attrib:
483 */
484 x86_mov(p->func, ptr, buf_stride);
485 x86_imul(p->func, ptr, elt);
486 x86_add(p->func, ptr, buf_base_ptr);
487 return ptr;
488 }
489 }
490
491
492
493 static boolean incr_inputs( struct translate_sse *p,
494 boolean linear )
495 {
496 if (linear && p->nr_buffer_varients == 1) {
497 struct x86_reg stride = x86_make_disp(p->machine_EDX,
498 get_offset(p, &p->buffer[0].stride));
499
500 if (p->buffer_varient[0].instance_divisor == 0) {
501 x86_add(p->func, p->idx_EBX, stride);
502 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
503 }
504 }
505 else if (linear) {
506 unsigned i;
507
508 /* Is this worthwhile??
509 */
510 for (i = 0; i < p->nr_buffer_varients; i++) {
511 struct translate_buffer_varient *varient = &p->buffer_varient[i];
512 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
513 get_offset(p, &varient->ptr));
514 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
515 get_offset(p, &p->buffer[varient->buffer_index].stride));
516
517 if (varient->instance_divisor == 0) {
518 x86_mov(p->func, p->tmp_EAX, buf_ptr);
519 x86_add(p->func, p->tmp_EAX, buf_stride);
520 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
521 x86_mov(p->func, buf_ptr, p->tmp_EAX);
522 }
523 }
524 }
525 else {
526 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
527 }
528
529 return TRUE;
530 }
531
532
533 /* Build run( struct translate *machine,
534 * unsigned start,
535 * unsigned count,
536 * void *output_buffer )
537 * or
538 * run_elts( struct translate *machine,
539 * unsigned *elts,
540 * unsigned count,
541 * void *output_buffer )
542 *
543 * Lots of hardcoding
544 *
545 * EAX -- pointer to current output vertex
546 * ECX -- pointer to current attribute
547 *
548 */
549 static boolean build_vertex_emit( struct translate_sse *p,
550 struct x86_function *func,
551 boolean linear )
552 {
553 int fixup, label;
554 unsigned j;
555
556 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
557 p->idx_EBX = x86_make_reg(file_REG32, reg_BX);
558 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
559 p->machine_EDX = x86_make_reg(file_REG32, reg_DX);
560 p->count_ESI = x86_make_reg(file_REG32, reg_SI);
561
562 p->func = func;
563 p->loaded_inv_255 = FALSE;
564 p->loaded_255 = FALSE;
565 p->loaded_identity = FALSE;
566
567 x86_init_func(p->func);
568
569 /* Push a few regs?
570 */
571 x86_push(p->func, p->idx_EBX);
572 x86_push(p->func, p->count_ESI);
573
574 /* Load arguments into regs:
575 */
576 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
577 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
578 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
579 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
580
581 /* Load instance ID.
582 */
583 if (p->use_instancing) {
584 x86_mov(p->func,
585 p->tmp_EAX,
586 x86_fn_arg(p->func, 4));
587 x86_mov(p->func,
588 x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
589 p->tmp_EAX);
590
591 /* XXX: temporary */
592 x86_mov(p->func,
593 p->tmp_EAX,
594 x86_fn_arg(p->func, 6));
595 x86_mov(p->func,
596 x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id_float)),
597 p->tmp_EAX);
598 }
599
600 /* Get vertex count, compare to zero
601 */
602 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
603 x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
604 fixup = x86_jcc_forward(p->func, cc_E);
605
606 /* always load, needed or not:
607 */
608 init_inputs(p, linear);
609
610 /* Note address for loop jump
611 */
612 label = x86_get_label(p->func);
613 {
614 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
615 int last_varient = -1;
616 struct x86_reg vb;
617
618 for (j = 0; j < p->translate.key.nr_elements; j++) {
619 const struct translate_element *a = &p->translate.key.element[j];
620 unsigned varient = p->element_to_buffer_varient[j];
621
622 /* Figure out source pointer address:
623 */
624 if (varient != last_varient) {
625 last_varient = varient;
626 vb = get_buffer_ptr(p, linear, varient, elt);
627 }
628
629 if (!translate_attr( p, a,
630 x86_make_disp(vb, a->input_offset),
631 x86_make_disp(p->outbuf_ECX, a->output_offset)))
632 return FALSE;
633 }
634
635 /* Next output vertex:
636 */
637 x86_lea(p->func,
638 p->outbuf_ECX,
639 x86_make_disp(p->outbuf_ECX,
640 p->translate.key.output_stride));
641
642 /* Incr index
643 */
644 incr_inputs( p, linear );
645 }
646
647 /* decr count, loop if not zero
648 */
649 x86_dec(p->func, p->count_ESI);
650 x86_jcc(p->func, cc_NZ, label);
651
652 /* Exit mmx state?
653 */
654 if (p->func->need_emms)
655 mmx_emms(p->func);
656
657 /* Land forward jump here:
658 */
659 x86_fixup_fwd_jump(p->func, fixup);
660
661 /* Pop regs and return
662 */
663
664 x86_pop(p->func, p->count_ESI);
665 x86_pop(p->func, p->idx_EBX);
666 x86_ret(p->func);
667
668 return TRUE;
669 }
670
671
672
673
674
675
676
677 static void translate_sse_set_buffer( struct translate *translate,
678 unsigned buf,
679 const void *ptr,
680 unsigned stride )
681 {
682 struct translate_sse *p = (struct translate_sse *)translate;
683
684 if (buf < p->nr_buffers) {
685 p->buffer[buf].base_ptr = (char *)ptr;
686 p->buffer[buf].stride = stride;
687 }
688
689 if (0) debug_printf("%s %d/%d: %p %d\n",
690 __FUNCTION__, buf,
691 p->nr_buffers,
692 ptr, stride);
693 }
694
695
696 static void translate_sse_release( struct translate *translate )
697 {
698 struct translate_sse *p = (struct translate_sse *)translate;
699
700 x86_release_func( &p->linear_func );
701 x86_release_func( &p->elt_func );
702
703 FREE(p);
704 }
705
706 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
707 const unsigned *elts,
708 unsigned count,
709 unsigned instance_id,
710 void *output_buffer )
711 {
712 struct translate_sse *p = (struct translate_sse *)translate;
713
714 p->gen_run_elts( translate,
715 elts,
716 count,
717 instance_id,
718 output_buffer,
719 (float)instance_id );
720 }
721
722 static void PIPE_CDECL translate_sse_run( struct translate *translate,
723 unsigned start,
724 unsigned count,
725 unsigned instance_id,
726 void *output_buffer )
727 {
728 struct translate_sse *p = (struct translate_sse *)translate;
729
730 p->gen_run( translate,
731 start,
732 count,
733 instance_id,
734 output_buffer,
735 (float)instance_id);
736 }
737
738
739 struct translate *translate_sse2_create( const struct translate_key *key )
740 {
741 struct translate_sse *p = NULL;
742 unsigned i;
743
744 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
745 goto fail;
746
747 p = CALLOC_STRUCT( translate_sse );
748 if (p == NULL)
749 goto fail;
750
751 p->translate.key = *key;
752 p->translate.release = translate_sse_release;
753 p->translate.set_buffer = translate_sse_set_buffer;
754 p->translate.run_elts = translate_sse_run_elts;
755 p->translate.run = translate_sse_run;
756
757 for (i = 0; i < key->nr_elements; i++) {
758 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
759 unsigned j;
760
761 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
762
763 if (key->element[i].instance_divisor) {
764 p->use_instancing = TRUE;
765 }
766
767 /*
768 * Map vertex element to vertex buffer varient.
769 */
770 for (j = 0; j < p->nr_buffer_varients; j++) {
771 if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
772 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
773 break;
774 }
775 }
776 if (j == p->nr_buffer_varients) {
777 p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
778 p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
779 p->nr_buffer_varients++;
780 }
781 p->element_to_buffer_varient[i] = j;
782 } else {
783 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
784
785 p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
786 }
787 }
788
789 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
790
791 if (!build_vertex_emit(p, &p->linear_func, TRUE))
792 goto fail;
793
794 if (!build_vertex_emit(p, &p->elt_func, FALSE))
795 goto fail;
796
797 p->gen_run = (run_func)x86_get_func(&p->linear_func);
798 if (p->gen_run == NULL)
799 goto fail;
800
801 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
802 if (p->gen_run_elts == NULL)
803 goto fail;
804
805 return &p->translate;
806
807 fail:
808 if (p)
809 translate_sse_release( &p->translate );
810
811 return NULL;
812 }
813
814
815
816 #else
817
818 struct translate *translate_sse2_create( const struct translate_key *key )
819 {
820 return NULL;
821 }
822
823 #endif