Merge branch '7.8'
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33
34 #include "translate.h"
35
36
37 #if defined(PIPE_ARCH_X86)
38
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
41
42
43 #define X 0
44 #define Y 1
45 #define Z 2
46 #define W 3
47
48
49 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50 unsigned start,
51 unsigned count,
52 unsigned instance_id,
53 void *output_buffer);
54
55 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
56 const unsigned *elts,
57 unsigned count,
58 unsigned instance_id,
59 void *output_buffer);
60
61 struct translate_buffer {
62 const void *base_ptr;
63 unsigned stride;
64 unsigned max_index;
65 };
66
67 struct translate_buffer_varient {
68 unsigned buffer_index;
69 unsigned instance_divisor;
70 void *ptr; /* updated either per vertex or per instance */
71 };
72
73
74 #define ELEMENT_BUFFER_INSTANCE_ID 1001
75
76
77 struct translate_sse {
78 struct translate translate;
79
80 struct x86_function linear_func;
81 struct x86_function elt_func;
82 struct x86_function *func;
83
84 boolean loaded_identity;
85 boolean loaded_255;
86 boolean loaded_inv_255;
87
88 float identity[4];
89 float float_255[4];
90 float inv_255[4];
91
92 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
93 unsigned nr_buffers;
94
95 /* Multiple buffer varients can map to a single buffer. */
96 struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
97 unsigned nr_buffer_varients;
98
99 /* Multiple elements can map to a single buffer varient. */
100 unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
101
102 boolean use_instancing;
103 unsigned instance_id;
104
105 run_func gen_run;
106 run_elts_func gen_run_elts;
107
108 /* these are actually known values, but putting them in a struct
109 * like this is helpful to keep them in sync across the file.
110 */
111 struct x86_reg tmp_EAX;
112 struct x86_reg idx_EBX; /* either start+i or &elt[i] */
113 struct x86_reg outbuf_ECX;
114 struct x86_reg machine_EDX;
115 struct x86_reg count_ESI; /* decrements to zero */
116 };
117
118 static int get_offset( const void *a, const void *b )
119 {
120 return (const char *)b - (const char *)a;
121 }
122
123
124
125 static struct x86_reg get_identity( struct translate_sse *p )
126 {
127 struct x86_reg reg = x86_make_reg(file_XMM, 6);
128
129 if (!p->loaded_identity) {
130 p->loaded_identity = TRUE;
131 p->identity[0] = 0;
132 p->identity[1] = 0;
133 p->identity[2] = 0;
134 p->identity[3] = 1;
135
136 sse_movups(p->func, reg,
137 x86_make_disp(p->machine_EDX,
138 get_offset(p, &p->identity[0])));
139 }
140
141 return reg;
142 }
143
144 static struct x86_reg get_255( struct translate_sse *p )
145 {
146 struct x86_reg reg = x86_make_reg(file_XMM, 7);
147
148 if (!p->loaded_255) {
149 p->loaded_255 = TRUE;
150 p->float_255[0] =
151 p->float_255[1] =
152 p->float_255[2] =
153 p->float_255[3] = 255.0f;
154
155 sse_movups(p->func, reg,
156 x86_make_disp(p->machine_EDX,
157 get_offset(p, &p->float_255[0])));
158 }
159
160 return reg;
161 }
162
163 static struct x86_reg get_inv_255( struct translate_sse *p )
164 {
165 struct x86_reg reg = x86_make_reg(file_XMM, 5);
166
167 if (!p->loaded_inv_255) {
168 p->loaded_inv_255 = TRUE;
169 p->inv_255[0] =
170 p->inv_255[1] =
171 p->inv_255[2] =
172 p->inv_255[3] = 1.0f / 255.0f;
173
174 sse_movups(p->func, reg,
175 x86_make_disp(p->machine_EDX,
176 get_offset(p, &p->inv_255[0])));
177 }
178
179 return reg;
180 }
181
182
183 static void emit_load_R32G32B32A32( struct translate_sse *p,
184 struct x86_reg data,
185 struct x86_reg arg0 )
186 {
187 sse_movups(p->func, data, arg0);
188 }
189
190 static void emit_load_R32G32B32( struct translate_sse *p,
191 struct x86_reg data,
192 struct x86_reg arg0 )
193 {
194 /* Have to jump through some hoops:
195 *
196 * c 0 0 0
197 * c 0 0 1
198 * 0 0 c 1
199 * a b c 1
200 */
201 sse_movss(p->func, data, x86_make_disp(arg0, 8));
202 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
203 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
204 sse_movlps(p->func, data, arg0);
205 }
206
207 static void emit_load_R32G32( struct translate_sse *p,
208 struct x86_reg data,
209 struct x86_reg arg0 )
210 {
211 /* 0 0 0 1
212 * a b 0 1
213 */
214 sse_movups(p->func, data, get_identity(p) );
215 sse_movlps(p->func, data, arg0);
216 }
217
218
219 static void emit_load_R32( struct translate_sse *p,
220 struct x86_reg data,
221 struct x86_reg arg0 )
222 {
223 /* a 0 0 0
224 * a 0 0 1
225 */
226 sse_movss(p->func, data, arg0);
227 sse_orps(p->func, data, get_identity(p) );
228 }
229
230
231 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
232 struct x86_reg data,
233 struct x86_reg src )
234 {
235
236 /* Load and unpack twice:
237 */
238 sse_movss(p->func, data, src);
239 sse2_punpcklbw(p->func, data, get_identity(p));
240 sse2_punpcklbw(p->func, data, get_identity(p));
241
242 /* Convert to float:
243 */
244 sse2_cvtdq2ps(p->func, data, data);
245
246
247 /* Scale by 1/255.0
248 */
249 sse_mulps(p->func, data, get_inv_255(p));
250 }
251
252
253
254
255 static void emit_store_R32G32B32A32( struct translate_sse *p,
256 struct x86_reg dest,
257 struct x86_reg dataXMM )
258 {
259 sse_movups(p->func, dest, dataXMM);
260 }
261
262 static void emit_store_R32G32B32( struct translate_sse *p,
263 struct x86_reg dest,
264 struct x86_reg dataXMM )
265 {
266 /* Emit two, shuffle, emit one.
267 */
268 sse_movlps(p->func, dest, dataXMM);
269 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
270 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
271 }
272
273 static void emit_store_R32G32( struct translate_sse *p,
274 struct x86_reg dest,
275 struct x86_reg dataXMM )
276 {
277 sse_movlps(p->func, dest, dataXMM);
278 }
279
280 static void emit_store_R32( struct translate_sse *p,
281 struct x86_reg dest,
282 struct x86_reg dataXMM )
283 {
284 sse_movss(p->func, dest, dataXMM);
285 }
286
287
288
289 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
290 struct x86_reg dest,
291 struct x86_reg dataXMM )
292 {
293 /* Scale by 255.0
294 */
295 sse_mulps(p->func, dataXMM, get_255(p));
296
297 /* Pack and emit:
298 */
299 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
300 sse2_packssdw(p->func, dataXMM, dataXMM);
301 sse2_packuswb(p->func, dataXMM, dataXMM);
302 sse_movss(p->func, dest, dataXMM);
303 }
304
305
306
307
308
309 /* Extended swizzles? Maybe later.
310 */
311 static void emit_swizzle( struct translate_sse *p,
312 struct x86_reg dest,
313 struct x86_reg src,
314 unsigned char shuffle )
315 {
316 sse_shufps(p->func, dest, src, shuffle);
317 }
318
319
320 static boolean translate_attr( struct translate_sse *p,
321 const struct translate_element *a,
322 struct x86_reg srcECX,
323 struct x86_reg dstEAX)
324 {
325 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
326
327 switch (a->input_format) {
328 case PIPE_FORMAT_R32_FLOAT:
329 emit_load_R32(p, dataXMM, srcECX);
330 break;
331 case PIPE_FORMAT_R32G32_FLOAT:
332 emit_load_R32G32(p, dataXMM, srcECX);
333 break;
334 case PIPE_FORMAT_R32G32B32_FLOAT:
335 emit_load_R32G32B32(p, dataXMM, srcECX);
336 break;
337 case PIPE_FORMAT_R32G32B32A32_FLOAT:
338 emit_load_R32G32B32A32(p, dataXMM, srcECX);
339 break;
340 case PIPE_FORMAT_B8G8R8A8_UNORM:
341 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
342 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
343 break;
344 case PIPE_FORMAT_R8G8B8A8_UNORM:
345 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
346 break;
347 default:
348 return FALSE;
349 }
350
351 switch (a->output_format) {
352 case PIPE_FORMAT_R32_FLOAT:
353 emit_store_R32(p, dstEAX, dataXMM);
354 break;
355 case PIPE_FORMAT_R32G32_FLOAT:
356 emit_store_R32G32(p, dstEAX, dataXMM);
357 break;
358 case PIPE_FORMAT_R32G32B32_FLOAT:
359 emit_store_R32G32B32(p, dstEAX, dataXMM);
360 break;
361 case PIPE_FORMAT_R32G32B32A32_FLOAT:
362 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
363 break;
364 case PIPE_FORMAT_B8G8R8A8_UNORM:
365 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
366 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
367 break;
368 case PIPE_FORMAT_R8G8B8A8_UNORM:
369 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
370 break;
371 default:
372 return FALSE;
373 }
374
375 return TRUE;
376 }
377
378
379 static boolean init_inputs( struct translate_sse *p,
380 boolean linear )
381 {
382 unsigned i;
383 struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
384 get_offset(p, &p->instance_id));
385
386 for (i = 0; i < p->nr_buffer_varients; i++) {
387 struct translate_buffer_varient *varient = &p->buffer_varient[i];
388 struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
389
390 if (linear || varient->instance_divisor) {
391 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
392 get_offset(p, &buffer->stride));
393 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
394 get_offset(p, &varient->ptr));
395 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
396 get_offset(p, &buffer->base_ptr));
397 struct x86_reg elt = p->idx_EBX;
398 struct x86_reg tmp_EAX = p->tmp_EAX;
399
400 /* Calculate pointer to first attrib:
401 * base_ptr + stride * index, where index depends on instance divisor
402 */
403 if (varient->instance_divisor) {
404 /* Our index is instance ID divided by instance divisor.
405 */
406 x86_mov(p->func, tmp_EAX, instance_id);
407
408 if (varient->instance_divisor != 1) {
409 struct x86_reg tmp_EDX = p->machine_EDX;
410 struct x86_reg tmp_ECX = p->outbuf_ECX;
411
412 /* TODO: Add x86_shr() to rtasm and use it whenever
413 * instance divisor is power of two.
414 */
415
416 x86_push(p->func, tmp_EDX);
417 x86_push(p->func, tmp_ECX);
418 x86_xor(p->func, tmp_EDX, tmp_EDX);
419 x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
420 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
421 x86_pop(p->func, tmp_ECX);
422 x86_pop(p->func, tmp_EDX);
423 }
424 } else {
425 x86_mov(p->func, tmp_EAX, elt);
426 }
427
428 /*
429 * TODO: Respect translate_buffer::max_index.
430 */
431
432 x86_imul(p->func, tmp_EAX, buf_stride);
433 x86_add(p->func, tmp_EAX, buf_base_ptr);
434
435
436 /* In the linear case, keep the buffer pointer instead of the
437 * index number.
438 */
439 if (linear && p->nr_buffer_varients == 1)
440 x86_mov(p->func, elt, tmp_EAX);
441 else
442 x86_mov(p->func, buf_ptr, tmp_EAX);
443 }
444 }
445
446 return TRUE;
447 }
448
449
450 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
451 boolean linear,
452 unsigned var_idx,
453 struct x86_reg elt )
454 {
455 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
456 return x86_make_disp(p->machine_EDX,
457 get_offset(p, &p->instance_id));
458 }
459 if (linear && p->nr_buffer_varients == 1) {
460 return p->idx_EBX;
461 }
462 else if (linear || p->buffer_varient[var_idx].instance_divisor) {
463 struct x86_reg ptr = p->tmp_EAX;
464 struct x86_reg buf_ptr =
465 x86_make_disp(p->machine_EDX,
466 get_offset(p, &p->buffer_varient[var_idx].ptr));
467
468 x86_mov(p->func, ptr, buf_ptr);
469 return ptr;
470 }
471 else {
472 struct x86_reg ptr = p->tmp_EAX;
473 const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
474
475 struct x86_reg buf_stride =
476 x86_make_disp(p->machine_EDX,
477 get_offset(p, &p->buffer[varient->buffer_index].stride));
478
479 struct x86_reg buf_base_ptr =
480 x86_make_disp(p->machine_EDX,
481 get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
482
483
484
485 /* Calculate pointer to current attrib:
486 */
487 x86_mov(p->func, ptr, buf_stride);
488 x86_imul(p->func, ptr, elt);
489 x86_add(p->func, ptr, buf_base_ptr);
490 return ptr;
491 }
492 }
493
494
495
496 static boolean incr_inputs( struct translate_sse *p,
497 boolean linear )
498 {
499 if (linear && p->nr_buffer_varients == 1) {
500 struct x86_reg stride = x86_make_disp(p->machine_EDX,
501 get_offset(p, &p->buffer[0].stride));
502
503 if (p->buffer_varient[0].instance_divisor == 0) {
504 x86_add(p->func, p->idx_EBX, stride);
505 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
506 }
507 }
508 else if (linear) {
509 unsigned i;
510
511 /* Is this worthwhile??
512 */
513 for (i = 0; i < p->nr_buffer_varients; i++) {
514 struct translate_buffer_varient *varient = &p->buffer_varient[i];
515 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
516 get_offset(p, &varient->ptr));
517 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
518 get_offset(p, &p->buffer[varient->buffer_index].stride));
519
520 if (varient->instance_divisor == 0) {
521 x86_mov(p->func, p->tmp_EAX, buf_ptr);
522 x86_add(p->func, p->tmp_EAX, buf_stride);
523 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
524 x86_mov(p->func, buf_ptr, p->tmp_EAX);
525 }
526 }
527 }
528 else {
529 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
530 }
531
532 return TRUE;
533 }
534
535
536 /* Build run( struct translate *machine,
537 * unsigned start,
538 * unsigned count,
539 * void *output_buffer )
540 * or
541 * run_elts( struct translate *machine,
542 * unsigned *elts,
543 * unsigned count,
544 * void *output_buffer )
545 *
546 * Lots of hardcoding
547 *
548 * EAX -- pointer to current output vertex
549 * ECX -- pointer to current attribute
550 *
551 */
552 static boolean build_vertex_emit( struct translate_sse *p,
553 struct x86_function *func,
554 boolean linear )
555 {
556 int fixup, label;
557 unsigned j;
558
559 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
560 p->idx_EBX = x86_make_reg(file_REG32, reg_BX);
561 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
562 p->machine_EDX = x86_make_reg(file_REG32, reg_DX);
563 p->count_ESI = x86_make_reg(file_REG32, reg_SI);
564
565 p->func = func;
566 p->loaded_inv_255 = FALSE;
567 p->loaded_255 = FALSE;
568 p->loaded_identity = FALSE;
569
570 x86_init_func(p->func);
571
572 /* Push a few regs?
573 */
574 x86_push(p->func, p->idx_EBX);
575 x86_push(p->func, p->count_ESI);
576
577 /* Load arguments into regs:
578 */
579 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
580 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
581 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
582 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
583
584 /* Load instance ID.
585 */
586 if (p->use_instancing) {
587 x86_mov(p->func,
588 p->tmp_EAX,
589 x86_fn_arg(p->func, 4));
590 x86_mov(p->func,
591 x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
592 p->tmp_EAX);
593 }
594
595 /* Get vertex count, compare to zero
596 */
597 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
598 x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
599 fixup = x86_jcc_forward(p->func, cc_E);
600
601 /* always load, needed or not:
602 */
603 init_inputs(p, linear);
604
605 /* Note address for loop jump
606 */
607 label = x86_get_label(p->func);
608 {
609 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
610 int last_varient = -1;
611 struct x86_reg vb;
612
613 for (j = 0; j < p->translate.key.nr_elements; j++) {
614 const struct translate_element *a = &p->translate.key.element[j];
615 unsigned varient = p->element_to_buffer_varient[j];
616
617 /* Figure out source pointer address:
618 */
619 if (varient != last_varient) {
620 last_varient = varient;
621 vb = get_buffer_ptr(p, linear, varient, elt);
622 }
623
624 if (!translate_attr( p, a,
625 x86_make_disp(vb, a->input_offset),
626 x86_make_disp(p->outbuf_ECX, a->output_offset)))
627 return FALSE;
628 }
629
630 /* Next output vertex:
631 */
632 x86_lea(p->func,
633 p->outbuf_ECX,
634 x86_make_disp(p->outbuf_ECX,
635 p->translate.key.output_stride));
636
637 /* Incr index
638 */
639 incr_inputs( p, linear );
640 }
641
642 /* decr count, loop if not zero
643 */
644 x86_dec(p->func, p->count_ESI);
645 x86_jcc(p->func, cc_NZ, label);
646
647 /* Exit mmx state?
648 */
649 if (p->func->need_emms)
650 mmx_emms(p->func);
651
652 /* Land forward jump here:
653 */
654 x86_fixup_fwd_jump(p->func, fixup);
655
656 /* Pop regs and return
657 */
658
659 x86_pop(p->func, p->count_ESI);
660 x86_pop(p->func, p->idx_EBX);
661 x86_ret(p->func);
662
663 return TRUE;
664 }
665
666
667
668
669
670
671
672 static void translate_sse_set_buffer( struct translate *translate,
673 unsigned buf,
674 const void *ptr,
675 unsigned stride,
676 unsigned max_index )
677 {
678 struct translate_sse *p = (struct translate_sse *)translate;
679
680 if (buf < p->nr_buffers) {
681 p->buffer[buf].base_ptr = (char *)ptr;
682 p->buffer[buf].stride = stride;
683 p->buffer[buf].max_index = max_index;
684 }
685
686 if (0) debug_printf("%s %d/%d: %p %d\n",
687 __FUNCTION__, buf,
688 p->nr_buffers,
689 ptr, stride);
690 }
691
692
693 static void translate_sse_release( struct translate *translate )
694 {
695 struct translate_sse *p = (struct translate_sse *)translate;
696
697 x86_release_func( &p->linear_func );
698 x86_release_func( &p->elt_func );
699
700 FREE(p);
701 }
702
703 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
704 const unsigned *elts,
705 unsigned count,
706 unsigned instance_id,
707 void *output_buffer )
708 {
709 struct translate_sse *p = (struct translate_sse *)translate;
710
711 p->gen_run_elts( translate,
712 elts,
713 count,
714 instance_id,
715 output_buffer);
716 }
717
718 static void PIPE_CDECL translate_sse_run( struct translate *translate,
719 unsigned start,
720 unsigned count,
721 unsigned instance_id,
722 void *output_buffer )
723 {
724 struct translate_sse *p = (struct translate_sse *)translate;
725
726 p->gen_run( translate,
727 start,
728 count,
729 instance_id,
730 output_buffer);
731 }
732
733
734 struct translate *translate_sse2_create( const struct translate_key *key )
735 {
736 struct translate_sse *p = NULL;
737 unsigned i;
738
739 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
740 goto fail;
741
742 p = CALLOC_STRUCT( translate_sse );
743 if (p == NULL)
744 goto fail;
745
746 p->translate.key = *key;
747 p->translate.release = translate_sse_release;
748 p->translate.set_buffer = translate_sse_set_buffer;
749 p->translate.run_elts = translate_sse_run_elts;
750 p->translate.run = translate_sse_run;
751
752 for (i = 0; i < key->nr_elements; i++) {
753 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
754 unsigned j;
755
756 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
757
758 if (key->element[i].instance_divisor) {
759 p->use_instancing = TRUE;
760 }
761
762 /*
763 * Map vertex element to vertex buffer varient.
764 */
765 for (j = 0; j < p->nr_buffer_varients; j++) {
766 if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
767 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
768 break;
769 }
770 }
771 if (j == p->nr_buffer_varients) {
772 p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
773 p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
774 p->nr_buffer_varients++;
775 }
776 p->element_to_buffer_varient[i] = j;
777 } else {
778 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
779
780 p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
781 }
782 }
783
784 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
785
786 if (!build_vertex_emit(p, &p->linear_func, TRUE))
787 goto fail;
788
789 if (!build_vertex_emit(p, &p->elt_func, FALSE))
790 goto fail;
791
792 p->gen_run = (run_func)x86_get_func(&p->linear_func);
793 if (p->gen_run == NULL)
794 goto fail;
795
796 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
797 if (p->gen_run_elts == NULL)
798 goto fail;
799
800 return &p->translate;
801
802 fail:
803 if (p)
804 translate_sse_release( &p->translate );
805
806 return NULL;
807 }
808
809
810
811 #else
812
813 struct translate *translate_sse2_create( const struct translate_key *key )
814 {
815 return NULL;
816 }
817
818 #endif