translate: Fix a call to indexed SSE run.
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33
34 #include "translate.h"
35
36
37 #if defined(PIPE_ARCH_X86)
38
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
41
42
43 #define X 0
44 #define Y 1
45 #define Z 2
46 #define W 3
47
48
49 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50 unsigned start,
51 unsigned count,
52 unsigned instance_id,
53 void *output_buffer );
54
55 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
56 const unsigned *elts,
57 unsigned count,
58 unsigned instance_id,
59 void *output_buffer );
60
61 struct translate_buffer {
62 const void *base_ptr;
63 unsigned stride;
64 };
65
66 struct translate_buffer_varient {
67 unsigned buffer_index;
68 unsigned instance_divisor;
69 void *ptr; /* updated either per vertex or per instance */
70 };
71
72
73 struct translate_sse {
74 struct translate translate;
75
76 struct x86_function linear_func;
77 struct x86_function elt_func;
78 struct x86_function *func;
79
80 boolean loaded_identity;
81 boolean loaded_255;
82 boolean loaded_inv_255;
83
84 float identity[4];
85 float float_255[4];
86 float inv_255[4];
87
88 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
89 unsigned nr_buffers;
90
91 /* Multiple buffer varients can map to a single buffer. */
92 struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
93 unsigned nr_buffer_varients;
94
95 /* Multiple elements can map to a single buffer varient. */
96 unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
97
98 boolean use_instancing;
99 unsigned instance_id;
100
101 run_func gen_run;
102 run_elts_func gen_run_elts;
103
104 /* these are actually known values, but putting them in a struct
105 * like this is helpful to keep them in sync across the file.
106 */
107 struct x86_reg tmp_EAX;
108 struct x86_reg idx_EBX; /* either start+i or &elt[i] */
109 struct x86_reg outbuf_ECX;
110 struct x86_reg machine_EDX;
111 struct x86_reg count_ESI; /* decrements to zero */
112 };
113
114 static int get_offset( const void *a, const void *b )
115 {
116 return (const char *)b - (const char *)a;
117 }
118
119
120
121 static struct x86_reg get_identity( struct translate_sse *p )
122 {
123 struct x86_reg reg = x86_make_reg(file_XMM, 6);
124
125 if (!p->loaded_identity) {
126 p->loaded_identity = TRUE;
127 p->identity[0] = 0;
128 p->identity[1] = 0;
129 p->identity[2] = 0;
130 p->identity[3] = 1;
131
132 sse_movups(p->func, reg,
133 x86_make_disp(p->machine_EDX,
134 get_offset(p, &p->identity[0])));
135 }
136
137 return reg;
138 }
139
140 static struct x86_reg get_255( struct translate_sse *p )
141 {
142 struct x86_reg reg = x86_make_reg(file_XMM, 7);
143
144 if (!p->loaded_255) {
145 p->loaded_255 = TRUE;
146 p->float_255[0] =
147 p->float_255[1] =
148 p->float_255[2] =
149 p->float_255[3] = 255.0f;
150
151 sse_movups(p->func, reg,
152 x86_make_disp(p->machine_EDX,
153 get_offset(p, &p->float_255[0])));
154 }
155
156 return reg;
157 }
158
159 static struct x86_reg get_inv_255( struct translate_sse *p )
160 {
161 struct x86_reg reg = x86_make_reg(file_XMM, 5);
162
163 if (!p->loaded_inv_255) {
164 p->loaded_inv_255 = TRUE;
165 p->inv_255[0] =
166 p->inv_255[1] =
167 p->inv_255[2] =
168 p->inv_255[3] = 1.0f / 255.0f;
169
170 sse_movups(p->func, reg,
171 x86_make_disp(p->machine_EDX,
172 get_offset(p, &p->inv_255[0])));
173 }
174
175 return reg;
176 }
177
178
179 static void emit_load_R32G32B32A32( struct translate_sse *p,
180 struct x86_reg data,
181 struct x86_reg arg0 )
182 {
183 sse_movups(p->func, data, arg0);
184 }
185
186 static void emit_load_R32G32B32( struct translate_sse *p,
187 struct x86_reg data,
188 struct x86_reg arg0 )
189 {
190 /* Have to jump through some hoops:
191 *
192 * c 0 0 0
193 * c 0 0 1
194 * 0 0 c 1
195 * a b c 1
196 */
197 sse_movss(p->func, data, x86_make_disp(arg0, 8));
198 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
199 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
200 sse_movlps(p->func, data, arg0);
201 }
202
203 static void emit_load_R32G32( struct translate_sse *p,
204 struct x86_reg data,
205 struct x86_reg arg0 )
206 {
207 /* 0 0 0 1
208 * a b 0 1
209 */
210 sse_movups(p->func, data, get_identity(p) );
211 sse_movlps(p->func, data, arg0);
212 }
213
214
215 static void emit_load_R32( struct translate_sse *p,
216 struct x86_reg data,
217 struct x86_reg arg0 )
218 {
219 /* a 0 0 0
220 * a 0 0 1
221 */
222 sse_movss(p->func, data, arg0);
223 sse_orps(p->func, data, get_identity(p) );
224 }
225
226
227 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
228 struct x86_reg data,
229 struct x86_reg src )
230 {
231
232 /* Load and unpack twice:
233 */
234 sse_movss(p->func, data, src);
235 sse2_punpcklbw(p->func, data, get_identity(p));
236 sse2_punpcklbw(p->func, data, get_identity(p));
237
238 /* Convert to float:
239 */
240 sse2_cvtdq2ps(p->func, data, data);
241
242
243 /* Scale by 1/255.0
244 */
245 sse_mulps(p->func, data, get_inv_255(p));
246 }
247
248
249
250
251 static void emit_store_R32G32B32A32( struct translate_sse *p,
252 struct x86_reg dest,
253 struct x86_reg dataXMM )
254 {
255 sse_movups(p->func, dest, dataXMM);
256 }
257
258 static void emit_store_R32G32B32( struct translate_sse *p,
259 struct x86_reg dest,
260 struct x86_reg dataXMM )
261 {
262 /* Emit two, shuffle, emit one.
263 */
264 sse_movlps(p->func, dest, dataXMM);
265 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
266 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
267 }
268
269 static void emit_store_R32G32( struct translate_sse *p,
270 struct x86_reg dest,
271 struct x86_reg dataXMM )
272 {
273 sse_movlps(p->func, dest, dataXMM);
274 }
275
276 static void emit_store_R32( struct translate_sse *p,
277 struct x86_reg dest,
278 struct x86_reg dataXMM )
279 {
280 sse_movss(p->func, dest, dataXMM);
281 }
282
283
284
285 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
286 struct x86_reg dest,
287 struct x86_reg dataXMM )
288 {
289 /* Scale by 255.0
290 */
291 sse_mulps(p->func, dataXMM, get_255(p));
292
293 /* Pack and emit:
294 */
295 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
296 sse2_packssdw(p->func, dataXMM, dataXMM);
297 sse2_packuswb(p->func, dataXMM, dataXMM);
298 sse_movss(p->func, dest, dataXMM);
299 }
300
301
302
303
304
305 /* Extended swizzles? Maybe later.
306 */
307 static void emit_swizzle( struct translate_sse *p,
308 struct x86_reg dest,
309 struct x86_reg src,
310 unsigned char shuffle )
311 {
312 sse_shufps(p->func, dest, src, shuffle);
313 }
314
315
316 static boolean translate_attr( struct translate_sse *p,
317 const struct translate_element *a,
318 struct x86_reg srcECX,
319 struct x86_reg dstEAX)
320 {
321 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
322
323 switch (a->input_format) {
324 case PIPE_FORMAT_R32_FLOAT:
325 emit_load_R32(p, dataXMM, srcECX);
326 break;
327 case PIPE_FORMAT_R32G32_FLOAT:
328 emit_load_R32G32(p, dataXMM, srcECX);
329 break;
330 case PIPE_FORMAT_R32G32B32_FLOAT:
331 emit_load_R32G32B32(p, dataXMM, srcECX);
332 break;
333 case PIPE_FORMAT_R32G32B32A32_FLOAT:
334 emit_load_R32G32B32A32(p, dataXMM, srcECX);
335 break;
336 case PIPE_FORMAT_B8G8R8A8_UNORM:
337 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
338 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
339 break;
340 case PIPE_FORMAT_R8G8B8A8_UNORM:
341 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
342 break;
343 default:
344 return FALSE;
345 }
346
347 switch (a->output_format) {
348 case PIPE_FORMAT_R32_FLOAT:
349 emit_store_R32(p, dstEAX, dataXMM);
350 break;
351 case PIPE_FORMAT_R32G32_FLOAT:
352 emit_store_R32G32(p, dstEAX, dataXMM);
353 break;
354 case PIPE_FORMAT_R32G32B32_FLOAT:
355 emit_store_R32G32B32(p, dstEAX, dataXMM);
356 break;
357 case PIPE_FORMAT_R32G32B32A32_FLOAT:
358 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
359 break;
360 case PIPE_FORMAT_B8G8R8A8_UNORM:
361 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
362 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
363 break;
364 case PIPE_FORMAT_R8G8B8A8_UNORM:
365 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
366 break;
367 default:
368 return FALSE;
369 }
370
371 return TRUE;
372 }
373
374
375 static boolean init_inputs( struct translate_sse *p,
376 boolean linear )
377 {
378 unsigned i;
379 if (linear) {
380 struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
381 get_offset(p, &p->instance_id));
382
383 for (i = 0; i < p->nr_buffer_varients; i++) {
384 struct translate_buffer_varient *varient = &p->buffer_varient[i];
385 struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
386 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
387 get_offset(p, &buffer->stride));
388 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
389 get_offset(p, &varient->ptr));
390 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
391 get_offset(p, &buffer->base_ptr));
392 struct x86_reg elt = p->idx_EBX;
393 struct x86_reg tmp_EAX = p->tmp_EAX;
394
395 /* Calculate pointer to first attrib:
396 * base_ptr + stride * index, where index depends on instance divisor
397 */
398 if (varient->instance_divisor) {
399 /* Our index is instance ID divided by instance divisor.
400 */
401 x86_mov(p->func, tmp_EAX, instance_id);
402
403 if (varient->instance_divisor != 1) {
404 struct x86_reg tmp_EDX = p->machine_EDX;
405 struct x86_reg tmp_ECX = p->outbuf_ECX;
406
407 /* TODO: Add x86_shr() to rtasm and use it whenever
408 * instance divisor is power of two.
409 */
410
411 x86_push(p->func, tmp_EDX);
412 x86_push(p->func, tmp_ECX);
413 x86_xor(p->func, tmp_EDX, tmp_EDX);
414 x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
415 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
416 x86_pop(p->func, tmp_ECX);
417 x86_pop(p->func, tmp_EDX);
418 }
419 } else {
420 x86_mov(p->func, tmp_EAX, elt);
421 }
422 x86_imul(p->func, tmp_EAX, buf_stride);
423 x86_add(p->func, tmp_EAX, buf_base_ptr);
424
425
426 /* In the linear case, keep the buffer pointer instead of the
427 * index number.
428 */
429 if (p->nr_buffer_varients == 1)
430 x86_mov(p->func, elt, tmp_EAX);
431 else
432 x86_mov(p->func, buf_ptr, tmp_EAX);
433 }
434 }
435
436 return TRUE;
437 }
438
439
440 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
441 boolean linear,
442 unsigned var_idx,
443 struct x86_reg elt )
444 {
445 if (linear && p->nr_buffer_varients == 1) {
446 return p->idx_EBX;
447 }
448 else if (linear) {
449 struct x86_reg ptr = p->tmp_EAX;
450 struct x86_reg buf_ptr =
451 x86_make_disp(p->machine_EDX,
452 get_offset(p, &p->buffer_varient[var_idx].ptr));
453
454 x86_mov(p->func, ptr, buf_ptr);
455 return ptr;
456 }
457 else {
458 struct x86_reg ptr = p->tmp_EAX;
459 const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
460
461 struct x86_reg buf_stride =
462 x86_make_disp(p->machine_EDX,
463 get_offset(p, &p->buffer[varient->buffer_index].stride));
464
465 struct x86_reg buf_base_ptr =
466 x86_make_disp(p->machine_EDX,
467 get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
468
469
470
471 /* Calculate pointer to current attrib:
472 */
473 x86_mov(p->func, ptr, buf_stride);
474 x86_imul(p->func, ptr, elt);
475 x86_add(p->func, ptr, buf_base_ptr);
476 return ptr;
477 }
478 }
479
480
481
482 static boolean incr_inputs( struct translate_sse *p,
483 boolean linear )
484 {
485 if (linear && p->nr_buffer_varients == 1) {
486 struct x86_reg stride = x86_make_disp(p->machine_EDX,
487 get_offset(p, &p->buffer[0].stride));
488
489 if (p->buffer_varient[0].instance_divisor == 0) {
490 x86_add(p->func, p->idx_EBX, stride);
491 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
492 }
493 }
494 else if (linear) {
495 unsigned i;
496
497 /* Is this worthwhile??
498 */
499 for (i = 0; i < p->nr_buffer_varients; i++) {
500 struct translate_buffer_varient *varient = &p->buffer_varient[i];
501 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
502 get_offset(p, &varient->ptr));
503 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
504 get_offset(p, &p->buffer[varient->buffer_index].stride));
505
506 if (varient->instance_divisor == 0) {
507 x86_mov(p->func, p->tmp_EAX, buf_ptr);
508 x86_add(p->func, p->tmp_EAX, buf_stride);
509 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
510 x86_mov(p->func, buf_ptr, p->tmp_EAX);
511 }
512 }
513 }
514 else {
515 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
516 }
517
518 return TRUE;
519 }
520
521
522 /* Build run( struct translate *machine,
523 * unsigned start,
524 * unsigned count,
525 * void *output_buffer )
526 * or
527 * run_elts( struct translate *machine,
528 * unsigned *elts,
529 * unsigned count,
530 * void *output_buffer )
531 *
532 * Lots of hardcoding
533 *
534 * EAX -- pointer to current output vertex
535 * ECX -- pointer to current attribute
536 *
537 */
538 static boolean build_vertex_emit( struct translate_sse *p,
539 struct x86_function *func,
540 boolean linear )
541 {
542 int fixup, label;
543 unsigned j;
544
545 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
546 p->idx_EBX = x86_make_reg(file_REG32, reg_BX);
547 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
548 p->machine_EDX = x86_make_reg(file_REG32, reg_DX);
549 p->count_ESI = x86_make_reg(file_REG32, reg_SI);
550
551 p->func = func;
552 p->loaded_inv_255 = FALSE;
553 p->loaded_255 = FALSE;
554 p->loaded_identity = FALSE;
555
556 x86_init_func(p->func);
557
558 /* Push a few regs?
559 */
560 x86_push(p->func, p->idx_EBX);
561 x86_push(p->func, p->count_ESI);
562
563 /* Load arguments into regs:
564 */
565 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
566 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
567 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
568 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
569
570 /* Load instance ID.
571 */
572 if (p->use_instancing) {
573 x86_mov(p->func,
574 p->tmp_EAX,
575 x86_fn_arg(p->func, 4));
576 x86_mov(p->func,
577 x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
578 p->tmp_EAX);
579 }
580
581 /* Get vertex count, compare to zero
582 */
583 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
584 x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
585 fixup = x86_jcc_forward(p->func, cc_E);
586
587 /* always load, needed or not:
588 */
589 init_inputs(p, linear);
590
591 /* Note address for loop jump
592 */
593 label = x86_get_label(p->func);
594 {
595 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
596 int last_varient = -1;
597 struct x86_reg vb;
598
599 for (j = 0; j < p->translate.key.nr_elements; j++) {
600 const struct translate_element *a = &p->translate.key.element[j];
601 unsigned varient = p->element_to_buffer_varient[j];
602
603 /* Figure out source pointer address:
604 */
605 if (varient != last_varient) {
606 last_varient = varient;
607 vb = get_buffer_ptr(p, linear, varient, elt);
608 }
609
610 if (!translate_attr( p, a,
611 x86_make_disp(vb, a->input_offset),
612 x86_make_disp(p->outbuf_ECX, a->output_offset)))
613 return FALSE;
614 }
615
616 /* Next output vertex:
617 */
618 x86_lea(p->func,
619 p->outbuf_ECX,
620 x86_make_disp(p->outbuf_ECX,
621 p->translate.key.output_stride));
622
623 /* Incr index
624 */
625 incr_inputs( p, linear );
626 }
627
628 /* decr count, loop if not zero
629 */
630 x86_dec(p->func, p->count_ESI);
631 x86_jcc(p->func, cc_NZ, label);
632
633 /* Exit mmx state?
634 */
635 if (p->func->need_emms)
636 mmx_emms(p->func);
637
638 /* Land forward jump here:
639 */
640 x86_fixup_fwd_jump(p->func, fixup);
641
642 /* Pop regs and return
643 */
644
645 x86_pop(p->func, p->count_ESI);
646 x86_pop(p->func, p->idx_EBX);
647 x86_ret(p->func);
648
649 return TRUE;
650 }
651
652
653
654
655
656
657
658 static void translate_sse_set_buffer( struct translate *translate,
659 unsigned buf,
660 const void *ptr,
661 unsigned stride )
662 {
663 struct translate_sse *p = (struct translate_sse *)translate;
664
665 if (buf < p->nr_buffers) {
666 p->buffer[buf].base_ptr = (char *)ptr;
667 p->buffer[buf].stride = stride;
668 }
669
670 if (0) debug_printf("%s %d/%d: %p %d\n",
671 __FUNCTION__, buf,
672 p->nr_buffers,
673 ptr, stride);
674 }
675
676
677 static void translate_sse_release( struct translate *translate )
678 {
679 struct translate_sse *p = (struct translate_sse *)translate;
680
681 x86_release_func( &p->linear_func );
682 x86_release_func( &p->elt_func );
683
684 FREE(p);
685 }
686
687 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
688 const unsigned *elts,
689 unsigned count,
690 void *output_buffer )
691 {
692 struct translate_sse *p = (struct translate_sse *)translate;
693
694 p->gen_run_elts( translate,
695 elts,
696 count,
697 0,
698 output_buffer );
699 }
700
701 static void PIPE_CDECL translate_sse_run( struct translate *translate,
702 unsigned start,
703 unsigned count,
704 unsigned instance_id,
705 void *output_buffer )
706 {
707 struct translate_sse *p = (struct translate_sse *)translate;
708
709 p->gen_run( translate,
710 start,
711 count,
712 instance_id,
713 output_buffer );
714 }
715
716
717 struct translate *translate_sse2_create( const struct translate_key *key )
718 {
719 struct translate_sse *p = NULL;
720 unsigned i;
721
722 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
723 goto fail;
724
725 p = CALLOC_STRUCT( translate_sse );
726 if (p == NULL)
727 goto fail;
728
729 p->translate.key = *key;
730 p->translate.release = translate_sse_release;
731 p->translate.set_buffer = translate_sse_set_buffer;
732 p->translate.run_elts = translate_sse_run_elts;
733 p->translate.run = translate_sse_run;
734
735 for (i = 0; i < key->nr_elements; i++) {
736 unsigned j;
737
738 p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
739
740 if (key->element[i].instance_divisor) {
741 p->use_instancing = TRUE;
742 }
743
744 /*
745 * Map vertex element to vertex buffer varient.
746 */
747 for (j = 0; j < p->nr_buffer_varients; j++) {
748 if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
749 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
750 break;
751 }
752 }
753 if (j == p->nr_buffer_varients) {
754 p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
755 p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
756 p->nr_buffer_varients++;
757 }
758 p->element_to_buffer_varient[i] = j;
759 }
760
761 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
762
763 if (!build_vertex_emit(p, &p->linear_func, TRUE))
764 goto fail;
765
766 if (!build_vertex_emit(p, &p->elt_func, FALSE))
767 goto fail;
768
769 p->gen_run = (run_func)x86_get_func(&p->linear_func);
770 if (p->gen_run == NULL)
771 goto fail;
772
773 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
774 if (p->gen_run_elts == NULL)
775 goto fail;
776
777 return &p->translate;
778
779 fail:
780 if (p)
781 translate_sse_release( &p->translate );
782
783 return NULL;
784 }
785
786
787
788 #else
789
790 struct translate *translate_sse2_create( const struct translate_key *key )
791 {
792 return NULL;
793 }
794
795 #endif