Merge commit 'origin/gallium-master-merge'
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33
34 #include "translate.h"
35
36
37 #if defined(PIPE_ARCH_X86)
38
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
41
42
43 #define X 0
44 #define Y 1
45 #define Z 2
46 #define W 3
47
48
49 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50 unsigned start,
51 unsigned count,
52 void *output_buffer );
53
54 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
55 const unsigned *elts,
56 unsigned count,
57 void *output_buffer );
58
59 struct translate_buffer {
60 const void *base_ptr;
61 unsigned stride;
62 void *ptr; /* updated per vertex */
63 };
64
65
66 struct translate_sse {
67 struct translate translate;
68
69 struct x86_function linear_func;
70 struct x86_function elt_func;
71 struct x86_function *func;
72
73 boolean loaded_identity;
74 boolean loaded_255;
75 boolean loaded_inv_255;
76
77 float identity[4];
78 float float_255[4];
79 float inv_255[4];
80
81 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
82 unsigned nr_buffers;
83
84 run_func gen_run;
85 run_elts_func gen_run_elts;
86
87 /* these are actually known values, but putting them in a struct
88 * like this is helpful to keep them in sync across the file.
89 */
90 struct x86_reg tmp_EAX;
91 struct x86_reg idx_EBX; /* either start+i or &elt[i] */
92 struct x86_reg outbuf_ECX;
93 struct x86_reg machine_EDX;
94 struct x86_reg count_ESI; /* decrements to zero */
95 };
96
97 static int get_offset( const void *a, const void *b )
98 {
99 return (const char *)b - (const char *)a;
100 }
101
102
103
104 static struct x86_reg get_identity( struct translate_sse *p )
105 {
106 struct x86_reg reg = x86_make_reg(file_XMM, 6);
107
108 if (!p->loaded_identity) {
109 p->loaded_identity = TRUE;
110 p->identity[0] = 0;
111 p->identity[1] = 0;
112 p->identity[2] = 0;
113 p->identity[3] = 1;
114
115 sse_movups(p->func, reg,
116 x86_make_disp(p->machine_EDX,
117 get_offset(p, &p->identity[0])));
118 }
119
120 return reg;
121 }
122
123 static struct x86_reg get_255( struct translate_sse *p )
124 {
125 struct x86_reg reg = x86_make_reg(file_XMM, 7);
126
127 if (!p->loaded_255) {
128 p->loaded_255 = TRUE;
129 p->float_255[0] =
130 p->float_255[1] =
131 p->float_255[2] =
132 p->float_255[3] = 255.0f;
133
134 sse_movups(p->func, reg,
135 x86_make_disp(p->machine_EDX,
136 get_offset(p, &p->float_255[0])));
137 }
138
139 return reg;
140 }
141
142 static struct x86_reg get_inv_255( struct translate_sse *p )
143 {
144 struct x86_reg reg = x86_make_reg(file_XMM, 5);
145
146 if (!p->loaded_inv_255) {
147 p->loaded_inv_255 = TRUE;
148 p->inv_255[0] =
149 p->inv_255[1] =
150 p->inv_255[2] =
151 p->inv_255[3] = 1.0f / 255.0f;
152
153 sse_movups(p->func, reg,
154 x86_make_disp(p->machine_EDX,
155 get_offset(p, &p->inv_255[0])));
156 }
157
158 return reg;
159 }
160
161
162 static void emit_load_R32G32B32A32( struct translate_sse *p,
163 struct x86_reg data,
164 struct x86_reg arg0 )
165 {
166 sse_movups(p->func, data, arg0);
167 }
168
169 static void emit_load_R32G32B32( struct translate_sse *p,
170 struct x86_reg data,
171 struct x86_reg arg0 )
172 {
173 /* Have to jump through some hoops:
174 *
175 * c 0 0 0
176 * c 0 0 1
177 * 0 0 c 1
178 * a b c 1
179 */
180 sse_movss(p->func, data, x86_make_disp(arg0, 8));
181 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
182 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
183 sse_movlps(p->func, data, arg0);
184 }
185
186 static void emit_load_R32G32( struct translate_sse *p,
187 struct x86_reg data,
188 struct x86_reg arg0 )
189 {
190 /* 0 0 0 1
191 * a b 0 1
192 */
193 sse_movups(p->func, data, get_identity(p) );
194 sse_movlps(p->func, data, arg0);
195 }
196
197
198 static void emit_load_R32( struct translate_sse *p,
199 struct x86_reg data,
200 struct x86_reg arg0 )
201 {
202 /* a 0 0 0
203 * a 0 0 1
204 */
205 sse_movss(p->func, data, arg0);
206 sse_orps(p->func, data, get_identity(p) );
207 }
208
209
210 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
211 struct x86_reg data,
212 struct x86_reg src )
213 {
214
215 /* Load and unpack twice:
216 */
217 sse_movss(p->func, data, src);
218 sse2_punpcklbw(p->func, data, get_identity(p));
219 sse2_punpcklbw(p->func, data, get_identity(p));
220
221 /* Convert to float:
222 */
223 sse2_cvtdq2ps(p->func, data, data);
224
225
226 /* Scale by 1/255.0
227 */
228 sse_mulps(p->func, data, get_inv_255(p));
229 }
230
231
232
233
234 static void emit_store_R32G32B32A32( struct translate_sse *p,
235 struct x86_reg dest,
236 struct x86_reg dataXMM )
237 {
238 sse_movups(p->func, dest, dataXMM);
239 }
240
241 static void emit_store_R32G32B32( struct translate_sse *p,
242 struct x86_reg dest,
243 struct x86_reg dataXMM )
244 {
245 /* Emit two, shuffle, emit one.
246 */
247 sse_movlps(p->func, dest, dataXMM);
248 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
249 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
250 }
251
252 static void emit_store_R32G32( struct translate_sse *p,
253 struct x86_reg dest,
254 struct x86_reg dataXMM )
255 {
256 sse_movlps(p->func, dest, dataXMM);
257 }
258
259 static void emit_store_R32( struct translate_sse *p,
260 struct x86_reg dest,
261 struct x86_reg dataXMM )
262 {
263 sse_movss(p->func, dest, dataXMM);
264 }
265
266
267
268 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
269 struct x86_reg dest,
270 struct x86_reg dataXMM )
271 {
272 /* Scale by 255.0
273 */
274 sse_mulps(p->func, dataXMM, get_255(p));
275
276 /* Pack and emit:
277 */
278 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
279 sse2_packssdw(p->func, dataXMM, dataXMM);
280 sse2_packuswb(p->func, dataXMM, dataXMM);
281 sse_movss(p->func, dest, dataXMM);
282 }
283
284
285
286
287
288 /* Extended swizzles? Maybe later.
289 */
290 static void emit_swizzle( struct translate_sse *p,
291 struct x86_reg dest,
292 struct x86_reg src,
293 unsigned char shuffle )
294 {
295 sse_shufps(p->func, dest, src, shuffle);
296 }
297
298
299 static boolean translate_attr( struct translate_sse *p,
300 const struct translate_element *a,
301 struct x86_reg srcECX,
302 struct x86_reg dstEAX)
303 {
304 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
305
306 switch (a->input_format) {
307 case PIPE_FORMAT_R32_FLOAT:
308 emit_load_R32(p, dataXMM, srcECX);
309 break;
310 case PIPE_FORMAT_R32G32_FLOAT:
311 emit_load_R32G32(p, dataXMM, srcECX);
312 break;
313 case PIPE_FORMAT_R32G32B32_FLOAT:
314 emit_load_R32G32B32(p, dataXMM, srcECX);
315 break;
316 case PIPE_FORMAT_R32G32B32A32_FLOAT:
317 emit_load_R32G32B32A32(p, dataXMM, srcECX);
318 break;
319 case PIPE_FORMAT_B8G8R8A8_UNORM:
320 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
321 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
322 break;
323 case PIPE_FORMAT_R8G8B8A8_UNORM:
324 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
325 break;
326 default:
327 return FALSE;
328 }
329
330 switch (a->output_format) {
331 case PIPE_FORMAT_R32_FLOAT:
332 emit_store_R32(p, dstEAX, dataXMM);
333 break;
334 case PIPE_FORMAT_R32G32_FLOAT:
335 emit_store_R32G32(p, dstEAX, dataXMM);
336 break;
337 case PIPE_FORMAT_R32G32B32_FLOAT:
338 emit_store_R32G32B32(p, dstEAX, dataXMM);
339 break;
340 case PIPE_FORMAT_R32G32B32A32_FLOAT:
341 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
342 break;
343 case PIPE_FORMAT_B8G8R8A8_UNORM:
344 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
345 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
346 break;
347 case PIPE_FORMAT_R8G8B8A8_UNORM:
348 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
349 break;
350 default:
351 return FALSE;
352 }
353
354 return TRUE;
355 }
356
357
358 static boolean init_inputs( struct translate_sse *p,
359 boolean linear )
360 {
361 unsigned i;
362 if (linear) {
363 for (i = 0; i < p->nr_buffers; i++) {
364 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
365 get_offset(p, &p->buffer[i].stride));
366 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
367 get_offset(p, &p->buffer[i].ptr));
368 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
369 get_offset(p, &p->buffer[i].base_ptr));
370 struct x86_reg elt = p->idx_EBX;
371 struct x86_reg tmp = p->tmp_EAX;
372
373
374 /* Calculate pointer to first attrib:
375 */
376 x86_mov(p->func, tmp, buf_stride);
377 x86_imul(p->func, tmp, elt);
378 x86_add(p->func, tmp, buf_base_ptr);
379
380
381 /* In the linear case, keep the buffer pointer instead of the
382 * index number.
383 */
384 if (p->nr_buffers == 1)
385 x86_mov( p->func, elt, tmp );
386 else
387 x86_mov( p->func, buf_ptr, tmp );
388 }
389 }
390
391 return TRUE;
392 }
393
394
395 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
396 boolean linear,
397 unsigned buf_idx,
398 struct x86_reg elt )
399 {
400 if (linear && p->nr_buffers == 1) {
401 return p->idx_EBX;
402 }
403 else if (linear) {
404 struct x86_reg ptr = p->tmp_EAX;
405 struct x86_reg buf_ptr =
406 x86_make_disp(p->machine_EDX,
407 get_offset(p, &p->buffer[buf_idx].ptr));
408
409 x86_mov(p->func, ptr, buf_ptr);
410 return ptr;
411 }
412 else {
413 struct x86_reg ptr = p->tmp_EAX;
414
415 struct x86_reg buf_stride =
416 x86_make_disp(p->machine_EDX,
417 get_offset(p, &p->buffer[buf_idx].stride));
418
419 struct x86_reg buf_base_ptr =
420 x86_make_disp(p->machine_EDX,
421 get_offset(p, &p->buffer[buf_idx].base_ptr));
422
423
424
425 /* Calculate pointer to current attrib:
426 */
427 x86_mov(p->func, ptr, buf_stride);
428 x86_imul(p->func, ptr, elt);
429 x86_add(p->func, ptr, buf_base_ptr);
430 return ptr;
431 }
432 }
433
434
435
436 static boolean incr_inputs( struct translate_sse *p,
437 boolean linear )
438 {
439 if (linear && p->nr_buffers == 1) {
440 struct x86_reg stride = x86_make_disp(p->machine_EDX,
441 get_offset(p, &p->buffer[0].stride));
442
443 x86_add(p->func, p->idx_EBX, stride);
444 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
445 }
446 else if (linear) {
447 unsigned i;
448
449 /* Is this worthwhile??
450 */
451 for (i = 0; i < p->nr_buffers; i++) {
452 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
453 get_offset(p, &p->buffer[i].ptr));
454 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
455 get_offset(p, &p->buffer[i].stride));
456
457 x86_mov(p->func, p->tmp_EAX, buf_ptr);
458 x86_add(p->func, p->tmp_EAX, buf_stride);
459 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
460 x86_mov(p->func, buf_ptr, p->tmp_EAX);
461 }
462 }
463 else {
464 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
465 }
466
467 return TRUE;
468 }
469
470
471 /* Build run( struct translate *machine,
472 * unsigned start,
473 * unsigned count,
474 * void *output_buffer )
475 * or
476 * run_elts( struct translate *machine,
477 * unsigned *elts,
478 * unsigned count,
479 * void *output_buffer )
480 *
481 * Lots of hardcoding
482 *
483 * EAX -- pointer to current output vertex
484 * ECX -- pointer to current attribute
485 *
486 */
487 static boolean build_vertex_emit( struct translate_sse *p,
488 struct x86_function *func,
489 boolean linear )
490 {
491 int fixup, label;
492 unsigned j;
493
494 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
495 p->idx_EBX = x86_make_reg(file_REG32, reg_BX);
496 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
497 p->machine_EDX = x86_make_reg(file_REG32, reg_DX);
498 p->count_ESI = x86_make_reg(file_REG32, reg_SI);
499
500 p->func = func;
501 p->loaded_inv_255 = FALSE;
502 p->loaded_255 = FALSE;
503 p->loaded_identity = FALSE;
504
505 x86_init_func(p->func);
506
507 /* Push a few regs?
508 */
509 x86_push(p->func, p->idx_EBX);
510 x86_push(p->func, p->count_ESI);
511
512 /* Load arguments into regs:
513 */
514 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
515 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
516 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
517 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
518
519 /* Get vertex count, compare to zero
520 */
521 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
522 x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
523 fixup = x86_jcc_forward(p->func, cc_E);
524
525 /* always load, needed or not:
526 */
527 init_inputs(p, linear);
528
529 /* Note address for loop jump
530 */
531 label = x86_get_label(p->func);
532 {
533 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
534 int last_vb = -1;
535 struct x86_reg vb;
536
537 for (j = 0; j < p->translate.key.nr_elements; j++) {
538 const struct translate_element *a = &p->translate.key.element[j];
539
540 /* Figure out source pointer address:
541 */
542 if (a->input_buffer != last_vb) {
543 last_vb = a->input_buffer;
544 vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
545 }
546
547 if (!translate_attr( p, a,
548 x86_make_disp(vb, a->input_offset),
549 x86_make_disp(p->outbuf_ECX, a->output_offset)))
550 return FALSE;
551 }
552
553 /* Next output vertex:
554 */
555 x86_lea(p->func,
556 p->outbuf_ECX,
557 x86_make_disp(p->outbuf_ECX,
558 p->translate.key.output_stride));
559
560 /* Incr index
561 */
562 incr_inputs( p, linear );
563 }
564
565 /* decr count, loop if not zero
566 */
567 x86_dec(p->func, p->count_ESI);
568 x86_jcc(p->func, cc_NZ, label);
569
570 /* Exit mmx state?
571 */
572 if (p->func->need_emms)
573 mmx_emms(p->func);
574
575 /* Land forward jump here:
576 */
577 x86_fixup_fwd_jump(p->func, fixup);
578
579 /* Pop regs and return
580 */
581
582 x86_pop(p->func, p->count_ESI);
583 x86_pop(p->func, p->idx_EBX);
584 x86_ret(p->func);
585
586 return TRUE;
587 }
588
589
590
591
592
593
594
595 static void translate_sse_set_buffer( struct translate *translate,
596 unsigned buf,
597 const void *ptr,
598 unsigned stride )
599 {
600 struct translate_sse *p = (struct translate_sse *)translate;
601
602 if (buf < p->nr_buffers) {
603 p->buffer[buf].base_ptr = (char *)ptr;
604 p->buffer[buf].stride = stride;
605 }
606
607 if (0) debug_printf("%s %d/%d: %p %d\n",
608 __FUNCTION__, buf,
609 p->nr_buffers,
610 ptr, stride);
611 }
612
613
614 static void translate_sse_release( struct translate *translate )
615 {
616 struct translate_sse *p = (struct translate_sse *)translate;
617
618 x86_release_func( &p->linear_func );
619 x86_release_func( &p->elt_func );
620
621 FREE(p);
622 }
623
624 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
625 const unsigned *elts,
626 unsigned count,
627 void *output_buffer )
628 {
629 struct translate_sse *p = (struct translate_sse *)translate;
630
631 p->gen_run_elts( translate,
632 elts,
633 count,
634 output_buffer );
635 }
636
637 static void PIPE_CDECL translate_sse_run( struct translate *translate,
638 unsigned start,
639 unsigned count,
640 void *output_buffer )
641 {
642 struct translate_sse *p = (struct translate_sse *)translate;
643
644 p->gen_run( translate,
645 start,
646 count,
647 output_buffer );
648 }
649
650
651 struct translate *translate_sse2_create( const struct translate_key *key )
652 {
653 struct translate_sse *p = NULL;
654 unsigned i;
655
656 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
657 goto fail;
658
659 p = CALLOC_STRUCT( translate_sse );
660 if (p == NULL)
661 goto fail;
662
663 p->translate.key = *key;
664 p->translate.release = translate_sse_release;
665 p->translate.set_buffer = translate_sse_set_buffer;
666 p->translate.run_elts = translate_sse_run_elts;
667 p->translate.run = translate_sse_run;
668
669 for (i = 0; i < key->nr_elements; i++)
670 p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
671
672 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
673
674 if (!build_vertex_emit(p, &p->linear_func, TRUE))
675 goto fail;
676
677 if (!build_vertex_emit(p, &p->elt_func, FALSE))
678 goto fail;
679
680 p->gen_run = (run_func)x86_get_func(&p->linear_func);
681 if (p->gen_run == NULL)
682 goto fail;
683
684 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
685 if (p->gen_run_elts == NULL)
686 goto fail;
687
688 return &p->translate;
689
690 fail:
691 if (p)
692 translate_sse_release( &p->translate );
693
694 return NULL;
695 }
696
697
698
699 #else
700
701 struct translate *translate_sse2_create( const struct translate_key *key )
702 {
703 return NULL;
704 }
705
706 #endif