Merge commit 'origin/master' into gallium-0.2
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_simple_list.h"
33
34 #include "translate.h"
35
36
37 #if defined(PIPE_ARCH_X86)
38
39 #include "rtasm/rtasm_cpu.h"
40 #include "rtasm/rtasm_x86sse.h"
41
42
43 #define X 0
44 #define Y 1
45 #define Z 2
46 #define W 3
47
48
49 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50 unsigned start,
51 unsigned count,
52 void *output_buffer );
53
54 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
55 const unsigned *elts,
56 unsigned count,
57 void *output_buffer );
58
59
60
61 struct translate_sse {
62 struct translate translate;
63
64 struct x86_function linear_func;
65 struct x86_function elt_func;
66 struct x86_function *func;
67
68 boolean loaded_identity;
69 boolean loaded_255;
70 boolean loaded_inv_255;
71
72 float identity[4];
73 float float_255[4];
74 float inv_255[4];
75
76 struct {
77 char *input_ptr;
78 unsigned input_stride;
79 } attrib[PIPE_MAX_ATTRIBS];
80
81 run_func gen_run;
82 run_elts_func gen_run_elts;
83
84 };
85
86 static int get_offset( const void *a, const void *b )
87 {
88 return (const char *)b - (const char *)a;
89 }
90
91
92
93 static struct x86_reg get_identity( struct translate_sse *p )
94 {
95 struct x86_reg reg = x86_make_reg(file_XMM, 6);
96
97 if (!p->loaded_identity) {
98 /* Nasty:
99 */
100 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
101
102 p->loaded_identity = TRUE;
103 p->identity[0] = 0;
104 p->identity[1] = 0;
105 p->identity[2] = 0;
106 p->identity[3] = 1;
107
108 sse_movups(p->func, reg,
109 x86_make_disp(translateESI,
110 get_offset(p, &p->identity[0])));
111 }
112
113 return reg;
114 }
115
116 static struct x86_reg get_255( struct translate_sse *p )
117 {
118 struct x86_reg reg = x86_make_reg(file_XMM, 6);
119
120 if (!p->loaded_255) {
121 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
122
123 p->loaded_255 = TRUE;
124 p->float_255[0] =
125 p->float_255[1] =
126 p->float_255[2] =
127 p->float_255[3] = 255.0f;
128
129 sse_movups(p->func, reg,
130 x86_make_disp(translateESI,
131 get_offset(p, &p->float_255[0])));
132 }
133
134 return reg;
135 return x86_make_reg(file_XMM, 7);
136 }
137
138 static struct x86_reg get_inv_255( struct translate_sse *p )
139 {
140 struct x86_reg reg = x86_make_reg(file_XMM, 5);
141
142 if (!p->loaded_inv_255) {
143 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
144
145 p->loaded_inv_255 = TRUE;
146 p->inv_255[0] =
147 p->inv_255[1] =
148 p->inv_255[2] =
149 p->inv_255[3] = 1.0f / 255.0f;
150
151 sse_movups(p->func, reg,
152 x86_make_disp(translateESI,
153 get_offset(p, &p->inv_255[0])));
154 }
155
156 return reg;
157 }
158
159
160 static void emit_load_R32G32B32A32( struct translate_sse *p,
161 struct x86_reg data,
162 struct x86_reg arg0 )
163 {
164 sse_movups(p->func, data, arg0);
165 }
166
167 static void emit_load_R32G32B32( struct translate_sse *p,
168 struct x86_reg data,
169 struct x86_reg arg0 )
170 {
171 /* Have to jump through some hoops:
172 *
173 * c 0 0 0
174 * c 0 0 1
175 * 0 0 c 1
176 * a b c 1
177 */
178 sse_movss(p->func, data, x86_make_disp(arg0, 8));
179 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
180 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
181 sse_movlps(p->func, data, arg0);
182 }
183
184 static void emit_load_R32G32( struct translate_sse *p,
185 struct x86_reg data,
186 struct x86_reg arg0 )
187 {
188 /* 0 0 0 1
189 * a b 0 1
190 */
191 sse_movups(p->func, data, get_identity(p) );
192 sse_movlps(p->func, data, arg0);
193 }
194
195
196 static void emit_load_R32( struct translate_sse *p,
197 struct x86_reg data,
198 struct x86_reg arg0 )
199 {
200 /* a 0 0 0
201 * a 0 0 1
202 */
203 sse_movss(p->func, data, arg0);
204 sse_orps(p->func, data, get_identity(p) );
205 }
206
207
208 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
209 struct x86_reg data,
210 struct x86_reg src )
211 {
212
213 /* Load and unpack twice:
214 */
215 sse_movss(p->func, data, src);
216 sse2_punpcklbw(p->func, data, get_identity(p));
217 sse2_punpcklbw(p->func, data, get_identity(p));
218
219 /* Convert to float:
220 */
221 sse2_cvtdq2ps(p->func, data, data);
222
223
224 /* Scale by 1/255.0
225 */
226 sse_mulps(p->func, data, get_inv_255(p));
227 }
228
229
230
231
232 static void emit_store_R32G32B32A32( struct translate_sse *p,
233 struct x86_reg dest,
234 struct x86_reg dataXMM )
235 {
236 sse_movups(p->func, dest, dataXMM);
237 }
238
239 static void emit_store_R32G32B32( struct translate_sse *p,
240 struct x86_reg dest,
241 struct x86_reg dataXMM )
242 {
243 /* Emit two, shuffle, emit one.
244 */
245 sse_movlps(p->func, dest, dataXMM);
246 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
247 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
248 }
249
250 static void emit_store_R32G32( struct translate_sse *p,
251 struct x86_reg dest,
252 struct x86_reg dataXMM )
253 {
254 sse_movlps(p->func, dest, dataXMM);
255 }
256
257 static void emit_store_R32( struct translate_sse *p,
258 struct x86_reg dest,
259 struct x86_reg dataXMM )
260 {
261 sse_movss(p->func, dest, dataXMM);
262 }
263
264
265
266 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
267 struct x86_reg dest,
268 struct x86_reg dataXMM )
269 {
270 /* Scale by 255.0
271 */
272 sse_mulps(p->func, dataXMM, get_255(p));
273
274 /* Pack and emit:
275 */
276 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
277 sse2_packssdw(p->func, dataXMM, dataXMM);
278 sse2_packuswb(p->func, dataXMM, dataXMM);
279 sse_movss(p->func, dest, dataXMM);
280 }
281
282
283
284
285
286 static void get_src_ptr( struct translate_sse *p,
287 struct x86_reg srcEAX,
288 struct x86_reg translateREG,
289 struct x86_reg eltREG,
290 unsigned a )
291 {
292 struct x86_reg input_ptr =
293 x86_make_disp(translateREG,
294 get_offset(p, &p->attrib[a].input_ptr));
295
296 struct x86_reg input_stride =
297 x86_make_disp(translateREG,
298 get_offset(p, &p->attrib[a].input_stride));
299
300 /* Calculate pointer to current attrib:
301 */
302 x86_mov(p->func, srcEAX, input_stride);
303 x86_imul(p->func, srcEAX, eltREG);
304 x86_add(p->func, srcEAX, input_ptr);
305 }
306
307
308 /* Extended swizzles? Maybe later.
309 */
310 static void emit_swizzle( struct translate_sse *p,
311 struct x86_reg dest,
312 struct x86_reg src,
313 unsigned char shuffle )
314 {
315 sse_shufps(p->func, dest, src, shuffle);
316 }
317
318
319 static boolean translate_attr( struct translate_sse *p,
320 const struct translate_element *a,
321 struct x86_reg srcECX,
322 struct x86_reg dstEAX)
323 {
324 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
325
326 switch (a->input_format) {
327 case PIPE_FORMAT_R32_FLOAT:
328 emit_load_R32(p, dataXMM, srcECX);
329 break;
330 case PIPE_FORMAT_R32G32_FLOAT:
331 emit_load_R32G32(p, dataXMM, srcECX);
332 break;
333 case PIPE_FORMAT_R32G32B32_FLOAT:
334 emit_load_R32G32B32(p, dataXMM, srcECX);
335 break;
336 case PIPE_FORMAT_R32G32B32A32_FLOAT:
337 emit_load_R32G32B32A32(p, dataXMM, srcECX);
338 break;
339 case PIPE_FORMAT_B8G8R8A8_UNORM:
340 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
341 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
342 break;
343 case PIPE_FORMAT_R8G8B8A8_UNORM:
344 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
345 break;
346 default:
347 return FALSE;
348 }
349
350 switch (a->output_format) {
351 case PIPE_FORMAT_R32_FLOAT:
352 emit_store_R32(p, dstEAX, dataXMM);
353 break;
354 case PIPE_FORMAT_R32G32_FLOAT:
355 emit_store_R32G32(p, dstEAX, dataXMM);
356 break;
357 case PIPE_FORMAT_R32G32B32_FLOAT:
358 emit_store_R32G32B32(p, dstEAX, dataXMM);
359 break;
360 case PIPE_FORMAT_R32G32B32A32_FLOAT:
361 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
362 break;
363 case PIPE_FORMAT_B8G8R8A8_UNORM:
364 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
365 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
366 break;
367 case PIPE_FORMAT_R8G8B8A8_UNORM:
368 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
369 break;
370 default:
371 return FALSE;
372 }
373
374 return TRUE;
375 }
376
377 /* Build run( struct translate *translate,
378 * unsigned start,
379 * unsigned count,
380 * void *output_buffer )
381 * or
382 * run_elts( struct translate *translate,
383 * unsigned *elts,
384 * unsigned count,
385 * void *output_buffer )
386 *
387 * Lots of hardcoding
388 *
389 * EAX -- pointer to current output vertex
390 * ECX -- pointer to current attribute
391 *
392 */
393 static boolean build_vertex_emit( struct translate_sse *p,
394 struct x86_function *func,
395 boolean linear )
396 {
397 struct x86_reg vertexECX = x86_make_reg(file_REG32, reg_AX);
398 struct x86_reg idxEBX = x86_make_reg(file_REG32, reg_BX);
399 struct x86_reg srcEAX = x86_make_reg(file_REG32, reg_CX);
400 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
401 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
402 int fixup, label;
403 unsigned j;
404
405 p->func = func;
406 p->loaded_inv_255 = FALSE;
407 p->loaded_255 = FALSE;
408 p->loaded_identity = FALSE;
409
410 x86_init_func(p->func);
411
412 /* Push a few regs?
413 */
414 x86_push(p->func, countEBP);
415 x86_push(p->func, translateESI);
416 x86_push(p->func, idxEBX);
417
418 /* Get vertex count, compare to zero
419 */
420 x86_xor(p->func, idxEBX, idxEBX);
421 x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
422 x86_cmp(p->func, countEBP, idxEBX);
423 fixup = x86_jcc_forward(p->func, cc_E);
424
425 /* If linear, idx is the current element, otherwise it is a pointer
426 * to the current element.
427 */
428 x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
429
430 /* Initialize destination register.
431 */
432 x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
433
434 /* Move argument 1 (translate_sse pointer) into a reg:
435 */
436 x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
437
438
439 /* always load, needed or not:
440 */
441
442 /* Note address for loop jump */
443 label = x86_get_label(p->func);
444
445
446 for (j = 0; j < p->translate.key.nr_elements; j++) {
447 const struct translate_element *a = &p->translate.key.element[j];
448
449 struct x86_reg destEAX = x86_make_disp(vertexECX,
450 a->output_offset);
451
452 /* Figure out source pointer address:
453 */
454 if (linear) {
455 get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
456 }
457 else {
458 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
459 }
460
461 if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
462 return FALSE;
463 }
464
465 /* Next vertex:
466 */
467 x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
468
469 /* Incr index
470 */
471 if (linear) {
472 x86_inc(p->func, idxEBX);
473 }
474 else {
475 x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
476 }
477
478 /* decr count, loop if not zero
479 */
480 x86_dec(p->func, countEBP);
481 x86_test(p->func, countEBP, countEBP);
482 x86_jcc(p->func, cc_NZ, label);
483
484 /* Exit mmx state?
485 */
486 if (p->func->need_emms)
487 mmx_emms(p->func);
488
489 /* Land forward jump here:
490 */
491 x86_fixup_fwd_jump(p->func, fixup);
492
493 /* Pop regs and return
494 */
495
496 x86_pop(p->func, idxEBX);
497 x86_pop(p->func, translateESI);
498 x86_pop(p->func, countEBP);
499 x86_ret(p->func);
500
501 return TRUE;
502 }
503
504
505
506
507
508
509
510 static void translate_sse_set_buffer( struct translate *translate,
511 unsigned buf,
512 const void *ptr,
513 unsigned stride )
514 {
515 struct translate_sse *p = (struct translate_sse *)translate;
516 unsigned i;
517
518 for (i = 0; i < p->translate.key.nr_elements; i++) {
519 if (p->translate.key.element[i].input_buffer == buf) {
520 p->attrib[i].input_ptr = ((char *)ptr +
521 p->translate.key.element[i].input_offset);
522 p->attrib[i].input_stride = stride;
523 }
524 }
525 }
526
527
528 static void translate_sse_release( struct translate *translate )
529 {
530 struct translate_sse *p = (struct translate_sse *)translate;
531
532 x86_release_func( &p->linear_func );
533 x86_release_func( &p->elt_func );
534
535 FREE(p);
536 }
537
538 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
539 const unsigned *elts,
540 unsigned count,
541 void *output_buffer )
542 {
543 struct translate_sse *p = (struct translate_sse *)translate;
544
545 p->gen_run_elts( translate,
546 elts,
547 count,
548 output_buffer );
549 }
550
551 static void PIPE_CDECL translate_sse_run( struct translate *translate,
552 unsigned start,
553 unsigned count,
554 void *output_buffer )
555 {
556 struct translate_sse *p = (struct translate_sse *)translate;
557
558 p->gen_run( translate,
559 start,
560 count,
561 output_buffer );
562 }
563
564
565 struct translate *translate_sse2_create( const struct translate_key *key )
566 {
567 struct translate_sse *p = NULL;
568
569 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
570 goto fail;
571
572 p = CALLOC_STRUCT( translate_sse );
573 if (p == NULL)
574 goto fail;
575
576 p->translate.key = *key;
577 p->translate.release = translate_sse_release;
578 p->translate.set_buffer = translate_sse_set_buffer;
579 p->translate.run_elts = translate_sse_run_elts;
580 p->translate.run = translate_sse_run;
581
582 if (!build_vertex_emit(p, &p->linear_func, TRUE))
583 goto fail;
584
585 if (!build_vertex_emit(p, &p->elt_func, FALSE))
586 goto fail;
587
588 p->gen_run = (run_func)x86_get_func(&p->linear_func);
589 if (p->gen_run == NULL)
590 goto fail;
591
592 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
593 if (p->gen_run_elts == NULL)
594 goto fail;
595
596 return &p->translate;
597
598 fail:
599 if (p)
600 translate_sse_release( &p->translate );
601
602 return NULL;
603 }
604
605
606
607 #else
608
609 struct translate *translate_sse2_create( const struct translate_key *key )
610 {
611 return NULL;
612 }
613
614 #endif