Merge branch 'gallium-vertex-linear' into gallium-tex-surfaces
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_compiler.h"
30 #include "pipe/p_util.h"
31 #include "util/u_simple_list.h"
32
33 #include "translate.h"
34
35
36 #if defined(__i386__) || defined(__386__) || defined(i386)
37
38 #include "rtasm/rtasm_cpu.h"
39 #include "rtasm/rtasm_x86sse.h"
40
41
42 #define X 0
43 #define Y 1
44 #define Z 2
45 #define W 3
46
47
48 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
49 unsigned start,
50 unsigned count,
51 void *output_buffer );
52
53 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
54 const unsigned *elts,
55 unsigned count,
56 void *output_buffer );
57
58
59
60 struct translate_sse {
61 struct translate translate;
62
63 struct x86_function linear_func;
64 struct x86_function elt_func;
65 struct x86_function *func;
66
67 boolean loaded_identity;
68 boolean loaded_255;
69 boolean loaded_inv_255;
70
71 float identity[4];
72 float float_255[4];
73 float inv_255[4];
74
75 struct {
76 char *input_ptr;
77 unsigned input_stride;
78 } attrib[PIPE_MAX_ATTRIBS];
79
80 run_func gen_run;
81 run_elts_func gen_run_elts;
82
83 };
84
85 static int get_offset( const void *a, const void *b )
86 {
87 return (const char *)b - (const char *)a;
88 }
89
90
91
92 static struct x86_reg get_identity( struct translate_sse *p )
93 {
94 struct x86_reg reg = x86_make_reg(file_XMM, 6);
95
96 if (!p->loaded_identity) {
97 /* Nasty:
98 */
99 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
100
101 p->loaded_identity = TRUE;
102 p->identity[0] = 0;
103 p->identity[1] = 0;
104 p->identity[2] = 0;
105 p->identity[3] = 1;
106
107 sse_movups(p->func, reg,
108 x86_make_disp(translateESI,
109 get_offset(p, &p->identity[0])));
110 }
111
112 return reg;
113 }
114
115 static struct x86_reg get_255( struct translate_sse *p )
116 {
117 struct x86_reg reg = x86_make_reg(file_XMM, 6);
118
119 if (!p->loaded_255) {
120 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
121
122 p->loaded_255 = TRUE;
123 p->float_255[0] =
124 p->float_255[1] =
125 p->float_255[2] =
126 p->float_255[3] = 255.0f;
127
128 sse_movups(p->func, reg,
129 x86_make_disp(translateESI,
130 get_offset(p, &p->float_255[0])));
131 }
132
133 return reg;
134 return x86_make_reg(file_XMM, 7);
135 }
136
137 static struct x86_reg get_inv_255( struct translate_sse *p )
138 {
139 struct x86_reg reg = x86_make_reg(file_XMM, 5);
140
141 if (!p->loaded_inv_255) {
142 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
143
144 p->loaded_inv_255 = TRUE;
145 p->inv_255[0] =
146 p->inv_255[1] =
147 p->inv_255[2] =
148 p->inv_255[3] = 1.0f / 255.0f;
149
150 sse_movups(p->func, reg,
151 x86_make_disp(translateESI,
152 get_offset(p, &p->inv_255[0])));
153 }
154
155 return reg;
156 }
157
158
159 static void emit_load_R32G32B32A32( struct translate_sse *p,
160 struct x86_reg data,
161 struct x86_reg arg0 )
162 {
163 sse_movups(p->func, data, arg0);
164 }
165
166 static void emit_load_R32G32B32( struct translate_sse *p,
167 struct x86_reg data,
168 struct x86_reg arg0 )
169 {
170 /* Have to jump through some hoops:
171 *
172 * c 0 0 0
173 * c 0 0 1
174 * 0 0 c 1
175 * a b c 1
176 */
177 sse_movss(p->func, data, x86_make_disp(arg0, 8));
178 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
179 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
180 sse_movlps(p->func, data, arg0);
181 }
182
183 static void emit_load_R32G32( struct translate_sse *p,
184 struct x86_reg data,
185 struct x86_reg arg0 )
186 {
187 /* 0 0 0 1
188 * a b 0 1
189 */
190 sse_movups(p->func, data, get_identity(p) );
191 sse_movlps(p->func, data, arg0);
192 }
193
194
195 static void emit_load_R32( struct translate_sse *p,
196 struct x86_reg data,
197 struct x86_reg arg0 )
198 {
199 /* a 0 0 0
200 * a 0 0 1
201 */
202 sse_movss(p->func, data, arg0);
203 sse_orps(p->func, data, get_identity(p) );
204 }
205
206
207 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
208 struct x86_reg data,
209 struct x86_reg src )
210 {
211
212 /* Load and unpack twice:
213 */
214 sse_movss(p->func, data, src);
215 sse2_punpcklbw(p->func, data, get_identity(p));
216 sse2_punpcklbw(p->func, data, get_identity(p));
217
218 /* Convert to float:
219 */
220 sse2_cvtdq2ps(p->func, data, data);
221
222
223 /* Scale by 1/255.0
224 */
225 sse_mulps(p->func, data, get_inv_255(p));
226 }
227
228
229
230
231 static void emit_store_R32G32B32A32( struct translate_sse *p,
232 struct x86_reg dest,
233 struct x86_reg dataXMM )
234 {
235 sse_movups(p->func, dest, dataXMM);
236 }
237
238 static void emit_store_R32G32B32( struct translate_sse *p,
239 struct x86_reg dest,
240 struct x86_reg dataXMM )
241 {
242 /* Emit two, shuffle, emit one.
243 */
244 sse_movlps(p->func, dest, dataXMM);
245 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
246 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
247 }
248
249 static void emit_store_R32G32( struct translate_sse *p,
250 struct x86_reg dest,
251 struct x86_reg dataXMM )
252 {
253 sse_movlps(p->func, dest, dataXMM);
254 }
255
256 static void emit_store_R32( struct translate_sse *p,
257 struct x86_reg dest,
258 struct x86_reg dataXMM )
259 {
260 sse_movss(p->func, dest, dataXMM);
261 }
262
263
264
265 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
266 struct x86_reg dest,
267 struct x86_reg dataXMM )
268 {
269 /* Scale by 255.0
270 */
271 sse_mulps(p->func, dataXMM, get_255(p));
272
273 /* Pack and emit:
274 */
275 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
276 sse2_packssdw(p->func, dataXMM, dataXMM);
277 sse2_packuswb(p->func, dataXMM, dataXMM);
278 sse_movss(p->func, dest, dataXMM);
279 }
280
281
282
283
284
285 static void get_src_ptr( struct translate_sse *p,
286 struct x86_reg srcEAX,
287 struct x86_reg translateREG,
288 struct x86_reg eltREG,
289 unsigned a )
290 {
291 struct x86_reg input_ptr =
292 x86_make_disp(translateREG,
293 get_offset(p, &p->attrib[a].input_ptr));
294
295 struct x86_reg input_stride =
296 x86_make_disp(translateREG,
297 get_offset(p, &p->attrib[a].input_stride));
298
299 /* Calculate pointer to current attrib:
300 */
301 x86_mov(p->func, srcEAX, input_stride);
302 x86_imul(p->func, srcEAX, eltREG);
303 x86_add(p->func, srcEAX, input_ptr);
304 }
305
306
307 /* Extended swizzles? Maybe later.
308 */
309 static void emit_swizzle( struct translate_sse *p,
310 struct x86_reg dest,
311 struct x86_reg src,
312 unsigned shuffle )
313 {
314 sse_shufps(p->func, dest, src, shuffle);
315 }
316
317
318 static boolean translate_attr( struct translate_sse *p,
319 const struct translate_element *a,
320 struct x86_reg srcECX,
321 struct x86_reg dstEAX)
322 {
323 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
324
325 switch (a->input_format) {
326 case PIPE_FORMAT_R32_FLOAT:
327 emit_load_R32(p, dataXMM, srcECX);
328 break;
329 case PIPE_FORMAT_R32G32_FLOAT:
330 emit_load_R32G32(p, dataXMM, srcECX);
331 break;
332 case PIPE_FORMAT_R32G32B32_FLOAT:
333 emit_load_R32G32B32(p, dataXMM, srcECX);
334 break;
335 case PIPE_FORMAT_R32G32B32A32_FLOAT:
336 emit_load_R32G32B32A32(p, dataXMM, srcECX);
337 break;
338 case PIPE_FORMAT_B8G8R8A8_UNORM:
339 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
340 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
341 break;
342 case PIPE_FORMAT_R8G8B8A8_UNORM:
343 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
344 break;
345 default:
346 return FALSE;
347 }
348
349 switch (a->output_format) {
350 case PIPE_FORMAT_R32_FLOAT:
351 emit_store_R32(p, dstEAX, dataXMM);
352 break;
353 case PIPE_FORMAT_R32G32_FLOAT:
354 emit_store_R32G32(p, dstEAX, dataXMM);
355 break;
356 case PIPE_FORMAT_R32G32B32_FLOAT:
357 emit_store_R32G32B32(p, dstEAX, dataXMM);
358 break;
359 case PIPE_FORMAT_R32G32B32A32_FLOAT:
360 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
361 break;
362 case PIPE_FORMAT_B8G8R8A8_UNORM:
363 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
364 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
365 break;
366 case PIPE_FORMAT_R8G8B8A8_UNORM:
367 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
368 break;
369 default:
370 return FALSE;
371 }
372
373 return TRUE;
374 }
375
376 /* Build run( struct translate *translate,
377 * unsigned start,
378 * unsigned count,
379 * void *output_buffer )
380 * or
381 * run_elts( struct translate *translate,
382 * unsigned *elts,
383 * unsigned count,
384 * void *output_buffer )
385 *
386 * Lots of hardcoding
387 *
388 * EAX -- pointer to current output vertex
389 * ECX -- pointer to current attribute
390 *
391 */
392 static boolean build_vertex_emit( struct translate_sse *p,
393 struct x86_function *func,
394 boolean linear )
395 {
396 struct x86_reg vertexECX = x86_make_reg(file_REG32, reg_AX);
397 struct x86_reg idxEBX = x86_make_reg(file_REG32, reg_BX);
398 struct x86_reg srcEAX = x86_make_reg(file_REG32, reg_CX);
399 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
400 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
401 int fixup, label;
402 unsigned j;
403
404 p->func = func;
405 p->loaded_inv_255 = FALSE;
406 p->loaded_255 = FALSE;
407 p->loaded_identity = FALSE;
408
409 x86_init_func(p->func);
410
411 /* Push a few regs?
412 */
413 x86_push(p->func, countEBP);
414 x86_push(p->func, translateESI);
415 x86_push(p->func, idxEBX);
416
417 /* Get vertex count, compare to zero
418 */
419 x86_xor(p->func, idxEBX, idxEBX);
420 x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
421 x86_cmp(p->func, countEBP, idxEBX);
422 fixup = x86_jcc_forward(p->func, cc_E);
423
424 /* If linear, idx is the current element, otherwise it is a pointer
425 * to the current element.
426 */
427 x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
428
429 /* Initialize destination register.
430 */
431 x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
432
433 /* Move argument 1 (translate_sse pointer) into a reg:
434 */
435 x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
436
437
438 /* always load, needed or not:
439 */
440
441 /* Note address for loop jump */
442 label = x86_get_label(p->func);
443
444
445 for (j = 0; j < p->translate.key.nr_elements; j++) {
446 const struct translate_element *a = &p->translate.key.element[j];
447
448 struct x86_reg destEAX = x86_make_disp(vertexECX,
449 a->output_offset);
450
451 /* Figure out source pointer address:
452 */
453 if (linear) {
454 get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
455 }
456 else {
457 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
458 }
459
460 if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
461 return FALSE;
462 }
463
464 /* Next vertex:
465 */
466 x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
467
468 /* Incr index
469 */
470 if (linear) {
471 x86_inc(p->func, idxEBX);
472 }
473 else {
474 x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
475 }
476
477 /* decr count, loop if not zero
478 */
479 x86_dec(p->func, countEBP);
480 x86_test(p->func, countEBP, countEBP);
481 x86_jcc(p->func, cc_NZ, label);
482
483 /* Exit mmx state?
484 */
485 if (p->func->need_emms)
486 mmx_emms(p->func);
487
488 /* Land forward jump here:
489 */
490 x86_fixup_fwd_jump(p->func, fixup);
491
492 /* Pop regs and return
493 */
494
495 x86_pop(p->func, idxEBX);
496 x86_pop(p->func, translateESI);
497 x86_pop(p->func, countEBP);
498 x86_ret(p->func);
499
500 return TRUE;
501 }
502
503
504
505
506
507
508
509 static void translate_sse_set_buffer( struct translate *translate,
510 unsigned buf,
511 const void *ptr,
512 unsigned stride )
513 {
514 struct translate_sse *p = (struct translate_sse *)translate;
515 unsigned i;
516
517 for (i = 0; i < p->translate.key.nr_elements; i++) {
518 if (p->translate.key.element[i].input_buffer == buf) {
519 p->attrib[i].input_ptr = ((char *)ptr +
520 p->translate.key.element[i].input_offset);
521 p->attrib[i].input_stride = stride;
522 }
523 }
524 }
525
526
527 static void translate_sse_release( struct translate *translate )
528 {
529 struct translate_sse *p = (struct translate_sse *)translate;
530
531 x86_release_func( &p->linear_func );
532 x86_release_func( &p->elt_func );
533
534 FREE(p);
535 }
536
537 static void translate_sse_run_elts( struct translate *translate,
538 const unsigned *elts,
539 unsigned count,
540 void *output_buffer )
541 {
542 struct translate_sse *p = (struct translate_sse *)translate;
543
544 p->gen_run_elts( translate,
545 elts,
546 count,
547 output_buffer );
548 }
549
550 static void translate_sse_run( struct translate *translate,
551 unsigned start,
552 unsigned count,
553 void *output_buffer )
554 {
555 struct translate_sse *p = (struct translate_sse *)translate;
556
557 p->gen_run( translate,
558 start,
559 count,
560 output_buffer );
561 }
562
563
564 struct translate *translate_sse2_create( const struct translate_key *key )
565 {
566 struct translate_sse *p = NULL;
567
568 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
569 goto fail;
570
571 p = CALLOC_STRUCT( translate_sse );
572 if (p == NULL)
573 goto fail;
574
575 p->translate.key = *key;
576 p->translate.release = translate_sse_release;
577 p->translate.set_buffer = translate_sse_set_buffer;
578 p->translate.run_elts = translate_sse_run_elts;
579 p->translate.run = translate_sse_run;
580
581 if (!build_vertex_emit(p, &p->linear_func, TRUE))
582 goto fail;
583
584 if (!build_vertex_emit(p, &p->elt_func, FALSE))
585 goto fail;
586
587 p->gen_run = (run_func)x86_get_func(&p->linear_func);
588 if (p->gen_run == NULL)
589 goto fail;
590
591 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
592 if (p->gen_run_elts == NULL)
593 goto fail;
594
595 return &p->translate;
596
597 fail:
598 if (p)
599 translate_sse_release( &p->translate );
600
601 return NULL;
602 }
603
604
605
606 #else
607
608 void translate_create_sse( const struct translate_key *key )
609 {
610 return NULL;
611 }
612
613 #endif