translate: fix several bugs
[mesa.git] / src / gallium / auxiliary / translate / translate_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29 #include "pipe/p_compiler.h"
30 #include "pipe/p_util.h"
31 #include "util/u_simple_list.h"
32
33 #include "translate.h"
34
35
36 #if defined(__i386__) || defined(__386__) || defined(i386)
37
38 #include "rtasm/rtasm_cpu.h"
39 #include "rtasm/rtasm_x86sse.h"
40
41
42 #define X 0
43 #define Y 1
44 #define Z 2
45 #define W 3
46
47
48 #ifdef WIN32
49 #define RTASM __cdecl
50 #else
51 #define RTASM
52 #endif
53
54 typedef void (RTASM *run_func)( struct translate *translate,
55 unsigned start,
56 unsigned count,
57 void *output_buffer );
58
59 typedef void (RTASM *run_elts_func)( struct translate *translate,
60 const unsigned *elts,
61 unsigned count,
62 void *output_buffer );
63
64
65
66 struct translate_sse {
67 struct translate translate;
68
69 struct x86_function linear_func;
70 struct x86_function elt_func;
71 struct x86_function *func;
72
73 boolean loaded_identity;
74 boolean loaded_255;
75 boolean loaded_inv_255;
76
77 float identity[4];
78 float float_255[4];
79 float inv_255[4];
80
81 struct {
82 char *input_ptr;
83 unsigned input_stride;
84 } attrib[PIPE_MAX_ATTRIBS];
85
86 run_func gen_run;
87 run_elts_func gen_run_elts;
88
89 };
90
91 static int get_offset( const void *a, const void *b )
92 {
93 return (const char *)b - (const char *)a;
94 }
95
96
97
98 static struct x86_reg get_identity( struct translate_sse *p )
99 {
100 struct x86_reg reg = x86_make_reg(file_XMM, 6);
101
102 if (!p->loaded_identity) {
103 /* Nasty:
104 */
105 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
106
107 p->loaded_identity = TRUE;
108 p->identity[0] = 0;
109 p->identity[1] = 0;
110 p->identity[2] = 0;
111 p->identity[3] = 1;
112
113 sse_movups(p->func, reg,
114 x86_make_disp(translateESI,
115 get_offset(p, &p->identity[0])));
116 }
117
118 return reg;
119 }
120
121 static struct x86_reg get_255( struct translate_sse *p )
122 {
123 struct x86_reg reg = x86_make_reg(file_XMM, 6);
124
125 if (!p->loaded_255) {
126 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
127
128 p->loaded_255 = TRUE;
129 p->float_255[0] =
130 p->float_255[1] =
131 p->float_255[2] =
132 p->float_255[3] = 255.0f;
133
134 sse_movups(p->func, reg,
135 x86_make_disp(translateESI,
136 get_offset(p, &p->float_255[0])));
137 }
138
139 return reg;
140 return x86_make_reg(file_XMM, 7);
141 }
142
143 static struct x86_reg get_inv_255( struct translate_sse *p )
144 {
145 struct x86_reg reg = x86_make_reg(file_XMM, 5);
146
147 if (!p->loaded_inv_255) {
148 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
149
150 p->loaded_inv_255 = TRUE;
151 p->inv_255[0] =
152 p->inv_255[1] =
153 p->inv_255[2] =
154 p->inv_255[3] = 1.0f / 255.0f;
155
156 sse_movups(p->func, reg,
157 x86_make_disp(translateESI,
158 get_offset(p, &p->inv_255[0])));
159 }
160
161 return reg;
162 }
163
164
165 static void emit_load_R32G32B32A32( struct translate_sse *p,
166 struct x86_reg data,
167 struct x86_reg arg0 )
168 {
169 sse_movups(p->func, data, arg0);
170 }
171
172 static void emit_load_R32G32B32( struct translate_sse *p,
173 struct x86_reg data,
174 struct x86_reg arg0 )
175 {
176 /* Have to jump through some hoops:
177 *
178 * c 0 0 0
179 * c 0 0 1
180 * 0 0 c 1
181 * a b c 1
182 */
183 sse_movss(p->func, data, x86_make_disp(arg0, 8));
184 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
185 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
186 sse_movlps(p->func, data, arg0);
187 }
188
189 static void emit_load_R32G32( struct translate_sse *p,
190 struct x86_reg data,
191 struct x86_reg arg0 )
192 {
193 /* 0 0 0 1
194 * a b 0 1
195 */
196 sse_movups(p->func, data, get_identity(p) );
197 sse_movlps(p->func, data, arg0);
198 }
199
200
201 static void emit_load_R32( struct translate_sse *p,
202 struct x86_reg data,
203 struct x86_reg arg0 )
204 {
205 /* a 0 0 0
206 * a 0 0 1
207 */
208 sse_movss(p->func, data, arg0);
209 sse_orps(p->func, data, get_identity(p) );
210 }
211
212
213 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
214 struct x86_reg data,
215 struct x86_reg src )
216 {
217
218 /* Load and unpack twice:
219 */
220 sse_movss(p->func, data, src);
221 sse2_punpcklbw(p->func, data, get_identity(p));
222 sse2_punpcklbw(p->func, data, get_identity(p));
223
224 /* Convert to float:
225 */
226 sse2_cvtdq2ps(p->func, data, data);
227
228
229 /* Scale by 1/255.0
230 */
231 sse_mulps(p->func, data, get_inv_255(p));
232 }
233
234
235
236
237 static void emit_store_R32G32B32A32( struct translate_sse *p,
238 struct x86_reg dest,
239 struct x86_reg dataXMM )
240 {
241 sse_movups(p->func, dest, dataXMM);
242 }
243
244 static void emit_store_R32G32B32( struct translate_sse *p,
245 struct x86_reg dest,
246 struct x86_reg dataXMM )
247 {
248 /* Emit two, shuffle, emit one.
249 */
250 sse_movlps(p->func, dest, dataXMM);
251 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
252 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
253 }
254
255 static void emit_store_R32G32( struct translate_sse *p,
256 struct x86_reg dest,
257 struct x86_reg dataXMM )
258 {
259 sse_movlps(p->func, dest, dataXMM);
260 }
261
262 static void emit_store_R32( struct translate_sse *p,
263 struct x86_reg dest,
264 struct x86_reg dataXMM )
265 {
266 sse_movss(p->func, dest, dataXMM);
267 }
268
269
270
271 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
272 struct x86_reg dest,
273 struct x86_reg dataXMM )
274 {
275 /* Scale by 255.0
276 */
277 sse_mulps(p->func, dataXMM, get_255(p));
278
279 /* Pack and emit:
280 */
281 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
282 sse2_packssdw(p->func, dataXMM, dataXMM);
283 sse2_packuswb(p->func, dataXMM, dataXMM);
284 sse_movss(p->func, dest, dataXMM);
285 }
286
287
288
289
290
291 static void get_src_ptr( struct translate_sse *p,
292 struct x86_reg srcEAX,
293 struct x86_reg translateREG,
294 struct x86_reg eltREG,
295 unsigned a )
296 {
297 struct x86_reg input_ptr =
298 x86_make_disp(translateREG,
299 get_offset(p, &p->attrib[a].input_ptr));
300
301 struct x86_reg input_stride =
302 x86_make_disp(translateREG,
303 get_offset(p, &p->attrib[a].input_stride));
304
305 /* Calculate pointer to current attrib:
306 */
307 x86_mov(p->func, srcEAX, input_stride);
308 x86_imul(p->func, srcEAX, eltREG);
309 x86_add(p->func, srcEAX, input_ptr);
310 }
311
312
313 /* Extended swizzles? Maybe later.
314 */
315 static void emit_swizzle( struct translate_sse *p,
316 struct x86_reg dest,
317 struct x86_reg src,
318 unsigned shuffle )
319 {
320 sse_shufps(p->func, dest, src, shuffle);
321 }
322
323
324 static boolean translate_attr( struct translate_sse *p,
325 const struct translate_element *a,
326 struct x86_reg srcECX,
327 struct x86_reg dstEAX)
328 {
329 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
330
331 switch (a->input_format) {
332 case PIPE_FORMAT_R32_FLOAT:
333 emit_load_R32(p, dataXMM, srcECX);
334 break;
335 case PIPE_FORMAT_R32G32_FLOAT:
336 emit_load_R32G32(p, dataXMM, srcECX);
337 break;
338 case PIPE_FORMAT_R32G32B32_FLOAT:
339 emit_load_R32G32B32(p, dataXMM, srcECX);
340 break;
341 case PIPE_FORMAT_R32G32B32A32_FLOAT:
342 emit_load_R32G32B32A32(p, dataXMM, srcECX);
343 break;
344 case PIPE_FORMAT_B8G8R8A8_UNORM:
345 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
346 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
347 break;
348 case PIPE_FORMAT_R8G8B8A8_UNORM:
349 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
350 break;
351 default:
352 return FALSE;
353 }
354
355 switch (a->output_format) {
356 case PIPE_FORMAT_R32_FLOAT:
357 emit_store_R32(p, dstEAX, dataXMM);
358 break;
359 case PIPE_FORMAT_R32G32_FLOAT:
360 emit_store_R32G32(p, dstEAX, dataXMM);
361 break;
362 case PIPE_FORMAT_R32G32B32_FLOAT:
363 emit_store_R32G32B32(p, dstEAX, dataXMM);
364 break;
365 case PIPE_FORMAT_R32G32B32A32_FLOAT:
366 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
367 break;
368 case PIPE_FORMAT_B8G8R8A8_UNORM:
369 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
370 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
371 break;
372 case PIPE_FORMAT_R8G8B8A8_UNORM:
373 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
374 break;
375 default:
376 return FALSE;
377 }
378
379 return TRUE;
380 }
381
382 /* Build run( struct translate *translate,
383 * unsigned start,
384 * unsigned count,
385 * void *output_buffer )
386 * or
387 * run_elts( struct translate *translate,
388 * unsigned *elts,
389 * unsigned count,
390 * void *output_buffer )
391 *
392 * Lots of hardcoding
393 *
394 * EAX -- pointer to current output vertex
395 * ECX -- pointer to current attribute
396 *
397 */
398 static boolean build_vertex_emit( struct translate_sse *p,
399 struct x86_function *func,
400 boolean linear )
401 {
402 struct x86_reg vertexECX = x86_make_reg(file_REG32, reg_AX);
403 struct x86_reg idxEBX = x86_make_reg(file_REG32, reg_BX);
404 struct x86_reg srcEAX = x86_make_reg(file_REG32, reg_CX);
405 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
406 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
407 uint8_t *fixup, *label;
408 unsigned j;
409
410 p->func = func;
411 p->loaded_inv_255 = FALSE;
412 p->loaded_255 = FALSE;
413 p->loaded_identity = FALSE;
414
415 x86_init_func(p->func);
416
417 /* Push a few regs?
418 */
419 x86_push(p->func, countEBP);
420 x86_push(p->func, translateESI);
421 x86_push(p->func, idxEBX);
422
423 /* Get vertex count, compare to zero
424 */
425 x86_xor(p->func, idxEBX, idxEBX);
426 x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
427 x86_cmp(p->func, countEBP, idxEBX);
428 fixup = x86_jcc_forward(p->func, cc_E);
429
430 /* If linear, idx is the current element, otherwise it is a pointer
431 * to the current element.
432 */
433 x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
434
435 /* Initialize destination register.
436 */
437 x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
438
439 /* Move argument 1 (translate_sse pointer) into a reg:
440 */
441 x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
442
443
444 /* always load, needed or not:
445 */
446
447 /* Note address for loop jump */
448 label = x86_get_label(p->func);
449
450
451 for (j = 0; j < p->translate.key.nr_elements; j++) {
452 const struct translate_element *a = &p->translate.key.element[j];
453
454 struct x86_reg destEAX = x86_make_disp(vertexECX,
455 a->output_offset);
456
457 /* Figure out source pointer address:
458 */
459 if (linear) {
460 get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
461 }
462 else {
463 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
464 }
465
466 if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
467 return FALSE;
468 }
469
470 /* Next vertex:
471 */
472 x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
473
474 /* Incr index
475 */ /* Emit code for each of the attributes. Currently routes
476 * everything through SSE registers, even when it might be more
477 * efficient to stick with regular old x86. No optimization or
478 * other tricks - enough new ground to cover here just getting
479 * things working.
480 */
481
482 if (linear) {
483 x86_inc(p->func, idxEBX);
484 }
485 else {
486 x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
487 }
488
489 /* decr count, loop if not zero
490 */
491 x86_dec(p->func, countEBP);
492 x86_test(p->func, countEBP, countEBP);
493 x86_jcc(p->func, cc_NZ, label);
494
495 /* Exit mmx state?
496 */
497 if (p->func->need_emms)
498 mmx_emms(p->func);
499
500 /* Land forward jump here:
501 */
502 x86_fixup_fwd_jump(p->func, fixup);
503
504 /* Pop regs and return
505 */
506
507 x86_pop(p->func, idxEBX);
508 x86_pop(p->func, translateESI);
509 x86_pop(p->func, countEBP);
510 x86_ret(p->func);
511
512 return TRUE;
513 }
514
515
516
517
518
519
520
521 static void translate_sse_set_buffer( struct translate *translate,
522 unsigned buf,
523 const void *ptr,
524 unsigned stride )
525 {
526 struct translate_sse *p = (struct translate_sse *)translate;
527 unsigned i;
528
529 for (i = 0; i < p->translate.key.nr_elements; i++) {
530 if (p->translate.key.element[i].input_buffer == buf) {
531 p->attrib[i].input_ptr = ((char *)ptr +
532 p->translate.key.element[i].input_offset);
533 p->attrib[i].input_stride = stride;
534 }
535 }
536 }
537
538
539 static void translate_sse_release( struct translate *translate )
540 {
541 struct translate_sse *p = (struct translate_sse *)translate;
542
543 x86_release_func( &p->linear_func );
544 x86_release_func( &p->elt_func );
545
546 FREE(p);
547 }
548
549 static void translate_sse_run_elts( struct translate *translate,
550 const unsigned *elts,
551 unsigned count,
552 void *output_buffer )
553 {
554 struct translate_sse *p = (struct translate_sse *)translate;
555
556 p->gen_run_elts( translate,
557 elts,
558 count,
559 output_buffer );
560 }
561
562 static void translate_sse_run( struct translate *translate,
563 unsigned start,
564 unsigned count,
565 void *output_buffer )
566 {
567 struct translate_sse *p = (struct translate_sse *)translate;
568
569 p->gen_run( translate,
570 start,
571 count,
572 output_buffer );
573 }
574
575
576 struct translate *translate_sse2_create( const struct translate_key *key )
577 {
578 struct translate_sse *p = NULL;
579
580 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
581 goto fail;
582
583 p = CALLOC_STRUCT( translate_sse );
584 if (p == NULL)
585 goto fail;
586
587 p->translate.key = *key;
588 p->translate.release = translate_sse_release;
589 p->translate.set_buffer = translate_sse_set_buffer;
590 p->translate.run_elts = translate_sse_run_elts;
591 p->translate.run = translate_sse_run;
592
593 if (!build_vertex_emit(p, &p->linear_func, TRUE))
594 goto fail;
595
596 if (!build_vertex_emit(p, &p->elt_func, FALSE))
597 goto fail;
598
599 p->gen_run = (run_func)x86_get_func(&p->linear_func);
600 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
601
602 return &p->translate;
603
604 fail:
605 if (p)
606 translate_sse_release( &p->translate );
607
608 return NULL;
609 }
610
611
612
613 #else
614
615 void translate_create_sse( const struct translate_key *key )
616 {
617 return NULL;
618 }
619
620 #endif