d4eefdb6fc936c0b81fb7e00226c1fa6953394da
[mesa.git] / src / mesa / tnl / t_vertex_sse.c
1 /*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28 #include "glheader.h"
29 #include "context.h"
30 #include "colormac.h"
31 #include "t_context.h"
32 #include "t_vertex.h"
33 #include "simple_list.h"
34 #include "enums.h"
35
36 #if defined(USE_X86_ASM)
37
38 #define X 0
39 #define Y 1
40 #define Z 2
41 #define W 3
42
43 #define DISASSEM 0
44
45 struct x86_reg {
46 GLuint file:3;
47 GLuint idx:3;
48 GLuint mod:2; /* mod_REG if this is just a register */
49 GLint disp:24; /* only +/- 23bits of offset - should be enough... */
50 };
51
52 struct x86_program {
53 GLcontext *ctx;
54
55 GLubyte *store;
56 GLubyte *csr;
57
58 GLuint stack_offset;
59
60 GLboolean inputs_safe;
61 GLboolean outputs_safe;
62 GLboolean have_sse2;
63 GLboolean need_emms;
64
65 struct x86_reg identity;
66 struct x86_reg chan0;
67
68 };
69
70
71 #define X86_TWOB 0x0f
72
73 /* There are more but these are all we'll use:
74 */
75 enum x86_reg_file {
76 file_REG32,
77 file_MMX,
78 file_XMM
79 };
80
81 /* Values for mod field of modr/m byte
82 */
83 enum x86_reg_mod {
84 mod_INDIRECT,
85 mod_DISP8,
86 mod_DISP32,
87 mod_REG
88 };
89
90 enum x86_reg_name {
91 reg_AX,
92 reg_CX,
93 reg_DX,
94 reg_BX,
95 reg_SP,
96 reg_BP,
97 reg_SI,
98 reg_DI
99 };
100
101
102 enum x86_cc {
103 cc_O, /* overflow */
104 cc_NO, /* not overflow */
105 cc_NAE, /* not above or equal / carry */
106 cc_AE, /* above or equal / not carry */
107 cc_E, /* equal / zero */
108 cc_NE /* not equal / not zero */
109 };
110
111 #define cc_Z cc_E
112 #define cc_NZ cc_NE
113
114
115 /* Create and manipulate registers and regmem values:
116 */
117 static struct x86_reg make_reg( GLuint file,
118 GLuint idx )
119 {
120 struct x86_reg reg;
121
122 reg.file = file;
123 reg.idx = idx;
124 reg.mod = mod_REG;
125 reg.disp = 0;
126
127 return reg;
128 }
129
130 static struct x86_reg make_disp( struct x86_reg reg,
131 GLint disp )
132 {
133 assert(reg.file == file_REG32);
134
135 if (reg.mod == mod_REG)
136 reg.disp = disp;
137 else
138 reg.disp += disp;
139
140 if (reg.disp == 0)
141 reg.mod = mod_INDIRECT;
142 else if (reg.disp <= 127 && reg.disp >= -128)
143 reg.mod = mod_DISP8;
144 else
145 reg.mod = mod_DISP32;
146
147 return reg;
148 }
149
150 static struct x86_reg deref( struct x86_reg reg )
151 {
152 return make_disp(reg, 0);
153 }
154
155 static struct x86_reg get_base_reg( struct x86_reg reg )
156 {
157 return make_reg( reg.file, reg.idx );
158 }
159
160
161 /* Retreive a reference to one of the function arguments, taking into
162 * account any push/pop activity:
163 */
164 static struct x86_reg make_fn_arg( struct x86_program *p,
165 GLuint arg )
166 {
167 return make_disp(make_reg(file_REG32, reg_SP),
168 p->stack_offset + arg * 4); /* ??? */
169 }
170
171 static struct x86_reg get_identity( struct x86_program *p )
172 {
173 return p->identity;
174 }
175
176
177 /* Emit bytes to the instruction stream:
178 */
179 static void emit_1b( struct x86_program *p, GLbyte b0 )
180 {
181 *(GLbyte *)(p->csr++) = b0;
182 }
183
184 static void emit_1i( struct x86_program *p, GLint i0 )
185 {
186 *(GLint *)(p->csr) = i0;
187 p->csr += 4;
188 }
189
190 static void disassem( struct x86_program *p, const char *fn )
191 {
192 #if DISASSEM
193 static const char *last_fn;
194 if (fn && fn != last_fn) {
195 _mesa_printf("0x%x: %s\n", p->csr, fn);
196 last_fn = fn;
197 }
198 #endif
199 }
200
201 static void emit_1ub_fn( struct x86_program *p, GLubyte b0, const char *fn )
202 {
203 disassem(p, fn);
204 *(p->csr++) = b0;
205 }
206
207 static void emit_2ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, const char *fn )
208 {
209 disassem(p, fn);
210 *(p->csr++) = b0;
211 *(p->csr++) = b1;
212 }
213
214 static void emit_3ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2, const char *fn )
215 {
216 disassem(p, fn);
217 *(p->csr++) = b0;
218 *(p->csr++) = b1;
219 *(p->csr++) = b2;
220 }
221
222 #define emit_1ub(p, b0) emit_1ub_fn(p, b0, __FUNCTION__)
223 #define emit_2ub(p, b0, b1) emit_2ub_fn(p, b0, b1, __FUNCTION__)
224 #define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__)
225
226
227 /* Labels, jumps and fixup:
228 */
229 static GLubyte *get_label( struct x86_program *p )
230 {
231 return p->csr;
232 }
233
234 static void x86_jcc( struct x86_program *p,
235 GLuint cc,
236 GLubyte *label )
237 {
238 GLint offset = label - (get_label(p) + 2);
239
240 if (offset <= 127 && offset >= -128) {
241 emit_1ub(p, 0x70 + cc);
242 emit_1b(p, (GLbyte) offset);
243 }
244 else {
245 offset = label - (get_label(p) + 6);
246 emit_2ub(p, 0x0f, 0x80 + cc);
247 emit_1i(p, offset);
248 }
249 }
250
251 /* Always use a 32bit offset for forward jumps:
252 */
253 static GLubyte *x86_jcc_forward( struct x86_program *p,
254 GLuint cc )
255 {
256 emit_2ub(p, 0x0f, 0x80 + cc);
257 emit_1i(p, 0);
258 return get_label(p);
259 }
260
261 /* Fixup offset from forward jump:
262 */
263 static void do_fixup( struct x86_program *p,
264 GLubyte *fixup )
265 {
266 *(int *)(fixup - 4) = get_label(p) - fixup;
267 }
268
269 static void x86_push( struct x86_program *p,
270 struct x86_reg reg )
271 {
272 assert(reg.mod == mod_REG);
273 emit_1ub(p, 0x50 + reg.idx);
274 p->stack_offset += 4;
275 }
276
277 static void x86_pop( struct x86_program *p,
278 struct x86_reg reg )
279 {
280 assert(reg.mod == mod_REG);
281 emit_1ub(p, 0x58 + reg.idx);
282 p->stack_offset -= 4;
283 }
284
285 static void x86_inc( struct x86_program *p,
286 struct x86_reg reg )
287 {
288 assert(reg.mod == mod_REG);
289 emit_1ub(p, 0x40 + reg.idx);
290 }
291
292 static void x86_dec( struct x86_program *p,
293 struct x86_reg reg )
294 {
295 assert(reg.mod == mod_REG);
296 emit_1ub(p, 0x48 + reg.idx);
297 }
298
299 static void x86_ret( struct x86_program *p )
300 {
301 emit_1ub(p, 0xc3);
302 }
303
304 static void mmx_emms( struct x86_program *p )
305 {
306 assert(p->need_emms);
307 emit_2ub(p, 0x0f, 0x77);
308 p->need_emms = 0;
309 }
310
311
312
313
314 /* Build a modRM byte + possible displacement. No treatment of SIB
315 * indexing. BZZT - no way to encode an absolute address.
316 */
317 static void emit_modrm( struct x86_program *p,
318 struct x86_reg reg,
319 struct x86_reg regmem )
320 {
321 GLubyte val = 0;
322
323 assert(reg.mod == mod_REG);
324
325 val |= regmem.mod << 6; /* mod field */
326 val |= reg.idx << 3; /* reg field */
327 val |= regmem.idx; /* r/m field */
328
329 emit_1ub_fn(p, val, 0);
330
331 /* Oh-oh we've stumbled into the SIB thing.
332 */
333 if (regmem.idx == reg_SP) {
334 emit_1ub_fn(p, 0x24, 0); /* simplistic! */
335 }
336
337 switch (regmem.mod) {
338 case mod_REG:
339 case mod_INDIRECT:
340 break;
341 case mod_DISP8:
342 emit_1b(p, regmem.disp);
343 break;
344 case mod_DISP32:
345 emit_1i(p, regmem.disp);
346 break;
347 default:
348 _mesa_printf("unknown regmem.mod %d\n", regmem.mod);
349 abort();
350 break;
351 }
352 }
353
354 /* Many x86 instructions have two opcodes to cope with the situations
355 * where the destination is a register or memory reference
356 * respectively. This function selects the correct opcode based on
357 * the arguments presented.
358 */
359 static void emit_op_modrm( struct x86_program *p,
360 GLubyte op_dst_is_reg,
361 GLubyte op_dst_is_mem,
362 struct x86_reg dst,
363 struct x86_reg src )
364 {
365 switch (dst.mod) {
366 case mod_REG:
367 emit_1ub_fn(p, op_dst_is_reg, 0);
368 emit_modrm(p, dst, src);
369 break;
370 case mod_INDIRECT:
371 case mod_DISP32:
372 case mod_DISP8:
373 assert(src.mod == mod_REG);
374 emit_1ub_fn(p, op_dst_is_mem, 0);
375 emit_modrm(p, src, dst);
376 break;
377 default:
378 _mesa_printf("unknown dst.mod %d\n", dst.mod);
379 abort();
380 break;
381 }
382 }
383
384 static void x86_mov( struct x86_program *p,
385 struct x86_reg dst,
386 struct x86_reg src )
387 {
388 emit_op_modrm( p, 0x8b, 0x89, dst, src );
389 }
390
391 static void x86_xor( struct x86_program *p,
392 struct x86_reg dst,
393 struct x86_reg src )
394 {
395 emit_op_modrm( p, 0x33, 0x31, dst, src );
396 }
397
398 static void x86_cmp( struct x86_program *p,
399 struct x86_reg dst,
400 struct x86_reg src )
401 {
402 emit_op_modrm( p, 0x3b, 0x39, dst, src );
403 }
404
405 static void sse2_movd( struct x86_program *p,
406 struct x86_reg dst,
407 struct x86_reg src )
408 {
409 assert(p->have_sse2);
410 emit_2ub(p, 0x66, X86_TWOB);
411 emit_op_modrm( p, 0x6e, 0x7e, dst, src );
412 }
413
414 static void mmx_movd( struct x86_program *p,
415 struct x86_reg dst,
416 struct x86_reg src )
417 {
418 p->need_emms = 1;
419 emit_1ub(p, X86_TWOB);
420 emit_op_modrm( p, 0x6e, 0x7e, dst, src );
421 }
422
423 static void mmx_movq( struct x86_program *p,
424 struct x86_reg dst,
425 struct x86_reg src )
426 {
427 p->need_emms = 1;
428 emit_1ub(p, X86_TWOB);
429 emit_op_modrm( p, 0x6f, 0x7f, dst, src );
430 }
431
432
433 static void sse_movss( struct x86_program *p,
434 struct x86_reg dst,
435 struct x86_reg src )
436 {
437 emit_2ub(p, 0xF3, X86_TWOB);
438 emit_op_modrm( p, 0x10, 0x11, dst, src );
439 }
440
441 static void sse_movaps( struct x86_program *p,
442 struct x86_reg dst,
443 struct x86_reg src )
444 {
445 emit_1ub(p, X86_TWOB);
446 emit_op_modrm( p, 0x28, 0x29, dst, src );
447 }
448
449 static void sse_movups( struct x86_program *p,
450 struct x86_reg dst,
451 struct x86_reg src )
452 {
453 emit_1ub(p, X86_TWOB);
454 emit_op_modrm( p, 0x10, 0x11, dst, src );
455 }
456
457 static void sse_movhps( struct x86_program *p,
458 struct x86_reg dst,
459 struct x86_reg src )
460 {
461 assert(dst.mod != mod_REG || src.mod != mod_REG);
462 emit_1ub(p, X86_TWOB);
463 emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
464 }
465
466 static void sse_movlps( struct x86_program *p,
467 struct x86_reg dst,
468 struct x86_reg src )
469 {
470 assert(dst.mod != mod_REG || src.mod != mod_REG);
471 emit_1ub(p, X86_TWOB);
472 emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
473 }
474
475 /* SSE operations often only have one format, with dest constrained to
476 * be a register:
477 */
478 static void sse_mulps( struct x86_program *p,
479 struct x86_reg dst,
480 struct x86_reg src )
481 {
482 emit_2ub(p, X86_TWOB, 0x59);
483 emit_modrm( p, dst, src );
484 }
485
486 static void sse_addps( struct x86_program *p,
487 struct x86_reg dst,
488 struct x86_reg src )
489 {
490 emit_2ub(p, X86_TWOB, 0x58);
491 emit_modrm( p, dst, src );
492 }
493
494 static void sse_movhlps( struct x86_program *p,
495 struct x86_reg dst,
496 struct x86_reg src )
497 {
498 assert(dst.mod == mod_REG && src.mod == mod_REG);
499 emit_2ub(p, X86_TWOB, 0x12);
500 emit_modrm( p, dst, src );
501 }
502
503 static void sse_movlhps( struct x86_program *p,
504 struct x86_reg dst,
505 struct x86_reg src )
506 {
507 assert(dst.mod == mod_REG && src.mod == mod_REG);
508 emit_2ub(p, X86_TWOB, 0x16);
509 emit_modrm( p, dst, src );
510 }
511
512 static void sse2_cvtps2dq( struct x86_program *p,
513 struct x86_reg dst,
514 struct x86_reg src )
515 {
516 assert(p->have_sse2);
517 emit_3ub(p, 0x66, X86_TWOB, 0x5B);
518 emit_modrm( p, dst, src );
519 }
520
521 static void sse2_packssdw( struct x86_program *p,
522 struct x86_reg dst,
523 struct x86_reg src )
524 {
525 assert(p->have_sse2);
526 emit_3ub(p, 0x66, X86_TWOB, 0x6B);
527 emit_modrm( p, dst, src );
528 }
529
530 static void sse2_packsswb( struct x86_program *p,
531 struct x86_reg dst,
532 struct x86_reg src )
533 {
534 assert(p->have_sse2);
535 emit_3ub(p, 0x66, X86_TWOB, 0x63);
536 emit_modrm( p, dst, src );
537 }
538
539 static void sse2_packuswb( struct x86_program *p,
540 struct x86_reg dst,
541 struct x86_reg src )
542 {
543 assert(p->have_sse2);
544 emit_3ub(p, 0x66, X86_TWOB, 0x67);
545 emit_modrm( p, dst, src );
546 }
547
548 static void sse_cvtps2pi( struct x86_program *p,
549 struct x86_reg dst,
550 struct x86_reg src )
551 {
552 assert(dst.file == file_MMX &&
553 (src.file == file_XMM || src.mod != mod_REG));
554
555 p->need_emms = 1;
556
557 emit_2ub(p, X86_TWOB, 0x2d);
558 emit_modrm( p, dst, src );
559 }
560
561 static void mmx_packssdw( struct x86_program *p,
562 struct x86_reg dst,
563 struct x86_reg src )
564 {
565 assert(dst.file == file_MMX &&
566 (src.file == file_MMX || src.mod != mod_REG));
567
568 p->need_emms = 1;
569
570 emit_2ub(p, X86_TWOB, 0x6b);
571 emit_modrm( p, dst, src );
572 }
573
574 static void mmx_packuswb( struct x86_program *p,
575 struct x86_reg dst,
576 struct x86_reg src )
577 {
578 assert(dst.file == file_MMX &&
579 (src.file == file_MMX || src.mod != mod_REG));
580
581 p->need_emms = 1;
582
583 emit_2ub(p, X86_TWOB, 0x67);
584 emit_modrm( p, dst, src );
585 }
586
587
588 /* Load effective address:
589 */
590 static void x86_lea( struct x86_program *p,
591 struct x86_reg dst,
592 struct x86_reg src )
593 {
594 emit_1ub(p, 0x8d);
595 emit_modrm( p, dst, src );
596 }
597
598 static void x86_test( struct x86_program *p,
599 struct x86_reg dst,
600 struct x86_reg src )
601 {
602 emit_1ub(p, 0x85);
603 emit_modrm( p, dst, src );
604 }
605
606
607
608
609 /**
610 * Perform a reduced swizzle:
611 */
612 static void sse2_pshufd( struct x86_program *p,
613 struct x86_reg dest,
614 struct x86_reg arg0,
615 GLubyte x,
616 GLubyte y,
617 GLubyte z,
618 GLubyte w)
619 {
620 assert(p->have_sse2);
621 emit_3ub(p, 0x66, X86_TWOB, 0x70);
622 emit_modrm(p, dest, arg0);
623 emit_1ub(p, (x|(y<<2)|(z<<4)|w<<6));
624 }
625
626
627 /* Shufps can also be used to implement a reduced swizzle when dest ==
628 * arg0.
629 */
630 static void sse_shufps( struct x86_program *p,
631 struct x86_reg dest,
632 struct x86_reg arg0,
633 GLubyte x,
634 GLubyte y,
635 GLubyte z,
636 GLubyte w)
637 {
638 emit_2ub(p, X86_TWOB, 0xC6);
639 emit_modrm(p, dest, arg0);
640 emit_1ub(p, (x|(y<<2)|(z<<4)|w<<6));
641 }
642
643
644 static void emit_load4f_4( struct x86_program *p,
645 struct x86_reg dest,
646 struct x86_reg arg0 )
647 {
648 sse_movups(p, dest, arg0);
649 }
650
651 static void emit_load4f_3( struct x86_program *p,
652 struct x86_reg dest,
653 struct x86_reg arg0 )
654 {
655 /* Have to jump through some hoops:
656 *
657 * c 0 0 0
658 * c 0 0 1
659 * 0 0 c 1
660 * a b c 1
661 */
662 sse_movss(p, dest, make_disp(arg0, 8));
663 sse_shufps(p, dest, get_identity(p), X,Y,Z,W );
664 sse_shufps(p, dest, dest, Y,Z,X,W );
665 sse_movlps(p, dest, arg0);
666 }
667
668 static void emit_load4f_2( struct x86_program *p,
669 struct x86_reg dest,
670 struct x86_reg arg0 )
671 {
672 /* Initialize from identity, then pull in low two words:
673 */
674 sse_movups(p, dest, get_identity(p));
675 sse_movlps(p, dest, arg0);
676 }
677
678 static void emit_load4f_1( struct x86_program *p,
679 struct x86_reg dest,
680 struct x86_reg arg0 )
681 {
682 /* Pull in low word, then swizzle in identity */
683 sse_movss(p, dest, arg0);
684 sse_shufps(p, dest, get_identity(p), X,Y,Z,W );
685 }
686
687
688
689 static void emit_load3f_3( struct x86_program *p,
690 struct x86_reg dest,
691 struct x86_reg arg0 )
692 {
693 /* Over-reads by 1 dword - potential SEGV if input is a vertex
694 * array.
695 */
696 if (p->inputs_safe) {
697 sse_movups(p, dest, arg0);
698 }
699 else {
700 /* c 0 0 0
701 * c c c c
702 * a b c c
703 */
704 sse_movss(p, dest, make_disp(arg0, 8));
705 sse_shufps(p, dest, dest, X,X,X,X);
706 sse_movlps(p, dest, arg0);
707 }
708 }
709
710 static void emit_load3f_2( struct x86_program *p,
711 struct x86_reg dest,
712 struct x86_reg arg0 )
713 {
714 emit_load4f_2(p, dest, arg0);
715 }
716
717 static void emit_load3f_1( struct x86_program *p,
718 struct x86_reg dest,
719 struct x86_reg arg0 )
720 {
721 emit_load4f_1(p, dest, arg0);
722 }
723
724 static void emit_load2f_2( struct x86_program *p,
725 struct x86_reg dest,
726 struct x86_reg arg0 )
727 {
728 sse_movlps(p, dest, arg0);
729 }
730
731 static void emit_load2f_1( struct x86_program *p,
732 struct x86_reg dest,
733 struct x86_reg arg0 )
734 {
735 emit_load4f_1(p, dest, arg0);
736 }
737
738 static void emit_load1f_1( struct x86_program *p,
739 struct x86_reg dest,
740 struct x86_reg arg0 )
741 {
742 sse_movss(p, dest, arg0);
743 }
744
745 static void (*load[4][4])( struct x86_program *p,
746 struct x86_reg dest,
747 struct x86_reg arg0 ) = {
748 { emit_load1f_1,
749 emit_load1f_1,
750 emit_load1f_1,
751 emit_load1f_1 },
752
753 { emit_load2f_1,
754 emit_load2f_2,
755 emit_load2f_2,
756 emit_load2f_2 },
757
758 { emit_load3f_1,
759 emit_load3f_2,
760 emit_load3f_3,
761 emit_load3f_3 },
762
763 { emit_load4f_1,
764 emit_load4f_2,
765 emit_load4f_3,
766 emit_load4f_4 }
767 };
768
769 static void emit_load( struct x86_program *p,
770 struct x86_reg dest,
771 GLuint sz,
772 struct x86_reg src,
773 GLuint src_sz)
774 {
775 if (DISASSEM)
776 _mesa_printf("load %d/%d\n", sz, src_sz);
777
778 load[sz-1][src_sz-1](p, dest, src);
779 }
780
781 static void emit_store4f( struct x86_program *p,
782 struct x86_reg dest,
783 struct x86_reg arg0 )
784 {
785 sse_movups(p, dest, arg0);
786 }
787
788 static void emit_store3f( struct x86_program *p,
789 struct x86_reg dest,
790 struct x86_reg arg0 )
791 {
792 if (p->outputs_safe) {
793 /* Emit the extra dword anyway. This may hurt writecombining,
794 * may cause other problems.
795 */
796 sse_movups(p, dest, arg0);
797 }
798 else {
799 /* Alternate strategy - emit two, shuffle, emit one.
800 */
801 sse_movlps(p, dest, arg0);
802 sse_shufps(p, arg0, arg0, Z, Z, Z, Z ); /* NOTE! destructive */
803 sse_movss(p, make_disp(dest,8), arg0);
804 }
805 }
806
807 static void emit_store2f( struct x86_program *p,
808 struct x86_reg dest,
809 struct x86_reg arg0 )
810 {
811 sse_movlps(p, dest, arg0);
812 }
813
814 static void emit_store1f( struct x86_program *p,
815 struct x86_reg dest,
816 struct x86_reg arg0 )
817 {
818 sse_movss(p, dest, arg0);
819 }
820
821
822 static void (*store[4])( struct x86_program *p,
823 struct x86_reg dest,
824 struct x86_reg arg0 ) =
825 {
826 emit_store1f,
827 emit_store2f,
828 emit_store3f,
829 emit_store4f
830 };
831
832 static void emit_store( struct x86_program *p,
833 struct x86_reg dest,
834 GLuint sz,
835 struct x86_reg temp )
836
837 {
838 if (DISASSEM)
839 _mesa_printf("store %d\n", sz);
840 store[sz-1](p, dest, temp);
841 }
842
843 static void emit_pack_store_4ub( struct x86_program *p,
844 struct x86_reg dest,
845 struct x86_reg temp )
846 {
847 /* Scale by 255.0
848 */
849 sse_mulps(p, temp, p->chan0);
850
851 if (p->have_sse2) {
852 sse2_cvtps2dq(p, temp, temp);
853 sse2_packssdw(p, temp, temp);
854 sse2_packuswb(p, temp, temp);
855 sse_movss(p, dest, temp);
856 }
857 else {
858 struct x86_reg mmx0 = make_reg(file_MMX, 0);
859 struct x86_reg mmx1 = make_reg(file_MMX, 1);
860 sse_cvtps2pi(p, mmx0, temp);
861 sse_movhlps(p, temp, temp);
862 sse_cvtps2pi(p, mmx1, temp);
863 mmx_packssdw(p, mmx0, mmx1);
864 mmx_packuswb(p, mmx0, mmx0);
865 mmx_movd(p, dest, mmx0);
866 }
867 }
868
869 static GLint get_offset( const void *a, const void *b )
870 {
871 return (const char *)b - (const char *)a;
872 }
873
874 /* Not much happens here. Eventually use this function to try and
875 * avoid saving/reloading the source pointers each vertex (if some of
876 * them can fit in registers).
877 */
878 static void get_src_ptr( struct x86_program *p,
879 struct x86_reg srcREG,
880 struct x86_reg vtxREG,
881 struct tnl_clipspace_attr *a )
882 {
883 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
884 struct x86_reg ptr_to_src = make_disp(vtxREG, get_offset(vtx, &a->inputptr));
885
886 /* Load current a[j].inputptr
887 */
888 x86_mov(p, srcREG, ptr_to_src);
889 }
890
891 static void update_src_ptr( struct x86_program *p,
892 struct x86_reg srcREG,
893 struct x86_reg vtxREG,
894 struct tnl_clipspace_attr *a )
895 {
896 if (a->inputstride) {
897 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
898 struct x86_reg ptr_to_src = make_disp(vtxREG, get_offset(vtx, &a->inputptr));
899
900 /* add a[j].inputstride (hardcoded value - could just as easily
901 * pull the stride value from memory each time).
902 */
903 x86_lea(p, srcREG, make_disp(srcREG, a->inputstride));
904
905 /* save new value of a[j].inputptr
906 */
907 x86_mov(p, ptr_to_src, srcREG);
908 }
909 }
910
911
912 /* Lots of hardcoding
913 *
914 * EAX -- pointer to current output vertex
915 * ECX -- pointer to current attribute
916 *
917 */
918 static GLboolean build_vertex_emit( struct x86_program *p )
919 {
920 GLcontext *ctx = p->ctx;
921 TNLcontext *tnl = TNL_CONTEXT(ctx);
922 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
923 GLuint j = 0;
924
925 struct x86_reg vertexEAX = make_reg(file_REG32, reg_AX);
926 struct x86_reg srcECX = make_reg(file_REG32, reg_CX);
927 struct x86_reg countEBP = make_reg(file_REG32, reg_BP);
928 struct x86_reg vtxESI = make_reg(file_REG32, reg_SI);
929 struct x86_reg temp = make_reg(file_XMM, 0);
930 struct x86_reg vp0 = make_reg(file_XMM, 1);
931 struct x86_reg vp1 = make_reg(file_XMM, 2);
932 GLubyte *fixup, *label;
933
934 p->csr = p->store;
935
936 /* Push a few regs?
937 */
938 /* x86_push(p, srcECX); */
939 x86_push(p, countEBP);
940 x86_push(p, vtxESI);
941
942
943 /* Get vertex count, compare to zero
944 */
945 x86_xor(p, srcECX, srcECX);
946 x86_mov(p, countEBP, make_fn_arg(p, 2));
947 x86_cmp(p, countEBP, srcECX);
948 fixup = x86_jcc_forward(p, cc_E);
949
950 /* Initialize destination register.
951 */
952 x86_mov(p, vertexEAX, make_fn_arg(p, 3));
953
954 /* Dereference ctx to get tnl, then vtx:
955 */
956 x86_mov(p, vtxESI, make_fn_arg(p, 1));
957 x86_mov(p, vtxESI, make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
958 vtxESI = make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
959
960
961 /* Possibly load vp0, vp1 for viewport calcs:
962 */
963 if (vtx->need_viewport) {
964 sse_movups(p, vp0, make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
965 sse_movups(p, vp1, make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
966 }
967
968 /* always load, needed or not:
969 */
970 sse_movups(p, p->chan0, make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
971 sse_movups(p, p->identity, make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
972
973 /* Note address for loop jump */
974 label = get_label(p);
975
976 /* Emit code for each of the attributes. Currently routes
977 * everything through SSE registers, even when it might be more
978 * efficient to stick with regular old x86. No optimization or
979 * other tricks - enough new ground to cover here just getting
980 * things working.
981 */
982 while (j < vtx->attr_count) {
983 struct tnl_clipspace_attr *a = &vtx->attr[j];
984 struct x86_reg dest = make_disp(vertexEAX, a->vertoffset);
985
986 /* Now, load an XMM reg from src, perhaps transform, then save.
987 * Could be shortcircuited in specific cases:
988 */
989 switch (a->format) {
990 case EMIT_1F:
991 get_src_ptr(p, srcECX, vtxESI, a);
992 emit_load(p, temp, 1, deref(srcECX), a->inputsize);
993 emit_store(p, dest, 1, temp);
994 update_src_ptr(p, srcECX, vtxESI, a);
995 break;
996 case EMIT_2F:
997 get_src_ptr(p, srcECX, vtxESI, a);
998 emit_load(p, temp, 2, deref(srcECX), a->inputsize);
999 emit_store(p, dest, 2, temp);
1000 update_src_ptr(p, srcECX, vtxESI, a);
1001 break;
1002 case EMIT_3F:
1003 /* Potentially the worst case - hardcode 2+1 copying:
1004 */
1005 if (0) {
1006 get_src_ptr(p, srcECX, vtxESI, a);
1007 emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1008 emit_store(p, dest, 3, temp);
1009 update_src_ptr(p, srcECX, vtxESI, a);
1010 }
1011 else {
1012 get_src_ptr(p, srcECX, vtxESI, a);
1013 emit_load(p, temp, 2, deref(srcECX), a->inputsize);
1014 emit_store(p, dest, 2, temp);
1015 if (a->inputsize > 2) {
1016 emit_load(p, temp, 1, make_disp(srcECX, 8), 1);
1017 emit_store(p, make_disp(dest,8), 1, temp);
1018 }
1019 else {
1020 sse_movss(p, make_disp(dest,8), get_identity(p));
1021 }
1022 update_src_ptr(p, srcECX, vtxESI, a);
1023 }
1024 break;
1025 case EMIT_4F:
1026 get_src_ptr(p, srcECX, vtxESI, a);
1027 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1028 emit_store(p, dest, 4, temp);
1029 update_src_ptr(p, srcECX, vtxESI, a);
1030 break;
1031 case EMIT_2F_VIEWPORT:
1032 get_src_ptr(p, srcECX, vtxESI, a);
1033 emit_load(p, temp, 2, deref(srcECX), a->inputsize);
1034 sse_mulps(p, temp, vp0);
1035 sse_addps(p, temp, vp1);
1036 emit_store(p, dest, 2, temp);
1037 update_src_ptr(p, srcECX, vtxESI, a);
1038 break;
1039 case EMIT_3F_VIEWPORT:
1040 get_src_ptr(p, srcECX, vtxESI, a);
1041 emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1042 sse_mulps(p, temp, vp0);
1043 sse_addps(p, temp, vp1);
1044 emit_store(p, dest, 3, temp);
1045 update_src_ptr(p, srcECX, vtxESI, a);
1046 break;
1047 case EMIT_4F_VIEWPORT:
1048 get_src_ptr(p, srcECX, vtxESI, a);
1049 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1050 sse_mulps(p, temp, vp0);
1051 sse_addps(p, temp, vp1);
1052 emit_store(p, dest, 4, temp);
1053 update_src_ptr(p, srcECX, vtxESI, a);
1054 break;
1055 case EMIT_3F_XYW:
1056 get_src_ptr(p, srcECX, vtxESI, a);
1057 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1058 sse_shufps(p, temp, temp, X, Y, W, Z);
1059 emit_store(p, dest, 3, temp);
1060 update_src_ptr(p, srcECX, vtxESI, a);
1061 break;
1062
1063 case EMIT_1UB_1F:
1064 /* Test for PAD3 + 1UB:
1065 */
1066 if (j > 0 &&
1067 a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
1068 {
1069 get_src_ptr(p, srcECX, vtxESI, a);
1070 emit_load(p, temp, 1, deref(srcECX), a->inputsize);
1071 sse_shufps(p, temp, temp, X, X, X, X);
1072 emit_pack_store_4ub(p, make_disp(dest, -3), temp); /* overkill! */
1073 update_src_ptr(p, srcECX, vtxESI, a);
1074 }
1075 else {
1076 _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
1077 return GL_FALSE;
1078 }
1079 break;
1080 case EMIT_3UB_3F_RGB:
1081 case EMIT_3UB_3F_BGR:
1082 /* Test for 3UB + PAD1:
1083 */
1084 if (j == vtx->attr_count - 1 ||
1085 a[1].vertoffset >= a->vertoffset + 4) {
1086 get_src_ptr(p, srcECX, vtxESI, a);
1087 emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1088 if (a->format == EMIT_3UB_3F_BGR)
1089 sse_shufps(p, temp, temp, Z, Y, X, W);
1090 emit_pack_store_4ub(p, dest, temp);
1091 update_src_ptr(p, srcECX, vtxESI, a);
1092 }
1093 /* Test for 3UB + 1UB:
1094 */
1095 else if (j < vtx->attr_count - 1 &&
1096 a[1].format == EMIT_1UB_1F &&
1097 a[1].vertoffset == a->vertoffset + 3) {
1098 get_src_ptr(p, srcECX, vtxESI, a);
1099 emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1100 update_src_ptr(p, srcECX, vtxESI, a);
1101
1102 /* Make room for incoming value:
1103 */
1104 sse_shufps(p, temp, temp, W, X, Y, Z);
1105
1106 get_src_ptr(p, srcECX, vtxESI, &a[1]);
1107 emit_load(p, temp, 1, deref(srcECX), a[1].inputsize);
1108 update_src_ptr(p, srcECX, vtxESI, &a[1]);
1109
1110 /* Rearrange and possibly do BGR conversion:
1111 */
1112 if (a->format == EMIT_3UB_3F_BGR)
1113 sse_shufps(p, temp, temp, W, Z, Y, X);
1114 else
1115 sse_shufps(p, temp, temp, Y, Z, W, X);
1116
1117 emit_pack_store_4ub(p, dest, temp);
1118 j++; /* NOTE: two attrs consumed */
1119 }
1120 else {
1121 _mesa_printf("Can't emit 3ub\n");
1122 }
1123 return GL_FALSE; /* add this later */
1124 break;
1125
1126 case EMIT_4UB_4F_RGBA:
1127 get_src_ptr(p, srcECX, vtxESI, a);
1128 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1129 emit_pack_store_4ub(p, dest, temp);
1130 update_src_ptr(p, srcECX, vtxESI, a);
1131 break;
1132 case EMIT_4UB_4F_BGRA:
1133 get_src_ptr(p, srcECX, vtxESI, a);
1134 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1135 sse_shufps(p, temp, temp, Z, Y, X, W);
1136 emit_pack_store_4ub(p, dest, temp);
1137 update_src_ptr(p, srcECX, vtxESI, a);
1138 break;
1139 case EMIT_4UB_4F_ARGB:
1140 get_src_ptr(p, srcECX, vtxESI, a);
1141 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1142 sse_shufps(p, temp, temp, W, X, Y, Z);
1143 emit_pack_store_4ub(p, dest, temp);
1144 update_src_ptr(p, srcECX, vtxESI, a);
1145 break;
1146 case EMIT_4UB_4F_ABGR:
1147 get_src_ptr(p, srcECX, vtxESI, a);
1148 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1149 sse_shufps(p, temp, temp, W, Z, Y, X);
1150 emit_pack_store_4ub(p, dest, temp);
1151 update_src_ptr(p, srcECX, vtxESI, a);
1152 break;
1153 case EMIT_4CHAN_4F_RGBA:
1154 switch (CHAN_TYPE) {
1155 case GL_UNSIGNED_BYTE:
1156 get_src_ptr(p, srcECX, vtxESI, a);
1157 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1158 emit_pack_store_4ub(p, dest, temp);
1159 update_src_ptr(p, srcECX, vtxESI, a);
1160 break;
1161 case GL_FLOAT:
1162 get_src_ptr(p, srcECX, vtxESI, a);
1163 emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1164 emit_store(p, dest, 4, temp);
1165 update_src_ptr(p, srcECX, vtxESI, a);
1166 break;
1167 case GL_UNSIGNED_SHORT:
1168 default:
1169 _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
1170 return GL_FALSE;
1171 }
1172 break;
1173 default:
1174 _mesa_printf("unknown a[%d].format %d\n", j, a->format);
1175 return GL_FALSE; /* catch any new opcodes */
1176 }
1177
1178 /* Increment j by at least 1 - may have been incremented above also:
1179 */
1180 j++;
1181 }
1182
1183 /* Next vertex:
1184 */
1185 x86_lea(p, vertexEAX, make_disp(vertexEAX, vtx->vertex_size));
1186
1187 /* decr count, loop if not zero
1188 */
1189 x86_dec(p, countEBP);
1190 x86_test(p, countEBP, countEBP);
1191 x86_jcc(p, cc_NZ, label);
1192
1193 /* Exit mmx state?
1194 */
1195 if (p->need_emms)
1196 mmx_emms(p);
1197
1198 /* Land forward jump here:
1199 */
1200 do_fixup(p, fixup);
1201
1202 /* Pop regs and return
1203 */
1204 x86_pop(p, get_base_reg(vtxESI));
1205 x86_pop(p, countEBP);
1206 /* x86_pop(p, srcECX); */
1207 x86_ret(p);
1208
1209 vtx->emit = (tnl_emit_func)p->store;
1210 return GL_TRUE;
1211 }
1212
1213 #include "x86/common_x86_asm.h"
1214
1215
1216 void _tnl_generate_sse_emit( GLcontext *ctx )
1217 {
1218 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
1219 struct x86_program p;
1220
1221 if (!cpu_has_xmm) {
1222 vtx->codegen_emit = NULL;
1223 return;
1224 }
1225
1226 memset(&p, 0, sizeof(p));
1227 p.ctx = ctx;
1228 p.store = MALLOC(1024);
1229
1230 p.inputs_safe = 0; /* for now */
1231 p.outputs_safe = 1; /* for now */
1232 p.have_sse2 = cpu_has_xmm2;
1233 p.identity = make_reg(file_XMM, 6);
1234 p.chan0 = make_reg(file_XMM, 7);
1235
1236 if (build_vertex_emit(&p)) {
1237 _tnl_register_fastpath( vtx, GL_TRUE );
1238 if (DISASSEM)
1239 _mesa_printf("disassemble 0x%x 0x%x\n", p.store, p.csr);
1240 }
1241 else {
1242 /* Note the failure so that we don't keep trying to codegen an
1243 * impossible state:
1244 */
1245 _tnl_register_fastpath( vtx, GL_FALSE );
1246 FREE(p.store);
1247 }
1248
1249 (void)sse2_movd;
1250 (void)x86_inc;
1251 (void)x86_xor;
1252 (void)mmx_movq;
1253 (void)sse_movlhps;
1254 (void)sse_movhps;
1255 (void)sse_movaps;
1256 (void)sse2_packsswb;
1257 (void)sse2_pshufd;
1258 }
1259
1260 #else
1261
1262 void _tnl_generate_sse_emit( GLcontext *ctx )
1263 {
1264 /* Dummy version for when USE_SSE_ASM not defined */
1265 }
1266
1267 #endif