vbo: pass the stream from DrawTransformFeedbackStream to drivers
[mesa.git] / src / mesa / tnl / t_vertex_sse.c
1 /*
2 * Copyright 2003 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@vmware.com>
26 */
27
28 #include <stdio.h>
29
30 #include "main/glheader.h"
31 #include "main/context.h"
32 #include "util/simple_list.h"
33 #include "main/enums.h"
34 #include "swrast/s_chan.h"
35 #include "t_context.h"
36 #include "t_vertex.h"
37
38 #if defined(USE_SSE_ASM)
39
40 #include "x86/rtasm/x86sse.h"
41 #include "x86/common_x86_asm.h"
42
43
44 /**
45 * Number of bytes to allocate for generated SSE functions
46 */
47 #define MAX_SSE_CODE_SIZE 1024
48
49
50 #define X 0
51 #define Y 1
52 #define Z 2
53 #define W 3
54
55
56 struct x86_program {
57 struct x86_function func;
58
59 struct gl_context *ctx;
60 GLboolean inputs_safe;
61 GLboolean outputs_safe;
62 GLboolean have_sse2;
63
64 struct x86_reg identity;
65 struct x86_reg chan0;
66 };
67
68
69 static struct x86_reg get_identity( struct x86_program *p )
70 {
71 return p->identity;
72 }
73
74 static void emit_load4f_4( struct x86_program *p,
75 struct x86_reg dest,
76 struct x86_reg arg0 )
77 {
78 sse_movups(&p->func, dest, arg0);
79 }
80
81 static void emit_load4f_3( struct x86_program *p,
82 struct x86_reg dest,
83 struct x86_reg arg0 )
84 {
85 /* Have to jump through some hoops:
86 *
87 * c 0 0 0
88 * c 0 0 1
89 * 0 0 c 1
90 * a b c 1
91 */
92 sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
93 sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
94 sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
95 sse_movlps(&p->func, dest, arg0);
96 }
97
98 static void emit_load4f_2( struct x86_program *p,
99 struct x86_reg dest,
100 struct x86_reg arg0 )
101 {
102 /* Initialize from identity, then pull in low two words:
103 */
104 sse_movups(&p->func, dest, get_identity(p));
105 sse_movlps(&p->func, dest, arg0);
106 }
107
108 static void emit_load4f_1( struct x86_program *p,
109 struct x86_reg dest,
110 struct x86_reg arg0 )
111 {
112 /* Pull in low word, then swizzle in identity */
113 sse_movss(&p->func, dest, arg0);
114 sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
115 }
116
117
118
119 static void emit_load3f_3( struct x86_program *p,
120 struct x86_reg dest,
121 struct x86_reg arg0 )
122 {
123 /* Over-reads by 1 dword - potential SEGV if input is a vertex
124 * array.
125 */
126 if (p->inputs_safe) {
127 sse_movups(&p->func, dest, arg0);
128 }
129 else {
130 /* c 0 0 0
131 * c c c c
132 * a b c c
133 */
134 sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
135 sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
136 sse_movlps(&p->func, dest, arg0);
137 }
138 }
139
140 static void emit_load3f_2( struct x86_program *p,
141 struct x86_reg dest,
142 struct x86_reg arg0 )
143 {
144 emit_load4f_2(p, dest, arg0);
145 }
146
147 static void emit_load3f_1( struct x86_program *p,
148 struct x86_reg dest,
149 struct x86_reg arg0 )
150 {
151 /* Loading from memory erases the upper bits. */
152 sse_movss(&p->func, dest, arg0);
153 }
154
155 static void emit_load2f_2( struct x86_program *p,
156 struct x86_reg dest,
157 struct x86_reg arg0 )
158 {
159 sse_movlps(&p->func, dest, arg0);
160 }
161
162 static void emit_load2f_1( struct x86_program *p,
163 struct x86_reg dest,
164 struct x86_reg arg0 )
165 {
166 /* Loading from memory erases the upper bits. */
167 sse_movss(&p->func, dest, arg0);
168 }
169
170 static void emit_load1f_1( struct x86_program *p,
171 struct x86_reg dest,
172 struct x86_reg arg0 )
173 {
174 sse_movss(&p->func, dest, arg0);
175 }
176
177 static void (*load[4][4])( struct x86_program *p,
178 struct x86_reg dest,
179 struct x86_reg arg0 ) = {
180 { emit_load1f_1,
181 emit_load1f_1,
182 emit_load1f_1,
183 emit_load1f_1 },
184
185 { emit_load2f_1,
186 emit_load2f_2,
187 emit_load2f_2,
188 emit_load2f_2 },
189
190 { emit_load3f_1,
191 emit_load3f_2,
192 emit_load3f_3,
193 emit_load3f_3 },
194
195 { emit_load4f_1,
196 emit_load4f_2,
197 emit_load4f_3,
198 emit_load4f_4 }
199 };
200
201 static void emit_load( struct x86_program *p,
202 struct x86_reg dest,
203 GLuint sz,
204 struct x86_reg src,
205 GLuint src_sz)
206 {
207 load[sz-1][src_sz-1](p, dest, src);
208 }
209
210 static void emit_store4f( struct x86_program *p,
211 struct x86_reg dest,
212 struct x86_reg arg0 )
213 {
214 sse_movups(&p->func, dest, arg0);
215 }
216
217 static void emit_store3f( struct x86_program *p,
218 struct x86_reg dest,
219 struct x86_reg arg0 )
220 {
221 if (p->outputs_safe) {
222 /* Emit the extra dword anyway. This may hurt writecombining,
223 * may cause other problems.
224 */
225 sse_movups(&p->func, dest, arg0);
226 }
227 else {
228 /* Alternate strategy - emit two, shuffle, emit one.
229 */
230 sse_movlps(&p->func, dest, arg0);
231 sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
232 sse_movss(&p->func, x86_make_disp(dest,8), arg0);
233 }
234 }
235
236 static void emit_store2f( struct x86_program *p,
237 struct x86_reg dest,
238 struct x86_reg arg0 )
239 {
240 sse_movlps(&p->func, dest, arg0);
241 }
242
243 static void emit_store1f( struct x86_program *p,
244 struct x86_reg dest,
245 struct x86_reg arg0 )
246 {
247 sse_movss(&p->func, dest, arg0);
248 }
249
250
251 static void (*store[4])( struct x86_program *p,
252 struct x86_reg dest,
253 struct x86_reg arg0 ) =
254 {
255 emit_store1f,
256 emit_store2f,
257 emit_store3f,
258 emit_store4f
259 };
260
261 static void emit_store( struct x86_program *p,
262 struct x86_reg dest,
263 GLuint sz,
264 struct x86_reg temp )
265
266 {
267 store[sz-1](p, dest, temp);
268 }
269
270 static void emit_pack_store_4ub( struct x86_program *p,
271 struct x86_reg dest,
272 struct x86_reg temp )
273 {
274 /* Scale by 255.0
275 */
276 sse_mulps(&p->func, temp, p->chan0);
277
278 if (p->have_sse2) {
279 sse2_cvtps2dq(&p->func, temp, temp);
280 sse2_packssdw(&p->func, temp, temp);
281 sse2_packuswb(&p->func, temp, temp);
282 sse_movss(&p->func, dest, temp);
283 }
284 else {
285 struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
286 struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
287 sse_cvtps2pi(&p->func, mmx0, temp);
288 sse_movhlps(&p->func, temp, temp);
289 sse_cvtps2pi(&p->func, mmx1, temp);
290 mmx_packssdw(&p->func, mmx0, mmx1);
291 mmx_packuswb(&p->func, mmx0, mmx0);
292 mmx_movd(&p->func, dest, mmx0);
293 }
294 }
295
296 static GLint get_offset( const void *a, const void *b )
297 {
298 return (const char *)b - (const char *)a;
299 }
300
301 /* Not much happens here. Eventually use this function to try and
302 * avoid saving/reloading the source pointers each vertex (if some of
303 * them can fit in registers).
304 */
305 static void get_src_ptr( struct x86_program *p,
306 struct x86_reg srcREG,
307 struct x86_reg vtxREG,
308 struct tnl_clipspace_attr *a )
309 {
310 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
311 struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
312
313 /* Load current a[j].inputptr
314 */
315 x86_mov(&p->func, srcREG, ptr_to_src);
316 }
317
318 static void update_src_ptr( struct x86_program *p,
319 struct x86_reg srcREG,
320 struct x86_reg vtxREG,
321 struct tnl_clipspace_attr *a )
322 {
323 if (a->inputstride) {
324 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
325 struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
326
327 /* add a[j].inputstride (hardcoded value - could just as easily
328 * pull the stride value from memory each time).
329 */
330 x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
331
332 /* save new value of a[j].inputptr
333 */
334 x86_mov(&p->func, ptr_to_src, srcREG);
335 }
336 }
337
338
339 /* Lots of hardcoding
340 *
341 * EAX -- pointer to current output vertex
342 * ECX -- pointer to current attribute
343 *
344 */
345 static GLboolean build_vertex_emit( struct x86_program *p )
346 {
347 struct gl_context *ctx = p->ctx;
348 TNLcontext *tnl = TNL_CONTEXT(ctx);
349 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
350 GLuint j = 0;
351
352 struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
353 struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
354 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
355 struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
356 struct x86_reg temp = x86_make_reg(file_XMM, 0);
357 struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
358 struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
359 struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
360 GLubyte *fixup, *label;
361
362 /* Push a few regs?
363 */
364 x86_push(&p->func, countEBP);
365 x86_push(&p->func, vtxESI);
366
367
368 /* Get vertex count, compare to zero
369 */
370 x86_xor(&p->func, srcECX, srcECX);
371 x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
372 x86_cmp(&p->func, countEBP, srcECX);
373 fixup = x86_jcc_forward(&p->func, cc_E);
374
375 /* Initialize destination register.
376 */
377 x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
378
379 /* Dereference ctx to get tnl, then vtx:
380 */
381 x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
382 x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
383 vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
384
385
386 /* Possibly load vp0, vp1 for viewport calcs:
387 */
388 if (vtx->need_viewport) {
389 sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
390 sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
391 }
392
393 /* always load, needed or not:
394 */
395 sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
396 sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
397
398 /* Note address for loop jump */
399 label = x86_get_label(&p->func);
400
401 /* Emit code for each of the attributes. Currently routes
402 * everything through SSE registers, even when it might be more
403 * efficient to stick with regular old x86. No optimization or
404 * other tricks - enough new ground to cover here just getting
405 * things working.
406 */
407 while (j < vtx->attr_count) {
408 struct tnl_clipspace_attr *a = &vtx->attr[j];
409 struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
410
411 /* Now, load an XMM reg from src, perhaps transform, then save.
412 * Could be shortcircuited in specific cases:
413 */
414 switch (a->format) {
415 case EMIT_1F:
416 get_src_ptr(p, srcECX, vtxESI, a);
417 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
418 emit_store(p, dest, 1, temp);
419 update_src_ptr(p, srcECX, vtxESI, a);
420 break;
421 case EMIT_2F:
422 get_src_ptr(p, srcECX, vtxESI, a);
423 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
424 emit_store(p, dest, 2, temp);
425 update_src_ptr(p, srcECX, vtxESI, a);
426 break;
427 case EMIT_3F:
428 /* Potentially the worst case - hardcode 2+1 copying:
429 */
430 if (0) {
431 get_src_ptr(p, srcECX, vtxESI, a);
432 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
433 emit_store(p, dest, 3, temp);
434 update_src_ptr(p, srcECX, vtxESI, a);
435 }
436 else {
437 get_src_ptr(p, srcECX, vtxESI, a);
438 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
439 emit_store(p, dest, 2, temp);
440 if (a->inputsize > 2) {
441 emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
442 emit_store(p, x86_make_disp(dest,8), 1, temp);
443 }
444 else {
445 sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
446 }
447 update_src_ptr(p, srcECX, vtxESI, a);
448 }
449 break;
450 case EMIT_4F:
451 get_src_ptr(p, srcECX, vtxESI, a);
452 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
453 emit_store(p, dest, 4, temp);
454 update_src_ptr(p, srcECX, vtxESI, a);
455 break;
456 case EMIT_2F_VIEWPORT:
457 get_src_ptr(p, srcECX, vtxESI, a);
458 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
459 sse_mulps(&p->func, temp, vp0);
460 sse_addps(&p->func, temp, vp1);
461 emit_store(p, dest, 2, temp);
462 update_src_ptr(p, srcECX, vtxESI, a);
463 break;
464 case EMIT_3F_VIEWPORT:
465 get_src_ptr(p, srcECX, vtxESI, a);
466 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
467 sse_mulps(&p->func, temp, vp0);
468 sse_addps(&p->func, temp, vp1);
469 emit_store(p, dest, 3, temp);
470 update_src_ptr(p, srcECX, vtxESI, a);
471 break;
472 case EMIT_4F_VIEWPORT:
473 get_src_ptr(p, srcECX, vtxESI, a);
474 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
475 sse_mulps(&p->func, temp, vp0);
476 sse_addps(&p->func, temp, vp1);
477 emit_store(p, dest, 4, temp);
478 update_src_ptr(p, srcECX, vtxESI, a);
479 break;
480 case EMIT_3F_XYW:
481 get_src_ptr(p, srcECX, vtxESI, a);
482 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
483 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
484 emit_store(p, dest, 3, temp);
485 update_src_ptr(p, srcECX, vtxESI, a);
486 break;
487
488 case EMIT_1UB_1F:
489 /* Test for PAD3 + 1UB:
490 */
491 if (j > 0 &&
492 a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
493 {
494 get_src_ptr(p, srcECX, vtxESI, a);
495 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
496 sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
497 emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
498 update_src_ptr(p, srcECX, vtxESI, a);
499 }
500 else {
501 printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
502 return GL_FALSE;
503 }
504 break;
505 case EMIT_3UB_3F_RGB:
506 case EMIT_3UB_3F_BGR:
507 /* Test for 3UB + PAD1:
508 */
509 if (j == vtx->attr_count - 1 ||
510 a[1].vertoffset >= a->vertoffset + 4) {
511 get_src_ptr(p, srcECX, vtxESI, a);
512 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
513 if (a->format == EMIT_3UB_3F_BGR)
514 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
515 emit_pack_store_4ub(p, dest, temp);
516 update_src_ptr(p, srcECX, vtxESI, a);
517 }
518 /* Test for 3UB + 1UB:
519 */
520 else if (j < vtx->attr_count - 1 &&
521 a[1].format == EMIT_1UB_1F &&
522 a[1].vertoffset == a->vertoffset + 3) {
523 get_src_ptr(p, srcECX, vtxESI, a);
524 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
525 update_src_ptr(p, srcECX, vtxESI, a);
526
527 /* Make room for incoming value:
528 */
529 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
530
531 get_src_ptr(p, srcECX, vtxESI, &a[1]);
532 emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
533 sse_movss(&p->func, temp, temp2);
534 update_src_ptr(p, srcECX, vtxESI, &a[1]);
535
536 /* Rearrange and possibly do BGR conversion:
537 */
538 if (a->format == EMIT_3UB_3F_BGR)
539 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
540 else
541 sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
542
543 emit_pack_store_4ub(p, dest, temp);
544 j++; /* NOTE: two attrs consumed */
545 }
546 else {
547 printf("Can't emit 3ub\n");
548 return GL_FALSE; /* add this later */
549 }
550 break;
551
552 case EMIT_4UB_4F_RGBA:
553 get_src_ptr(p, srcECX, vtxESI, a);
554 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
555 emit_pack_store_4ub(p, dest, temp);
556 update_src_ptr(p, srcECX, vtxESI, a);
557 break;
558 case EMIT_4UB_4F_BGRA:
559 get_src_ptr(p, srcECX, vtxESI, a);
560 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
561 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
562 emit_pack_store_4ub(p, dest, temp);
563 update_src_ptr(p, srcECX, vtxESI, a);
564 break;
565 case EMIT_4UB_4F_ARGB:
566 get_src_ptr(p, srcECX, vtxESI, a);
567 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
568 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
569 emit_pack_store_4ub(p, dest, temp);
570 update_src_ptr(p, srcECX, vtxESI, a);
571 break;
572 case EMIT_4UB_4F_ABGR:
573 get_src_ptr(p, srcECX, vtxESI, a);
574 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
575 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
576 emit_pack_store_4ub(p, dest, temp);
577 update_src_ptr(p, srcECX, vtxESI, a);
578 break;
579 case EMIT_4CHAN_4F_RGBA:
580 switch (CHAN_TYPE) {
581 case GL_UNSIGNED_BYTE:
582 get_src_ptr(p, srcECX, vtxESI, a);
583 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
584 emit_pack_store_4ub(p, dest, temp);
585 update_src_ptr(p, srcECX, vtxESI, a);
586 break;
587 case GL_FLOAT:
588 get_src_ptr(p, srcECX, vtxESI, a);
589 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
590 emit_store(p, dest, 4, temp);
591 update_src_ptr(p, srcECX, vtxESI, a);
592 break;
593 case GL_UNSIGNED_SHORT:
594 default:
595 printf("unknown CHAN_TYPE %s\n", _mesa_enum_to_string(CHAN_TYPE));
596 return GL_FALSE;
597 }
598 break;
599 default:
600 printf("unknown a[%d].format %d\n", j, a->format);
601 return GL_FALSE; /* catch any new opcodes */
602 }
603
604 /* Increment j by at least 1 - may have been incremented above also:
605 */
606 j++;
607 }
608
609 /* Next vertex:
610 */
611 x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
612
613 /* decr count, loop if not zero
614 */
615 x86_dec(&p->func, countEBP);
616 x86_test(&p->func, countEBP, countEBP);
617 x86_jcc(&p->func, cc_NZ, label);
618
619 /* Exit mmx state?
620 */
621 if (p->func.need_emms)
622 mmx_emms(&p->func);
623
624 /* Land forward jump here:
625 */
626 x86_fixup_fwd_jump(&p->func, fixup);
627
628 /* Pop regs and return
629 */
630 x86_pop(&p->func, x86_get_base_reg(vtxESI));
631 x86_pop(&p->func, countEBP);
632 x86_ret(&p->func);
633
634 assert(!vtx->emit);
635 vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
636
637 assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE );
638 return GL_TRUE;
639 }
640
641
642
643 void _tnl_generate_sse_emit( struct gl_context *ctx )
644 {
645 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
646 struct x86_program p;
647
648 if (!cpu_has_xmm) {
649 vtx->codegen_emit = NULL;
650 return;
651 }
652
653 memset(&p, 0, sizeof(p));
654
655 p.ctx = ctx;
656 p.inputs_safe = 0; /* for now */
657 p.outputs_safe = 0; /* for now */
658 p.have_sse2 = cpu_has_xmm2;
659 p.identity = x86_make_reg(file_XMM, 6);
660 p.chan0 = x86_make_reg(file_XMM, 7);
661
662 if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
663 vtx->emit = NULL;
664 return;
665 }
666
667 if (build_vertex_emit(&p)) {
668 _tnl_register_fastpath( vtx, GL_TRUE );
669 }
670 else {
671 /* Note the failure so that we don't keep trying to codegen an
672 * impossible state:
673 */
674 _tnl_register_fastpath( vtx, GL_FALSE );
675 x86_release_func(&p.func);
676 }
677 }
678
679 #else
680
681 void _tnl_generate_sse_emit( struct gl_context *ctx )
682 {
683 /* Dummy version for when USE_SSE_ASM not defined */
684 }
685
686 #endif