i965: checkpoint commit: VS constant buffers
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
42 {
43 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
44
45 if (++c->last_tmp > c->prog_data.total_grf)
46 c->prog_data.total_grf = c->last_tmp;
47
48 return tmp;
49 }
50
51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
52 {
53 if (tmp.nr == c->last_tmp-1)
54 c->last_tmp--;
55 }
56
57 static void release_tmps( struct brw_vs_compile *c )
58 {
59 c->last_tmp = c->first_tmp;
60 }
61
62
63 /**
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
66 * ahead of time.
67 */
68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
69 {
70 GLuint i, reg = 0, mrf;
71 GLuint nr_params;
72
73 #if 0
74 if (c->vp->program.Base.Parameters->NumParameters >= 6)
75 c->use_const_buffer = 1;
76 else
77 #endif
78 c->use_const_buffer = GL_FALSE;
79 /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
80
81 /* r0 -- reserved as usual
82 */
83 c->r0 = brw_vec8_grf(reg, 0);
84 reg++;
85
86 /* User clip planes from curbe:
87 */
88 if (c->key.nr_userclip) {
89 for (i = 0; i < c->key.nr_userclip; i++) {
90 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
91 }
92
93 /* Deal with curbe alignment:
94 */
95 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
96 }
97
98 /* Vertex program parameters from curbe:
99 */
100 if (c->use_const_buffer) {
101 /* get constants from a real constant buffer */
102 c->prog_data.curb_read_length = 0;
103 }
104 else {
105 /* use a section of the GRF for constants */
106 nr_params = c->vp->program.Base.Parameters->NumParameters;
107 for (i = 0; i < nr_params; i++) {
108 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
109 }
110 reg += (nr_params + 1) / 2;
111 c->prog_data.curb_read_length = reg - 1;
112 }
113
114 /* Allocate input regs:
115 */
116 c->nr_inputs = 0;
117 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
118 if (c->prog_data.inputs_read & (1 << i)) {
119 c->nr_inputs++;
120 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
121 reg++;
122 }
123 }
124
125 /* Allocate outputs: TODO: could organize the non-position outputs
126 * to go straight into message regs.
127 */
128 c->nr_outputs = 0;
129 c->first_output = reg;
130 mrf = 4;
131 for (i = 0; i < VERT_RESULT_MAX; i++) {
132 if (c->prog_data.outputs_written & (1 << i)) {
133 c->nr_outputs++;
134 if (i == VERT_RESULT_HPOS) {
135 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
136 reg++;
137 }
138 else if (i == VERT_RESULT_PSIZ) {
139 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
140 reg++;
141 mrf++; /* just a placeholder? XXX fix later stages & remove this */
142 }
143 else {
144 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
145 mrf++;
146 }
147 }
148 }
149
150 /* Allocate program temporaries:
151 */
152 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
153 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
154 reg++;
155 }
156
157 /* Address reg(s). Don't try to use the internal address reg until
158 * deref time.
159 */
160 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
161 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
162 reg,
163 0,
164 BRW_REGISTER_TYPE_D,
165 BRW_VERTICAL_STRIDE_8,
166 BRW_WIDTH_8,
167 BRW_HORIZONTAL_STRIDE_1,
168 BRW_SWIZZLE_XXXX,
169 WRITEMASK_X);
170 reg++;
171 }
172
173 for (i = 0; i < 128; i++) {
174 if (c->output_regs[i].used_in_src) {
175 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
176 reg++;
177 }
178 }
179
180 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
181 reg += 2;
182
183 /* Some opcodes need an internal temporary:
184 */
185 c->first_tmp = reg;
186 c->last_tmp = reg; /* for allocation purposes */
187
188 /* Each input reg holds data from two vertices. The
189 * urb_read_length is the number of registers read from *each*
190 * vertex urb, so is half the amount:
191 */
192 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
193
194 c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
195 c->prog_data.total_grf = reg;
196
197 if (c->use_const_buffer) {
198 for (i = 0; i < 3; i++) {
199 c->current_const[i].index = -1;
200 c->current_const[i].reg = get_tmp(c);
201 }
202 }
203
204 if (INTEL_DEBUG & DEBUG_VS) {
205 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
206 _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
207 _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
208 }
209 }
210
211
212 /**
213 * If an instruction uses a temp reg both as a src and the dest, we
214 * sometimes need to allocate an intermediate temporary.
215 */
216 static void unalias1( struct brw_vs_compile *c,
217 struct brw_reg dst,
218 struct brw_reg arg0,
219 void (*func)( struct brw_vs_compile *,
220 struct brw_reg,
221 struct brw_reg ))
222 {
223 if (dst.file == arg0.file && dst.nr == arg0.nr) {
224 struct brw_compile *p = &c->func;
225 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
226 func(c, tmp, arg0);
227 brw_MOV(p, dst, tmp);
228 release_tmp(c, tmp);
229 }
230 else {
231 func(c, dst, arg0);
232 }
233 }
234
235 /**
236 * \sa unalias2
237 * Checkes if 2-operand instruction needs an intermediate temporary.
238 */
239 static void unalias2( struct brw_vs_compile *c,
240 struct brw_reg dst,
241 struct brw_reg arg0,
242 struct brw_reg arg1,
243 void (*func)( struct brw_vs_compile *,
244 struct brw_reg,
245 struct brw_reg,
246 struct brw_reg ))
247 {
248 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
249 (dst.file == arg1.file && dst.nr == arg1.nr)) {
250 struct brw_compile *p = &c->func;
251 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
252 func(c, tmp, arg0, arg1);
253 brw_MOV(p, dst, tmp);
254 release_tmp(c, tmp);
255 }
256 else {
257 func(c, dst, arg0, arg1);
258 }
259 }
260
261 /**
262 * \sa unalias2
263 * Checkes if 3-operand instruction needs an intermediate temporary.
264 */
265 static void unalias3( struct brw_vs_compile *c,
266 struct brw_reg dst,
267 struct brw_reg arg0,
268 struct brw_reg arg1,
269 struct brw_reg arg2,
270 void (*func)( struct brw_vs_compile *,
271 struct brw_reg,
272 struct brw_reg,
273 struct brw_reg,
274 struct brw_reg ))
275 {
276 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
277 (dst.file == arg1.file && dst.nr == arg1.nr) ||
278 (dst.file == arg2.file && dst.nr == arg2.nr)) {
279 struct brw_compile *p = &c->func;
280 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
281 func(c, tmp, arg0, arg1, arg2);
282 brw_MOV(p, dst, tmp);
283 release_tmp(c, tmp);
284 }
285 else {
286 func(c, dst, arg0, arg1, arg2);
287 }
288 }
289
290 static void emit_sop( struct brw_compile *p,
291 struct brw_reg dst,
292 struct brw_reg arg0,
293 struct brw_reg arg1,
294 GLuint cond)
295 {
296 brw_MOV(p, dst, brw_imm_f(0.0f));
297 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
298 brw_MOV(p, dst, brw_imm_f(1.0f));
299 brw_set_predicate_control_flag_value(p, 0xff);
300 }
301
302 static void emit_seq( struct brw_compile *p,
303 struct brw_reg dst,
304 struct brw_reg arg0,
305 struct brw_reg arg1 )
306 {
307 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
308 }
309
310 static void emit_sne( struct brw_compile *p,
311 struct brw_reg dst,
312 struct brw_reg arg0,
313 struct brw_reg arg1 )
314 {
315 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
316 }
317 static void emit_slt( struct brw_compile *p,
318 struct brw_reg dst,
319 struct brw_reg arg0,
320 struct brw_reg arg1 )
321 {
322 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
323 }
324
325 static void emit_sle( struct brw_compile *p,
326 struct brw_reg dst,
327 struct brw_reg arg0,
328 struct brw_reg arg1 )
329 {
330 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
331 }
332
333 static void emit_sgt( struct brw_compile *p,
334 struct brw_reg dst,
335 struct brw_reg arg0,
336 struct brw_reg arg1 )
337 {
338 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
339 }
340
341 static void emit_sge( struct brw_compile *p,
342 struct brw_reg dst,
343 struct brw_reg arg0,
344 struct brw_reg arg1 )
345 {
346 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
347 }
348
349 static void emit_max( struct brw_compile *p,
350 struct brw_reg dst,
351 struct brw_reg arg0,
352 struct brw_reg arg1 )
353 {
354 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
355 brw_SEL(p, dst, arg1, arg0);
356 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
357 }
358
359 static void emit_min( struct brw_compile *p,
360 struct brw_reg dst,
361 struct brw_reg arg0,
362 struct brw_reg arg1 )
363 {
364 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
365 brw_SEL(p, dst, arg0, arg1);
366 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
367 }
368
369
370 static void emit_math1( struct brw_vs_compile *c,
371 GLuint function,
372 struct brw_reg dst,
373 struct brw_reg arg0,
374 GLuint precision)
375 {
376 /* There are various odd behaviours with SEND on the simulator. In
377 * addition there are documented issues with the fact that the GEN4
378 * processor doesn't do dependency control properly on SEND
379 * results. So, on balance, this kludge to get around failures
380 * with writemasked math results looks like it might be necessary
381 * whether that turns out to be a simulator bug or not:
382 */
383 struct brw_compile *p = &c->func;
384 struct brw_reg tmp = dst;
385 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
386 dst.file != BRW_GENERAL_REGISTER_FILE);
387
388 if (need_tmp)
389 tmp = get_tmp(c);
390
391 brw_math(p,
392 tmp,
393 function,
394 BRW_MATH_SATURATE_NONE,
395 2,
396 arg0,
397 BRW_MATH_DATA_SCALAR,
398 precision);
399
400 if (need_tmp) {
401 brw_MOV(p, dst, tmp);
402 release_tmp(c, tmp);
403 }
404 }
405
406
407 static void emit_math2( struct brw_vs_compile *c,
408 GLuint function,
409 struct brw_reg dst,
410 struct brw_reg arg0,
411 struct brw_reg arg1,
412 GLuint precision)
413 {
414 struct brw_compile *p = &c->func;
415 struct brw_reg tmp = dst;
416 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
417 dst.file != BRW_GENERAL_REGISTER_FILE);
418
419 if (need_tmp)
420 tmp = get_tmp(c);
421
422 brw_MOV(p, brw_message_reg(3), arg1);
423
424 brw_math(p,
425 tmp,
426 function,
427 BRW_MATH_SATURATE_NONE,
428 2,
429 arg0,
430 BRW_MATH_DATA_SCALAR,
431 precision);
432
433 if (need_tmp) {
434 brw_MOV(p, dst, tmp);
435 release_tmp(c, tmp);
436 }
437 }
438
439
440 static void emit_exp_noalias( struct brw_vs_compile *c,
441 struct brw_reg dst,
442 struct brw_reg arg0 )
443 {
444 struct brw_compile *p = &c->func;
445
446
447 if (dst.dw1.bits.writemask & WRITEMASK_X) {
448 struct brw_reg tmp = get_tmp(c);
449 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
450
451 /* tmp_d = floor(arg0.x) */
452 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
453
454 /* result[0] = 2.0 ^ tmp */
455
456 /* Adjust exponent for floating point:
457 * exp += 127
458 */
459 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
460
461 /* Install exponent and sign.
462 * Excess drops off the edge:
463 */
464 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
465 tmp_d, brw_imm_d(23));
466
467 release_tmp(c, tmp);
468 }
469
470 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
471 /* result[1] = arg0.x - floor(arg0.x) */
472 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
473 }
474
475 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
476 /* As with the LOG instruction, we might be better off just
477 * doing a taylor expansion here, seeing as we have to do all
478 * the prep work.
479 *
480 * If mathbox partial precision is too low, consider also:
481 * result[3] = result[0] * EXP(result[1])
482 */
483 emit_math1(c,
484 BRW_MATH_FUNCTION_EXP,
485 brw_writemask(dst, WRITEMASK_Z),
486 brw_swizzle1(arg0, 0),
487 BRW_MATH_PRECISION_FULL);
488 }
489
490 if (dst.dw1.bits.writemask & WRITEMASK_W) {
491 /* result[3] = 1.0; */
492 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
493 }
494 }
495
496
497 static void emit_log_noalias( struct brw_vs_compile *c,
498 struct brw_reg dst,
499 struct brw_reg arg0 )
500 {
501 struct brw_compile *p = &c->func;
502 struct brw_reg tmp = dst;
503 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
504 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
505 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
506 dst.file != BRW_GENERAL_REGISTER_FILE);
507
508 if (need_tmp) {
509 tmp = get_tmp(c);
510 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
511 }
512
513 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
514 * according to spec:
515 *
516 * These almost look likey they could be joined up, but not really
517 * practical:
518 *
519 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
520 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
521 */
522 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
523 brw_AND(p,
524 brw_writemask(tmp_ud, WRITEMASK_X),
525 brw_swizzle1(arg0_ud, 0),
526 brw_imm_ud((1U<<31)-1));
527
528 brw_SHR(p,
529 brw_writemask(tmp_ud, WRITEMASK_X),
530 tmp_ud,
531 brw_imm_ud(23));
532
533 brw_ADD(p,
534 brw_writemask(tmp, WRITEMASK_X),
535 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
536 brw_imm_d(-127));
537 }
538
539 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
540 brw_AND(p,
541 brw_writemask(tmp_ud, WRITEMASK_Y),
542 brw_swizzle1(arg0_ud, 0),
543 brw_imm_ud((1<<23)-1));
544
545 brw_OR(p,
546 brw_writemask(tmp_ud, WRITEMASK_Y),
547 tmp_ud,
548 brw_imm_ud(127<<23));
549 }
550
551 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
552 /* result[2] = result[0] + LOG2(result[1]); */
553
554 /* Why bother? The above is just a hint how to do this with a
555 * taylor series. Maybe we *should* use a taylor series as by
556 * the time all the above has been done it's almost certainly
557 * quicker than calling the mathbox, even with low precision.
558 *
559 * Options are:
560 * - result[0] + mathbox.LOG2(result[1])
561 * - mathbox.LOG2(arg0.x)
562 * - result[0] + inline_taylor_approx(result[1])
563 */
564 emit_math1(c,
565 BRW_MATH_FUNCTION_LOG,
566 brw_writemask(tmp, WRITEMASK_Z),
567 brw_swizzle1(tmp, 1),
568 BRW_MATH_PRECISION_FULL);
569
570 brw_ADD(p,
571 brw_writemask(tmp, WRITEMASK_Z),
572 brw_swizzle1(tmp, 2),
573 brw_swizzle1(tmp, 0));
574 }
575
576 if (dst.dw1.bits.writemask & WRITEMASK_W) {
577 /* result[3] = 1.0; */
578 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
579 }
580
581 if (need_tmp) {
582 brw_MOV(p, dst, tmp);
583 release_tmp(c, tmp);
584 }
585 }
586
587
588 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
589 */
590 static void emit_dst_noalias( struct brw_vs_compile *c,
591 struct brw_reg dst,
592 struct brw_reg arg0,
593 struct brw_reg arg1)
594 {
595 struct brw_compile *p = &c->func;
596
597 /* There must be a better way to do this:
598 */
599 if (dst.dw1.bits.writemask & WRITEMASK_X)
600 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
601 if (dst.dw1.bits.writemask & WRITEMASK_Y)
602 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
603 if (dst.dw1.bits.writemask & WRITEMASK_Z)
604 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
605 if (dst.dw1.bits.writemask & WRITEMASK_W)
606 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
607 }
608
609
610 static void emit_xpd( struct brw_compile *p,
611 struct brw_reg dst,
612 struct brw_reg t,
613 struct brw_reg u)
614 {
615 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
616 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
617 }
618
619
620 static void emit_lit_noalias( struct brw_vs_compile *c,
621 struct brw_reg dst,
622 struct brw_reg arg0 )
623 {
624 struct brw_compile *p = &c->func;
625 struct brw_instruction *if_insn;
626 struct brw_reg tmp = dst;
627 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
628
629 if (need_tmp)
630 tmp = get_tmp(c);
631
632 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
633 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
634
635 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
636 * to get all channels active inside the IF. In the clipping code
637 * we run with NoMask, so it's not an option and we can use
638 * BRW_EXECUTE_1 for all comparisions.
639 */
640 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
641 if_insn = brw_IF(p, BRW_EXECUTE_8);
642 {
643 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
644
645 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
646 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
647 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
648
649 emit_math2(c,
650 BRW_MATH_FUNCTION_POW,
651 brw_writemask(dst, WRITEMASK_Z),
652 brw_swizzle1(tmp, 2),
653 brw_swizzle1(arg0, 3),
654 BRW_MATH_PRECISION_PARTIAL);
655 }
656
657 brw_ENDIF(p, if_insn);
658 }
659
660 static void emit_lrp_noalias(struct brw_vs_compile *c,
661 struct brw_reg dst,
662 struct brw_reg arg0,
663 struct brw_reg arg1,
664 struct brw_reg arg2)
665 {
666 struct brw_compile *p = &c->func;
667
668 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
669 brw_MUL(p, brw_null_reg(), dst, arg2);
670 brw_MAC(p, dst, arg0, arg1);
671 }
672
673 /** 3 or 4-component vector normalization */
674 static void emit_nrm( struct brw_vs_compile *c,
675 struct brw_reg dst,
676 struct brw_reg arg0,
677 int num_comps)
678 {
679 struct brw_compile *p = &c->func;
680 struct brw_reg tmp = get_tmp(c);
681
682 /* tmp = dot(arg0, arg0) */
683 if (num_comps == 3)
684 brw_DP3(p, tmp, arg0, arg0);
685 else
686 brw_DP4(p, tmp, arg0, arg0);
687
688 /* tmp = 1 / sqrt(tmp) */
689 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
690
691 /* dst = arg0 * tmp */
692 brw_MUL(p, dst, arg0, tmp);
693
694 release_tmp(c, tmp);
695 }
696
697
698 static struct brw_reg
699 get_constant(struct brw_vs_compile *c,
700 const struct prog_instruction *inst,
701 GLuint argIndex)
702 {
703 const struct prog_src_register *src = &inst->SrcReg[argIndex];
704 struct brw_compile *p = &c->func;
705 struct brw_reg const_reg;
706
707 if (c->current_const[argIndex].index != src->Index) {
708 struct brw_reg src_reg = get_tmp(c);
709 struct brw_reg t = get_tmp(c);
710
711 c->current_const[argIndex].index = src->Index;
712
713 brw_MOV(p, t, brw_vec8_grf(0, 0));/*SAVE*/
714
715 #if 0
716 printf(" fetch const[%d] for arg %d into reg %d\n",
717 src->Index, argIndex, c->current_const[argIndex].reg.nr);
718 #endif
719
720 /* need to fetch the constant now */
721 brw_dp_READ_4_vs(p,
722 c->current_const[argIndex].reg, /* writeback dest */
723 src_reg, /* src reg */
724 1, /* msg_reg */
725 src->RelAddr, /* relative indexing? */
726 16 * src->Index, /* byte offset */
727 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
728 );
729
730 brw_MOV(p, brw_vec8_grf(0, 0), t);/*RESTORE*/
731 release_tmp(c, src_reg);
732 release_tmp(c, t);
733 }
734
735 /* replicate lower four floats into upper four floats (to get XYZWXYZW) */
736 const_reg = c->current_const[argIndex].reg;
737 const_reg = stride(const_reg, 0, 4, 0);
738 const_reg.subnr = 0;
739
740 return const_reg;
741 }
742
743
744
745 /* TODO: relative addressing!
746 */
747 static struct brw_reg get_reg( struct brw_vs_compile *c,
748 gl_register_file file,
749 GLuint index )
750 {
751 switch (file) {
752 case PROGRAM_TEMPORARY:
753 case PROGRAM_INPUT:
754 case PROGRAM_OUTPUT:
755 assert(c->regs[file][index].nr != 0);
756 return c->regs[file][index];
757 case PROGRAM_STATE_VAR:
758 case PROGRAM_CONSTANT:
759 case PROGRAM_UNIFORM:
760 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
761 return c->regs[PROGRAM_STATE_VAR][index];
762 case PROGRAM_ADDRESS:
763 assert(index == 0);
764 return c->regs[file][index];
765
766 case PROGRAM_UNDEFINED: /* undef values */
767 return brw_null_reg();
768
769 case PROGRAM_LOCAL_PARAM:
770 case PROGRAM_ENV_PARAM:
771 case PROGRAM_WRITE_ONLY:
772 default:
773 assert(0);
774 return brw_null_reg();
775 }
776 }
777
778
779 /**
780 * Get brw reg corresponding to the instruction's [argIndex] src reg.
781 * TODO: relative addressing!
782 */
783 static struct brw_reg
784 get_src_reg( struct brw_vs_compile *c,
785 const struct prog_instruction *inst,
786 GLuint argIndex )
787 {
788 const GLuint file = inst->SrcReg[argIndex].File;
789 const GLint index = inst->SrcReg[argIndex].Index;
790
791 switch (file) {
792 case PROGRAM_TEMPORARY:
793 case PROGRAM_INPUT:
794 case PROGRAM_OUTPUT:
795 assert(c->regs[file][index].nr != 0);
796 return c->regs[file][index];
797 case PROGRAM_STATE_VAR:
798 case PROGRAM_CONSTANT:
799 case PROGRAM_UNIFORM:
800 if (c->use_const_buffer) {
801 return get_constant(c, inst, argIndex);
802 }
803 else {
804 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
805 return c->regs[PROGRAM_STATE_VAR][index];
806 }
807 case PROGRAM_ADDRESS:
808 assert(index == 0);
809 return c->regs[file][index];
810
811 case PROGRAM_UNDEFINED:
812 /* this is a normal case since we loop over all three src args */
813 return brw_null_reg();
814
815 case PROGRAM_LOCAL_PARAM:
816 case PROGRAM_ENV_PARAM:
817 case PROGRAM_WRITE_ONLY:
818 default:
819 assert(0);
820 return brw_null_reg();
821 }
822 }
823
824
825 /**
826 * Indirect addressing: get reg[[arg] + offset].
827 */
828 static struct brw_reg deref( struct brw_vs_compile *c,
829 struct brw_reg arg,
830 GLint offset)
831 {
832 struct brw_compile *p = &c->func;
833 struct brw_reg tmp = vec4(get_tmp(c));
834 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
835 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
836 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
837 struct brw_reg indirect = brw_vec4_indirect(0,0);
838
839 {
840 brw_push_insn_state(p);
841 brw_set_access_mode(p, BRW_ALIGN_1);
842
843 /* This is pretty clunky - load the address register twice and
844 * fetch each 4-dword value in turn. There must be a way to do
845 * this in a single pass, but I couldn't get it to work.
846 */
847 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
848 brw_MOV(p, tmp, indirect);
849
850 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
851 brw_MOV(p, suboffset(tmp, 4), indirect);
852
853 brw_pop_insn_state(p);
854 }
855
856 return vec8(tmp);
857 }
858
859
860 static void emit_arl( struct brw_vs_compile *c,
861 struct brw_reg dst,
862 struct brw_reg arg0 )
863 {
864 struct brw_compile *p = &c->func;
865 struct brw_reg tmp = dst;
866 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
867
868 if (need_tmp)
869 tmp = get_tmp(c);
870
871 brw_RNDD(p, tmp, arg0);
872 brw_MUL(p, dst, tmp, brw_imm_d(16));
873
874 if (need_tmp)
875 release_tmp(c, tmp);
876 }
877
878
879 /**
880 * Return the brw reg for the given instruction's src argument.
881 * Will return mangled results for SWZ op. The emit_swz() function
882 * ignores this result and recalculates taking extended swizzles into
883 * account.
884 */
885 static struct brw_reg get_arg( struct brw_vs_compile *c,
886 const struct prog_instruction *inst,
887 GLuint argIndex )
888 {
889 const struct prog_src_register *src = &inst->SrcReg[argIndex];
890 struct brw_reg reg;
891
892 if (src->File == PROGRAM_UNDEFINED)
893 return brw_null_reg();
894
895 if (src->RelAddr) {
896 /* XXX fix */
897 reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
898 }
899 else {
900 reg = get_src_reg(c, inst, argIndex);
901 }
902
903 /* Convert 3-bit swizzle to 2-bit.
904 */
905 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
906 GET_SWZ(src->Swizzle, 1),
907 GET_SWZ(src->Swizzle, 2),
908 GET_SWZ(src->Swizzle, 3));
909
910 /* Note this is ok for non-swizzle instructions:
911 */
912 reg.negate = src->NegateBase ? 1 : 0;
913
914 return reg;
915 }
916
917
918 /**
919 * Get brw register for the given program dest register.
920 */
921 static struct brw_reg get_dst( struct brw_vs_compile *c,
922 struct prog_dst_register dst )
923 {
924 struct brw_reg reg;
925
926 switch (dst.File) {
927 case PROGRAM_TEMPORARY:
928 case PROGRAM_OUTPUT:
929 assert(c->regs[dst.File][dst.Index].nr != 0);
930 reg = c->regs[dst.File][dst.Index];
931 break;
932 case PROGRAM_UNDEFINED:
933 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
934 reg = brw_null_reg();
935 break;
936 default:
937 assert(0);
938 reg = brw_null_reg();
939 }
940
941 reg.dw1.bits.writemask = dst.WriteMask;
942
943 return reg;
944 }
945
946
947 static void emit_swz( struct brw_vs_compile *c,
948 struct brw_reg dst,
949 const struct prog_instruction *inst)
950 {
951 const GLuint argIndex = 0;
952 const struct prog_src_register src = inst->SrcReg[argIndex];
953 struct brw_compile *p = &c->func;
954 GLuint zeros_mask = 0;
955 GLuint ones_mask = 0;
956 GLuint src_mask = 0;
957 GLubyte src_swz[4];
958 GLboolean need_tmp = (src.NegateBase &&
959 dst.file != BRW_GENERAL_REGISTER_FILE);
960 struct brw_reg tmp = dst;
961 GLuint i;
962
963 if (need_tmp)
964 tmp = get_tmp(c);
965
966 for (i = 0; i < 4; i++) {
967 if (dst.dw1.bits.writemask & (1<<i)) {
968 GLubyte s = GET_SWZ(src.Swizzle, i);
969 switch (s) {
970 case SWIZZLE_X:
971 case SWIZZLE_Y:
972 case SWIZZLE_Z:
973 case SWIZZLE_W:
974 src_mask |= 1<<i;
975 src_swz[i] = s;
976 break;
977 case SWIZZLE_ZERO:
978 zeros_mask |= 1<<i;
979 break;
980 case SWIZZLE_ONE:
981 ones_mask |= 1<<i;
982 break;
983 }
984 }
985 }
986
987 /* Do src first, in case dst aliases src:
988 */
989 if (src_mask) {
990 struct brw_reg arg0;
991
992 if (src.RelAddr)
993 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
994 else
995 arg0 = get_src_reg(c, inst, argIndex);
996
997 arg0 = brw_swizzle(arg0,
998 src_swz[0], src_swz[1],
999 src_swz[2], src_swz[3]);
1000
1001 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1002 }
1003
1004 if (zeros_mask)
1005 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1006
1007 if (ones_mask)
1008 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1009
1010 if (src.NegateBase)
1011 brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
1012
1013 if (need_tmp) {
1014 brw_MOV(p, dst, tmp);
1015 release_tmp(c, tmp);
1016 }
1017 }
1018
1019
1020 /**
1021 * Post-vertex-program processing. Send the results to the URB.
1022 */
1023 static void emit_vertex_write( struct brw_vs_compile *c)
1024 {
1025 struct brw_compile *p = &c->func;
1026 struct brw_reg m0 = brw_message_reg(0);
1027 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1028 struct brw_reg ndc;
1029
1030 if (c->key.copy_edgeflag) {
1031 brw_MOV(p,
1032 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1033 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1034 }
1035
1036 /* Build ndc coords */
1037 ndc = get_tmp(c);
1038 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1039 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1040
1041 /* Update the header for point size, user clipping flags, and -ve rhw
1042 * workaround.
1043 */
1044 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1045 c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1046 {
1047 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1048 GLuint i;
1049
1050 brw_MOV(p, header1, brw_imm_ud(0));
1051
1052 brw_set_access_mode(p, BRW_ALIGN_16);
1053
1054 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1055 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1056 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1057 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1058 }
1059
1060 for (i = 0; i < c->key.nr_userclip; i++) {
1061 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1062 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1063 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1064 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1065 }
1066
1067 /* i965 clipping workaround:
1068 * 1) Test for -ve rhw
1069 * 2) If set,
1070 * set ndc = (0,0,0,0)
1071 * set ucp[6] = 1
1072 *
1073 * Later, clipping will detect ucp[6] and ensure the primitive is
1074 * clipped against all fixed planes.
1075 */
1076 if (!BRW_IS_G4X(p->brw)) {
1077 brw_CMP(p,
1078 vec8(brw_null_reg()),
1079 BRW_CONDITIONAL_L,
1080 brw_swizzle1(ndc, 3),
1081 brw_imm_f(0));
1082
1083 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1084 brw_MOV(p, ndc, brw_imm_f(0));
1085 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1086 }
1087
1088 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1089 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1090 brw_set_access_mode(p, BRW_ALIGN_16);
1091
1092 release_tmp(c, header1);
1093 }
1094 else {
1095 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1096 }
1097
1098 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1099 * of zeros followed by two sets of NDC coordinates:
1100 */
1101 brw_set_access_mode(p, BRW_ALIGN_1);
1102 brw_MOV(p, offset(m0, 2), ndc);
1103 brw_MOV(p, offset(m0, 3), pos);
1104
1105 brw_urb_WRITE(p,
1106 brw_null_reg(), /* dest */
1107 0, /* starting mrf reg nr */
1108 c->r0, /* src */
1109 0, /* allocate */
1110 1, /* used */
1111 c->nr_outputs + 3, /* msg len */
1112 0, /* response len */
1113 1, /* eot */
1114 1, /* writes complete */
1115 0, /* urb destination offset */
1116 BRW_URB_SWIZZLE_INTERLEAVE);
1117 }
1118
1119
1120 /**
1121 * Called after code generation to resolve subroutine calls and the
1122 * END instruction.
1123 * \param end_inst points to brw code for END instruction
1124 * \param last_inst points to last instruction emitted before vertex write
1125 */
1126 static void
1127 post_vs_emit( struct brw_vs_compile *c,
1128 struct brw_instruction *end_inst,
1129 struct brw_instruction *last_inst )
1130 {
1131 GLint offset;
1132
1133 brw_resolve_cals(&c->func);
1134
1135 /* patch up the END code to jump past subroutines, etc */
1136 offset = last_inst - end_inst;
1137 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1138 }
1139
1140
1141 /* Emit the vertex program instructions here.
1142 */
1143 void brw_vs_emit(struct brw_vs_compile *c )
1144 {
1145 #define MAX_IFSN 32
1146 struct brw_compile *p = &c->func;
1147 GLuint nr_insns = c->vp->program.Base.NumInstructions;
1148 GLuint insn, if_insn = 0;
1149 GLuint end_offset = 0;
1150 struct brw_instruction *end_inst, *last_inst;
1151 struct brw_instruction *if_inst[MAX_IFSN];
1152 struct brw_indirect stack_index = brw_indirect(0, 0);
1153
1154 GLuint index;
1155 GLuint file;
1156
1157 if (INTEL_DEBUG & DEBUG_VS) {
1158 _mesa_printf("vs-emit:\n");
1159 _mesa_print_program(&c->vp->program.Base);
1160 _mesa_printf("\n");
1161 }
1162
1163 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1164 brw_set_access_mode(p, BRW_ALIGN_16);
1165
1166 /* Message registers can't be read, so copy the output into GRF register
1167 if they are used in source registers */
1168 for (insn = 0; insn < nr_insns; insn++) {
1169 GLuint i;
1170 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1171 for (i = 0; i < 3; i++) {
1172 struct prog_src_register *src = &inst->SrcReg[i];
1173 GLuint index = src->Index;
1174 GLuint file = src->File;
1175 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1176 c->output_regs[index].used_in_src = GL_TRUE;
1177 }
1178 }
1179
1180 /* Static register allocation
1181 */
1182 brw_vs_alloc_regs(c);
1183 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1184
1185 for (insn = 0; insn < nr_insns; insn++) {
1186
1187 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1188 struct brw_reg args[3], dst;
1189 GLuint i;
1190
1191 /* Get argument regs. SWZ is special and does this itself.
1192 */
1193 if (inst->Opcode != OPCODE_SWZ)
1194 for (i = 0; i < 3; i++) {
1195 struct prog_src_register *src = &inst->SrcReg[i];
1196 index = src->Index;
1197 file = src->File;
1198 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1199 args[i] = c->output_regs[index].reg;
1200 else
1201 args[i] = get_arg(c, inst, i);
1202 }
1203
1204 /* Get dest regs. Note that it is possible for a reg to be both
1205 * dst and arg, given the static allocation of registers. So
1206 * care needs to be taken emitting multi-operation instructions.
1207 */
1208 index = inst->DstReg.Index;
1209 file = inst->DstReg.File;
1210 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1211 dst = c->output_regs[index].reg;
1212 else
1213 dst = get_dst(c, inst->DstReg);
1214
1215 if (inst->SaturateMode != SATURATE_OFF) {
1216 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1217 inst->SaturateMode);
1218 }
1219
1220 switch (inst->Opcode) {
1221 case OPCODE_ABS:
1222 brw_MOV(p, dst, brw_abs(args[0]));
1223 break;
1224 case OPCODE_ADD:
1225 brw_ADD(p, dst, args[0], args[1]);
1226 break;
1227 case OPCODE_COS:
1228 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1229 break;
1230 case OPCODE_DP3:
1231 brw_DP3(p, dst, args[0], args[1]);
1232 break;
1233 case OPCODE_DP4:
1234 brw_DP4(p, dst, args[0], args[1]);
1235 break;
1236 case OPCODE_DPH:
1237 brw_DPH(p, dst, args[0], args[1]);
1238 break;
1239 case OPCODE_NRM3:
1240 emit_nrm(c, dst, args[0], 3);
1241 break;
1242 case OPCODE_NRM4:
1243 emit_nrm(c, dst, args[0], 4);
1244 break;
1245 case OPCODE_DST:
1246 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1247 break;
1248 case OPCODE_EXP:
1249 unalias1(c, dst, args[0], emit_exp_noalias);
1250 break;
1251 case OPCODE_EX2:
1252 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1253 break;
1254 case OPCODE_ARL:
1255 emit_arl(c, dst, args[0]);
1256 break;
1257 case OPCODE_FLR:
1258 brw_RNDD(p, dst, args[0]);
1259 break;
1260 case OPCODE_FRC:
1261 brw_FRC(p, dst, args[0]);
1262 break;
1263 case OPCODE_LOG:
1264 unalias1(c, dst, args[0], emit_log_noalias);
1265 break;
1266 case OPCODE_LG2:
1267 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1268 break;
1269 case OPCODE_LIT:
1270 unalias1(c, dst, args[0], emit_lit_noalias);
1271 break;
1272 case OPCODE_LRP:
1273 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1274 break;
1275 case OPCODE_MAD:
1276 brw_MOV(p, brw_acc_reg(), args[2]);
1277 brw_MAC(p, dst, args[0], args[1]);
1278 break;
1279 case OPCODE_MAX:
1280 emit_max(p, dst, args[0], args[1]);
1281 break;
1282 case OPCODE_MIN:
1283 emit_min(p, dst, args[0], args[1]);
1284 break;
1285 case OPCODE_MOV:
1286 brw_MOV(p, dst, args[0]);
1287 break;
1288 case OPCODE_MUL:
1289 brw_MUL(p, dst, args[0], args[1]);
1290 break;
1291 case OPCODE_POW:
1292 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1293 break;
1294 case OPCODE_RCP:
1295 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1296 break;
1297 case OPCODE_RSQ:
1298 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1299 break;
1300
1301 case OPCODE_SEQ:
1302 emit_seq(p, dst, args[0], args[1]);
1303 break;
1304 case OPCODE_SIN:
1305 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1306 break;
1307 case OPCODE_SNE:
1308 emit_sne(p, dst, args[0], args[1]);
1309 break;
1310 case OPCODE_SGE:
1311 emit_sge(p, dst, args[0], args[1]);
1312 break;
1313 case OPCODE_SGT:
1314 emit_sgt(p, dst, args[0], args[1]);
1315 break;
1316 case OPCODE_SLT:
1317 emit_slt(p, dst, args[0], args[1]);
1318 break;
1319 case OPCODE_SLE:
1320 emit_sle(p, dst, args[0], args[1]);
1321 break;
1322 case OPCODE_SUB:
1323 brw_ADD(p, dst, args[0], negate(args[1]));
1324 break;
1325 case OPCODE_SWZ:
1326 /* The args[0] value can't be used here as it won't have
1327 * correctly encoded the full swizzle:
1328 */
1329 emit_swz(c, dst, inst);
1330 break;
1331 case OPCODE_TRUNC:
1332 /* round toward zero */
1333 brw_RNDZ(p, dst, args[0]);
1334 break;
1335 case OPCODE_XPD:
1336 emit_xpd(p, dst, args[0], args[1]);
1337 break;
1338 case OPCODE_IF:
1339 assert(if_insn < MAX_IFSN);
1340 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1341 break;
1342 case OPCODE_ELSE:
1343 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1344 break;
1345 case OPCODE_ENDIF:
1346 assert(if_insn > 0);
1347 brw_ENDIF(p, if_inst[--if_insn]);
1348 break;
1349 case OPCODE_BRA:
1350 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1351 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1352 brw_set_predicate_control_flag_value(p, 0xff);
1353 break;
1354 case OPCODE_CAL:
1355 brw_set_access_mode(p, BRW_ALIGN_1);
1356 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1357 brw_set_access_mode(p, BRW_ALIGN_16);
1358 brw_ADD(p, get_addr_reg(stack_index),
1359 get_addr_reg(stack_index), brw_imm_d(4));
1360 brw_save_call(p, inst->Comment, p->nr_insn);
1361 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1362 break;
1363 case OPCODE_RET:
1364 brw_ADD(p, get_addr_reg(stack_index),
1365 get_addr_reg(stack_index), brw_imm_d(-4));
1366 brw_set_access_mode(p, BRW_ALIGN_1);
1367 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1368 brw_set_access_mode(p, BRW_ALIGN_16);
1369 break;
1370 case OPCODE_END:
1371 end_offset = p->nr_insn;
1372 /* this instruction will get patched later to jump past subroutine
1373 * code, etc.
1374 */
1375 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1376 break;
1377 case OPCODE_PRINT:
1378 /* no-op */
1379 break;
1380 case OPCODE_BGNSUB:
1381 brw_save_label(p, inst->Comment, p->nr_insn);
1382 break;
1383 case OPCODE_ENDSUB:
1384 /* no-op */
1385 break;
1386 default:
1387 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1388 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1389 _mesa_opcode_string(inst->Opcode) :
1390 "unknown");
1391 }
1392
1393 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1394 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1395 && c->output_regs[inst->DstReg.Index].used_in_src) {
1396 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1397 }
1398
1399 /* Result color clamping.
1400 *
1401 * When destination register is an output register and
1402 * it's primary/secondary front/back color, we have to clamp
1403 * the result to [0,1]. This is done by enabling the
1404 * saturation bit for the last instruction.
1405 *
1406 * We don't use brw_set_saturate() as it modifies
1407 * p->current->header.saturate, which affects all the subsequent
1408 * instructions. Instead, we directly modify the header
1409 * of the last (already stored) instruction.
1410 */
1411 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1412 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1413 || (inst->DstReg.Index == VERT_RESULT_COL1)
1414 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1415 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1416 p->store[p->nr_insn-1].header.saturate = 1;
1417 }
1418 }
1419
1420 release_tmps(c);
1421 }
1422
1423 end_inst = &p->store[end_offset];
1424 last_inst = &p->store[p->nr_insn];
1425
1426 /* The END instruction will be patched to jump to this code */
1427 emit_vertex_write(c);
1428
1429 post_vs_emit(c, end_inst, last_inst);
1430 }