i965: Even if no VS inputs are set, still load some amount of URB as required.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
42 {
43 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
44
45 if (++c->last_tmp > c->prog_data.total_grf)
46 c->prog_data.total_grf = c->last_tmp;
47
48 return tmp;
49 }
50
51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
52 {
53 if (tmp.nr == c->last_tmp-1)
54 c->last_tmp--;
55 }
56
57 static void release_tmps( struct brw_vs_compile *c )
58 {
59 c->last_tmp = c->first_tmp;
60 }
61
62
63 /**
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
66 * ahead of time.
67 */
68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
69 {
70 GLuint i, reg = 0, mrf;
71 int attributes_in_vue;
72
73 #if 0
74 if (c->vp->program.Base.Parameters->NumParameters >= 6)
75 c->vp->use_const_buffer = 1;
76 else
77 #endif
78 c->vp->use_const_buffer = GL_FALSE;
79 /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
80
81 /* r0 -- reserved as usual
82 */
83 c->r0 = brw_vec8_grf(reg, 0);
84 reg++;
85
86 /* User clip planes from curbe:
87 */
88 if (c->key.nr_userclip) {
89 for (i = 0; i < c->key.nr_userclip; i++) {
90 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
91 }
92
93 /* Deal with curbe alignment:
94 */
95 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
96 }
97
98 /* Vertex program parameters from curbe:
99 */
100 if (c->vp->use_const_buffer) {
101 /* get constants from a real constant buffer */
102 c->prog_data.curb_read_length = 0;
103 c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
104 }
105 else {
106 /* use a section of the GRF for constants */
107 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
108 for (i = 0; i < nr_params; i++) {
109 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
110 }
111 reg += (nr_params + 1) / 2;
112 c->prog_data.curb_read_length = reg - 1;
113
114 c->prog_data.nr_params = nr_params * 4;
115 }
116
117 /* Allocate input regs:
118 */
119 c->nr_inputs = 0;
120 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
121 if (c->prog_data.inputs_read & (1 << i)) {
122 c->nr_inputs++;
123 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
124 reg++;
125 }
126 }
127 /* If there are no inputs, we'll still be reading one attribute's worth
128 * because it's required -- see urb_read_length setting.
129 */
130 if (c->nr_inputs == 0)
131 reg++;
132
133 /* Allocate outputs: TODO: could organize the non-position outputs
134 * to go straight into message regs.
135 */
136 c->nr_outputs = 0;
137 c->first_output = reg;
138 mrf = 4;
139 for (i = 0; i < VERT_RESULT_MAX; i++) {
140 if (c->prog_data.outputs_written & (1 << i)) {
141 c->nr_outputs++;
142 if (i == VERT_RESULT_HPOS) {
143 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
144 reg++;
145 }
146 else if (i == VERT_RESULT_PSIZ) {
147 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
148 reg++;
149 mrf++; /* just a placeholder? XXX fix later stages & remove this */
150 }
151 else {
152 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
153 mrf++;
154 }
155 }
156 }
157
158 /* Allocate program temporaries:
159 */
160 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
161 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
162 reg++;
163 }
164
165 /* Address reg(s). Don't try to use the internal address reg until
166 * deref time.
167 */
168 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
169 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
170 reg,
171 0,
172 BRW_REGISTER_TYPE_D,
173 BRW_VERTICAL_STRIDE_8,
174 BRW_WIDTH_8,
175 BRW_HORIZONTAL_STRIDE_1,
176 BRW_SWIZZLE_XXXX,
177 WRITEMASK_X);
178 reg++;
179 }
180
181 if (c->vp->use_const_buffer) {
182 for (i = 0; i < 3; i++) {
183 c->current_const[i].index = -1;
184 c->current_const[i].reg = brw_vec8_grf(reg, 0);
185 reg++;
186 }
187 }
188
189 for (i = 0; i < 128; i++) {
190 if (c->output_regs[i].used_in_src) {
191 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
192 reg++;
193 }
194 }
195
196 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
197 reg += 2;
198
199 /* Some opcodes need an internal temporary:
200 */
201 c->first_tmp = reg;
202 c->last_tmp = reg; /* for allocation purposes */
203
204 /* Each input reg holds data from two vertices. The
205 * urb_read_length is the number of registers read from *each*
206 * vertex urb, so is half the amount:
207 */
208 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
209 /* Setting this field to 0 leads to undefined behavior according to the
210 * the VS_STATE docs. Our VUEs will always have at least one attribute
211 * sitting in them, even if it's padding.
212 */
213 if (c->prog_data.urb_read_length == 0)
214 c->prog_data.urb_read_length = 1;
215
216 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
217 * them to fit the biggest thing they need to.
218 */
219 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
220
221 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
222
223 c->prog_data.total_grf = reg;
224
225 if (INTEL_DEBUG & DEBUG_VS) {
226 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
227 _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
228 _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
229 }
230 }
231
232
233 /**
234 * If an instruction uses a temp reg both as a src and the dest, we
235 * sometimes need to allocate an intermediate temporary.
236 */
237 static void unalias1( struct brw_vs_compile *c,
238 struct brw_reg dst,
239 struct brw_reg arg0,
240 void (*func)( struct brw_vs_compile *,
241 struct brw_reg,
242 struct brw_reg ))
243 {
244 if (dst.file == arg0.file && dst.nr == arg0.nr) {
245 struct brw_compile *p = &c->func;
246 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
247 func(c, tmp, arg0);
248 brw_MOV(p, dst, tmp);
249 release_tmp(c, tmp);
250 }
251 else {
252 func(c, dst, arg0);
253 }
254 }
255
256 /**
257 * \sa unalias2
258 * Checkes if 2-operand instruction needs an intermediate temporary.
259 */
260 static void unalias2( struct brw_vs_compile *c,
261 struct brw_reg dst,
262 struct brw_reg arg0,
263 struct brw_reg arg1,
264 void (*func)( struct brw_vs_compile *,
265 struct brw_reg,
266 struct brw_reg,
267 struct brw_reg ))
268 {
269 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
270 (dst.file == arg1.file && dst.nr == arg1.nr)) {
271 struct brw_compile *p = &c->func;
272 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
273 func(c, tmp, arg0, arg1);
274 brw_MOV(p, dst, tmp);
275 release_tmp(c, tmp);
276 }
277 else {
278 func(c, dst, arg0, arg1);
279 }
280 }
281
282 /**
283 * \sa unalias2
284 * Checkes if 3-operand instruction needs an intermediate temporary.
285 */
286 static void unalias3( struct brw_vs_compile *c,
287 struct brw_reg dst,
288 struct brw_reg arg0,
289 struct brw_reg arg1,
290 struct brw_reg arg2,
291 void (*func)( struct brw_vs_compile *,
292 struct brw_reg,
293 struct brw_reg,
294 struct brw_reg,
295 struct brw_reg ))
296 {
297 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
298 (dst.file == arg1.file && dst.nr == arg1.nr) ||
299 (dst.file == arg2.file && dst.nr == arg2.nr)) {
300 struct brw_compile *p = &c->func;
301 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
302 func(c, tmp, arg0, arg1, arg2);
303 brw_MOV(p, dst, tmp);
304 release_tmp(c, tmp);
305 }
306 else {
307 func(c, dst, arg0, arg1, arg2);
308 }
309 }
310
311 static void emit_sop( struct brw_compile *p,
312 struct brw_reg dst,
313 struct brw_reg arg0,
314 struct brw_reg arg1,
315 GLuint cond)
316 {
317 brw_MOV(p, dst, brw_imm_f(0.0f));
318 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
319 brw_MOV(p, dst, brw_imm_f(1.0f));
320 brw_set_predicate_control_flag_value(p, 0xff);
321 }
322
323 static void emit_seq( struct brw_compile *p,
324 struct brw_reg dst,
325 struct brw_reg arg0,
326 struct brw_reg arg1 )
327 {
328 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
329 }
330
331 static void emit_sne( struct brw_compile *p,
332 struct brw_reg dst,
333 struct brw_reg arg0,
334 struct brw_reg arg1 )
335 {
336 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
337 }
338 static void emit_slt( struct brw_compile *p,
339 struct brw_reg dst,
340 struct brw_reg arg0,
341 struct brw_reg arg1 )
342 {
343 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
344 }
345
346 static void emit_sle( struct brw_compile *p,
347 struct brw_reg dst,
348 struct brw_reg arg0,
349 struct brw_reg arg1 )
350 {
351 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
352 }
353
354 static void emit_sgt( struct brw_compile *p,
355 struct brw_reg dst,
356 struct brw_reg arg0,
357 struct brw_reg arg1 )
358 {
359 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
360 }
361
362 static void emit_sge( struct brw_compile *p,
363 struct brw_reg dst,
364 struct brw_reg arg0,
365 struct brw_reg arg1 )
366 {
367 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
368 }
369
370 static void emit_max( struct brw_compile *p,
371 struct brw_reg dst,
372 struct brw_reg arg0,
373 struct brw_reg arg1 )
374 {
375 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
376 brw_SEL(p, dst, arg1, arg0);
377 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
378 }
379
380 static void emit_min( struct brw_compile *p,
381 struct brw_reg dst,
382 struct brw_reg arg0,
383 struct brw_reg arg1 )
384 {
385 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
386 brw_SEL(p, dst, arg0, arg1);
387 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
388 }
389
390
391 static void emit_math1( struct brw_vs_compile *c,
392 GLuint function,
393 struct brw_reg dst,
394 struct brw_reg arg0,
395 GLuint precision)
396 {
397 /* There are various odd behaviours with SEND on the simulator. In
398 * addition there are documented issues with the fact that the GEN4
399 * processor doesn't do dependency control properly on SEND
400 * results. So, on balance, this kludge to get around failures
401 * with writemasked math results looks like it might be necessary
402 * whether that turns out to be a simulator bug or not:
403 */
404 struct brw_compile *p = &c->func;
405 struct brw_reg tmp = dst;
406 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
407 dst.file != BRW_GENERAL_REGISTER_FILE);
408
409 if (need_tmp)
410 tmp = get_tmp(c);
411
412 brw_math(p,
413 tmp,
414 function,
415 BRW_MATH_SATURATE_NONE,
416 2,
417 arg0,
418 BRW_MATH_DATA_SCALAR,
419 precision);
420
421 if (need_tmp) {
422 brw_MOV(p, dst, tmp);
423 release_tmp(c, tmp);
424 }
425 }
426
427
428 static void emit_math2( struct brw_vs_compile *c,
429 GLuint function,
430 struct brw_reg dst,
431 struct brw_reg arg0,
432 struct brw_reg arg1,
433 GLuint precision)
434 {
435 struct brw_compile *p = &c->func;
436 struct brw_reg tmp = dst;
437 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
438 dst.file != BRW_GENERAL_REGISTER_FILE);
439
440 if (need_tmp)
441 tmp = get_tmp(c);
442
443 brw_MOV(p, brw_message_reg(3), arg1);
444
445 brw_math(p,
446 tmp,
447 function,
448 BRW_MATH_SATURATE_NONE,
449 2,
450 arg0,
451 BRW_MATH_DATA_SCALAR,
452 precision);
453
454 if (need_tmp) {
455 brw_MOV(p, dst, tmp);
456 release_tmp(c, tmp);
457 }
458 }
459
460
461 static void emit_exp_noalias( struct brw_vs_compile *c,
462 struct brw_reg dst,
463 struct brw_reg arg0 )
464 {
465 struct brw_compile *p = &c->func;
466
467
468 if (dst.dw1.bits.writemask & WRITEMASK_X) {
469 struct brw_reg tmp = get_tmp(c);
470 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
471
472 /* tmp_d = floor(arg0.x) */
473 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
474
475 /* result[0] = 2.0 ^ tmp */
476
477 /* Adjust exponent for floating point:
478 * exp += 127
479 */
480 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
481
482 /* Install exponent and sign.
483 * Excess drops off the edge:
484 */
485 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
486 tmp_d, brw_imm_d(23));
487
488 release_tmp(c, tmp);
489 }
490
491 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
492 /* result[1] = arg0.x - floor(arg0.x) */
493 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
494 }
495
496 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
497 /* As with the LOG instruction, we might be better off just
498 * doing a taylor expansion here, seeing as we have to do all
499 * the prep work.
500 *
501 * If mathbox partial precision is too low, consider also:
502 * result[3] = result[0] * EXP(result[1])
503 */
504 emit_math1(c,
505 BRW_MATH_FUNCTION_EXP,
506 brw_writemask(dst, WRITEMASK_Z),
507 brw_swizzle1(arg0, 0),
508 BRW_MATH_PRECISION_FULL);
509 }
510
511 if (dst.dw1.bits.writemask & WRITEMASK_W) {
512 /* result[3] = 1.0; */
513 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
514 }
515 }
516
517
518 static void emit_log_noalias( struct brw_vs_compile *c,
519 struct brw_reg dst,
520 struct brw_reg arg0 )
521 {
522 struct brw_compile *p = &c->func;
523 struct brw_reg tmp = dst;
524 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
525 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
526 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
527 dst.file != BRW_GENERAL_REGISTER_FILE);
528
529 if (need_tmp) {
530 tmp = get_tmp(c);
531 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
532 }
533
534 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
535 * according to spec:
536 *
537 * These almost look likey they could be joined up, but not really
538 * practical:
539 *
540 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
541 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
542 */
543 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
544 brw_AND(p,
545 brw_writemask(tmp_ud, WRITEMASK_X),
546 brw_swizzle1(arg0_ud, 0),
547 brw_imm_ud((1U<<31)-1));
548
549 brw_SHR(p,
550 brw_writemask(tmp_ud, WRITEMASK_X),
551 tmp_ud,
552 brw_imm_ud(23));
553
554 brw_ADD(p,
555 brw_writemask(tmp, WRITEMASK_X),
556 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
557 brw_imm_d(-127));
558 }
559
560 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
561 brw_AND(p,
562 brw_writemask(tmp_ud, WRITEMASK_Y),
563 brw_swizzle1(arg0_ud, 0),
564 brw_imm_ud((1<<23)-1));
565
566 brw_OR(p,
567 brw_writemask(tmp_ud, WRITEMASK_Y),
568 tmp_ud,
569 brw_imm_ud(127<<23));
570 }
571
572 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
573 /* result[2] = result[0] + LOG2(result[1]); */
574
575 /* Why bother? The above is just a hint how to do this with a
576 * taylor series. Maybe we *should* use a taylor series as by
577 * the time all the above has been done it's almost certainly
578 * quicker than calling the mathbox, even with low precision.
579 *
580 * Options are:
581 * - result[0] + mathbox.LOG2(result[1])
582 * - mathbox.LOG2(arg0.x)
583 * - result[0] + inline_taylor_approx(result[1])
584 */
585 emit_math1(c,
586 BRW_MATH_FUNCTION_LOG,
587 brw_writemask(tmp, WRITEMASK_Z),
588 brw_swizzle1(tmp, 1),
589 BRW_MATH_PRECISION_FULL);
590
591 brw_ADD(p,
592 brw_writemask(tmp, WRITEMASK_Z),
593 brw_swizzle1(tmp, 2),
594 brw_swizzle1(tmp, 0));
595 }
596
597 if (dst.dw1.bits.writemask & WRITEMASK_W) {
598 /* result[3] = 1.0; */
599 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
600 }
601
602 if (need_tmp) {
603 brw_MOV(p, dst, tmp);
604 release_tmp(c, tmp);
605 }
606 }
607
608
609 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
610 */
611 static void emit_dst_noalias( struct brw_vs_compile *c,
612 struct brw_reg dst,
613 struct brw_reg arg0,
614 struct brw_reg arg1)
615 {
616 struct brw_compile *p = &c->func;
617
618 /* There must be a better way to do this:
619 */
620 if (dst.dw1.bits.writemask & WRITEMASK_X)
621 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
622 if (dst.dw1.bits.writemask & WRITEMASK_Y)
623 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
624 if (dst.dw1.bits.writemask & WRITEMASK_Z)
625 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
626 if (dst.dw1.bits.writemask & WRITEMASK_W)
627 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
628 }
629
630
631 static void emit_xpd( struct brw_compile *p,
632 struct brw_reg dst,
633 struct brw_reg t,
634 struct brw_reg u)
635 {
636 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
637 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
638 }
639
640
641 static void emit_lit_noalias( struct brw_vs_compile *c,
642 struct brw_reg dst,
643 struct brw_reg arg0 )
644 {
645 struct brw_compile *p = &c->func;
646 struct brw_instruction *if_insn;
647 struct brw_reg tmp = dst;
648 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
649
650 if (need_tmp)
651 tmp = get_tmp(c);
652
653 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
654 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
655
656 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
657 * to get all channels active inside the IF. In the clipping code
658 * we run with NoMask, so it's not an option and we can use
659 * BRW_EXECUTE_1 for all comparisions.
660 */
661 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
662 if_insn = brw_IF(p, BRW_EXECUTE_8);
663 {
664 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
665
666 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
667 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
668 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
669
670 emit_math2(c,
671 BRW_MATH_FUNCTION_POW,
672 brw_writemask(dst, WRITEMASK_Z),
673 brw_swizzle1(tmp, 2),
674 brw_swizzle1(arg0, 3),
675 BRW_MATH_PRECISION_PARTIAL);
676 }
677
678 brw_ENDIF(p, if_insn);
679
680 release_tmp(c, tmp);
681 }
682
683 static void emit_lrp_noalias(struct brw_vs_compile *c,
684 struct brw_reg dst,
685 struct brw_reg arg0,
686 struct brw_reg arg1,
687 struct brw_reg arg2)
688 {
689 struct brw_compile *p = &c->func;
690
691 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
692 brw_MUL(p, brw_null_reg(), dst, arg2);
693 brw_MAC(p, dst, arg0, arg1);
694 }
695
696 /** 3 or 4-component vector normalization */
697 static void emit_nrm( struct brw_vs_compile *c,
698 struct brw_reg dst,
699 struct brw_reg arg0,
700 int num_comps)
701 {
702 struct brw_compile *p = &c->func;
703 struct brw_reg tmp = get_tmp(c);
704
705 /* tmp = dot(arg0, arg0) */
706 if (num_comps == 3)
707 brw_DP3(p, tmp, arg0, arg0);
708 else
709 brw_DP4(p, tmp, arg0, arg0);
710
711 /* tmp = 1 / sqrt(tmp) */
712 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
713
714 /* dst = arg0 * tmp */
715 brw_MUL(p, dst, arg0, tmp);
716
717 release_tmp(c, tmp);
718 }
719
720
721 static struct brw_reg
722 get_constant(struct brw_vs_compile *c,
723 const struct prog_instruction *inst,
724 GLuint argIndex)
725 {
726 const struct prog_src_register *src = &inst->SrcReg[argIndex];
727 struct brw_compile *p = &c->func;
728 struct brw_reg const_reg;
729 struct brw_reg const2_reg;
730
731 assert(argIndex < 3);
732
733 if (c->current_const[argIndex].index != src->Index || src->RelAddr) {
734 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
735
736 c->current_const[argIndex].index = src->Index;
737
738 #if 0
739 printf(" fetch const[%d] for arg %d into reg %d\n",
740 src->Index, argIndex, c->current_const[argIndex].reg.nr);
741 #endif
742 /* need to fetch the constant now */
743 brw_dp_READ_4_vs(p,
744 c->current_const[argIndex].reg,/* writeback dest */
745 0, /* oword */
746 src->RelAddr, /* relative indexing? */
747 addrReg, /* address register */
748 16 * src->Index, /* byte offset */
749 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
750 );
751
752 if (src->RelAddr) {
753 /* second read */
754 const2_reg = get_tmp(c);
755
756 /* use upper half of address reg for second read */
757 addrReg = stride(addrReg, 0, 4, 0);
758 addrReg.subnr = 16;
759
760 brw_dp_READ_4_vs(p,
761 const2_reg, /* writeback dest */
762 1, /* oword */
763 src->RelAddr, /* relative indexing? */
764 addrReg, /* address register */
765 16 * src->Index, /* byte offset */
766 SURF_INDEX_VERT_CONST_BUFFER
767 );
768 }
769 }
770
771 const_reg = c->current_const[argIndex].reg;
772
773 if (src->RelAddr) {
774 /* merge the two Owords into the constant register */
775 /* const_reg[7..4] = const2_reg[7..4] */
776 brw_MOV(p,
777 suboffset(stride(const_reg, 0, 4, 1), 4),
778 suboffset(stride(const2_reg, 0, 4, 1), 4));
779 release_tmp(c, const2_reg);
780 }
781 else {
782 /* replicate lower four floats into upper half (to get XYZWXYZW) */
783 const_reg = stride(const_reg, 0, 4, 0);
784 const_reg.subnr = 0;
785 }
786
787 return const_reg;
788 }
789
790
791
792 /* TODO: relative addressing!
793 */
794 static struct brw_reg get_reg( struct brw_vs_compile *c,
795 gl_register_file file,
796 GLuint index )
797 {
798 switch (file) {
799 case PROGRAM_TEMPORARY:
800 case PROGRAM_INPUT:
801 case PROGRAM_OUTPUT:
802 assert(c->regs[file][index].nr != 0);
803 return c->regs[file][index];
804 case PROGRAM_STATE_VAR:
805 case PROGRAM_CONSTANT:
806 case PROGRAM_UNIFORM:
807 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
808 return c->regs[PROGRAM_STATE_VAR][index];
809 case PROGRAM_ADDRESS:
810 assert(index == 0);
811 return c->regs[file][index];
812
813 case PROGRAM_UNDEFINED: /* undef values */
814 return brw_null_reg();
815
816 case PROGRAM_LOCAL_PARAM:
817 case PROGRAM_ENV_PARAM:
818 case PROGRAM_WRITE_ONLY:
819 default:
820 assert(0);
821 return brw_null_reg();
822 }
823 }
824
825
826 /**
827 * Indirect addressing: get reg[[arg] + offset].
828 */
829 static struct brw_reg deref( struct brw_vs_compile *c,
830 struct brw_reg arg,
831 GLint offset)
832 {
833 struct brw_compile *p = &c->func;
834 struct brw_reg tmp = vec4(get_tmp(c));
835 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
836 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
837 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
838 struct brw_reg indirect = brw_vec4_indirect(0,0);
839
840 {
841 brw_push_insn_state(p);
842 brw_set_access_mode(p, BRW_ALIGN_1);
843
844 /* This is pretty clunky - load the address register twice and
845 * fetch each 4-dword value in turn. There must be a way to do
846 * this in a single pass, but I couldn't get it to work.
847 */
848 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
849 brw_MOV(p, tmp, indirect);
850
851 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
852 brw_MOV(p, suboffset(tmp, 4), indirect);
853
854 brw_pop_insn_state(p);
855 }
856
857 /* NOTE: tmp not released */
858 return vec8(tmp);
859 }
860
861
862 /**
863 * Get brw reg corresponding to the instruction's [argIndex] src reg.
864 * TODO: relative addressing!
865 */
866 static struct brw_reg
867 get_src_reg( struct brw_vs_compile *c,
868 const struct prog_instruction *inst,
869 GLuint argIndex )
870 {
871 const GLuint file = inst->SrcReg[argIndex].File;
872 const GLint index = inst->SrcReg[argIndex].Index;
873 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
874
875 switch (file) {
876 case PROGRAM_TEMPORARY:
877 case PROGRAM_INPUT:
878 case PROGRAM_OUTPUT:
879 if (relAddr) {
880 return deref(c, c->regs[file][0], index);
881 }
882 else {
883 assert(c->regs[file][index].nr != 0);
884 return c->regs[file][index];
885 }
886
887 case PROGRAM_STATE_VAR:
888 case PROGRAM_CONSTANT:
889 case PROGRAM_UNIFORM:
890 if (c->vp->use_const_buffer) {
891 return get_constant(c, inst, argIndex);
892 }
893 else if (relAddr) {
894 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
895 }
896 else {
897 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
898 return c->regs[PROGRAM_STATE_VAR][index];
899 }
900 case PROGRAM_ADDRESS:
901 assert(index == 0);
902 return c->regs[file][index];
903
904 case PROGRAM_UNDEFINED:
905 /* this is a normal case since we loop over all three src args */
906 return brw_null_reg();
907
908 case PROGRAM_LOCAL_PARAM:
909 case PROGRAM_ENV_PARAM:
910 case PROGRAM_WRITE_ONLY:
911 default:
912 assert(0);
913 return brw_null_reg();
914 }
915 }
916
917
918 static void emit_arl( struct brw_vs_compile *c,
919 struct brw_reg dst,
920 struct brw_reg arg0 )
921 {
922 struct brw_compile *p = &c->func;
923 struct brw_reg tmp = dst;
924 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
925
926 if (need_tmp)
927 tmp = get_tmp(c);
928
929 brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
930 brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
931
932 if (need_tmp)
933 release_tmp(c, tmp);
934 }
935
936
937 /**
938 * Return the brw reg for the given instruction's src argument.
939 * Will return mangled results for SWZ op. The emit_swz() function
940 * ignores this result and recalculates taking extended swizzles into
941 * account.
942 */
943 static struct brw_reg get_arg( struct brw_vs_compile *c,
944 const struct prog_instruction *inst,
945 GLuint argIndex )
946 {
947 const struct prog_src_register *src = &inst->SrcReg[argIndex];
948 struct brw_reg reg;
949
950 if (src->File == PROGRAM_UNDEFINED)
951 return brw_null_reg();
952
953 reg = get_src_reg(c, inst, argIndex);
954
955 /* Convert 3-bit swizzle to 2-bit.
956 */
957 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
958 GET_SWZ(src->Swizzle, 1),
959 GET_SWZ(src->Swizzle, 2),
960 GET_SWZ(src->Swizzle, 3));
961
962 /* Note this is ok for non-swizzle instructions:
963 */
964 reg.negate = src->Negate ? 1 : 0;
965
966 return reg;
967 }
968
969
970 /**
971 * Get brw register for the given program dest register.
972 */
973 static struct brw_reg get_dst( struct brw_vs_compile *c,
974 struct prog_dst_register dst )
975 {
976 struct brw_reg reg;
977
978 switch (dst.File) {
979 case PROGRAM_TEMPORARY:
980 case PROGRAM_OUTPUT:
981 assert(c->regs[dst.File][dst.Index].nr != 0);
982 reg = c->regs[dst.File][dst.Index];
983 break;
984 case PROGRAM_ADDRESS:
985 assert(dst.Index == 0);
986 reg = c->regs[dst.File][dst.Index];
987 break;
988 case PROGRAM_UNDEFINED:
989 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
990 reg = brw_null_reg();
991 break;
992 default:
993 assert(0);
994 reg = brw_null_reg();
995 }
996
997 reg.dw1.bits.writemask = dst.WriteMask;
998
999 return reg;
1000 }
1001
1002
1003 static void emit_swz( struct brw_vs_compile *c,
1004 struct brw_reg dst,
1005 const struct prog_instruction *inst)
1006 {
1007 const GLuint argIndex = 0;
1008 const struct prog_src_register src = inst->SrcReg[argIndex];
1009 struct brw_compile *p = &c->func;
1010 GLuint zeros_mask = 0;
1011 GLuint ones_mask = 0;
1012 GLuint src_mask = 0;
1013 GLubyte src_swz[4];
1014 GLboolean need_tmp = (src.Negate &&
1015 dst.file != BRW_GENERAL_REGISTER_FILE);
1016 struct brw_reg tmp = dst;
1017 GLuint i;
1018
1019 if (need_tmp)
1020 tmp = get_tmp(c);
1021
1022 for (i = 0; i < 4; i++) {
1023 if (dst.dw1.bits.writemask & (1<<i)) {
1024 GLubyte s = GET_SWZ(src.Swizzle, i);
1025 switch (s) {
1026 case SWIZZLE_X:
1027 case SWIZZLE_Y:
1028 case SWIZZLE_Z:
1029 case SWIZZLE_W:
1030 src_mask |= 1<<i;
1031 src_swz[i] = s;
1032 break;
1033 case SWIZZLE_ZERO:
1034 zeros_mask |= 1<<i;
1035 break;
1036 case SWIZZLE_ONE:
1037 ones_mask |= 1<<i;
1038 break;
1039 }
1040 }
1041 }
1042
1043 /* Do src first, in case dst aliases src:
1044 */
1045 if (src_mask) {
1046 struct brw_reg arg0;
1047
1048 arg0 = get_src_reg(c, inst, argIndex);
1049
1050 arg0 = brw_swizzle(arg0,
1051 src_swz[0], src_swz[1],
1052 src_swz[2], src_swz[3]);
1053
1054 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1055 }
1056
1057 if (zeros_mask)
1058 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1059
1060 if (ones_mask)
1061 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1062
1063 if (src.Negate)
1064 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1065
1066 if (need_tmp) {
1067 brw_MOV(p, dst, tmp);
1068 release_tmp(c, tmp);
1069 }
1070 }
1071
1072
1073 /**
1074 * Post-vertex-program processing. Send the results to the URB.
1075 */
1076 static void emit_vertex_write( struct brw_vs_compile *c)
1077 {
1078 struct brw_compile *p = &c->func;
1079 struct brw_reg m0 = brw_message_reg(0);
1080 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1081 struct brw_reg ndc;
1082
1083 if (c->key.copy_edgeflag) {
1084 brw_MOV(p,
1085 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1086 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1087 }
1088
1089 /* Build ndc coords */
1090 ndc = get_tmp(c);
1091 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1092 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1093
1094 /* Update the header for point size, user clipping flags, and -ve rhw
1095 * workaround.
1096 */
1097 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1098 c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1099 {
1100 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1101 GLuint i;
1102
1103 brw_MOV(p, header1, brw_imm_ud(0));
1104
1105 brw_set_access_mode(p, BRW_ALIGN_16);
1106
1107 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1108 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1109 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1110 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1111 }
1112
1113 for (i = 0; i < c->key.nr_userclip; i++) {
1114 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1115 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1116 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1117 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1118 }
1119
1120 /* i965 clipping workaround:
1121 * 1) Test for -ve rhw
1122 * 2) If set,
1123 * set ndc = (0,0,0,0)
1124 * set ucp[6] = 1
1125 *
1126 * Later, clipping will detect ucp[6] and ensure the primitive is
1127 * clipped against all fixed planes.
1128 */
1129 if (!BRW_IS_G4X(p->brw)) {
1130 brw_CMP(p,
1131 vec8(brw_null_reg()),
1132 BRW_CONDITIONAL_L,
1133 brw_swizzle1(ndc, 3),
1134 brw_imm_f(0));
1135
1136 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1137 brw_MOV(p, ndc, brw_imm_f(0));
1138 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1139 }
1140
1141 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1142 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1143 brw_set_access_mode(p, BRW_ALIGN_16);
1144
1145 release_tmp(c, header1);
1146 }
1147 else {
1148 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1149 }
1150
1151 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1152 * of zeros followed by two sets of NDC coordinates:
1153 */
1154 brw_set_access_mode(p, BRW_ALIGN_1);
1155 brw_MOV(p, offset(m0, 2), ndc);
1156 brw_MOV(p, offset(m0, 3), pos);
1157
1158 brw_urb_WRITE(p,
1159 brw_null_reg(), /* dest */
1160 0, /* starting mrf reg nr */
1161 c->r0, /* src */
1162 0, /* allocate */
1163 1, /* used */
1164 c->nr_outputs + 3, /* msg len */
1165 0, /* response len */
1166 1, /* eot */
1167 1, /* writes complete */
1168 0, /* urb destination offset */
1169 BRW_URB_SWIZZLE_INTERLEAVE);
1170 }
1171
1172
1173 /**
1174 * Called after code generation to resolve subroutine calls and the
1175 * END instruction.
1176 * \param end_inst points to brw code for END instruction
1177 * \param last_inst points to last instruction emitted before vertex write
1178 */
1179 static void
1180 post_vs_emit( struct brw_vs_compile *c,
1181 struct brw_instruction *end_inst,
1182 struct brw_instruction *last_inst )
1183 {
1184 GLint offset;
1185
1186 brw_resolve_cals(&c->func);
1187
1188 /* patch up the END code to jump past subroutines, etc */
1189 offset = last_inst - end_inst;
1190 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1191 }
1192
1193
1194 /* Emit the vertex program instructions here.
1195 */
1196 void brw_vs_emit(struct brw_vs_compile *c )
1197 {
1198 #define MAX_IFSN 32
1199 struct brw_compile *p = &c->func;
1200 GLuint nr_insns = c->vp->program.Base.NumInstructions;
1201 GLuint insn, if_insn = 0;
1202 GLuint end_offset = 0;
1203 struct brw_instruction *end_inst, *last_inst;
1204 struct brw_instruction *if_inst[MAX_IFSN];
1205 struct brw_indirect stack_index = brw_indirect(0, 0);
1206
1207 GLuint index;
1208 GLuint file;
1209
1210 if (INTEL_DEBUG & DEBUG_VS) {
1211 _mesa_printf("vs-emit:\n");
1212 _mesa_print_program(&c->vp->program.Base);
1213 _mesa_printf("\n");
1214 }
1215
1216 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1217 brw_set_access_mode(p, BRW_ALIGN_16);
1218
1219 /* Message registers can't be read, so copy the output into GRF register
1220 if they are used in source registers */
1221 for (insn = 0; insn < nr_insns; insn++) {
1222 GLuint i;
1223 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1224 for (i = 0; i < 3; i++) {
1225 struct prog_src_register *src = &inst->SrcReg[i];
1226 GLuint index = src->Index;
1227 GLuint file = src->File;
1228 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1229 c->output_regs[index].used_in_src = GL_TRUE;
1230 }
1231 }
1232
1233 /* Static register allocation
1234 */
1235 brw_vs_alloc_regs(c);
1236 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1237
1238 for (insn = 0; insn < nr_insns; insn++) {
1239
1240 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1241 struct brw_reg args[3], dst;
1242 GLuint i;
1243
1244 #if 0
1245 printf("%d: ", insn);
1246 _mesa_print_instruction(inst);
1247 #endif
1248
1249 /* Get argument regs. SWZ is special and does this itself.
1250 */
1251 if (inst->Opcode != OPCODE_SWZ)
1252 for (i = 0; i < 3; i++) {
1253 struct prog_src_register *src = &inst->SrcReg[i];
1254 index = src->Index;
1255 file = src->File;
1256 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1257 args[i] = c->output_regs[index].reg;
1258 else
1259 args[i] = get_arg(c, inst, i);
1260 }
1261
1262 /* Get dest regs. Note that it is possible for a reg to be both
1263 * dst and arg, given the static allocation of registers. So
1264 * care needs to be taken emitting multi-operation instructions.
1265 */
1266 index = inst->DstReg.Index;
1267 file = inst->DstReg.File;
1268 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1269 dst = c->output_regs[index].reg;
1270 else
1271 dst = get_dst(c, inst->DstReg);
1272
1273 if (inst->SaturateMode != SATURATE_OFF) {
1274 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1275 inst->SaturateMode);
1276 }
1277
1278 switch (inst->Opcode) {
1279 case OPCODE_ABS:
1280 brw_MOV(p, dst, brw_abs(args[0]));
1281 break;
1282 case OPCODE_ADD:
1283 brw_ADD(p, dst, args[0], args[1]);
1284 break;
1285 case OPCODE_COS:
1286 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1287 break;
1288 case OPCODE_DP3:
1289 brw_DP3(p, dst, args[0], args[1]);
1290 break;
1291 case OPCODE_DP4:
1292 brw_DP4(p, dst, args[0], args[1]);
1293 break;
1294 case OPCODE_DPH:
1295 brw_DPH(p, dst, args[0], args[1]);
1296 break;
1297 case OPCODE_NRM3:
1298 emit_nrm(c, dst, args[0], 3);
1299 break;
1300 case OPCODE_NRM4:
1301 emit_nrm(c, dst, args[0], 4);
1302 break;
1303 case OPCODE_DST:
1304 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1305 break;
1306 case OPCODE_EXP:
1307 unalias1(c, dst, args[0], emit_exp_noalias);
1308 break;
1309 case OPCODE_EX2:
1310 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1311 break;
1312 case OPCODE_ARL:
1313 emit_arl(c, dst, args[0]);
1314 break;
1315 case OPCODE_FLR:
1316 brw_RNDD(p, dst, args[0]);
1317 break;
1318 case OPCODE_FRC:
1319 brw_FRC(p, dst, args[0]);
1320 break;
1321 case OPCODE_LOG:
1322 unalias1(c, dst, args[0], emit_log_noalias);
1323 break;
1324 case OPCODE_LG2:
1325 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1326 break;
1327 case OPCODE_LIT:
1328 unalias1(c, dst, args[0], emit_lit_noalias);
1329 break;
1330 case OPCODE_LRP:
1331 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1332 break;
1333 case OPCODE_MAD:
1334 brw_MOV(p, brw_acc_reg(), args[2]);
1335 brw_MAC(p, dst, args[0], args[1]);
1336 break;
1337 case OPCODE_MAX:
1338 emit_max(p, dst, args[0], args[1]);
1339 break;
1340 case OPCODE_MIN:
1341 emit_min(p, dst, args[0], args[1]);
1342 break;
1343 case OPCODE_MOV:
1344 brw_MOV(p, dst, args[0]);
1345 break;
1346 case OPCODE_MUL:
1347 brw_MUL(p, dst, args[0], args[1]);
1348 break;
1349 case OPCODE_POW:
1350 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1351 break;
1352 case OPCODE_RCP:
1353 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1354 break;
1355 case OPCODE_RSQ:
1356 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1357 break;
1358
1359 case OPCODE_SEQ:
1360 emit_seq(p, dst, args[0], args[1]);
1361 break;
1362 case OPCODE_SIN:
1363 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1364 break;
1365 case OPCODE_SNE:
1366 emit_sne(p, dst, args[0], args[1]);
1367 break;
1368 case OPCODE_SGE:
1369 emit_sge(p, dst, args[0], args[1]);
1370 break;
1371 case OPCODE_SGT:
1372 emit_sgt(p, dst, args[0], args[1]);
1373 break;
1374 case OPCODE_SLT:
1375 emit_slt(p, dst, args[0], args[1]);
1376 break;
1377 case OPCODE_SLE:
1378 emit_sle(p, dst, args[0], args[1]);
1379 break;
1380 case OPCODE_SUB:
1381 brw_ADD(p, dst, args[0], negate(args[1]));
1382 break;
1383 case OPCODE_SWZ:
1384 /* The args[0] value can't be used here as it won't have
1385 * correctly encoded the full swizzle:
1386 */
1387 emit_swz(c, dst, inst);
1388 break;
1389 case OPCODE_TRUNC:
1390 /* round toward zero */
1391 brw_RNDZ(p, dst, args[0]);
1392 break;
1393 case OPCODE_XPD:
1394 emit_xpd(p, dst, args[0], args[1]);
1395 break;
1396 case OPCODE_IF:
1397 assert(if_insn < MAX_IFSN);
1398 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1399 break;
1400 case OPCODE_ELSE:
1401 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1402 break;
1403 case OPCODE_ENDIF:
1404 assert(if_insn > 0);
1405 brw_ENDIF(p, if_inst[--if_insn]);
1406 break;
1407 case OPCODE_BRA:
1408 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1409 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1410 brw_set_predicate_control_flag_value(p, 0xff);
1411 break;
1412 case OPCODE_CAL:
1413 brw_set_access_mode(p, BRW_ALIGN_1);
1414 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1415 brw_set_access_mode(p, BRW_ALIGN_16);
1416 brw_ADD(p, get_addr_reg(stack_index),
1417 get_addr_reg(stack_index), brw_imm_d(4));
1418 brw_save_call(p, inst->Comment, p->nr_insn);
1419 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1420 break;
1421 case OPCODE_RET:
1422 brw_ADD(p, get_addr_reg(stack_index),
1423 get_addr_reg(stack_index), brw_imm_d(-4));
1424 brw_set_access_mode(p, BRW_ALIGN_1);
1425 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1426 brw_set_access_mode(p, BRW_ALIGN_16);
1427 break;
1428 case OPCODE_END:
1429 end_offset = p->nr_insn;
1430 /* this instruction will get patched later to jump past subroutine
1431 * code, etc.
1432 */
1433 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1434 break;
1435 case OPCODE_PRINT:
1436 /* no-op */
1437 break;
1438 case OPCODE_BGNSUB:
1439 brw_save_label(p, inst->Comment, p->nr_insn);
1440 break;
1441 case OPCODE_ENDSUB:
1442 /* no-op */
1443 break;
1444 default:
1445 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1446 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1447 _mesa_opcode_string(inst->Opcode) :
1448 "unknown");
1449 }
1450
1451 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1452 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1453 && c->output_regs[inst->DstReg.Index].used_in_src) {
1454 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1455 }
1456
1457 /* Result color clamping.
1458 *
1459 * When destination register is an output register and
1460 * it's primary/secondary front/back color, we have to clamp
1461 * the result to [0,1]. This is done by enabling the
1462 * saturation bit for the last instruction.
1463 *
1464 * We don't use brw_set_saturate() as it modifies
1465 * p->current->header.saturate, which affects all the subsequent
1466 * instructions. Instead, we directly modify the header
1467 * of the last (already stored) instruction.
1468 */
1469 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1470 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1471 || (inst->DstReg.Index == VERT_RESULT_COL1)
1472 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1473 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1474 p->store[p->nr_insn-1].header.saturate = 1;
1475 }
1476 }
1477
1478 release_tmps(c);
1479 }
1480
1481 end_inst = &p->store[end_offset];
1482 last_inst = &p->store[p->nr_insn];
1483
1484 /* The END instruction will be patched to jump to this code */
1485 emit_vertex_write(c);
1486
1487 post_vs_emit(c, end_inst, last_inst);
1488 }