514f15d5e3ac1c083ae531de63cfb741c5484a85
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
42 {
43 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
44
45 if (++c->last_tmp > c->prog_data.total_grf)
46 c->prog_data.total_grf = c->last_tmp;
47
48 return tmp;
49 }
50
51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
52 {
53 if (tmp.nr == c->last_tmp-1)
54 c->last_tmp--;
55 }
56
57 static void release_tmps( struct brw_vs_compile *c )
58 {
59 c->last_tmp = c->first_tmp;
60 }
61
62
63 /**
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
66 * ahead of time.
67 */
68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
69 {
70 GLuint i, reg = 0, mrf;
71
72 /* Determine whether to use a real constant buffer or use a block
73 * of GRF registers for constants. The later is faster but only
74 * works if everything fits in the GRF.
75 * XXX this heuristic/check may need some fine tuning...
76 */
77 if (c->vp->program.Base.Parameters->NumParameters +
78 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
79 c->vp->use_const_buffer = GL_TRUE;
80 else
81 c->vp->use_const_buffer = GL_FALSE;
82
83 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
84
85 /* r0 -- reserved as usual
86 */
87 c->r0 = brw_vec8_grf(reg, 0);
88 reg++;
89
90 /* User clip planes from curbe:
91 */
92 if (c->key.nr_userclip) {
93 for (i = 0; i < c->key.nr_userclip; i++) {
94 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
95 }
96
97 /* Deal with curbe alignment:
98 */
99 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
100 }
101
102 /* Vertex program parameters from curbe:
103 */
104 if (c->vp->use_const_buffer) {
105 /* get constants from a real constant buffer */
106 c->prog_data.curb_read_length = 0;
107 c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
108 }
109 else {
110 /* use a section of the GRF for constants */
111 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
112 for (i = 0; i < nr_params; i++) {
113 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
114 }
115 reg += (nr_params + 1) / 2;
116 c->prog_data.curb_read_length = reg - 1;
117
118 c->prog_data.nr_params = nr_params * 4;
119 }
120
121 /* Allocate input regs:
122 */
123 c->nr_inputs = 0;
124 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
125 if (c->prog_data.inputs_read & (1 << i)) {
126 c->nr_inputs++;
127 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
128 reg++;
129 }
130 }
131
132 /* Allocate outputs. The non-position outputs go straight into message regs.
133 */
134 c->nr_outputs = 0;
135 c->first_output = reg;
136 c->first_overflow_output = 0;
137
138 if (BRW_IS_IGDNG(c->func.brw))
139 mrf = 8;
140 else
141 mrf = 4;
142
143 for (i = 0; i < VERT_RESULT_MAX; i++) {
144 if (c->prog_data.outputs_written & (1 << i)) {
145 c->nr_outputs++;
146 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
147 if (i == VERT_RESULT_HPOS) {
148 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
149 reg++;
150 }
151 else if (i == VERT_RESULT_PSIZ) {
152 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
153 reg++;
154 mrf++; /* just a placeholder? XXX fix later stages & remove this */
155 }
156 else {
157 if (mrf < 16) {
158 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
159 mrf++;
160 }
161 else {
162 /* too many vertex results to fit in MRF, use GRF for overflow */
163 if (!c->first_overflow_output)
164 c->first_overflow_output = i;
165 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
166 reg++;
167 }
168 }
169 }
170 }
171
172 /* Allocate program temporaries:
173 */
174 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
175 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
176 reg++;
177 }
178
179 /* Address reg(s). Don't try to use the internal address reg until
180 * deref time.
181 */
182 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
183 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
184 reg,
185 0,
186 BRW_REGISTER_TYPE_D,
187 BRW_VERTICAL_STRIDE_8,
188 BRW_WIDTH_8,
189 BRW_HORIZONTAL_STRIDE_1,
190 BRW_SWIZZLE_XXXX,
191 WRITEMASK_X);
192 reg++;
193 }
194
195 if (c->vp->use_const_buffer) {
196 for (i = 0; i < 3; i++) {
197 c->current_const[i].index = -1;
198 c->current_const[i].reg = brw_vec8_grf(reg, 0);
199 reg++;
200 }
201 }
202
203 for (i = 0; i < 128; i++) {
204 if (c->output_regs[i].used_in_src) {
205 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
206 reg++;
207 }
208 }
209
210 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
211 reg += 2;
212
213 /* Some opcodes need an internal temporary:
214 */
215 c->first_tmp = reg;
216 c->last_tmp = reg; /* for allocation purposes */
217
218 /* Each input reg holds data from two vertices. The
219 * urb_read_length is the number of registers read from *each*
220 * vertex urb, so is half the amount:
221 */
222 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
223
224 if (BRW_IS_IGDNG(c->func.brw))
225 c->prog_data.urb_entry_size = (c->nr_outputs + 6 + 3) / 4;
226 else
227 c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
228
229 c->prog_data.total_grf = reg;
230
231 if (INTEL_DEBUG & DEBUG_VS) {
232 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
233 _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
234 _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
235 }
236 }
237
238
239 /**
240 * If an instruction uses a temp reg both as a src and the dest, we
241 * sometimes need to allocate an intermediate temporary.
242 */
243 static void unalias1( struct brw_vs_compile *c,
244 struct brw_reg dst,
245 struct brw_reg arg0,
246 void (*func)( struct brw_vs_compile *,
247 struct brw_reg,
248 struct brw_reg ))
249 {
250 if (dst.file == arg0.file && dst.nr == arg0.nr) {
251 struct brw_compile *p = &c->func;
252 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
253 func(c, tmp, arg0);
254 brw_MOV(p, dst, tmp);
255 release_tmp(c, tmp);
256 }
257 else {
258 func(c, dst, arg0);
259 }
260 }
261
262 /**
263 * \sa unalias2
264 * Checkes if 2-operand instruction needs an intermediate temporary.
265 */
266 static void unalias2( struct brw_vs_compile *c,
267 struct brw_reg dst,
268 struct brw_reg arg0,
269 struct brw_reg arg1,
270 void (*func)( struct brw_vs_compile *,
271 struct brw_reg,
272 struct brw_reg,
273 struct brw_reg ))
274 {
275 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
276 (dst.file == arg1.file && dst.nr == arg1.nr)) {
277 struct brw_compile *p = &c->func;
278 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
279 func(c, tmp, arg0, arg1);
280 brw_MOV(p, dst, tmp);
281 release_tmp(c, tmp);
282 }
283 else {
284 func(c, dst, arg0, arg1);
285 }
286 }
287
288 /**
289 * \sa unalias2
290 * Checkes if 3-operand instruction needs an intermediate temporary.
291 */
292 static void unalias3( struct brw_vs_compile *c,
293 struct brw_reg dst,
294 struct brw_reg arg0,
295 struct brw_reg arg1,
296 struct brw_reg arg2,
297 void (*func)( struct brw_vs_compile *,
298 struct brw_reg,
299 struct brw_reg,
300 struct brw_reg,
301 struct brw_reg ))
302 {
303 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
304 (dst.file == arg1.file && dst.nr == arg1.nr) ||
305 (dst.file == arg2.file && dst.nr == arg2.nr)) {
306 struct brw_compile *p = &c->func;
307 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
308 func(c, tmp, arg0, arg1, arg2);
309 brw_MOV(p, dst, tmp);
310 release_tmp(c, tmp);
311 }
312 else {
313 func(c, dst, arg0, arg1, arg2);
314 }
315 }
316
317 static void emit_sop( struct brw_compile *p,
318 struct brw_reg dst,
319 struct brw_reg arg0,
320 struct brw_reg arg1,
321 GLuint cond)
322 {
323 brw_MOV(p, dst, brw_imm_f(0.0f));
324 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
325 brw_MOV(p, dst, brw_imm_f(1.0f));
326 brw_set_predicate_control_flag_value(p, 0xff);
327 }
328
329 static void emit_seq( struct brw_compile *p,
330 struct brw_reg dst,
331 struct brw_reg arg0,
332 struct brw_reg arg1 )
333 {
334 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
335 }
336
337 static void emit_sne( struct brw_compile *p,
338 struct brw_reg dst,
339 struct brw_reg arg0,
340 struct brw_reg arg1 )
341 {
342 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
343 }
344 static void emit_slt( struct brw_compile *p,
345 struct brw_reg dst,
346 struct brw_reg arg0,
347 struct brw_reg arg1 )
348 {
349 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
350 }
351
352 static void emit_sle( struct brw_compile *p,
353 struct brw_reg dst,
354 struct brw_reg arg0,
355 struct brw_reg arg1 )
356 {
357 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
358 }
359
360 static void emit_sgt( struct brw_compile *p,
361 struct brw_reg dst,
362 struct brw_reg arg0,
363 struct brw_reg arg1 )
364 {
365 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
366 }
367
368 static void emit_sge( struct brw_compile *p,
369 struct brw_reg dst,
370 struct brw_reg arg0,
371 struct brw_reg arg1 )
372 {
373 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
374 }
375
376 static void emit_max( struct brw_compile *p,
377 struct brw_reg dst,
378 struct brw_reg arg0,
379 struct brw_reg arg1 )
380 {
381 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
382 brw_SEL(p, dst, arg1, arg0);
383 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
384 }
385
386 static void emit_min( struct brw_compile *p,
387 struct brw_reg dst,
388 struct brw_reg arg0,
389 struct brw_reg arg1 )
390 {
391 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
392 brw_SEL(p, dst, arg0, arg1);
393 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
394 }
395
396
397 static void emit_math1( struct brw_vs_compile *c,
398 GLuint function,
399 struct brw_reg dst,
400 struct brw_reg arg0,
401 GLuint precision)
402 {
403 /* There are various odd behaviours with SEND on the simulator. In
404 * addition there are documented issues with the fact that the GEN4
405 * processor doesn't do dependency control properly on SEND
406 * results. So, on balance, this kludge to get around failures
407 * with writemasked math results looks like it might be necessary
408 * whether that turns out to be a simulator bug or not:
409 */
410 struct brw_compile *p = &c->func;
411 struct brw_reg tmp = dst;
412 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
413 dst.file != BRW_GENERAL_REGISTER_FILE);
414
415 if (need_tmp)
416 tmp = get_tmp(c);
417
418 brw_math(p,
419 tmp,
420 function,
421 BRW_MATH_SATURATE_NONE,
422 2,
423 arg0,
424 BRW_MATH_DATA_SCALAR,
425 precision);
426
427 if (need_tmp) {
428 brw_MOV(p, dst, tmp);
429 release_tmp(c, tmp);
430 }
431 }
432
433
434 static void emit_math2( struct brw_vs_compile *c,
435 GLuint function,
436 struct brw_reg dst,
437 struct brw_reg arg0,
438 struct brw_reg arg1,
439 GLuint precision)
440 {
441 struct brw_compile *p = &c->func;
442 struct brw_reg tmp = dst;
443 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
444 dst.file != BRW_GENERAL_REGISTER_FILE);
445
446 if (need_tmp)
447 tmp = get_tmp(c);
448
449 brw_MOV(p, brw_message_reg(3), arg1);
450
451 brw_math(p,
452 tmp,
453 function,
454 BRW_MATH_SATURATE_NONE,
455 2,
456 arg0,
457 BRW_MATH_DATA_SCALAR,
458 precision);
459
460 if (need_tmp) {
461 brw_MOV(p, dst, tmp);
462 release_tmp(c, tmp);
463 }
464 }
465
466
467 static void emit_exp_noalias( struct brw_vs_compile *c,
468 struct brw_reg dst,
469 struct brw_reg arg0 )
470 {
471 struct brw_compile *p = &c->func;
472
473
474 if (dst.dw1.bits.writemask & WRITEMASK_X) {
475 struct brw_reg tmp = get_tmp(c);
476 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
477
478 /* tmp_d = floor(arg0.x) */
479 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
480
481 /* result[0] = 2.0 ^ tmp */
482
483 /* Adjust exponent for floating point:
484 * exp += 127
485 */
486 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
487
488 /* Install exponent and sign.
489 * Excess drops off the edge:
490 */
491 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
492 tmp_d, brw_imm_d(23));
493
494 release_tmp(c, tmp);
495 }
496
497 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
498 /* result[1] = arg0.x - floor(arg0.x) */
499 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
500 }
501
502 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
503 /* As with the LOG instruction, we might be better off just
504 * doing a taylor expansion here, seeing as we have to do all
505 * the prep work.
506 *
507 * If mathbox partial precision is too low, consider also:
508 * result[3] = result[0] * EXP(result[1])
509 */
510 emit_math1(c,
511 BRW_MATH_FUNCTION_EXP,
512 brw_writemask(dst, WRITEMASK_Z),
513 brw_swizzle1(arg0, 0),
514 BRW_MATH_PRECISION_FULL);
515 }
516
517 if (dst.dw1.bits.writemask & WRITEMASK_W) {
518 /* result[3] = 1.0; */
519 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
520 }
521 }
522
523
524 static void emit_log_noalias( struct brw_vs_compile *c,
525 struct brw_reg dst,
526 struct brw_reg arg0 )
527 {
528 struct brw_compile *p = &c->func;
529 struct brw_reg tmp = dst;
530 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
531 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
532 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
533 dst.file != BRW_GENERAL_REGISTER_FILE);
534
535 if (need_tmp) {
536 tmp = get_tmp(c);
537 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
538 }
539
540 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
541 * according to spec:
542 *
543 * These almost look likey they could be joined up, but not really
544 * practical:
545 *
546 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
547 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
548 */
549 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
550 brw_AND(p,
551 brw_writemask(tmp_ud, WRITEMASK_X),
552 brw_swizzle1(arg0_ud, 0),
553 brw_imm_ud((1U<<31)-1));
554
555 brw_SHR(p,
556 brw_writemask(tmp_ud, WRITEMASK_X),
557 tmp_ud,
558 brw_imm_ud(23));
559
560 brw_ADD(p,
561 brw_writemask(tmp, WRITEMASK_X),
562 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
563 brw_imm_d(-127));
564 }
565
566 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
567 brw_AND(p,
568 brw_writemask(tmp_ud, WRITEMASK_Y),
569 brw_swizzle1(arg0_ud, 0),
570 brw_imm_ud((1<<23)-1));
571
572 brw_OR(p,
573 brw_writemask(tmp_ud, WRITEMASK_Y),
574 tmp_ud,
575 brw_imm_ud(127<<23));
576 }
577
578 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
579 /* result[2] = result[0] + LOG2(result[1]); */
580
581 /* Why bother? The above is just a hint how to do this with a
582 * taylor series. Maybe we *should* use a taylor series as by
583 * the time all the above has been done it's almost certainly
584 * quicker than calling the mathbox, even with low precision.
585 *
586 * Options are:
587 * - result[0] + mathbox.LOG2(result[1])
588 * - mathbox.LOG2(arg0.x)
589 * - result[0] + inline_taylor_approx(result[1])
590 */
591 emit_math1(c,
592 BRW_MATH_FUNCTION_LOG,
593 brw_writemask(tmp, WRITEMASK_Z),
594 brw_swizzle1(tmp, 1),
595 BRW_MATH_PRECISION_FULL);
596
597 brw_ADD(p,
598 brw_writemask(tmp, WRITEMASK_Z),
599 brw_swizzle1(tmp, 2),
600 brw_swizzle1(tmp, 0));
601 }
602
603 if (dst.dw1.bits.writemask & WRITEMASK_W) {
604 /* result[3] = 1.0; */
605 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
606 }
607
608 if (need_tmp) {
609 brw_MOV(p, dst, tmp);
610 release_tmp(c, tmp);
611 }
612 }
613
614
615 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
616 */
617 static void emit_dst_noalias( struct brw_vs_compile *c,
618 struct brw_reg dst,
619 struct brw_reg arg0,
620 struct brw_reg arg1)
621 {
622 struct brw_compile *p = &c->func;
623
624 /* There must be a better way to do this:
625 */
626 if (dst.dw1.bits.writemask & WRITEMASK_X)
627 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
628 if (dst.dw1.bits.writemask & WRITEMASK_Y)
629 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
630 if (dst.dw1.bits.writemask & WRITEMASK_Z)
631 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
632 if (dst.dw1.bits.writemask & WRITEMASK_W)
633 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
634 }
635
636
637 static void emit_xpd( struct brw_compile *p,
638 struct brw_reg dst,
639 struct brw_reg t,
640 struct brw_reg u)
641 {
642 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
643 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
644 }
645
646
647 static void emit_lit_noalias( struct brw_vs_compile *c,
648 struct brw_reg dst,
649 struct brw_reg arg0 )
650 {
651 struct brw_compile *p = &c->func;
652 struct brw_instruction *if_insn;
653 struct brw_reg tmp = dst;
654 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
655
656 if (need_tmp)
657 tmp = get_tmp(c);
658
659 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
660 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
661
662 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
663 * to get all channels active inside the IF. In the clipping code
664 * we run with NoMask, so it's not an option and we can use
665 * BRW_EXECUTE_1 for all comparisions.
666 */
667 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
668 if_insn = brw_IF(p, BRW_EXECUTE_8);
669 {
670 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
671
672 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
673 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
674 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
675
676 emit_math2(c,
677 BRW_MATH_FUNCTION_POW,
678 brw_writemask(dst, WRITEMASK_Z),
679 brw_swizzle1(tmp, 2),
680 brw_swizzle1(arg0, 3),
681 BRW_MATH_PRECISION_PARTIAL);
682 }
683
684 brw_ENDIF(p, if_insn);
685
686 release_tmp(c, tmp);
687 }
688
689 static void emit_lrp_noalias(struct brw_vs_compile *c,
690 struct brw_reg dst,
691 struct brw_reg arg0,
692 struct brw_reg arg1,
693 struct brw_reg arg2)
694 {
695 struct brw_compile *p = &c->func;
696
697 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
698 brw_MUL(p, brw_null_reg(), dst, arg2);
699 brw_MAC(p, dst, arg0, arg1);
700 }
701
702 /** 3 or 4-component vector normalization */
703 static void emit_nrm( struct brw_vs_compile *c,
704 struct brw_reg dst,
705 struct brw_reg arg0,
706 int num_comps)
707 {
708 struct brw_compile *p = &c->func;
709 struct brw_reg tmp = get_tmp(c);
710
711 /* tmp = dot(arg0, arg0) */
712 if (num_comps == 3)
713 brw_DP3(p, tmp, arg0, arg0);
714 else
715 brw_DP4(p, tmp, arg0, arg0);
716
717 /* tmp = 1 / sqrt(tmp) */
718 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
719
720 /* dst = arg0 * tmp */
721 brw_MUL(p, dst, arg0, tmp);
722
723 release_tmp(c, tmp);
724 }
725
726
727 static struct brw_reg
728 get_constant(struct brw_vs_compile *c,
729 const struct prog_instruction *inst,
730 GLuint argIndex)
731 {
732 const struct prog_src_register *src = &inst->SrcReg[argIndex];
733 struct brw_compile *p = &c->func;
734 struct brw_reg const_reg;
735 struct brw_reg const2_reg;
736 const GLboolean relAddr = src->RelAddr;
737
738 assert(argIndex < 3);
739
740 if (c->current_const[argIndex].index != src->Index || relAddr) {
741 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
742
743 c->current_const[argIndex].index = src->Index;
744
745 #if 0
746 printf(" fetch const[%d] for arg %d into reg %d\n",
747 src->Index, argIndex, c->current_const[argIndex].reg.nr);
748 #endif
749 /* need to fetch the constant now */
750 brw_dp_READ_4_vs(p,
751 c->current_const[argIndex].reg,/* writeback dest */
752 0, /* oword */
753 relAddr, /* relative indexing? */
754 addrReg, /* address register */
755 16 * src->Index, /* byte offset */
756 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
757 );
758
759 if (relAddr) {
760 /* second read */
761 const2_reg = get_tmp(c);
762
763 /* use upper half of address reg for second read */
764 addrReg = stride(addrReg, 0, 4, 0);
765 addrReg.subnr = 16;
766
767 brw_dp_READ_4_vs(p,
768 const2_reg, /* writeback dest */
769 1, /* oword */
770 relAddr, /* relative indexing? */
771 addrReg, /* address register */
772 16 * src->Index, /* byte offset */
773 SURF_INDEX_VERT_CONST_BUFFER
774 );
775 }
776 }
777
778 const_reg = c->current_const[argIndex].reg;
779
780 if (relAddr) {
781 /* merge the two Owords into the constant register */
782 /* const_reg[7..4] = const2_reg[7..4] */
783 brw_MOV(p,
784 suboffset(stride(const_reg, 0, 4, 1), 4),
785 suboffset(stride(const2_reg, 0, 4, 1), 4));
786 release_tmp(c, const2_reg);
787 }
788 else {
789 /* replicate lower four floats into upper half (to get XYZWXYZW) */
790 const_reg = stride(const_reg, 0, 4, 0);
791 const_reg.subnr = 0;
792 }
793
794 return const_reg;
795 }
796
797
798
799 /* TODO: relative addressing!
800 */
801 static struct brw_reg get_reg( struct brw_vs_compile *c,
802 gl_register_file file,
803 GLuint index )
804 {
805 switch (file) {
806 case PROGRAM_TEMPORARY:
807 case PROGRAM_INPUT:
808 case PROGRAM_OUTPUT:
809 assert(c->regs[file][index].nr != 0);
810 return c->regs[file][index];
811 case PROGRAM_STATE_VAR:
812 case PROGRAM_CONSTANT:
813 case PROGRAM_UNIFORM:
814 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
815 return c->regs[PROGRAM_STATE_VAR][index];
816 case PROGRAM_ADDRESS:
817 assert(index == 0);
818 return c->regs[file][index];
819
820 case PROGRAM_UNDEFINED: /* undef values */
821 return brw_null_reg();
822
823 case PROGRAM_LOCAL_PARAM:
824 case PROGRAM_ENV_PARAM:
825 case PROGRAM_WRITE_ONLY:
826 default:
827 assert(0);
828 return brw_null_reg();
829 }
830 }
831
832
833 /**
834 * Indirect addressing: get reg[[arg] + offset].
835 */
836 static struct brw_reg deref( struct brw_vs_compile *c,
837 struct brw_reg arg,
838 GLint offset)
839 {
840 struct brw_compile *p = &c->func;
841 struct brw_reg tmp = vec4(get_tmp(c));
842 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
843 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
844 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
845 struct brw_reg indirect = brw_vec4_indirect(0,0);
846
847 {
848 brw_push_insn_state(p);
849 brw_set_access_mode(p, BRW_ALIGN_1);
850
851 /* This is pretty clunky - load the address register twice and
852 * fetch each 4-dword value in turn. There must be a way to do
853 * this in a single pass, but I couldn't get it to work.
854 */
855 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
856 brw_MOV(p, tmp, indirect);
857
858 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
859 brw_MOV(p, suboffset(tmp, 4), indirect);
860
861 brw_pop_insn_state(p);
862 }
863
864 /* NOTE: tmp not released */
865 return vec8(tmp);
866 }
867
868
869 /**
870 * Get brw reg corresponding to the instruction's [argIndex] src reg.
871 * TODO: relative addressing!
872 */
873 static struct brw_reg
874 get_src_reg( struct brw_vs_compile *c,
875 const struct prog_instruction *inst,
876 GLuint argIndex )
877 {
878 const GLuint file = inst->SrcReg[argIndex].File;
879 const GLint index = inst->SrcReg[argIndex].Index;
880 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
881
882 switch (file) {
883 case PROGRAM_TEMPORARY:
884 case PROGRAM_INPUT:
885 case PROGRAM_OUTPUT:
886 if (relAddr) {
887 return deref(c, c->regs[file][0], index);
888 }
889 else {
890 assert(c->regs[file][index].nr != 0);
891 return c->regs[file][index];
892 }
893
894 case PROGRAM_STATE_VAR:
895 case PROGRAM_CONSTANT:
896 case PROGRAM_UNIFORM:
897 if (c->vp->use_const_buffer) {
898 return get_constant(c, inst, argIndex);
899 }
900 else if (relAddr) {
901 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
902 }
903 else {
904 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
905 return c->regs[PROGRAM_STATE_VAR][index];
906 }
907 case PROGRAM_ADDRESS:
908 assert(index == 0);
909 return c->regs[file][index];
910
911 case PROGRAM_UNDEFINED:
912 /* this is a normal case since we loop over all three src args */
913 return brw_null_reg();
914
915 case PROGRAM_LOCAL_PARAM:
916 case PROGRAM_ENV_PARAM:
917 case PROGRAM_WRITE_ONLY:
918 default:
919 assert(0);
920 return brw_null_reg();
921 }
922 }
923
924
925 static void emit_arl( struct brw_vs_compile *c,
926 struct brw_reg dst,
927 struct brw_reg arg0 )
928 {
929 struct brw_compile *p = &c->func;
930 struct brw_reg tmp = dst;
931 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
932
933 if (need_tmp)
934 tmp = get_tmp(c);
935
936 brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
937 brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
938
939 if (need_tmp)
940 release_tmp(c, tmp);
941 }
942
943
944 /**
945 * Return the brw reg for the given instruction's src argument.
946 * Will return mangled results for SWZ op. The emit_swz() function
947 * ignores this result and recalculates taking extended swizzles into
948 * account.
949 */
950 static struct brw_reg get_arg( struct brw_vs_compile *c,
951 const struct prog_instruction *inst,
952 GLuint argIndex )
953 {
954 const struct prog_src_register *src = &inst->SrcReg[argIndex];
955 struct brw_reg reg;
956
957 if (src->File == PROGRAM_UNDEFINED)
958 return brw_null_reg();
959
960 reg = get_src_reg(c, inst, argIndex);
961
962 /* Convert 3-bit swizzle to 2-bit.
963 */
964 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
965 GET_SWZ(src->Swizzle, 1),
966 GET_SWZ(src->Swizzle, 2),
967 GET_SWZ(src->Swizzle, 3));
968
969 /* Note this is ok for non-swizzle instructions:
970 */
971 reg.negate = src->Negate ? 1 : 0;
972
973 return reg;
974 }
975
976
977 /**
978 * Get brw register for the given program dest register.
979 */
980 static struct brw_reg get_dst( struct brw_vs_compile *c,
981 struct prog_dst_register dst )
982 {
983 struct brw_reg reg;
984
985 switch (dst.File) {
986 case PROGRAM_TEMPORARY:
987 case PROGRAM_OUTPUT:
988 assert(c->regs[dst.File][dst.Index].nr != 0);
989 reg = c->regs[dst.File][dst.Index];
990 break;
991 case PROGRAM_ADDRESS:
992 assert(dst.Index == 0);
993 reg = c->regs[dst.File][dst.Index];
994 break;
995 case PROGRAM_UNDEFINED:
996 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
997 reg = brw_null_reg();
998 break;
999 default:
1000 assert(0);
1001 reg = brw_null_reg();
1002 }
1003
1004 reg.dw1.bits.writemask = dst.WriteMask;
1005
1006 return reg;
1007 }
1008
1009
1010 static void emit_swz( struct brw_vs_compile *c,
1011 struct brw_reg dst,
1012 const struct prog_instruction *inst)
1013 {
1014 const GLuint argIndex = 0;
1015 const struct prog_src_register src = inst->SrcReg[argIndex];
1016 struct brw_compile *p = &c->func;
1017 GLuint zeros_mask = 0;
1018 GLuint ones_mask = 0;
1019 GLuint src_mask = 0;
1020 GLubyte src_swz[4];
1021 GLboolean need_tmp = (src.Negate &&
1022 dst.file != BRW_GENERAL_REGISTER_FILE);
1023 struct brw_reg tmp = dst;
1024 GLuint i;
1025
1026 if (need_tmp)
1027 tmp = get_tmp(c);
1028
1029 for (i = 0; i < 4; i++) {
1030 if (dst.dw1.bits.writemask & (1<<i)) {
1031 GLubyte s = GET_SWZ(src.Swizzle, i);
1032 switch (s) {
1033 case SWIZZLE_X:
1034 case SWIZZLE_Y:
1035 case SWIZZLE_Z:
1036 case SWIZZLE_W:
1037 src_mask |= 1<<i;
1038 src_swz[i] = s;
1039 break;
1040 case SWIZZLE_ZERO:
1041 zeros_mask |= 1<<i;
1042 break;
1043 case SWIZZLE_ONE:
1044 ones_mask |= 1<<i;
1045 break;
1046 }
1047 }
1048 }
1049
1050 /* Do src first, in case dst aliases src:
1051 */
1052 if (src_mask) {
1053 struct brw_reg arg0;
1054
1055 arg0 = get_src_reg(c, inst, argIndex);
1056
1057 arg0 = brw_swizzle(arg0,
1058 src_swz[0], src_swz[1],
1059 src_swz[2], src_swz[3]);
1060
1061 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1062 }
1063
1064 if (zeros_mask)
1065 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1066
1067 if (ones_mask)
1068 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1069
1070 if (src.Negate)
1071 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1072
1073 if (need_tmp) {
1074 brw_MOV(p, dst, tmp);
1075 release_tmp(c, tmp);
1076 }
1077 }
1078
1079
1080 /**
1081 * Post-vertex-program processing. Send the results to the URB.
1082 */
1083 static void emit_vertex_write( struct brw_vs_compile *c)
1084 {
1085 struct brw_compile *p = &c->func;
1086 struct brw_reg m0 = brw_message_reg(0);
1087 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1088 struct brw_reg ndc;
1089 int eot;
1090 GLuint len_vertext_header = 2;
1091
1092 if (c->key.copy_edgeflag) {
1093 brw_MOV(p,
1094 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1095 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1096 }
1097
1098 /* Build ndc coords */
1099 ndc = get_tmp(c);
1100 /* ndc = 1.0 / pos.w */
1101 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1102 /* ndc.xyz = pos * ndc */
1103 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1104
1105 /* Update the header for point size, user clipping flags, and -ve rhw
1106 * workaround.
1107 */
1108 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1109 c->key.nr_userclip || BRW_IS_965(p->brw))
1110 {
1111 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1112 GLuint i;
1113
1114 brw_MOV(p, header1, brw_imm_ud(0));
1115
1116 brw_set_access_mode(p, BRW_ALIGN_16);
1117
1118 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1119 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1120 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1121 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1122 }
1123
1124 for (i = 0; i < c->key.nr_userclip; i++) {
1125 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1126 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1127 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1128 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1129 }
1130
1131 /* i965 clipping workaround:
1132 * 1) Test for -ve rhw
1133 * 2) If set,
1134 * set ndc = (0,0,0,0)
1135 * set ucp[6] = 1
1136 *
1137 * Later, clipping will detect ucp[6] and ensure the primitive is
1138 * clipped against all fixed planes.
1139 */
1140 if (BRW_IS_965(p->brw)) {
1141 brw_CMP(p,
1142 vec8(brw_null_reg()),
1143 BRW_CONDITIONAL_L,
1144 brw_swizzle1(ndc, 3),
1145 brw_imm_f(0));
1146
1147 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1148 brw_MOV(p, ndc, brw_imm_f(0));
1149 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1150 }
1151
1152 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1153 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1154 brw_set_access_mode(p, BRW_ALIGN_16);
1155
1156 release_tmp(c, header1);
1157 }
1158 else {
1159 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1160 }
1161
1162 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1163 * of zeros followed by two sets of NDC coordinates:
1164 */
1165 brw_set_access_mode(p, BRW_ALIGN_1);
1166 brw_MOV(p, offset(m0, 2), ndc);
1167
1168 if (BRW_IS_IGDNG(p->brw)) {
1169 /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1170 brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1171 /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1172 * Seems it is useless for us.
1173 * m6 is used for aligning, so that the remainder of vertex element is
1174 * reg-aligned.
1175 */
1176 brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1177 len_vertext_header = 6;
1178 } else {
1179 brw_MOV(p, offset(m0, 3), pos);
1180 len_vertext_header = 2;
1181 }
1182
1183 eot = (c->first_overflow_output == 0);
1184
1185 brw_urb_WRITE(p,
1186 brw_null_reg(), /* dest */
1187 0, /* starting mrf reg nr */
1188 c->r0, /* src */
1189 0, /* allocate */
1190 1, /* used */
1191 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1192 0, /* response len */
1193 eot, /* eot */
1194 1, /* writes complete */
1195 0, /* urb destination offset */
1196 BRW_URB_SWIZZLE_INTERLEAVE);
1197
1198 if (c->first_overflow_output > 0) {
1199 /* Not all of the vertex outputs/results fit into the MRF.
1200 * Move the overflowed attributes from the GRF to the MRF and
1201 * issue another brw_urb_WRITE().
1202 */
1203 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1204 * at mrf[4] atm...
1205 */
1206 GLuint i, mrf = 0;
1207 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1208 if (c->prog_data.outputs_written & (1 << i)) {
1209 /* move from GRF to MRF */
1210 brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1211 mrf++;
1212 }
1213 }
1214
1215 brw_urb_WRITE(p,
1216 brw_null_reg(), /* dest */
1217 4, /* starting mrf reg nr */
1218 c->r0, /* src */
1219 0, /* allocate */
1220 1, /* used */
1221 mrf+1, /* msg len */
1222 0, /* response len */
1223 1, /* eot */
1224 1, /* writes complete */
1225 BRW_MAX_MRF-1, /* urb destination offset */
1226 BRW_URB_SWIZZLE_INTERLEAVE);
1227 }
1228 }
1229
1230
1231 /**
1232 * Called after code generation to resolve subroutine calls and the
1233 * END instruction.
1234 * \param end_inst points to brw code for END instruction
1235 * \param last_inst points to last instruction emitted before vertex write
1236 */
1237 static void
1238 post_vs_emit( struct brw_vs_compile *c,
1239 struct brw_instruction *end_inst,
1240 struct brw_instruction *last_inst )
1241 {
1242 GLint offset;
1243
1244 brw_resolve_cals(&c->func);
1245
1246 /* patch up the END code to jump past subroutines, etc */
1247 offset = last_inst - end_inst;
1248 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1249 }
1250
1251
1252 /* Emit the vertex program instructions here.
1253 */
1254 void brw_vs_emit(struct brw_vs_compile *c )
1255 {
1256 #define MAX_IF_DEPTH 32
1257 #define MAX_LOOP_DEPTH 32
1258 struct brw_compile *p = &c->func;
1259 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1260 GLuint insn, if_depth = 0, loop_depth = 0;
1261 GLuint end_offset = 0;
1262 struct brw_instruction *end_inst, *last_inst;
1263 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1264 const struct brw_indirect stack_index = brw_indirect(0, 0);
1265 GLuint index;
1266 GLuint file;
1267
1268 if (INTEL_DEBUG & DEBUG_VS) {
1269 _mesa_printf("vs-emit:\n");
1270 _mesa_print_program(&c->vp->program.Base);
1271 _mesa_printf("\n");
1272 }
1273
1274 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1275 brw_set_access_mode(p, BRW_ALIGN_16);
1276
1277 /* Message registers can't be read, so copy the output into GRF register
1278 if they are used in source registers */
1279 for (insn = 0; insn < nr_insns; insn++) {
1280 GLuint i;
1281 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1282 for (i = 0; i < 3; i++) {
1283 struct prog_src_register *src = &inst->SrcReg[i];
1284 GLuint index = src->Index;
1285 GLuint file = src->File;
1286 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1287 c->output_regs[index].used_in_src = GL_TRUE;
1288 }
1289 }
1290
1291 /* Static register allocation
1292 */
1293 brw_vs_alloc_regs(c);
1294 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1295
1296 for (insn = 0; insn < nr_insns; insn++) {
1297
1298 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1299 struct brw_reg args[3], dst;
1300 GLuint i;
1301
1302 #if 0
1303 printf("%d: ", insn);
1304 _mesa_print_instruction(inst);
1305 #endif
1306
1307 /* Get argument regs. SWZ is special and does this itself.
1308 */
1309 if (inst->Opcode != OPCODE_SWZ)
1310 for (i = 0; i < 3; i++) {
1311 const struct prog_src_register *src = &inst->SrcReg[i];
1312 index = src->Index;
1313 file = src->File;
1314 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1315 args[i] = c->output_regs[index].reg;
1316 else
1317 args[i] = get_arg(c, inst, i);
1318 }
1319
1320 /* Get dest regs. Note that it is possible for a reg to be both
1321 * dst and arg, given the static allocation of registers. So
1322 * care needs to be taken emitting multi-operation instructions.
1323 */
1324 index = inst->DstReg.Index;
1325 file = inst->DstReg.File;
1326 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1327 dst = c->output_regs[index].reg;
1328 else
1329 dst = get_dst(c, inst->DstReg);
1330
1331 if (inst->SaturateMode != SATURATE_OFF) {
1332 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1333 inst->SaturateMode);
1334 }
1335
1336 switch (inst->Opcode) {
1337 case OPCODE_ABS:
1338 brw_MOV(p, dst, brw_abs(args[0]));
1339 break;
1340 case OPCODE_ADD:
1341 brw_ADD(p, dst, args[0], args[1]);
1342 break;
1343 case OPCODE_COS:
1344 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1345 break;
1346 case OPCODE_DP3:
1347 brw_DP3(p, dst, args[0], args[1]);
1348 break;
1349 case OPCODE_DP4:
1350 brw_DP4(p, dst, args[0], args[1]);
1351 break;
1352 case OPCODE_DPH:
1353 brw_DPH(p, dst, args[0], args[1]);
1354 break;
1355 case OPCODE_NRM3:
1356 emit_nrm(c, dst, args[0], 3);
1357 break;
1358 case OPCODE_NRM4:
1359 emit_nrm(c, dst, args[0], 4);
1360 break;
1361 case OPCODE_DST:
1362 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1363 break;
1364 case OPCODE_EXP:
1365 unalias1(c, dst, args[0], emit_exp_noalias);
1366 break;
1367 case OPCODE_EX2:
1368 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1369 break;
1370 case OPCODE_ARL:
1371 emit_arl(c, dst, args[0]);
1372 break;
1373 case OPCODE_FLR:
1374 brw_RNDD(p, dst, args[0]);
1375 break;
1376 case OPCODE_FRC:
1377 brw_FRC(p, dst, args[0]);
1378 break;
1379 case OPCODE_LOG:
1380 unalias1(c, dst, args[0], emit_log_noalias);
1381 break;
1382 case OPCODE_LG2:
1383 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1384 break;
1385 case OPCODE_LIT:
1386 unalias1(c, dst, args[0], emit_lit_noalias);
1387 break;
1388 case OPCODE_LRP:
1389 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1390 break;
1391 case OPCODE_MAD:
1392 brw_MOV(p, brw_acc_reg(), args[2]);
1393 brw_MAC(p, dst, args[0], args[1]);
1394 break;
1395 case OPCODE_MAX:
1396 emit_max(p, dst, args[0], args[1]);
1397 break;
1398 case OPCODE_MIN:
1399 emit_min(p, dst, args[0], args[1]);
1400 break;
1401 case OPCODE_MOV:
1402 brw_MOV(p, dst, args[0]);
1403 break;
1404 case OPCODE_MUL:
1405 brw_MUL(p, dst, args[0], args[1]);
1406 break;
1407 case OPCODE_POW:
1408 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1409 break;
1410 case OPCODE_RCP:
1411 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1412 break;
1413 case OPCODE_RSQ:
1414 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1415 break;
1416
1417 case OPCODE_SEQ:
1418 emit_seq(p, dst, args[0], args[1]);
1419 break;
1420 case OPCODE_SIN:
1421 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1422 break;
1423 case OPCODE_SNE:
1424 emit_sne(p, dst, args[0], args[1]);
1425 break;
1426 case OPCODE_SGE:
1427 emit_sge(p, dst, args[0], args[1]);
1428 break;
1429 case OPCODE_SGT:
1430 emit_sgt(p, dst, args[0], args[1]);
1431 break;
1432 case OPCODE_SLT:
1433 emit_slt(p, dst, args[0], args[1]);
1434 break;
1435 case OPCODE_SLE:
1436 emit_sle(p, dst, args[0], args[1]);
1437 break;
1438 case OPCODE_SUB:
1439 brw_ADD(p, dst, args[0], negate(args[1]));
1440 break;
1441 case OPCODE_SWZ:
1442 /* The args[0] value can't be used here as it won't have
1443 * correctly encoded the full swizzle:
1444 */
1445 emit_swz(c, dst, inst);
1446 break;
1447 case OPCODE_TRUNC:
1448 /* round toward zero */
1449 brw_RNDZ(p, dst, args[0]);
1450 break;
1451 case OPCODE_XPD:
1452 emit_xpd(p, dst, args[0], args[1]);
1453 break;
1454 case OPCODE_IF:
1455 assert(if_depth < MAX_IF_DEPTH);
1456 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1457 break;
1458 case OPCODE_ELSE:
1459 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1460 break;
1461 case OPCODE_ENDIF:
1462 assert(if_depth > 0);
1463 brw_ENDIF(p, if_inst[--if_depth]);
1464 break;
1465 #if 0
1466 case OPCODE_BGNLOOP:
1467 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1468 break;
1469 case OPCODE_BRK:
1470 brw_BREAK(p);
1471 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1472 break;
1473 case OPCODE_CONT:
1474 brw_CONT(p);
1475 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1476 break;
1477 case OPCODE_ENDLOOP:
1478 {
1479 struct brw_instruction *inst0, *inst1;
1480 loop_depth--;
1481 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1482 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1483 while (inst0 > loop_inst[loop_depth]) {
1484 inst0--;
1485 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1486 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
1487 inst0->bits3.if_else.pop_count = 0;
1488 }
1489 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1490 inst0->bits3.if_else.jump_count = inst1 - inst0;
1491 inst0->bits3.if_else.pop_count = 0;
1492 }
1493 }
1494 }
1495 break;
1496 #else
1497 (void) loop_inst;
1498 (void) loop_depth;
1499 #endif
1500 case OPCODE_BRA:
1501 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1502 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1503 brw_set_predicate_control_flag_value(p, 0xff);
1504 break;
1505 case OPCODE_CAL:
1506 brw_set_access_mode(p, BRW_ALIGN_1);
1507 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1508 brw_set_access_mode(p, BRW_ALIGN_16);
1509 brw_ADD(p, get_addr_reg(stack_index),
1510 get_addr_reg(stack_index), brw_imm_d(4));
1511 brw_save_call(p, inst->Comment, p->nr_insn);
1512 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1513 break;
1514 case OPCODE_RET:
1515 brw_ADD(p, get_addr_reg(stack_index),
1516 get_addr_reg(stack_index), brw_imm_d(-4));
1517 brw_set_access_mode(p, BRW_ALIGN_1);
1518 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1519 brw_set_access_mode(p, BRW_ALIGN_16);
1520 break;
1521 case OPCODE_END:
1522 end_offset = p->nr_insn;
1523 /* this instruction will get patched later to jump past subroutine
1524 * code, etc.
1525 */
1526 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1527 break;
1528 case OPCODE_PRINT:
1529 /* no-op */
1530 break;
1531 case OPCODE_BGNSUB:
1532 brw_save_label(p, inst->Comment, p->nr_insn);
1533 break;
1534 case OPCODE_ENDSUB:
1535 /* no-op */
1536 break;
1537 default:
1538 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1539 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1540 _mesa_opcode_string(inst->Opcode) :
1541 "unknown");
1542 }
1543
1544 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1545 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1546 && c->output_regs[inst->DstReg.Index].used_in_src) {
1547 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1548 }
1549
1550 /* Result color clamping.
1551 *
1552 * When destination register is an output register and
1553 * it's primary/secondary front/back color, we have to clamp
1554 * the result to [0,1]. This is done by enabling the
1555 * saturation bit for the last instruction.
1556 *
1557 * We don't use brw_set_saturate() as it modifies
1558 * p->current->header.saturate, which affects all the subsequent
1559 * instructions. Instead, we directly modify the header
1560 * of the last (already stored) instruction.
1561 */
1562 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1563 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1564 || (inst->DstReg.Index == VERT_RESULT_COL1)
1565 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1566 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1567 p->store[p->nr_insn-1].header.saturate = 1;
1568 }
1569 }
1570
1571 release_tmps(c);
1572 }
1573
1574 end_inst = &p->store[end_offset];
1575 last_inst = &p->store[p->nr_insn];
1576
1577 /* The END instruction will be patched to jump to this code */
1578 emit_vertex_write(c);
1579
1580 post_vs_emit(c, end_inst, last_inst);
1581 }