f7b07266362fefecbba6fa516c5c865871da22ae
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
42 {
43 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
44
45 if (++c->last_tmp > c->prog_data.total_grf)
46 c->prog_data.total_grf = c->last_tmp;
47
48 return tmp;
49 }
50
51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
52 {
53 if (tmp.nr == c->last_tmp-1)
54 c->last_tmp--;
55 }
56
57 static void release_tmps( struct brw_vs_compile *c )
58 {
59 c->last_tmp = c->first_tmp;
60 }
61
62
63 /**
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
66 * ahead of time.
67 */
68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
69 {
70 GLuint i, reg = 0, mrf;
71 int attributes_in_vue;
72
73 /* Determine whether to use a real constant buffer or use a block
74 * of GRF registers for constants. The later is faster but only
75 * works if everything fits in the GRF.
76 * XXX this heuristic/check may need some fine tuning...
77 */
78 if (c->vp->program.Base.Parameters->NumParameters +
79 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
80 c->vp->use_const_buffer = GL_TRUE;
81 else
82 c->vp->use_const_buffer = GL_FALSE;
83
84 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
85
86 /* r0 -- reserved as usual
87 */
88 c->r0 = brw_vec8_grf(reg, 0);
89 reg++;
90
91 /* User clip planes from curbe:
92 */
93 if (c->key.nr_userclip) {
94 for (i = 0; i < c->key.nr_userclip; i++) {
95 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
96 }
97
98 /* Deal with curbe alignment:
99 */
100 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
101 }
102
103 /* Vertex program parameters from curbe:
104 */
105 if (c->vp->use_const_buffer) {
106 /* get constants from a real constant buffer */
107 c->prog_data.curb_read_length = 0;
108 c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
109 }
110 else {
111 /* use a section of the GRF for constants */
112 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
113 for (i = 0; i < nr_params; i++) {
114 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
115 }
116 reg += (nr_params + 1) / 2;
117 c->prog_data.curb_read_length = reg - 1;
118
119 c->prog_data.nr_params = nr_params * 4;
120 }
121
122 /* Allocate input regs:
123 */
124 c->nr_inputs = 0;
125 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
126 if (c->prog_data.inputs_read & (1 << i)) {
127 c->nr_inputs++;
128 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
129 reg++;
130 }
131 }
132 /* If there are no inputs, we'll still be reading one attribute's worth
133 * because it's required -- see urb_read_length setting.
134 */
135 if (c->nr_inputs == 0)
136 reg++;
137
138 /* Allocate outputs. The non-position outputs go straight into message regs.
139 */
140 c->nr_outputs = 0;
141 c->first_output = reg;
142 c->first_overflow_output = 0;
143
144 if (BRW_IS_IGDNG(c->func.brw))
145 mrf = 8;
146 else
147 mrf = 4;
148
149 for (i = 0; i < VERT_RESULT_MAX; i++) {
150 if (c->prog_data.outputs_written & (1 << i)) {
151 c->nr_outputs++;
152 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
153 if (i == VERT_RESULT_HPOS) {
154 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
155 reg++;
156 }
157 else if (i == VERT_RESULT_PSIZ) {
158 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
159 reg++;
160 mrf++; /* just a placeholder? XXX fix later stages & remove this */
161 }
162 else {
163 if (mrf < 16) {
164 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
165 mrf++;
166 }
167 else {
168 /* too many vertex results to fit in MRF, use GRF for overflow */
169 if (!c->first_overflow_output)
170 c->first_overflow_output = i;
171 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
172 reg++;
173 }
174 }
175 }
176 }
177
178 /* Allocate program temporaries:
179 */
180 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
181 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
182 reg++;
183 }
184
185 /* Address reg(s). Don't try to use the internal address reg until
186 * deref time.
187 */
188 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
189 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
190 reg,
191 0,
192 BRW_REGISTER_TYPE_D,
193 BRW_VERTICAL_STRIDE_8,
194 BRW_WIDTH_8,
195 BRW_HORIZONTAL_STRIDE_1,
196 BRW_SWIZZLE_XXXX,
197 WRITEMASK_X);
198 reg++;
199 }
200
201 if (c->vp->use_const_buffer) {
202 for (i = 0; i < 3; i++) {
203 c->current_const[i].index = -1;
204 c->current_const[i].reg = brw_vec8_grf(reg, 0);
205 reg++;
206 }
207 }
208
209 for (i = 0; i < 128; i++) {
210 if (c->output_regs[i].used_in_src) {
211 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
212 reg++;
213 }
214 }
215
216 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
217 reg += 2;
218
219 /* Some opcodes need an internal temporary:
220 */
221 c->first_tmp = reg;
222 c->last_tmp = reg; /* for allocation purposes */
223
224 /* Each input reg holds data from two vertices. The
225 * urb_read_length is the number of registers read from *each*
226 * vertex urb, so is half the amount:
227 */
228 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
229 /* Setting this field to 0 leads to undefined behavior according to the
230 * the VS_STATE docs. Our VUEs will always have at least one attribute
231 * sitting in them, even if it's padding.
232 */
233 if (c->prog_data.urb_read_length == 0)
234 c->prog_data.urb_read_length = 1;
235
236 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
237 * them to fit the biggest thing they need to.
238 */
239 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
240
241 if (BRW_IS_IGDNG(c->func.brw))
242 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
243 else
244 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
245
246 c->prog_data.total_grf = reg;
247
248 if (INTEL_DEBUG & DEBUG_VS) {
249 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
250 _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
251 _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
252 }
253 }
254
255
256 /**
257 * If an instruction uses a temp reg both as a src and the dest, we
258 * sometimes need to allocate an intermediate temporary.
259 */
260 static void unalias1( struct brw_vs_compile *c,
261 struct brw_reg dst,
262 struct brw_reg arg0,
263 void (*func)( struct brw_vs_compile *,
264 struct brw_reg,
265 struct brw_reg ))
266 {
267 if (dst.file == arg0.file && dst.nr == arg0.nr) {
268 struct brw_compile *p = &c->func;
269 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
270 func(c, tmp, arg0);
271 brw_MOV(p, dst, tmp);
272 release_tmp(c, tmp);
273 }
274 else {
275 func(c, dst, arg0);
276 }
277 }
278
279 /**
280 * \sa unalias2
281 * Checkes if 2-operand instruction needs an intermediate temporary.
282 */
283 static void unalias2( struct brw_vs_compile *c,
284 struct brw_reg dst,
285 struct brw_reg arg0,
286 struct brw_reg arg1,
287 void (*func)( struct brw_vs_compile *,
288 struct brw_reg,
289 struct brw_reg,
290 struct brw_reg ))
291 {
292 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
293 (dst.file == arg1.file && dst.nr == arg1.nr)) {
294 struct brw_compile *p = &c->func;
295 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
296 func(c, tmp, arg0, arg1);
297 brw_MOV(p, dst, tmp);
298 release_tmp(c, tmp);
299 }
300 else {
301 func(c, dst, arg0, arg1);
302 }
303 }
304
305 /**
306 * \sa unalias2
307 * Checkes if 3-operand instruction needs an intermediate temporary.
308 */
309 static void unalias3( struct brw_vs_compile *c,
310 struct brw_reg dst,
311 struct brw_reg arg0,
312 struct brw_reg arg1,
313 struct brw_reg arg2,
314 void (*func)( struct brw_vs_compile *,
315 struct brw_reg,
316 struct brw_reg,
317 struct brw_reg,
318 struct brw_reg ))
319 {
320 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
321 (dst.file == arg1.file && dst.nr == arg1.nr) ||
322 (dst.file == arg2.file && dst.nr == arg2.nr)) {
323 struct brw_compile *p = &c->func;
324 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
325 func(c, tmp, arg0, arg1, arg2);
326 brw_MOV(p, dst, tmp);
327 release_tmp(c, tmp);
328 }
329 else {
330 func(c, dst, arg0, arg1, arg2);
331 }
332 }
333
334 static void emit_sop( struct brw_vs_compile *c,
335 struct brw_reg dst,
336 struct brw_reg arg0,
337 struct brw_reg arg1,
338 GLuint cond)
339 {
340 struct brw_compile *p = &c->func;
341
342 brw_MOV(p, dst, brw_imm_f(0.0f));
343 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
344 brw_MOV(p, dst, brw_imm_f(1.0f));
345 brw_set_predicate_control_flag_value(p, 0xff);
346 }
347
348 static void emit_seq( struct brw_vs_compile *c,
349 struct brw_reg dst,
350 struct brw_reg arg0,
351 struct brw_reg arg1 )
352 {
353 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
354 }
355
356 static void emit_sne( struct brw_vs_compile *c,
357 struct brw_reg dst,
358 struct brw_reg arg0,
359 struct brw_reg arg1 )
360 {
361 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
362 }
363 static void emit_slt( struct brw_vs_compile *c,
364 struct brw_reg dst,
365 struct brw_reg arg0,
366 struct brw_reg arg1 )
367 {
368 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
369 }
370
371 static void emit_sle( struct brw_vs_compile *c,
372 struct brw_reg dst,
373 struct brw_reg arg0,
374 struct brw_reg arg1 )
375 {
376 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
377 }
378
379 static void emit_sgt( struct brw_vs_compile *c,
380 struct brw_reg dst,
381 struct brw_reg arg0,
382 struct brw_reg arg1 )
383 {
384 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
385 }
386
387 static void emit_sge( struct brw_vs_compile *c,
388 struct brw_reg dst,
389 struct brw_reg arg0,
390 struct brw_reg arg1 )
391 {
392 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
393 }
394
395 static void emit_max( struct brw_compile *p,
396 struct brw_reg dst,
397 struct brw_reg arg0,
398 struct brw_reg arg1 )
399 {
400 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
401 brw_SEL(p, dst, arg1, arg0);
402 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
403 }
404
405 static void emit_min( struct brw_compile *p,
406 struct brw_reg dst,
407 struct brw_reg arg0,
408 struct brw_reg arg1 )
409 {
410 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
411 brw_SEL(p, dst, arg0, arg1);
412 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
413 }
414
415
416 static void emit_math1( struct brw_vs_compile *c,
417 GLuint function,
418 struct brw_reg dst,
419 struct brw_reg arg0,
420 GLuint precision)
421 {
422 /* There are various odd behaviours with SEND on the simulator. In
423 * addition there are documented issues with the fact that the GEN4
424 * processor doesn't do dependency control properly on SEND
425 * results. So, on balance, this kludge to get around failures
426 * with writemasked math results looks like it might be necessary
427 * whether that turns out to be a simulator bug or not:
428 */
429 struct brw_compile *p = &c->func;
430 struct brw_reg tmp = dst;
431 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
432 dst.file != BRW_GENERAL_REGISTER_FILE);
433
434 if (need_tmp)
435 tmp = get_tmp(c);
436
437 brw_math(p,
438 tmp,
439 function,
440 BRW_MATH_SATURATE_NONE,
441 2,
442 arg0,
443 BRW_MATH_DATA_SCALAR,
444 precision);
445
446 if (need_tmp) {
447 brw_MOV(p, dst, tmp);
448 release_tmp(c, tmp);
449 }
450 }
451
452
453 static void emit_math2( struct brw_vs_compile *c,
454 GLuint function,
455 struct brw_reg dst,
456 struct brw_reg arg0,
457 struct brw_reg arg1,
458 GLuint precision)
459 {
460 struct brw_compile *p = &c->func;
461 struct brw_reg tmp = dst;
462 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
463 dst.file != BRW_GENERAL_REGISTER_FILE);
464
465 if (need_tmp)
466 tmp = get_tmp(c);
467
468 brw_MOV(p, brw_message_reg(3), arg1);
469
470 brw_math(p,
471 tmp,
472 function,
473 BRW_MATH_SATURATE_NONE,
474 2,
475 arg0,
476 BRW_MATH_DATA_SCALAR,
477 precision);
478
479 if (need_tmp) {
480 brw_MOV(p, dst, tmp);
481 release_tmp(c, tmp);
482 }
483 }
484
485
486 static void emit_exp_noalias( struct brw_vs_compile *c,
487 struct brw_reg dst,
488 struct brw_reg arg0 )
489 {
490 struct brw_compile *p = &c->func;
491
492
493 if (dst.dw1.bits.writemask & WRITEMASK_X) {
494 struct brw_reg tmp = get_tmp(c);
495 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
496
497 /* tmp_d = floor(arg0.x) */
498 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
499
500 /* result[0] = 2.0 ^ tmp */
501
502 /* Adjust exponent for floating point:
503 * exp += 127
504 */
505 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
506
507 /* Install exponent and sign.
508 * Excess drops off the edge:
509 */
510 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
511 tmp_d, brw_imm_d(23));
512
513 release_tmp(c, tmp);
514 }
515
516 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
517 /* result[1] = arg0.x - floor(arg0.x) */
518 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
519 }
520
521 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
522 /* As with the LOG instruction, we might be better off just
523 * doing a taylor expansion here, seeing as we have to do all
524 * the prep work.
525 *
526 * If mathbox partial precision is too low, consider also:
527 * result[3] = result[0] * EXP(result[1])
528 */
529 emit_math1(c,
530 BRW_MATH_FUNCTION_EXP,
531 brw_writemask(dst, WRITEMASK_Z),
532 brw_swizzle1(arg0, 0),
533 BRW_MATH_PRECISION_FULL);
534 }
535
536 if (dst.dw1.bits.writemask & WRITEMASK_W) {
537 /* result[3] = 1.0; */
538 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
539 }
540 }
541
542
543 static void emit_log_noalias( struct brw_vs_compile *c,
544 struct brw_reg dst,
545 struct brw_reg arg0 )
546 {
547 struct brw_compile *p = &c->func;
548 struct brw_reg tmp = dst;
549 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
550 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
551 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
552 dst.file != BRW_GENERAL_REGISTER_FILE);
553
554 if (need_tmp) {
555 tmp = get_tmp(c);
556 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
557 }
558
559 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
560 * according to spec:
561 *
562 * These almost look likey they could be joined up, but not really
563 * practical:
564 *
565 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
566 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
567 */
568 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
569 brw_AND(p,
570 brw_writemask(tmp_ud, WRITEMASK_X),
571 brw_swizzle1(arg0_ud, 0),
572 brw_imm_ud((1U<<31)-1));
573
574 brw_SHR(p,
575 brw_writemask(tmp_ud, WRITEMASK_X),
576 tmp_ud,
577 brw_imm_ud(23));
578
579 brw_ADD(p,
580 brw_writemask(tmp, WRITEMASK_X),
581 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
582 brw_imm_d(-127));
583 }
584
585 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
586 brw_AND(p,
587 brw_writemask(tmp_ud, WRITEMASK_Y),
588 brw_swizzle1(arg0_ud, 0),
589 brw_imm_ud((1<<23)-1));
590
591 brw_OR(p,
592 brw_writemask(tmp_ud, WRITEMASK_Y),
593 tmp_ud,
594 brw_imm_ud(127<<23));
595 }
596
597 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
598 /* result[2] = result[0] + LOG2(result[1]); */
599
600 /* Why bother? The above is just a hint how to do this with a
601 * taylor series. Maybe we *should* use a taylor series as by
602 * the time all the above has been done it's almost certainly
603 * quicker than calling the mathbox, even with low precision.
604 *
605 * Options are:
606 * - result[0] + mathbox.LOG2(result[1])
607 * - mathbox.LOG2(arg0.x)
608 * - result[0] + inline_taylor_approx(result[1])
609 */
610 emit_math1(c,
611 BRW_MATH_FUNCTION_LOG,
612 brw_writemask(tmp, WRITEMASK_Z),
613 brw_swizzle1(tmp, 1),
614 BRW_MATH_PRECISION_FULL);
615
616 brw_ADD(p,
617 brw_writemask(tmp, WRITEMASK_Z),
618 brw_swizzle1(tmp, 2),
619 brw_swizzle1(tmp, 0));
620 }
621
622 if (dst.dw1.bits.writemask & WRITEMASK_W) {
623 /* result[3] = 1.0; */
624 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
625 }
626
627 if (need_tmp) {
628 brw_MOV(p, dst, tmp);
629 release_tmp(c, tmp);
630 }
631 }
632
633
634 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
635 */
636 static void emit_dst_noalias( struct brw_vs_compile *c,
637 struct brw_reg dst,
638 struct brw_reg arg0,
639 struct brw_reg arg1)
640 {
641 struct brw_compile *p = &c->func;
642
643 /* There must be a better way to do this:
644 */
645 if (dst.dw1.bits.writemask & WRITEMASK_X)
646 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
647 if (dst.dw1.bits.writemask & WRITEMASK_Y)
648 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
649 if (dst.dw1.bits.writemask & WRITEMASK_Z)
650 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
651 if (dst.dw1.bits.writemask & WRITEMASK_W)
652 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
653 }
654
655
656 static void emit_xpd( struct brw_compile *p,
657 struct brw_reg dst,
658 struct brw_reg t,
659 struct brw_reg u)
660 {
661 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
662 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
663 }
664
665
666 static void emit_lit_noalias( struct brw_vs_compile *c,
667 struct brw_reg dst,
668 struct brw_reg arg0 )
669 {
670 struct brw_compile *p = &c->func;
671 struct brw_instruction *if_insn;
672 struct brw_reg tmp = dst;
673 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
674
675 if (need_tmp)
676 tmp = get_tmp(c);
677
678 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
679 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
680
681 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
682 * to get all channels active inside the IF. In the clipping code
683 * we run with NoMask, so it's not an option and we can use
684 * BRW_EXECUTE_1 for all comparisions.
685 */
686 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
687 if_insn = brw_IF(p, BRW_EXECUTE_8);
688 {
689 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
690
691 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
692 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
693 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
694
695 emit_math2(c,
696 BRW_MATH_FUNCTION_POW,
697 brw_writemask(dst, WRITEMASK_Z),
698 brw_swizzle1(tmp, 2),
699 brw_swizzle1(arg0, 3),
700 BRW_MATH_PRECISION_PARTIAL);
701 }
702
703 brw_ENDIF(p, if_insn);
704
705 release_tmp(c, tmp);
706 }
707
708 static void emit_lrp_noalias(struct brw_vs_compile *c,
709 struct brw_reg dst,
710 struct brw_reg arg0,
711 struct brw_reg arg1,
712 struct brw_reg arg2)
713 {
714 struct brw_compile *p = &c->func;
715
716 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
717 brw_MUL(p, brw_null_reg(), dst, arg2);
718 brw_MAC(p, dst, arg0, arg1);
719 }
720
721 /** 3 or 4-component vector normalization */
722 static void emit_nrm( struct brw_vs_compile *c,
723 struct brw_reg dst,
724 struct brw_reg arg0,
725 int num_comps)
726 {
727 struct brw_compile *p = &c->func;
728 struct brw_reg tmp = get_tmp(c);
729
730 /* tmp = dot(arg0, arg0) */
731 if (num_comps == 3)
732 brw_DP3(p, tmp, arg0, arg0);
733 else
734 brw_DP4(p, tmp, arg0, arg0);
735
736 /* tmp = 1 / sqrt(tmp) */
737 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
738
739 /* dst = arg0 * tmp */
740 brw_MUL(p, dst, arg0, tmp);
741
742 release_tmp(c, tmp);
743 }
744
745
746 static struct brw_reg
747 get_constant(struct brw_vs_compile *c,
748 const struct prog_instruction *inst,
749 GLuint argIndex)
750 {
751 const struct prog_src_register *src = &inst->SrcReg[argIndex];
752 struct brw_compile *p = &c->func;
753 struct brw_reg const_reg;
754 struct brw_reg const2_reg;
755 const GLboolean relAddr = src->RelAddr;
756
757 assert(argIndex < 3);
758
759 if (c->current_const[argIndex].index != src->Index || relAddr) {
760 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
761
762 c->current_const[argIndex].index = src->Index;
763
764 #if 0
765 printf(" fetch const[%d] for arg %d into reg %d\n",
766 src->Index, argIndex, c->current_const[argIndex].reg.nr);
767 #endif
768 /* need to fetch the constant now */
769 brw_dp_READ_4_vs(p,
770 c->current_const[argIndex].reg,/* writeback dest */
771 0, /* oword */
772 relAddr, /* relative indexing? */
773 addrReg, /* address register */
774 16 * src->Index, /* byte offset */
775 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
776 );
777
778 if (relAddr) {
779 /* second read */
780 const2_reg = get_tmp(c);
781
782 /* use upper half of address reg for second read */
783 addrReg = stride(addrReg, 0, 4, 0);
784 addrReg.subnr = 16;
785
786 brw_dp_READ_4_vs(p,
787 const2_reg, /* writeback dest */
788 1, /* oword */
789 relAddr, /* relative indexing? */
790 addrReg, /* address register */
791 16 * src->Index, /* byte offset */
792 SURF_INDEX_VERT_CONST_BUFFER
793 );
794 }
795 }
796
797 const_reg = c->current_const[argIndex].reg;
798
799 if (relAddr) {
800 /* merge the two Owords into the constant register */
801 /* const_reg[7..4] = const2_reg[7..4] */
802 brw_MOV(p,
803 suboffset(stride(const_reg, 0, 4, 1), 4),
804 suboffset(stride(const2_reg, 0, 4, 1), 4));
805 release_tmp(c, const2_reg);
806 }
807 else {
808 /* replicate lower four floats into upper half (to get XYZWXYZW) */
809 const_reg = stride(const_reg, 0, 4, 0);
810 const_reg.subnr = 0;
811 }
812
813 return const_reg;
814 }
815
816
817
818 /* TODO: relative addressing!
819 */
820 static struct brw_reg get_reg( struct brw_vs_compile *c,
821 gl_register_file file,
822 GLuint index )
823 {
824 switch (file) {
825 case PROGRAM_TEMPORARY:
826 case PROGRAM_INPUT:
827 case PROGRAM_OUTPUT:
828 assert(c->regs[file][index].nr != 0);
829 return c->regs[file][index];
830 case PROGRAM_STATE_VAR:
831 case PROGRAM_CONSTANT:
832 case PROGRAM_UNIFORM:
833 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
834 return c->regs[PROGRAM_STATE_VAR][index];
835 case PROGRAM_ADDRESS:
836 assert(index == 0);
837 return c->regs[file][index];
838
839 case PROGRAM_UNDEFINED: /* undef values */
840 return brw_null_reg();
841
842 case PROGRAM_LOCAL_PARAM:
843 case PROGRAM_ENV_PARAM:
844 case PROGRAM_WRITE_ONLY:
845 default:
846 assert(0);
847 return brw_null_reg();
848 }
849 }
850
851
852 /**
853 * Indirect addressing: get reg[[arg] + offset].
854 */
855 static struct brw_reg deref( struct brw_vs_compile *c,
856 struct brw_reg arg,
857 GLint offset)
858 {
859 struct brw_compile *p = &c->func;
860 struct brw_reg tmp = vec4(get_tmp(c));
861 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
862 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
863 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
864 struct brw_reg indirect = brw_vec4_indirect(0,0);
865
866 {
867 brw_push_insn_state(p);
868 brw_set_access_mode(p, BRW_ALIGN_1);
869
870 /* This is pretty clunky - load the address register twice and
871 * fetch each 4-dword value in turn. There must be a way to do
872 * this in a single pass, but I couldn't get it to work.
873 */
874 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
875 brw_MOV(p, tmp, indirect);
876
877 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
878 brw_MOV(p, suboffset(tmp, 4), indirect);
879
880 brw_pop_insn_state(p);
881 }
882
883 /* NOTE: tmp not released */
884 return vec8(tmp);
885 }
886
887
888 /**
889 * Get brw reg corresponding to the instruction's [argIndex] src reg.
890 * TODO: relative addressing!
891 */
892 static struct brw_reg
893 get_src_reg( struct brw_vs_compile *c,
894 const struct prog_instruction *inst,
895 GLuint argIndex )
896 {
897 const GLuint file = inst->SrcReg[argIndex].File;
898 const GLint index = inst->SrcReg[argIndex].Index;
899 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
900
901 switch (file) {
902 case PROGRAM_TEMPORARY:
903 case PROGRAM_INPUT:
904 case PROGRAM_OUTPUT:
905 if (relAddr) {
906 return deref(c, c->regs[file][0], index);
907 }
908 else {
909 assert(c->regs[file][index].nr != 0);
910 return c->regs[file][index];
911 }
912
913 case PROGRAM_STATE_VAR:
914 case PROGRAM_CONSTANT:
915 case PROGRAM_UNIFORM:
916 case PROGRAM_ENV_PARAM:
917 case PROGRAM_LOCAL_PARAM:
918 if (c->vp->use_const_buffer) {
919 return get_constant(c, inst, argIndex);
920 }
921 else if (relAddr) {
922 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
923 }
924 else {
925 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
926 return c->regs[PROGRAM_STATE_VAR][index];
927 }
928 case PROGRAM_ADDRESS:
929 assert(index == 0);
930 return c->regs[file][index];
931
932 case PROGRAM_UNDEFINED:
933 /* this is a normal case since we loop over all three src args */
934 return brw_null_reg();
935
936 case PROGRAM_WRITE_ONLY:
937 default:
938 assert(0);
939 return brw_null_reg();
940 }
941 }
942
943
944 static void emit_arl( struct brw_vs_compile *c,
945 struct brw_reg dst,
946 struct brw_reg arg0 )
947 {
948 struct brw_compile *p = &c->func;
949 struct brw_reg tmp = dst;
950 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
951
952 if (need_tmp)
953 tmp = get_tmp(c);
954
955 brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
956 brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
957
958 if (need_tmp)
959 release_tmp(c, tmp);
960 }
961
962
963 /**
964 * Return the brw reg for the given instruction's src argument.
965 * Will return mangled results for SWZ op. The emit_swz() function
966 * ignores this result and recalculates taking extended swizzles into
967 * account.
968 */
969 static struct brw_reg get_arg( struct brw_vs_compile *c,
970 const struct prog_instruction *inst,
971 GLuint argIndex )
972 {
973 const struct prog_src_register *src = &inst->SrcReg[argIndex];
974 struct brw_reg reg;
975
976 if (src->File == PROGRAM_UNDEFINED)
977 return brw_null_reg();
978
979 reg = get_src_reg(c, inst, argIndex);
980
981 /* Convert 3-bit swizzle to 2-bit.
982 */
983 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
984 GET_SWZ(src->Swizzle, 1),
985 GET_SWZ(src->Swizzle, 2),
986 GET_SWZ(src->Swizzle, 3));
987
988 /* Note this is ok for non-swizzle instructions:
989 */
990 reg.negate = src->Negate ? 1 : 0;
991
992 return reg;
993 }
994
995
996 /**
997 * Get brw register for the given program dest register.
998 */
999 static struct brw_reg get_dst( struct brw_vs_compile *c,
1000 struct prog_dst_register dst )
1001 {
1002 struct brw_reg reg;
1003
1004 switch (dst.File) {
1005 case PROGRAM_TEMPORARY:
1006 case PROGRAM_OUTPUT:
1007 assert(c->regs[dst.File][dst.Index].nr != 0);
1008 reg = c->regs[dst.File][dst.Index];
1009 break;
1010 case PROGRAM_ADDRESS:
1011 assert(dst.Index == 0);
1012 reg = c->regs[dst.File][dst.Index];
1013 break;
1014 case PROGRAM_UNDEFINED:
1015 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1016 reg = brw_null_reg();
1017 break;
1018 default:
1019 assert(0);
1020 reg = brw_null_reg();
1021 }
1022
1023 reg.dw1.bits.writemask = dst.WriteMask;
1024
1025 return reg;
1026 }
1027
1028
1029 static void emit_swz( struct brw_vs_compile *c,
1030 struct brw_reg dst,
1031 const struct prog_instruction *inst)
1032 {
1033 const GLuint argIndex = 0;
1034 const struct prog_src_register src = inst->SrcReg[argIndex];
1035 struct brw_compile *p = &c->func;
1036 GLuint zeros_mask = 0;
1037 GLuint ones_mask = 0;
1038 GLuint src_mask = 0;
1039 GLubyte src_swz[4];
1040 GLboolean need_tmp = (src.Negate &&
1041 dst.file != BRW_GENERAL_REGISTER_FILE);
1042 struct brw_reg tmp = dst;
1043 GLuint i;
1044
1045 if (need_tmp)
1046 tmp = get_tmp(c);
1047
1048 for (i = 0; i < 4; i++) {
1049 if (dst.dw1.bits.writemask & (1<<i)) {
1050 GLubyte s = GET_SWZ(src.Swizzle, i);
1051 switch (s) {
1052 case SWIZZLE_X:
1053 case SWIZZLE_Y:
1054 case SWIZZLE_Z:
1055 case SWIZZLE_W:
1056 src_mask |= 1<<i;
1057 src_swz[i] = s;
1058 break;
1059 case SWIZZLE_ZERO:
1060 zeros_mask |= 1<<i;
1061 break;
1062 case SWIZZLE_ONE:
1063 ones_mask |= 1<<i;
1064 break;
1065 }
1066 }
1067 }
1068
1069 /* Do src first, in case dst aliases src:
1070 */
1071 if (src_mask) {
1072 struct brw_reg arg0;
1073
1074 arg0 = get_src_reg(c, inst, argIndex);
1075
1076 arg0 = brw_swizzle(arg0,
1077 src_swz[0], src_swz[1],
1078 src_swz[2], src_swz[3]);
1079
1080 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1081 }
1082
1083 if (zeros_mask)
1084 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1085
1086 if (ones_mask)
1087 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1088
1089 if (src.Negate)
1090 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1091
1092 if (need_tmp) {
1093 brw_MOV(p, dst, tmp);
1094 release_tmp(c, tmp);
1095 }
1096 }
1097
1098
1099 /**
1100 * Post-vertex-program processing. Send the results to the URB.
1101 */
1102 static void emit_vertex_write( struct brw_vs_compile *c)
1103 {
1104 struct brw_compile *p = &c->func;
1105 struct brw_reg m0 = brw_message_reg(0);
1106 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1107 struct brw_reg ndc;
1108 int eot;
1109 GLuint len_vertext_header = 2;
1110
1111 if (c->key.copy_edgeflag) {
1112 brw_MOV(p,
1113 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1114 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1115 }
1116
1117 /* Build ndc coords */
1118 ndc = get_tmp(c);
1119 /* ndc = 1.0 / pos.w */
1120 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1121 /* ndc.xyz = pos * ndc */
1122 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1123
1124 /* Update the header for point size, user clipping flags, and -ve rhw
1125 * workaround.
1126 */
1127 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1128 c->key.nr_userclip || BRW_IS_965(p->brw))
1129 {
1130 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1131 GLuint i;
1132
1133 brw_MOV(p, header1, brw_imm_ud(0));
1134
1135 brw_set_access_mode(p, BRW_ALIGN_16);
1136
1137 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1138 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1139 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1140 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1141 }
1142
1143 for (i = 0; i < c->key.nr_userclip; i++) {
1144 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1145 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1146 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1147 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1148 }
1149
1150 /* i965 clipping workaround:
1151 * 1) Test for -ve rhw
1152 * 2) If set,
1153 * set ndc = (0,0,0,0)
1154 * set ucp[6] = 1
1155 *
1156 * Later, clipping will detect ucp[6] and ensure the primitive is
1157 * clipped against all fixed planes.
1158 */
1159 if (BRW_IS_965(p->brw)) {
1160 brw_CMP(p,
1161 vec8(brw_null_reg()),
1162 BRW_CONDITIONAL_L,
1163 brw_swizzle1(ndc, 3),
1164 brw_imm_f(0));
1165
1166 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1167 brw_MOV(p, ndc, brw_imm_f(0));
1168 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1169 }
1170
1171 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1172 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1173 brw_set_access_mode(p, BRW_ALIGN_16);
1174
1175 release_tmp(c, header1);
1176 }
1177 else {
1178 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1179 }
1180
1181 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1182 * of zeros followed by two sets of NDC coordinates:
1183 */
1184 brw_set_access_mode(p, BRW_ALIGN_1);
1185 brw_MOV(p, offset(m0, 2), ndc);
1186
1187 if (BRW_IS_IGDNG(p->brw)) {
1188 /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1189 brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1190 /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1191 * Seems it is useless for us.
1192 * m6 is used for aligning, so that the remainder of vertex element is
1193 * reg-aligned.
1194 */
1195 brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1196 len_vertext_header = 6;
1197 } else {
1198 brw_MOV(p, offset(m0, 3), pos);
1199 len_vertext_header = 2;
1200 }
1201
1202 eot = (c->first_overflow_output == 0);
1203
1204 brw_urb_WRITE(p,
1205 brw_null_reg(), /* dest */
1206 0, /* starting mrf reg nr */
1207 c->r0, /* src */
1208 0, /* allocate */
1209 1, /* used */
1210 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1211 0, /* response len */
1212 eot, /* eot */
1213 eot, /* writes complete */
1214 0, /* urb destination offset */
1215 BRW_URB_SWIZZLE_INTERLEAVE);
1216
1217 if (c->first_overflow_output > 0) {
1218 /* Not all of the vertex outputs/results fit into the MRF.
1219 * Move the overflowed attributes from the GRF to the MRF and
1220 * issue another brw_urb_WRITE().
1221 */
1222 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1223 * at mrf[4] atm...
1224 */
1225 GLuint i, mrf = 0;
1226 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1227 if (c->prog_data.outputs_written & (1 << i)) {
1228 /* move from GRF to MRF */
1229 brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1230 mrf++;
1231 }
1232 }
1233
1234 brw_urb_WRITE(p,
1235 brw_null_reg(), /* dest */
1236 4, /* starting mrf reg nr */
1237 c->r0, /* src */
1238 0, /* allocate */
1239 1, /* used */
1240 mrf+1, /* msg len */
1241 0, /* response len */
1242 1, /* eot */
1243 1, /* writes complete */
1244 BRW_MAX_MRF-1, /* urb destination offset */
1245 BRW_URB_SWIZZLE_INTERLEAVE);
1246 }
1247 }
1248
1249
1250 /**
1251 * Called after code generation to resolve subroutine calls and the
1252 * END instruction.
1253 * \param end_inst points to brw code for END instruction
1254 * \param last_inst points to last instruction emitted before vertex write
1255 */
1256 static void
1257 post_vs_emit( struct brw_vs_compile *c,
1258 struct brw_instruction *end_inst,
1259 struct brw_instruction *last_inst )
1260 {
1261 GLint offset;
1262
1263 brw_resolve_cals(&c->func);
1264
1265 /* patch up the END code to jump past subroutines, etc */
1266 offset = last_inst - end_inst;
1267 if (offset > 1) {
1268 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1269 } else {
1270 end_inst->header.opcode = BRW_OPCODE_NOP;
1271 }
1272 }
1273
1274 static GLboolean
1275 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1276 {
1277 struct brw_compile *p = &c->func;
1278 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1279
1280 if (p->nr_insn == 0)
1281 return GL_FALSE;
1282
1283 if (val.address_mode != BRW_ADDRESS_DIRECT)
1284 return GL_FALSE;
1285
1286 switch (prev_insn->header.opcode) {
1287 case BRW_OPCODE_MOV:
1288 case BRW_OPCODE_MAC:
1289 case BRW_OPCODE_MUL:
1290 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1291 prev_insn->header.execution_size == val.width &&
1292 prev_insn->bits1.da1.dest_reg_file == val.file &&
1293 prev_insn->bits1.da1.dest_reg_type == val.type &&
1294 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1295 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1296 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1297 prev_insn->bits1.da16.dest_writemask == 0xf)
1298 return GL_TRUE;
1299 else
1300 return GL_FALSE;
1301 default:
1302 return GL_FALSE;
1303 }
1304 }
1305
1306 static uint32_t
1307 get_predicate(const struct prog_instruction *inst)
1308 {
1309 if (inst->DstReg.CondMask == COND_TR)
1310 return BRW_PREDICATE_NONE;
1311
1312 /* All of GLSL only produces predicates for COND_NE and one channel per
1313 * vector. Fail badly if someone starts doing something else, as it might
1314 * mean infinite looping or something.
1315 *
1316 * We'd like to support all the condition codes, but our hardware doesn't
1317 * quite match the Mesa IR, which is modeled after the NV extensions. For
1318 * those, the instruction may update the condition codes or not, then any
1319 * later instruction may use one of those condition codes. For gen4, the
1320 * instruction may update the flags register based on one of the condition
1321 * codes output by the instruction, and then further instructions may
1322 * predicate on that. We can probably support this, but it won't
1323 * necessarily be easy.
1324 */
1325 assert(inst->DstReg.CondMask == COND_NE);
1326
1327 switch (inst->DstReg.CondSwizzle) {
1328 case SWIZZLE_XXXX:
1329 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1330 case SWIZZLE_YYYY:
1331 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1332 case SWIZZLE_ZZZZ:
1333 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1334 case SWIZZLE_WWWW:
1335 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1336 default:
1337 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1338 inst->DstReg.CondMask);
1339 return BRW_PREDICATE_NORMAL;
1340 }
1341 }
1342
1343 /* Emit the vertex program instructions here.
1344 */
1345 void brw_vs_emit(struct brw_vs_compile *c )
1346 {
1347 #define MAX_IF_DEPTH 32
1348 #define MAX_LOOP_DEPTH 32
1349 struct brw_compile *p = &c->func;
1350 struct brw_context *brw = p->brw;
1351 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1352 GLuint insn, if_depth = 0, loop_depth = 0;
1353 GLuint end_offset = 0;
1354 struct brw_instruction *end_inst, *last_inst;
1355 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1356 const struct brw_indirect stack_index = brw_indirect(0, 0);
1357 GLuint index;
1358 GLuint file;
1359
1360 if (INTEL_DEBUG & DEBUG_VS) {
1361 _mesa_printf("vs-mesa:\n");
1362 _mesa_print_program(&c->vp->program.Base);
1363 _mesa_printf("\n");
1364 }
1365
1366 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1367 brw_set_access_mode(p, BRW_ALIGN_16);
1368
1369 /* Message registers can't be read, so copy the output into GRF register
1370 if they are used in source registers */
1371 for (insn = 0; insn < nr_insns; insn++) {
1372 GLuint i;
1373 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1374 for (i = 0; i < 3; i++) {
1375 struct prog_src_register *src = &inst->SrcReg[i];
1376 GLuint index = src->Index;
1377 GLuint file = src->File;
1378 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1379 c->output_regs[index].used_in_src = GL_TRUE;
1380 }
1381 }
1382
1383 /* Static register allocation
1384 */
1385 brw_vs_alloc_regs(c);
1386 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1387
1388 for (insn = 0; insn < nr_insns; insn++) {
1389
1390 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1391 struct brw_reg args[3], dst;
1392 GLuint i;
1393
1394 #if 0
1395 printf("%d: ", insn);
1396 _mesa_print_instruction(inst);
1397 #endif
1398
1399 /* Get argument regs. SWZ is special and does this itself.
1400 */
1401 if (inst->Opcode != OPCODE_SWZ)
1402 for (i = 0; i < 3; i++) {
1403 const struct prog_src_register *src = &inst->SrcReg[i];
1404 index = src->Index;
1405 file = src->File;
1406 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1407 args[i] = c->output_regs[index].reg;
1408 else
1409 args[i] = get_arg(c, inst, i);
1410 }
1411
1412 /* Get dest regs. Note that it is possible for a reg to be both
1413 * dst and arg, given the static allocation of registers. So
1414 * care needs to be taken emitting multi-operation instructions.
1415 */
1416 index = inst->DstReg.Index;
1417 file = inst->DstReg.File;
1418 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1419 dst = c->output_regs[index].reg;
1420 else
1421 dst = get_dst(c, inst->DstReg);
1422
1423 if (inst->SaturateMode != SATURATE_OFF) {
1424 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1425 inst->SaturateMode);
1426 }
1427
1428 switch (inst->Opcode) {
1429 case OPCODE_ABS:
1430 brw_MOV(p, dst, brw_abs(args[0]));
1431 break;
1432 case OPCODE_ADD:
1433 brw_ADD(p, dst, args[0], args[1]);
1434 break;
1435 case OPCODE_COS:
1436 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1437 break;
1438 case OPCODE_DP3:
1439 brw_DP3(p, dst, args[0], args[1]);
1440 break;
1441 case OPCODE_DP4:
1442 brw_DP4(p, dst, args[0], args[1]);
1443 break;
1444 case OPCODE_DPH:
1445 brw_DPH(p, dst, args[0], args[1]);
1446 break;
1447 case OPCODE_NRM3:
1448 emit_nrm(c, dst, args[0], 3);
1449 break;
1450 case OPCODE_NRM4:
1451 emit_nrm(c, dst, args[0], 4);
1452 break;
1453 case OPCODE_DST:
1454 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1455 break;
1456 case OPCODE_EXP:
1457 unalias1(c, dst, args[0], emit_exp_noalias);
1458 break;
1459 case OPCODE_EX2:
1460 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1461 break;
1462 case OPCODE_ARL:
1463 emit_arl(c, dst, args[0]);
1464 break;
1465 case OPCODE_FLR:
1466 brw_RNDD(p, dst, args[0]);
1467 break;
1468 case OPCODE_FRC:
1469 brw_FRC(p, dst, args[0]);
1470 break;
1471 case OPCODE_LOG:
1472 unalias1(c, dst, args[0], emit_log_noalias);
1473 break;
1474 case OPCODE_LG2:
1475 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1476 break;
1477 case OPCODE_LIT:
1478 unalias1(c, dst, args[0], emit_lit_noalias);
1479 break;
1480 case OPCODE_LRP:
1481 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1482 break;
1483 case OPCODE_MAD:
1484 if (!accumulator_contains(c, args[2]))
1485 brw_MOV(p, brw_acc_reg(), args[2]);
1486 brw_MAC(p, dst, args[0], args[1]);
1487 break;
1488 case OPCODE_MAX:
1489 emit_max(p, dst, args[0], args[1]);
1490 break;
1491 case OPCODE_MIN:
1492 emit_min(p, dst, args[0], args[1]);
1493 break;
1494 case OPCODE_MOV:
1495 brw_MOV(p, dst, args[0]);
1496 break;
1497 case OPCODE_MUL:
1498 brw_MUL(p, dst, args[0], args[1]);
1499 break;
1500 case OPCODE_POW:
1501 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1502 break;
1503 case OPCODE_RCP:
1504 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1505 break;
1506 case OPCODE_RSQ:
1507 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1508 break;
1509
1510 case OPCODE_SEQ:
1511 unalias2(c, dst, args[0], args[1], emit_seq);
1512 break;
1513 case OPCODE_SIN:
1514 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1515 break;
1516 case OPCODE_SNE:
1517 unalias2(c, dst, args[0], args[1], emit_sne);
1518 break;
1519 case OPCODE_SGE:
1520 unalias2(c, dst, args[0], args[1], emit_sge);
1521 break;
1522 case OPCODE_SGT:
1523 unalias2(c, dst, args[0], args[1], emit_sgt);
1524 break;
1525 case OPCODE_SLT:
1526 unalias2(c, dst, args[0], args[1], emit_slt);
1527 break;
1528 case OPCODE_SLE:
1529 unalias2(c, dst, args[0], args[1], emit_sle);
1530 break;
1531 case OPCODE_SUB:
1532 brw_ADD(p, dst, args[0], negate(args[1]));
1533 break;
1534 case OPCODE_SWZ:
1535 /* The args[0] value can't be used here as it won't have
1536 * correctly encoded the full swizzle:
1537 */
1538 emit_swz(c, dst, inst);
1539 break;
1540 case OPCODE_TRUNC:
1541 /* round toward zero */
1542 brw_RNDZ(p, dst, args[0]);
1543 break;
1544 case OPCODE_XPD:
1545 emit_xpd(p, dst, args[0], args[1]);
1546 break;
1547 case OPCODE_IF:
1548 assert(if_depth < MAX_IF_DEPTH);
1549 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1550 /* Note that brw_IF smashes the predicate_control field. */
1551 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1552 if_depth++;
1553 break;
1554 case OPCODE_ELSE:
1555 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1556 break;
1557 case OPCODE_ENDIF:
1558 assert(if_depth > 0);
1559 brw_ENDIF(p, if_inst[--if_depth]);
1560 break;
1561 case OPCODE_BGNLOOP:
1562 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1563 break;
1564 case OPCODE_BRK:
1565 brw_set_predicate_control(p, get_predicate(inst));
1566 brw_BREAK(p);
1567 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1568 break;
1569 case OPCODE_CONT:
1570 brw_set_predicate_control(p, get_predicate(inst));
1571 brw_CONT(p);
1572 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1573 break;
1574 case OPCODE_ENDLOOP:
1575 {
1576 struct brw_instruction *inst0, *inst1;
1577 GLuint br = 1;
1578
1579 loop_depth--;
1580
1581 if (BRW_IS_IGDNG(brw))
1582 br = 2;
1583
1584 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1585 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1586 while (inst0 > loop_inst[loop_depth]) {
1587 inst0--;
1588 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1589 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1590 inst0->bits3.if_else.pop_count = 0;
1591 }
1592 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1593 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1594 inst0->bits3.if_else.pop_count = 0;
1595 }
1596 }
1597 }
1598 break;
1599 case OPCODE_BRA:
1600 brw_set_predicate_control(p, get_predicate(inst));
1601 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1602 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1603 break;
1604 case OPCODE_CAL:
1605 brw_set_access_mode(p, BRW_ALIGN_1);
1606 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1607 brw_set_access_mode(p, BRW_ALIGN_16);
1608 brw_ADD(p, get_addr_reg(stack_index),
1609 get_addr_reg(stack_index), brw_imm_d(4));
1610 brw_save_call(p, inst->Comment, p->nr_insn);
1611 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1612 break;
1613 case OPCODE_RET:
1614 brw_ADD(p, get_addr_reg(stack_index),
1615 get_addr_reg(stack_index), brw_imm_d(-4));
1616 brw_set_access_mode(p, BRW_ALIGN_1);
1617 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1618 brw_set_access_mode(p, BRW_ALIGN_16);
1619 break;
1620 case OPCODE_END:
1621 end_offset = p->nr_insn;
1622 /* this instruction will get patched later to jump past subroutine
1623 * code, etc.
1624 */
1625 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1626 break;
1627 case OPCODE_PRINT:
1628 /* no-op */
1629 break;
1630 case OPCODE_BGNSUB:
1631 brw_save_label(p, inst->Comment, p->nr_insn);
1632 break;
1633 case OPCODE_ENDSUB:
1634 /* no-op */
1635 break;
1636 default:
1637 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1638 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1639 _mesa_opcode_string(inst->Opcode) :
1640 "unknown");
1641 }
1642
1643 /* Set the predication update on the last instruction of the native
1644 * instruction sequence.
1645 *
1646 * This would be problematic if it was set on a math instruction,
1647 * but that shouldn't be the case with the current GLSL compiler.
1648 */
1649 if (inst->CondUpdate) {
1650 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1651
1652 assert(hw_insn->header.destreg__conditionalmod == 0);
1653 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1654 }
1655
1656 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1657 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1658 && c->output_regs[inst->DstReg.Index].used_in_src) {
1659 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1660 }
1661
1662 /* Result color clamping.
1663 *
1664 * When destination register is an output register and
1665 * it's primary/secondary front/back color, we have to clamp
1666 * the result to [0,1]. This is done by enabling the
1667 * saturation bit for the last instruction.
1668 *
1669 * We don't use brw_set_saturate() as it modifies
1670 * p->current->header.saturate, which affects all the subsequent
1671 * instructions. Instead, we directly modify the header
1672 * of the last (already stored) instruction.
1673 */
1674 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1675 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1676 || (inst->DstReg.Index == VERT_RESULT_COL1)
1677 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1678 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1679 p->store[p->nr_insn-1].header.saturate = 1;
1680 }
1681 }
1682
1683 release_tmps(c);
1684 }
1685
1686 end_inst = &p->store[end_offset];
1687 last_inst = &p->store[p->nr_insn];
1688
1689 /* The END instruction will be patched to jump to this code */
1690 emit_vertex_write(c);
1691
1692 post_vs_emit(c, end_inst, last_inst);
1693
1694 if (INTEL_DEBUG & DEBUG_VS) {
1695 int i;
1696
1697 _mesa_printf("vs-native:\n");
1698 for (i = 0; i < p->nr_insn; i++)
1699 brw_disasm(stderr, &p->store[i]);
1700 _mesa_printf("\n");
1701 }
1702 }