Merge branch 'mesa_7_5_branch'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
42 {
43 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
44
45 if (++c->last_tmp > c->prog_data.total_grf)
46 c->prog_data.total_grf = c->last_tmp;
47
48 return tmp;
49 }
50
51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
52 {
53 if (tmp.nr == c->last_tmp-1)
54 c->last_tmp--;
55 }
56
57 static void release_tmps( struct brw_vs_compile *c )
58 {
59 c->last_tmp = c->first_tmp;
60 }
61
62
63 /**
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
66 * ahead of time.
67 */
68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
69 {
70 GLuint i, reg = 0, mrf;
71
72 /* Determine whether to use a real constant buffer or use a block
73 * of GRF registers for constants. The later is faster but only
74 * works if everything fits in the GRF.
75 * XXX this heuristic/check may need some fine tuning...
76 */
77 if (c->vp->program.Base.Parameters->NumParameters +
78 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
79 c->vp->use_const_buffer = GL_TRUE;
80 else
81 c->vp->use_const_buffer = GL_FALSE;
82
83 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
84
85 /* r0 -- reserved as usual
86 */
87 c->r0 = brw_vec8_grf(reg, 0);
88 reg++;
89
90 /* User clip planes from curbe:
91 */
92 if (c->key.nr_userclip) {
93 for (i = 0; i < c->key.nr_userclip; i++) {
94 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
95 }
96
97 /* Deal with curbe alignment:
98 */
99 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
100 }
101
102 /* Vertex program parameters from curbe:
103 */
104 if (c->vp->use_const_buffer) {
105 /* get constants from a real constant buffer */
106 c->prog_data.curb_read_length = 0;
107 c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
108 }
109 else {
110 /* use a section of the GRF for constants */
111 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
112 for (i = 0; i < nr_params; i++) {
113 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
114 }
115 reg += (nr_params + 1) / 2;
116 c->prog_data.curb_read_length = reg - 1;
117
118 c->prog_data.nr_params = nr_params * 4;
119 }
120
121 /* Allocate input regs:
122 */
123 c->nr_inputs = 0;
124 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
125 if (c->prog_data.inputs_read & (1 << i)) {
126 c->nr_inputs++;
127 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
128 reg++;
129 }
130 }
131
132 /* Allocate outputs. The non-position outputs go straight into message regs.
133 */
134 c->nr_outputs = 0;
135 c->first_output = reg;
136 c->first_overflow_output = 0;
137 mrf = 4;
138 for (i = 0; i < VERT_RESULT_MAX; i++) {
139 if (c->prog_data.outputs_written & (1 << i)) {
140 c->nr_outputs++;
141 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
142 if (i == VERT_RESULT_HPOS) {
143 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
144 reg++;
145 }
146 else if (i == VERT_RESULT_PSIZ) {
147 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
148 reg++;
149 mrf++; /* just a placeholder? XXX fix later stages & remove this */
150 }
151 else {
152 if (mrf < 16) {
153 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
154 mrf++;
155 }
156 else {
157 /* too many vertex results to fit in MRF, use GRF for overflow */
158 if (!c->first_overflow_output)
159 c->first_overflow_output = i;
160 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
161 reg++;
162 }
163 }
164 }
165 }
166
167 /* Allocate program temporaries:
168 */
169 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
170 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
171 reg++;
172 }
173
174 /* Address reg(s). Don't try to use the internal address reg until
175 * deref time.
176 */
177 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
178 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
179 reg,
180 0,
181 BRW_REGISTER_TYPE_D,
182 BRW_VERTICAL_STRIDE_8,
183 BRW_WIDTH_8,
184 BRW_HORIZONTAL_STRIDE_1,
185 BRW_SWIZZLE_XXXX,
186 WRITEMASK_X);
187 reg++;
188 }
189
190 if (c->vp->use_const_buffer) {
191 for (i = 0; i < 3; i++) {
192 c->current_const[i].index = -1;
193 c->current_const[i].reg = brw_vec8_grf(reg, 0);
194 reg++;
195 }
196 }
197
198 for (i = 0; i < 128; i++) {
199 if (c->output_regs[i].used_in_src) {
200 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
201 reg++;
202 }
203 }
204
205 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
206 reg += 2;
207
208 /* Some opcodes need an internal temporary:
209 */
210 c->first_tmp = reg;
211 c->last_tmp = reg; /* for allocation purposes */
212
213 /* Each input reg holds data from two vertices. The
214 * urb_read_length is the number of registers read from *each*
215 * vertex urb, so is half the amount:
216 */
217 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
218
219 c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
220 c->prog_data.total_grf = reg;
221
222 if (INTEL_DEBUG & DEBUG_VS) {
223 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
224 _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
225 _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
226 }
227 }
228
229
230 /**
231 * If an instruction uses a temp reg both as a src and the dest, we
232 * sometimes need to allocate an intermediate temporary.
233 */
234 static void unalias1( struct brw_vs_compile *c,
235 struct brw_reg dst,
236 struct brw_reg arg0,
237 void (*func)( struct brw_vs_compile *,
238 struct brw_reg,
239 struct brw_reg ))
240 {
241 if (dst.file == arg0.file && dst.nr == arg0.nr) {
242 struct brw_compile *p = &c->func;
243 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
244 func(c, tmp, arg0);
245 brw_MOV(p, dst, tmp);
246 release_tmp(c, tmp);
247 }
248 else {
249 func(c, dst, arg0);
250 }
251 }
252
253 /**
254 * \sa unalias2
255 * Checkes if 2-operand instruction needs an intermediate temporary.
256 */
257 static void unalias2( struct brw_vs_compile *c,
258 struct brw_reg dst,
259 struct brw_reg arg0,
260 struct brw_reg arg1,
261 void (*func)( struct brw_vs_compile *,
262 struct brw_reg,
263 struct brw_reg,
264 struct brw_reg ))
265 {
266 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
267 (dst.file == arg1.file && dst.nr == arg1.nr)) {
268 struct brw_compile *p = &c->func;
269 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
270 func(c, tmp, arg0, arg1);
271 brw_MOV(p, dst, tmp);
272 release_tmp(c, tmp);
273 }
274 else {
275 func(c, dst, arg0, arg1);
276 }
277 }
278
279 /**
280 * \sa unalias2
281 * Checkes if 3-operand instruction needs an intermediate temporary.
282 */
283 static void unalias3( struct brw_vs_compile *c,
284 struct brw_reg dst,
285 struct brw_reg arg0,
286 struct brw_reg arg1,
287 struct brw_reg arg2,
288 void (*func)( struct brw_vs_compile *,
289 struct brw_reg,
290 struct brw_reg,
291 struct brw_reg,
292 struct brw_reg ))
293 {
294 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
295 (dst.file == arg1.file && dst.nr == arg1.nr) ||
296 (dst.file == arg2.file && dst.nr == arg2.nr)) {
297 struct brw_compile *p = &c->func;
298 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
299 func(c, tmp, arg0, arg1, arg2);
300 brw_MOV(p, dst, tmp);
301 release_tmp(c, tmp);
302 }
303 else {
304 func(c, dst, arg0, arg1, arg2);
305 }
306 }
307
308 static void emit_sop( struct brw_compile *p,
309 struct brw_reg dst,
310 struct brw_reg arg0,
311 struct brw_reg arg1,
312 GLuint cond)
313 {
314 brw_MOV(p, dst, brw_imm_f(0.0f));
315 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
316 brw_MOV(p, dst, brw_imm_f(1.0f));
317 brw_set_predicate_control_flag_value(p, 0xff);
318 }
319
320 static void emit_seq( struct brw_compile *p,
321 struct brw_reg dst,
322 struct brw_reg arg0,
323 struct brw_reg arg1 )
324 {
325 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
326 }
327
328 static void emit_sne( struct brw_compile *p,
329 struct brw_reg dst,
330 struct brw_reg arg0,
331 struct brw_reg arg1 )
332 {
333 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
334 }
335 static void emit_slt( struct brw_compile *p,
336 struct brw_reg dst,
337 struct brw_reg arg0,
338 struct brw_reg arg1 )
339 {
340 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
341 }
342
343 static void emit_sle( struct brw_compile *p,
344 struct brw_reg dst,
345 struct brw_reg arg0,
346 struct brw_reg arg1 )
347 {
348 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
349 }
350
351 static void emit_sgt( struct brw_compile *p,
352 struct brw_reg dst,
353 struct brw_reg arg0,
354 struct brw_reg arg1 )
355 {
356 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
357 }
358
359 static void emit_sge( struct brw_compile *p,
360 struct brw_reg dst,
361 struct brw_reg arg0,
362 struct brw_reg arg1 )
363 {
364 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
365 }
366
367 static void emit_max( struct brw_compile *p,
368 struct brw_reg dst,
369 struct brw_reg arg0,
370 struct brw_reg arg1 )
371 {
372 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
373 brw_SEL(p, dst, arg1, arg0);
374 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
375 }
376
377 static void emit_min( struct brw_compile *p,
378 struct brw_reg dst,
379 struct brw_reg arg0,
380 struct brw_reg arg1 )
381 {
382 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
383 brw_SEL(p, dst, arg0, arg1);
384 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
385 }
386
387
388 static void emit_math1( struct brw_vs_compile *c,
389 GLuint function,
390 struct brw_reg dst,
391 struct brw_reg arg0,
392 GLuint precision)
393 {
394 /* There are various odd behaviours with SEND on the simulator. In
395 * addition there are documented issues with the fact that the GEN4
396 * processor doesn't do dependency control properly on SEND
397 * results. So, on balance, this kludge to get around failures
398 * with writemasked math results looks like it might be necessary
399 * whether that turns out to be a simulator bug or not:
400 */
401 struct brw_compile *p = &c->func;
402 struct brw_reg tmp = dst;
403 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
404 dst.file != BRW_GENERAL_REGISTER_FILE);
405
406 if (need_tmp)
407 tmp = get_tmp(c);
408
409 brw_math(p,
410 tmp,
411 function,
412 BRW_MATH_SATURATE_NONE,
413 2,
414 arg0,
415 BRW_MATH_DATA_SCALAR,
416 precision);
417
418 if (need_tmp) {
419 brw_MOV(p, dst, tmp);
420 release_tmp(c, tmp);
421 }
422 }
423
424
425 static void emit_math2( struct brw_vs_compile *c,
426 GLuint function,
427 struct brw_reg dst,
428 struct brw_reg arg0,
429 struct brw_reg arg1,
430 GLuint precision)
431 {
432 struct brw_compile *p = &c->func;
433 struct brw_reg tmp = dst;
434 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
435 dst.file != BRW_GENERAL_REGISTER_FILE);
436
437 if (need_tmp)
438 tmp = get_tmp(c);
439
440 brw_MOV(p, brw_message_reg(3), arg1);
441
442 brw_math(p,
443 tmp,
444 function,
445 BRW_MATH_SATURATE_NONE,
446 2,
447 arg0,
448 BRW_MATH_DATA_SCALAR,
449 precision);
450
451 if (need_tmp) {
452 brw_MOV(p, dst, tmp);
453 release_tmp(c, tmp);
454 }
455 }
456
457
458 static void emit_exp_noalias( struct brw_vs_compile *c,
459 struct brw_reg dst,
460 struct brw_reg arg0 )
461 {
462 struct brw_compile *p = &c->func;
463
464
465 if (dst.dw1.bits.writemask & WRITEMASK_X) {
466 struct brw_reg tmp = get_tmp(c);
467 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
468
469 /* tmp_d = floor(arg0.x) */
470 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
471
472 /* result[0] = 2.0 ^ tmp */
473
474 /* Adjust exponent for floating point:
475 * exp += 127
476 */
477 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
478
479 /* Install exponent and sign.
480 * Excess drops off the edge:
481 */
482 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
483 tmp_d, brw_imm_d(23));
484
485 release_tmp(c, tmp);
486 }
487
488 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
489 /* result[1] = arg0.x - floor(arg0.x) */
490 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
491 }
492
493 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
494 /* As with the LOG instruction, we might be better off just
495 * doing a taylor expansion here, seeing as we have to do all
496 * the prep work.
497 *
498 * If mathbox partial precision is too low, consider also:
499 * result[3] = result[0] * EXP(result[1])
500 */
501 emit_math1(c,
502 BRW_MATH_FUNCTION_EXP,
503 brw_writemask(dst, WRITEMASK_Z),
504 brw_swizzle1(arg0, 0),
505 BRW_MATH_PRECISION_FULL);
506 }
507
508 if (dst.dw1.bits.writemask & WRITEMASK_W) {
509 /* result[3] = 1.0; */
510 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
511 }
512 }
513
514
515 static void emit_log_noalias( struct brw_vs_compile *c,
516 struct brw_reg dst,
517 struct brw_reg arg0 )
518 {
519 struct brw_compile *p = &c->func;
520 struct brw_reg tmp = dst;
521 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
522 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
523 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
524 dst.file != BRW_GENERAL_REGISTER_FILE);
525
526 if (need_tmp) {
527 tmp = get_tmp(c);
528 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
529 }
530
531 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
532 * according to spec:
533 *
534 * These almost look likey they could be joined up, but not really
535 * practical:
536 *
537 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
538 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
539 */
540 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
541 brw_AND(p,
542 brw_writemask(tmp_ud, WRITEMASK_X),
543 brw_swizzle1(arg0_ud, 0),
544 brw_imm_ud((1U<<31)-1));
545
546 brw_SHR(p,
547 brw_writemask(tmp_ud, WRITEMASK_X),
548 tmp_ud,
549 brw_imm_ud(23));
550
551 brw_ADD(p,
552 brw_writemask(tmp, WRITEMASK_X),
553 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
554 brw_imm_d(-127));
555 }
556
557 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
558 brw_AND(p,
559 brw_writemask(tmp_ud, WRITEMASK_Y),
560 brw_swizzle1(arg0_ud, 0),
561 brw_imm_ud((1<<23)-1));
562
563 brw_OR(p,
564 brw_writemask(tmp_ud, WRITEMASK_Y),
565 tmp_ud,
566 brw_imm_ud(127<<23));
567 }
568
569 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
570 /* result[2] = result[0] + LOG2(result[1]); */
571
572 /* Why bother? The above is just a hint how to do this with a
573 * taylor series. Maybe we *should* use a taylor series as by
574 * the time all the above has been done it's almost certainly
575 * quicker than calling the mathbox, even with low precision.
576 *
577 * Options are:
578 * - result[0] + mathbox.LOG2(result[1])
579 * - mathbox.LOG2(arg0.x)
580 * - result[0] + inline_taylor_approx(result[1])
581 */
582 emit_math1(c,
583 BRW_MATH_FUNCTION_LOG,
584 brw_writemask(tmp, WRITEMASK_Z),
585 brw_swizzle1(tmp, 1),
586 BRW_MATH_PRECISION_FULL);
587
588 brw_ADD(p,
589 brw_writemask(tmp, WRITEMASK_Z),
590 brw_swizzle1(tmp, 2),
591 brw_swizzle1(tmp, 0));
592 }
593
594 if (dst.dw1.bits.writemask & WRITEMASK_W) {
595 /* result[3] = 1.0; */
596 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
597 }
598
599 if (need_tmp) {
600 brw_MOV(p, dst, tmp);
601 release_tmp(c, tmp);
602 }
603 }
604
605
606 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
607 */
608 static void emit_dst_noalias( struct brw_vs_compile *c,
609 struct brw_reg dst,
610 struct brw_reg arg0,
611 struct brw_reg arg1)
612 {
613 struct brw_compile *p = &c->func;
614
615 /* There must be a better way to do this:
616 */
617 if (dst.dw1.bits.writemask & WRITEMASK_X)
618 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
619 if (dst.dw1.bits.writemask & WRITEMASK_Y)
620 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
621 if (dst.dw1.bits.writemask & WRITEMASK_Z)
622 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
623 if (dst.dw1.bits.writemask & WRITEMASK_W)
624 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
625 }
626
627
628 static void emit_xpd( struct brw_compile *p,
629 struct brw_reg dst,
630 struct brw_reg t,
631 struct brw_reg u)
632 {
633 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
634 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
635 }
636
637
638 static void emit_lit_noalias( struct brw_vs_compile *c,
639 struct brw_reg dst,
640 struct brw_reg arg0 )
641 {
642 struct brw_compile *p = &c->func;
643 struct brw_instruction *if_insn;
644 struct brw_reg tmp = dst;
645 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
646
647 if (need_tmp)
648 tmp = get_tmp(c);
649
650 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
651 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
652
653 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
654 * to get all channels active inside the IF. In the clipping code
655 * we run with NoMask, so it's not an option and we can use
656 * BRW_EXECUTE_1 for all comparisions.
657 */
658 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
659 if_insn = brw_IF(p, BRW_EXECUTE_8);
660 {
661 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
662
663 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
664 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
665 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
666
667 emit_math2(c,
668 BRW_MATH_FUNCTION_POW,
669 brw_writemask(dst, WRITEMASK_Z),
670 brw_swizzle1(tmp, 2),
671 brw_swizzle1(arg0, 3),
672 BRW_MATH_PRECISION_PARTIAL);
673 }
674
675 brw_ENDIF(p, if_insn);
676
677 release_tmp(c, tmp);
678 }
679
680 static void emit_lrp_noalias(struct brw_vs_compile *c,
681 struct brw_reg dst,
682 struct brw_reg arg0,
683 struct brw_reg arg1,
684 struct brw_reg arg2)
685 {
686 struct brw_compile *p = &c->func;
687
688 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
689 brw_MUL(p, brw_null_reg(), dst, arg2);
690 brw_MAC(p, dst, arg0, arg1);
691 }
692
693 /** 3 or 4-component vector normalization */
694 static void emit_nrm( struct brw_vs_compile *c,
695 struct brw_reg dst,
696 struct brw_reg arg0,
697 int num_comps)
698 {
699 struct brw_compile *p = &c->func;
700 struct brw_reg tmp = get_tmp(c);
701
702 /* tmp = dot(arg0, arg0) */
703 if (num_comps == 3)
704 brw_DP3(p, tmp, arg0, arg0);
705 else
706 brw_DP4(p, tmp, arg0, arg0);
707
708 /* tmp = 1 / sqrt(tmp) */
709 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
710
711 /* dst = arg0 * tmp */
712 brw_MUL(p, dst, arg0, tmp);
713
714 release_tmp(c, tmp);
715 }
716
717
718 static struct brw_reg
719 get_constant(struct brw_vs_compile *c,
720 const struct prog_instruction *inst,
721 GLuint argIndex)
722 {
723 const struct prog_src_register *src = &inst->SrcReg[argIndex];
724 struct brw_compile *p = &c->func;
725 struct brw_reg const_reg;
726 struct brw_reg const2_reg;
727 const GLboolean relAddr = src->RelAddr;
728
729 assert(argIndex < 3);
730
731 if (c->current_const[argIndex].index != src->Index || relAddr) {
732 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
733
734 c->current_const[argIndex].index = src->Index;
735
736 #if 0
737 printf(" fetch const[%d] for arg %d into reg %d\n",
738 src->Index, argIndex, c->current_const[argIndex].reg.nr);
739 #endif
740 /* need to fetch the constant now */
741 brw_dp_READ_4_vs(p,
742 c->current_const[argIndex].reg,/* writeback dest */
743 0, /* oword */
744 relAddr, /* relative indexing? */
745 addrReg, /* address register */
746 16 * src->Index, /* byte offset */
747 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
748 );
749
750 if (relAddr) {
751 /* second read */
752 const2_reg = get_tmp(c);
753
754 /* use upper half of address reg for second read */
755 addrReg = stride(addrReg, 0, 4, 0);
756 addrReg.subnr = 16;
757
758 brw_dp_READ_4_vs(p,
759 const2_reg, /* writeback dest */
760 1, /* oword */
761 relAddr, /* relative indexing? */
762 addrReg, /* address register */
763 16 * src->Index, /* byte offset */
764 SURF_INDEX_VERT_CONST_BUFFER
765 );
766 }
767 }
768
769 const_reg = c->current_const[argIndex].reg;
770
771 if (relAddr) {
772 /* merge the two Owords into the constant register */
773 /* const_reg[7..4] = const2_reg[7..4] */
774 brw_MOV(p,
775 suboffset(stride(const_reg, 0, 4, 1), 4),
776 suboffset(stride(const2_reg, 0, 4, 1), 4));
777 release_tmp(c, const2_reg);
778 }
779 else {
780 /* replicate lower four floats into upper half (to get XYZWXYZW) */
781 const_reg = stride(const_reg, 0, 4, 0);
782 const_reg.subnr = 0;
783 }
784
785 return const_reg;
786 }
787
788
789
790 /* TODO: relative addressing!
791 */
792 static struct brw_reg get_reg( struct brw_vs_compile *c,
793 gl_register_file file,
794 GLuint index )
795 {
796 switch (file) {
797 case PROGRAM_TEMPORARY:
798 case PROGRAM_INPUT:
799 case PROGRAM_OUTPUT:
800 assert(c->regs[file][index].nr != 0);
801 return c->regs[file][index];
802 case PROGRAM_STATE_VAR:
803 case PROGRAM_CONSTANT:
804 case PROGRAM_UNIFORM:
805 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
806 return c->regs[PROGRAM_STATE_VAR][index];
807 case PROGRAM_ADDRESS:
808 assert(index == 0);
809 return c->regs[file][index];
810
811 case PROGRAM_UNDEFINED: /* undef values */
812 return brw_null_reg();
813
814 case PROGRAM_LOCAL_PARAM:
815 case PROGRAM_ENV_PARAM:
816 case PROGRAM_WRITE_ONLY:
817 default:
818 assert(0);
819 return brw_null_reg();
820 }
821 }
822
823
824 /**
825 * Indirect addressing: get reg[[arg] + offset].
826 */
827 static struct brw_reg deref( struct brw_vs_compile *c,
828 struct brw_reg arg,
829 GLint offset)
830 {
831 struct brw_compile *p = &c->func;
832 struct brw_reg tmp = vec4(get_tmp(c));
833 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
834 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
835 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
836 struct brw_reg indirect = brw_vec4_indirect(0,0);
837
838 {
839 brw_push_insn_state(p);
840 brw_set_access_mode(p, BRW_ALIGN_1);
841
842 /* This is pretty clunky - load the address register twice and
843 * fetch each 4-dword value in turn. There must be a way to do
844 * this in a single pass, but I couldn't get it to work.
845 */
846 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
847 brw_MOV(p, tmp, indirect);
848
849 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
850 brw_MOV(p, suboffset(tmp, 4), indirect);
851
852 brw_pop_insn_state(p);
853 }
854
855 /* NOTE: tmp not released */
856 return vec8(tmp);
857 }
858
859
860 /**
861 * Get brw reg corresponding to the instruction's [argIndex] src reg.
862 * TODO: relative addressing!
863 */
864 static struct brw_reg
865 get_src_reg( struct brw_vs_compile *c,
866 const struct prog_instruction *inst,
867 GLuint argIndex )
868 {
869 const GLuint file = inst->SrcReg[argIndex].File;
870 const GLint index = inst->SrcReg[argIndex].Index;
871 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
872
873 switch (file) {
874 case PROGRAM_TEMPORARY:
875 case PROGRAM_INPUT:
876 case PROGRAM_OUTPUT:
877 if (relAddr) {
878 return deref(c, c->regs[file][0], index);
879 }
880 else {
881 assert(c->regs[file][index].nr != 0);
882 return c->regs[file][index];
883 }
884
885 case PROGRAM_STATE_VAR:
886 case PROGRAM_CONSTANT:
887 case PROGRAM_UNIFORM:
888 if (c->vp->use_const_buffer) {
889 return get_constant(c, inst, argIndex);
890 }
891 else if (relAddr) {
892 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
893 }
894 else {
895 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
896 return c->regs[PROGRAM_STATE_VAR][index];
897 }
898 case PROGRAM_ADDRESS:
899 assert(index == 0);
900 return c->regs[file][index];
901
902 case PROGRAM_UNDEFINED:
903 /* this is a normal case since we loop over all three src args */
904 return brw_null_reg();
905
906 case PROGRAM_LOCAL_PARAM:
907 case PROGRAM_ENV_PARAM:
908 case PROGRAM_WRITE_ONLY:
909 default:
910 assert(0);
911 return brw_null_reg();
912 }
913 }
914
915
916 static void emit_arl( struct brw_vs_compile *c,
917 struct brw_reg dst,
918 struct brw_reg arg0 )
919 {
920 struct brw_compile *p = &c->func;
921 struct brw_reg tmp = dst;
922 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
923
924 if (need_tmp)
925 tmp = get_tmp(c);
926
927 brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
928 brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
929
930 if (need_tmp)
931 release_tmp(c, tmp);
932 }
933
934
935 /**
936 * Return the brw reg for the given instruction's src argument.
937 * Will return mangled results for SWZ op. The emit_swz() function
938 * ignores this result and recalculates taking extended swizzles into
939 * account.
940 */
941 static struct brw_reg get_arg( struct brw_vs_compile *c,
942 const struct prog_instruction *inst,
943 GLuint argIndex )
944 {
945 const struct prog_src_register *src = &inst->SrcReg[argIndex];
946 struct brw_reg reg;
947
948 if (src->File == PROGRAM_UNDEFINED)
949 return brw_null_reg();
950
951 reg = get_src_reg(c, inst, argIndex);
952
953 /* Convert 3-bit swizzle to 2-bit.
954 */
955 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
956 GET_SWZ(src->Swizzle, 1),
957 GET_SWZ(src->Swizzle, 2),
958 GET_SWZ(src->Swizzle, 3));
959
960 /* Note this is ok for non-swizzle instructions:
961 */
962 reg.negate = src->Negate ? 1 : 0;
963
964 return reg;
965 }
966
967
968 /**
969 * Get brw register for the given program dest register.
970 */
971 static struct brw_reg get_dst( struct brw_vs_compile *c,
972 struct prog_dst_register dst )
973 {
974 struct brw_reg reg;
975
976 switch (dst.File) {
977 case PROGRAM_TEMPORARY:
978 case PROGRAM_OUTPUT:
979 assert(c->regs[dst.File][dst.Index].nr != 0);
980 reg = c->regs[dst.File][dst.Index];
981 break;
982 case PROGRAM_ADDRESS:
983 assert(dst.Index == 0);
984 reg = c->regs[dst.File][dst.Index];
985 break;
986 case PROGRAM_UNDEFINED:
987 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
988 reg = brw_null_reg();
989 break;
990 default:
991 assert(0);
992 reg = brw_null_reg();
993 }
994
995 reg.dw1.bits.writemask = dst.WriteMask;
996
997 return reg;
998 }
999
1000
1001 static void emit_swz( struct brw_vs_compile *c,
1002 struct brw_reg dst,
1003 const struct prog_instruction *inst)
1004 {
1005 const GLuint argIndex = 0;
1006 const struct prog_src_register src = inst->SrcReg[argIndex];
1007 struct brw_compile *p = &c->func;
1008 GLuint zeros_mask = 0;
1009 GLuint ones_mask = 0;
1010 GLuint src_mask = 0;
1011 GLubyte src_swz[4];
1012 GLboolean need_tmp = (src.Negate &&
1013 dst.file != BRW_GENERAL_REGISTER_FILE);
1014 struct brw_reg tmp = dst;
1015 GLuint i;
1016
1017 if (need_tmp)
1018 tmp = get_tmp(c);
1019
1020 for (i = 0; i < 4; i++) {
1021 if (dst.dw1.bits.writemask & (1<<i)) {
1022 GLubyte s = GET_SWZ(src.Swizzle, i);
1023 switch (s) {
1024 case SWIZZLE_X:
1025 case SWIZZLE_Y:
1026 case SWIZZLE_Z:
1027 case SWIZZLE_W:
1028 src_mask |= 1<<i;
1029 src_swz[i] = s;
1030 break;
1031 case SWIZZLE_ZERO:
1032 zeros_mask |= 1<<i;
1033 break;
1034 case SWIZZLE_ONE:
1035 ones_mask |= 1<<i;
1036 break;
1037 }
1038 }
1039 }
1040
1041 /* Do src first, in case dst aliases src:
1042 */
1043 if (src_mask) {
1044 struct brw_reg arg0;
1045
1046 arg0 = get_src_reg(c, inst, argIndex);
1047
1048 arg0 = brw_swizzle(arg0,
1049 src_swz[0], src_swz[1],
1050 src_swz[2], src_swz[3]);
1051
1052 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1053 }
1054
1055 if (zeros_mask)
1056 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1057
1058 if (ones_mask)
1059 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1060
1061 if (src.Negate)
1062 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1063
1064 if (need_tmp) {
1065 brw_MOV(p, dst, tmp);
1066 release_tmp(c, tmp);
1067 }
1068 }
1069
1070
1071 /**
1072 * Post-vertex-program processing. Send the results to the URB.
1073 */
1074 static void emit_vertex_write( struct brw_vs_compile *c)
1075 {
1076 struct brw_compile *p = &c->func;
1077 struct brw_reg m0 = brw_message_reg(0);
1078 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1079 struct brw_reg ndc;
1080 int eot;
1081
1082 if (c->key.copy_edgeflag) {
1083 brw_MOV(p,
1084 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1085 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1086 }
1087
1088 /* Build ndc coords */
1089 ndc = get_tmp(c);
1090 /* ndc = 1.0 / pos.w */
1091 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1092 /* ndc.xyz = pos * ndc */
1093 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1094
1095 /* Update the header for point size, user clipping flags, and -ve rhw
1096 * workaround.
1097 */
1098 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1099 c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1100 {
1101 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1102 GLuint i;
1103
1104 brw_MOV(p, header1, brw_imm_ud(0));
1105
1106 brw_set_access_mode(p, BRW_ALIGN_16);
1107
1108 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1109 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1110 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1111 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1112 }
1113
1114 for (i = 0; i < c->key.nr_userclip; i++) {
1115 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1116 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1117 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1118 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1119 }
1120
1121 /* i965 clipping workaround:
1122 * 1) Test for -ve rhw
1123 * 2) If set,
1124 * set ndc = (0,0,0,0)
1125 * set ucp[6] = 1
1126 *
1127 * Later, clipping will detect ucp[6] and ensure the primitive is
1128 * clipped against all fixed planes.
1129 */
1130 if (!BRW_IS_G4X(p->brw)) {
1131 brw_CMP(p,
1132 vec8(brw_null_reg()),
1133 BRW_CONDITIONAL_L,
1134 brw_swizzle1(ndc, 3),
1135 brw_imm_f(0));
1136
1137 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1138 brw_MOV(p, ndc, brw_imm_f(0));
1139 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1140 }
1141
1142 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1143 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1144 brw_set_access_mode(p, BRW_ALIGN_16);
1145
1146 release_tmp(c, header1);
1147 }
1148 else {
1149 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1150 }
1151
1152 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1153 * of zeros followed by two sets of NDC coordinates:
1154 */
1155 brw_set_access_mode(p, BRW_ALIGN_1);
1156 brw_MOV(p, offset(m0, 2), ndc);
1157 brw_MOV(p, offset(m0, 3), pos);
1158
1159 eot = (c->first_overflow_output == 0);
1160
1161 brw_urb_WRITE(p,
1162 brw_null_reg(), /* dest */
1163 0, /* starting mrf reg nr */
1164 c->r0, /* src */
1165 0, /* allocate */
1166 1, /* used */
1167 MIN2(c->nr_outputs + 3, (BRW_MAX_MRF-1)), /* msg len */
1168 0, /* response len */
1169 eot, /* eot */
1170 1, /* writes complete */
1171 0, /* urb destination offset */
1172 BRW_URB_SWIZZLE_INTERLEAVE);
1173
1174 if (c->first_overflow_output > 0) {
1175 /* Not all of the vertex outputs/results fit into the MRF.
1176 * Move the overflowed attributes from the GRF to the MRF and
1177 * issue another brw_urb_WRITE().
1178 */
1179 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1180 * at mrf[4] atm...
1181 */
1182 GLuint i, mrf = 0;
1183 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1184 if (c->prog_data.outputs_written & (1 << i)) {
1185 /* move from GRF to MRF */
1186 brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1187 mrf++;
1188 }
1189 }
1190
1191 brw_urb_WRITE(p,
1192 brw_null_reg(), /* dest */
1193 4, /* starting mrf reg nr */
1194 c->r0, /* src */
1195 0, /* allocate */
1196 1, /* used */
1197 mrf+1, /* msg len */
1198 0, /* response len */
1199 1, /* eot */
1200 1, /* writes complete */
1201 BRW_MAX_MRF-1, /* urb destination offset */
1202 BRW_URB_SWIZZLE_INTERLEAVE);
1203 }
1204 }
1205
1206
1207 /**
1208 * Called after code generation to resolve subroutine calls and the
1209 * END instruction.
1210 * \param end_inst points to brw code for END instruction
1211 * \param last_inst points to last instruction emitted before vertex write
1212 */
1213 static void
1214 post_vs_emit( struct brw_vs_compile *c,
1215 struct brw_instruction *end_inst,
1216 struct brw_instruction *last_inst )
1217 {
1218 GLint offset;
1219
1220 brw_resolve_cals(&c->func);
1221
1222 /* patch up the END code to jump past subroutines, etc */
1223 offset = last_inst - end_inst;
1224 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1225 }
1226
1227
1228 /* Emit the vertex program instructions here.
1229 */
1230 void brw_vs_emit(struct brw_vs_compile *c )
1231 {
1232 #define MAX_IF_DEPTH 32
1233 #define MAX_LOOP_DEPTH 32
1234 struct brw_compile *p = &c->func;
1235 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1236 GLuint insn, if_depth = 0, loop_depth = 0;
1237 GLuint end_offset = 0;
1238 struct brw_instruction *end_inst, *last_inst;
1239 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1240 const struct brw_indirect stack_index = brw_indirect(0, 0);
1241 GLuint index;
1242 GLuint file;
1243
1244 if (INTEL_DEBUG & DEBUG_VS) {
1245 _mesa_printf("vs-emit:\n");
1246 _mesa_print_program(&c->vp->program.Base);
1247 _mesa_printf("\n");
1248 }
1249
1250 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1251 brw_set_access_mode(p, BRW_ALIGN_16);
1252
1253 /* Message registers can't be read, so copy the output into GRF register
1254 if they are used in source registers */
1255 for (insn = 0; insn < nr_insns; insn++) {
1256 GLuint i;
1257 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1258 for (i = 0; i < 3; i++) {
1259 struct prog_src_register *src = &inst->SrcReg[i];
1260 GLuint index = src->Index;
1261 GLuint file = src->File;
1262 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1263 c->output_regs[index].used_in_src = GL_TRUE;
1264 }
1265 }
1266
1267 /* Static register allocation
1268 */
1269 brw_vs_alloc_regs(c);
1270 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1271
1272 for (insn = 0; insn < nr_insns; insn++) {
1273
1274 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1275 struct brw_reg args[3], dst;
1276 GLuint i;
1277
1278 #if 0
1279 printf("%d: ", insn);
1280 _mesa_print_instruction(inst);
1281 #endif
1282
1283 /* Get argument regs. SWZ is special and does this itself.
1284 */
1285 if (inst->Opcode != OPCODE_SWZ)
1286 for (i = 0; i < 3; i++) {
1287 const struct prog_src_register *src = &inst->SrcReg[i];
1288 index = src->Index;
1289 file = src->File;
1290 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1291 args[i] = c->output_regs[index].reg;
1292 else
1293 args[i] = get_arg(c, inst, i);
1294 }
1295
1296 /* Get dest regs. Note that it is possible for a reg to be both
1297 * dst and arg, given the static allocation of registers. So
1298 * care needs to be taken emitting multi-operation instructions.
1299 */
1300 index = inst->DstReg.Index;
1301 file = inst->DstReg.File;
1302 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1303 dst = c->output_regs[index].reg;
1304 else
1305 dst = get_dst(c, inst->DstReg);
1306
1307 if (inst->SaturateMode != SATURATE_OFF) {
1308 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1309 inst->SaturateMode);
1310 }
1311
1312 switch (inst->Opcode) {
1313 case OPCODE_ABS:
1314 brw_MOV(p, dst, brw_abs(args[0]));
1315 break;
1316 case OPCODE_ADD:
1317 brw_ADD(p, dst, args[0], args[1]);
1318 break;
1319 case OPCODE_COS:
1320 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1321 break;
1322 case OPCODE_DP3:
1323 brw_DP3(p, dst, args[0], args[1]);
1324 break;
1325 case OPCODE_DP4:
1326 brw_DP4(p, dst, args[0], args[1]);
1327 break;
1328 case OPCODE_DPH:
1329 brw_DPH(p, dst, args[0], args[1]);
1330 break;
1331 case OPCODE_NRM3:
1332 emit_nrm(c, dst, args[0], 3);
1333 break;
1334 case OPCODE_NRM4:
1335 emit_nrm(c, dst, args[0], 4);
1336 break;
1337 case OPCODE_DST:
1338 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1339 break;
1340 case OPCODE_EXP:
1341 unalias1(c, dst, args[0], emit_exp_noalias);
1342 break;
1343 case OPCODE_EX2:
1344 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1345 break;
1346 case OPCODE_ARL:
1347 emit_arl(c, dst, args[0]);
1348 break;
1349 case OPCODE_FLR:
1350 brw_RNDD(p, dst, args[0]);
1351 break;
1352 case OPCODE_FRC:
1353 brw_FRC(p, dst, args[0]);
1354 break;
1355 case OPCODE_LOG:
1356 unalias1(c, dst, args[0], emit_log_noalias);
1357 break;
1358 case OPCODE_LG2:
1359 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1360 break;
1361 case OPCODE_LIT:
1362 unalias1(c, dst, args[0], emit_lit_noalias);
1363 break;
1364 case OPCODE_LRP:
1365 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1366 break;
1367 case OPCODE_MAD:
1368 brw_MOV(p, brw_acc_reg(), args[2]);
1369 brw_MAC(p, dst, args[0], args[1]);
1370 break;
1371 case OPCODE_MAX:
1372 emit_max(p, dst, args[0], args[1]);
1373 break;
1374 case OPCODE_MIN:
1375 emit_min(p, dst, args[0], args[1]);
1376 break;
1377 case OPCODE_MOV:
1378 brw_MOV(p, dst, args[0]);
1379 break;
1380 case OPCODE_MUL:
1381 brw_MUL(p, dst, args[0], args[1]);
1382 break;
1383 case OPCODE_POW:
1384 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1385 break;
1386 case OPCODE_RCP:
1387 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1388 break;
1389 case OPCODE_RSQ:
1390 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1391 break;
1392
1393 case OPCODE_SEQ:
1394 emit_seq(p, dst, args[0], args[1]);
1395 break;
1396 case OPCODE_SIN:
1397 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1398 break;
1399 case OPCODE_SNE:
1400 emit_sne(p, dst, args[0], args[1]);
1401 break;
1402 case OPCODE_SGE:
1403 emit_sge(p, dst, args[0], args[1]);
1404 break;
1405 case OPCODE_SGT:
1406 emit_sgt(p, dst, args[0], args[1]);
1407 break;
1408 case OPCODE_SLT:
1409 emit_slt(p, dst, args[0], args[1]);
1410 break;
1411 case OPCODE_SLE:
1412 emit_sle(p, dst, args[0], args[1]);
1413 break;
1414 case OPCODE_SUB:
1415 brw_ADD(p, dst, args[0], negate(args[1]));
1416 break;
1417 case OPCODE_SWZ:
1418 /* The args[0] value can't be used here as it won't have
1419 * correctly encoded the full swizzle:
1420 */
1421 emit_swz(c, dst, inst);
1422 break;
1423 case OPCODE_TRUNC:
1424 /* round toward zero */
1425 brw_RNDZ(p, dst, args[0]);
1426 break;
1427 case OPCODE_XPD:
1428 emit_xpd(p, dst, args[0], args[1]);
1429 break;
1430 case OPCODE_IF:
1431 assert(if_depth < MAX_IF_DEPTH);
1432 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1433 break;
1434 case OPCODE_ELSE:
1435 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1436 break;
1437 case OPCODE_ENDIF:
1438 assert(if_depth > 0);
1439 brw_ENDIF(p, if_inst[--if_depth]);
1440 break;
1441 #if 0
1442 case OPCODE_BGNLOOP:
1443 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1444 break;
1445 case OPCODE_BRK:
1446 brw_BREAK(p);
1447 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1448 break;
1449 case OPCODE_CONT:
1450 brw_CONT(p);
1451 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1452 break;
1453 case OPCODE_ENDLOOP:
1454 {
1455 struct brw_instruction *inst0, *inst1;
1456 loop_depth--;
1457 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1458 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1459 while (inst0 > loop_inst[loop_depth]) {
1460 inst0--;
1461 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1462 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
1463 inst0->bits3.if_else.pop_count = 0;
1464 }
1465 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1466 inst0->bits3.if_else.jump_count = inst1 - inst0;
1467 inst0->bits3.if_else.pop_count = 0;
1468 }
1469 }
1470 }
1471 break;
1472 #else
1473 (void) loop_inst;
1474 (void) loop_depth;
1475 #endif
1476 case OPCODE_BRA:
1477 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1478 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1479 brw_set_predicate_control_flag_value(p, 0xff);
1480 break;
1481 case OPCODE_CAL:
1482 brw_set_access_mode(p, BRW_ALIGN_1);
1483 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1484 brw_set_access_mode(p, BRW_ALIGN_16);
1485 brw_ADD(p, get_addr_reg(stack_index),
1486 get_addr_reg(stack_index), brw_imm_d(4));
1487 brw_save_call(p, inst->Comment, p->nr_insn);
1488 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1489 break;
1490 case OPCODE_RET:
1491 brw_ADD(p, get_addr_reg(stack_index),
1492 get_addr_reg(stack_index), brw_imm_d(-4));
1493 brw_set_access_mode(p, BRW_ALIGN_1);
1494 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1495 brw_set_access_mode(p, BRW_ALIGN_16);
1496 break;
1497 case OPCODE_END:
1498 end_offset = p->nr_insn;
1499 /* this instruction will get patched later to jump past subroutine
1500 * code, etc.
1501 */
1502 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1503 break;
1504 case OPCODE_PRINT:
1505 /* no-op */
1506 break;
1507 case OPCODE_BGNSUB:
1508 brw_save_label(p, inst->Comment, p->nr_insn);
1509 break;
1510 case OPCODE_ENDSUB:
1511 /* no-op */
1512 break;
1513 default:
1514 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1515 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1516 _mesa_opcode_string(inst->Opcode) :
1517 "unknown");
1518 }
1519
1520 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1521 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1522 && c->output_regs[inst->DstReg.Index].used_in_src) {
1523 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1524 }
1525
1526 /* Result color clamping.
1527 *
1528 * When destination register is an output register and
1529 * it's primary/secondary front/back color, we have to clamp
1530 * the result to [0,1]. This is done by enabling the
1531 * saturation bit for the last instruction.
1532 *
1533 * We don't use brw_set_saturate() as it modifies
1534 * p->current->header.saturate, which affects all the subsequent
1535 * instructions. Instead, we directly modify the header
1536 * of the last (already stored) instruction.
1537 */
1538 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1539 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1540 || (inst->DstReg.Index == VERT_RESULT_COL1)
1541 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1542 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1543 p->store[p->nr_insn-1].header.saturate = 1;
1544 }
1545 }
1546
1547 release_tmps(c);
1548 }
1549
1550 end_inst = &p->store[end_offset];
1551 last_inst = &p->store[p->nr_insn];
1552
1553 /* The END instruction will be patched to jump to this code */
1554 emit_vertex_write(c);
1555
1556 post_vs_emit(c, end_inst, last_inst);
1557 }