i965: comments and a new assertion
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
42 {
43 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
44
45 if (++c->last_tmp > c->prog_data.total_grf)
46 c->prog_data.total_grf = c->last_tmp;
47
48 return tmp;
49 }
50
51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
52 {
53 if (tmp.nr == c->last_tmp-1)
54 c->last_tmp--;
55 }
56
57 static void release_tmps( struct brw_vs_compile *c )
58 {
59 c->last_tmp = c->first_tmp;
60 }
61
62
63 /**
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
66 * ahead of time.
67 */
68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
69 {
70 GLuint i, reg = 0, mrf;
71
72 /* Determine whether to use a real constant buffer or use a block
73 * of GRF registers for constants. The later is faster but only
74 * works if everything fits in the GRF.
75 * XXX this heuristic/check may need some fine tuning...
76 */
77 if (c->vp->program.Base.Parameters->NumParameters +
78 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
79 c->vp->use_const_buffer = GL_TRUE;
80 else
81 c->vp->use_const_buffer = GL_FALSE;
82
83 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
84
85 /* r0 -- reserved as usual
86 */
87 c->r0 = brw_vec8_grf(reg, 0);
88 reg++;
89
90 /* User clip planes from curbe:
91 */
92 if (c->key.nr_userclip) {
93 for (i = 0; i < c->key.nr_userclip; i++) {
94 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
95 }
96
97 /* Deal with curbe alignment:
98 */
99 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
100 }
101
102 /* Vertex program parameters from curbe:
103 */
104 if (c->vp->use_const_buffer) {
105 /* get constants from a real constant buffer */
106 c->prog_data.curb_read_length = 0;
107 c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
108 }
109 else {
110 /* use a section of the GRF for constants */
111 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
112 for (i = 0; i < nr_params; i++) {
113 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
114 }
115 reg += (nr_params + 1) / 2;
116 c->prog_data.curb_read_length = reg - 1;
117
118 c->prog_data.nr_params = nr_params * 4;
119 }
120
121 /* Allocate input regs:
122 */
123 c->nr_inputs = 0;
124 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
125 if (c->prog_data.inputs_read & (1 << i)) {
126 c->nr_inputs++;
127 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
128 reg++;
129 }
130 }
131
132 /* Allocate outputs. The non-position outputs go straight into message regs.
133 */
134 c->nr_outputs = 0;
135 c->first_output = reg;
136 mrf = 4;
137 for (i = 0; i < VERT_RESULT_MAX; i++) {
138 if (c->prog_data.outputs_written & (1 << i)) {
139 c->nr_outputs++;
140 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
141 if (i == VERT_RESULT_HPOS) {
142 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
143 reg++;
144 }
145 else if (i == VERT_RESULT_PSIZ) {
146 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
147 reg++;
148 mrf++; /* just a placeholder? XXX fix later stages & remove this */
149 }
150 else {
151 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
152 mrf++;
153 }
154 }
155 }
156
157 /* Allocate program temporaries:
158 */
159 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
160 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
161 reg++;
162 }
163
164 /* Address reg(s). Don't try to use the internal address reg until
165 * deref time.
166 */
167 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
168 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
169 reg,
170 0,
171 BRW_REGISTER_TYPE_D,
172 BRW_VERTICAL_STRIDE_8,
173 BRW_WIDTH_8,
174 BRW_HORIZONTAL_STRIDE_1,
175 BRW_SWIZZLE_XXXX,
176 WRITEMASK_X);
177 reg++;
178 }
179
180 if (c->vp->use_const_buffer) {
181 for (i = 0; i < 3; i++) {
182 c->current_const[i].index = -1;
183 c->current_const[i].reg = brw_vec8_grf(reg, 0);
184 reg++;
185 }
186 }
187
188 for (i = 0; i < 128; i++) {
189 if (c->output_regs[i].used_in_src) {
190 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
191 reg++;
192 }
193 }
194
195 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
196 reg += 2;
197
198 /* Some opcodes need an internal temporary:
199 */
200 c->first_tmp = reg;
201 c->last_tmp = reg; /* for allocation purposes */
202
203 /* Each input reg holds data from two vertices. The
204 * urb_read_length is the number of registers read from *each*
205 * vertex urb, so is half the amount:
206 */
207 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
208
209 c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
210 c->prog_data.total_grf = reg;
211
212 if (INTEL_DEBUG & DEBUG_VS) {
213 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
214 _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
215 _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
216 }
217 }
218
219
220 /**
221 * If an instruction uses a temp reg both as a src and the dest, we
222 * sometimes need to allocate an intermediate temporary.
223 */
224 static void unalias1( struct brw_vs_compile *c,
225 struct brw_reg dst,
226 struct brw_reg arg0,
227 void (*func)( struct brw_vs_compile *,
228 struct brw_reg,
229 struct brw_reg ))
230 {
231 if (dst.file == arg0.file && dst.nr == arg0.nr) {
232 struct brw_compile *p = &c->func;
233 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
234 func(c, tmp, arg0);
235 brw_MOV(p, dst, tmp);
236 release_tmp(c, tmp);
237 }
238 else {
239 func(c, dst, arg0);
240 }
241 }
242
243 /**
244 * \sa unalias2
245 * Checkes if 2-operand instruction needs an intermediate temporary.
246 */
247 static void unalias2( struct brw_vs_compile *c,
248 struct brw_reg dst,
249 struct brw_reg arg0,
250 struct brw_reg arg1,
251 void (*func)( struct brw_vs_compile *,
252 struct brw_reg,
253 struct brw_reg,
254 struct brw_reg ))
255 {
256 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
257 (dst.file == arg1.file && dst.nr == arg1.nr)) {
258 struct brw_compile *p = &c->func;
259 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
260 func(c, tmp, arg0, arg1);
261 brw_MOV(p, dst, tmp);
262 release_tmp(c, tmp);
263 }
264 else {
265 func(c, dst, arg0, arg1);
266 }
267 }
268
269 /**
270 * \sa unalias2
271 * Checkes if 3-operand instruction needs an intermediate temporary.
272 */
273 static void unalias3( struct brw_vs_compile *c,
274 struct brw_reg dst,
275 struct brw_reg arg0,
276 struct brw_reg arg1,
277 struct brw_reg arg2,
278 void (*func)( struct brw_vs_compile *,
279 struct brw_reg,
280 struct brw_reg,
281 struct brw_reg,
282 struct brw_reg ))
283 {
284 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
285 (dst.file == arg1.file && dst.nr == arg1.nr) ||
286 (dst.file == arg2.file && dst.nr == arg2.nr)) {
287 struct brw_compile *p = &c->func;
288 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
289 func(c, tmp, arg0, arg1, arg2);
290 brw_MOV(p, dst, tmp);
291 release_tmp(c, tmp);
292 }
293 else {
294 func(c, dst, arg0, arg1, arg2);
295 }
296 }
297
298 static void emit_sop( struct brw_compile *p,
299 struct brw_reg dst,
300 struct brw_reg arg0,
301 struct brw_reg arg1,
302 GLuint cond)
303 {
304 brw_MOV(p, dst, brw_imm_f(0.0f));
305 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
306 brw_MOV(p, dst, brw_imm_f(1.0f));
307 brw_set_predicate_control_flag_value(p, 0xff);
308 }
309
310 static void emit_seq( struct brw_compile *p,
311 struct brw_reg dst,
312 struct brw_reg arg0,
313 struct brw_reg arg1 )
314 {
315 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
316 }
317
318 static void emit_sne( struct brw_compile *p,
319 struct brw_reg dst,
320 struct brw_reg arg0,
321 struct brw_reg arg1 )
322 {
323 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
324 }
325 static void emit_slt( struct brw_compile *p,
326 struct brw_reg dst,
327 struct brw_reg arg0,
328 struct brw_reg arg1 )
329 {
330 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
331 }
332
333 static void emit_sle( struct brw_compile *p,
334 struct brw_reg dst,
335 struct brw_reg arg0,
336 struct brw_reg arg1 )
337 {
338 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
339 }
340
341 static void emit_sgt( struct brw_compile *p,
342 struct brw_reg dst,
343 struct brw_reg arg0,
344 struct brw_reg arg1 )
345 {
346 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
347 }
348
349 static void emit_sge( struct brw_compile *p,
350 struct brw_reg dst,
351 struct brw_reg arg0,
352 struct brw_reg arg1 )
353 {
354 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
355 }
356
357 static void emit_max( struct brw_compile *p,
358 struct brw_reg dst,
359 struct brw_reg arg0,
360 struct brw_reg arg1 )
361 {
362 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
363 brw_SEL(p, dst, arg1, arg0);
364 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
365 }
366
367 static void emit_min( struct brw_compile *p,
368 struct brw_reg dst,
369 struct brw_reg arg0,
370 struct brw_reg arg1 )
371 {
372 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
373 brw_SEL(p, dst, arg0, arg1);
374 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
375 }
376
377
378 static void emit_math1( struct brw_vs_compile *c,
379 GLuint function,
380 struct brw_reg dst,
381 struct brw_reg arg0,
382 GLuint precision)
383 {
384 /* There are various odd behaviours with SEND on the simulator. In
385 * addition there are documented issues with the fact that the GEN4
386 * processor doesn't do dependency control properly on SEND
387 * results. So, on balance, this kludge to get around failures
388 * with writemasked math results looks like it might be necessary
389 * whether that turns out to be a simulator bug or not:
390 */
391 struct brw_compile *p = &c->func;
392 struct brw_reg tmp = dst;
393 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
394 dst.file != BRW_GENERAL_REGISTER_FILE);
395
396 if (need_tmp)
397 tmp = get_tmp(c);
398
399 brw_math(p,
400 tmp,
401 function,
402 BRW_MATH_SATURATE_NONE,
403 2,
404 arg0,
405 BRW_MATH_DATA_SCALAR,
406 precision);
407
408 if (need_tmp) {
409 brw_MOV(p, dst, tmp);
410 release_tmp(c, tmp);
411 }
412 }
413
414
415 static void emit_math2( struct brw_vs_compile *c,
416 GLuint function,
417 struct brw_reg dst,
418 struct brw_reg arg0,
419 struct brw_reg arg1,
420 GLuint precision)
421 {
422 struct brw_compile *p = &c->func;
423 struct brw_reg tmp = dst;
424 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
425 dst.file != BRW_GENERAL_REGISTER_FILE);
426
427 if (need_tmp)
428 tmp = get_tmp(c);
429
430 brw_MOV(p, brw_message_reg(3), arg1);
431
432 brw_math(p,
433 tmp,
434 function,
435 BRW_MATH_SATURATE_NONE,
436 2,
437 arg0,
438 BRW_MATH_DATA_SCALAR,
439 precision);
440
441 if (need_tmp) {
442 brw_MOV(p, dst, tmp);
443 release_tmp(c, tmp);
444 }
445 }
446
447
448 static void emit_exp_noalias( struct brw_vs_compile *c,
449 struct brw_reg dst,
450 struct brw_reg arg0 )
451 {
452 struct brw_compile *p = &c->func;
453
454
455 if (dst.dw1.bits.writemask & WRITEMASK_X) {
456 struct brw_reg tmp = get_tmp(c);
457 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
458
459 /* tmp_d = floor(arg0.x) */
460 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
461
462 /* result[0] = 2.0 ^ tmp */
463
464 /* Adjust exponent for floating point:
465 * exp += 127
466 */
467 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
468
469 /* Install exponent and sign.
470 * Excess drops off the edge:
471 */
472 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
473 tmp_d, brw_imm_d(23));
474
475 release_tmp(c, tmp);
476 }
477
478 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
479 /* result[1] = arg0.x - floor(arg0.x) */
480 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
481 }
482
483 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
484 /* As with the LOG instruction, we might be better off just
485 * doing a taylor expansion here, seeing as we have to do all
486 * the prep work.
487 *
488 * If mathbox partial precision is too low, consider also:
489 * result[3] = result[0] * EXP(result[1])
490 */
491 emit_math1(c,
492 BRW_MATH_FUNCTION_EXP,
493 brw_writemask(dst, WRITEMASK_Z),
494 brw_swizzle1(arg0, 0),
495 BRW_MATH_PRECISION_FULL);
496 }
497
498 if (dst.dw1.bits.writemask & WRITEMASK_W) {
499 /* result[3] = 1.0; */
500 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
501 }
502 }
503
504
505 static void emit_log_noalias( struct brw_vs_compile *c,
506 struct brw_reg dst,
507 struct brw_reg arg0 )
508 {
509 struct brw_compile *p = &c->func;
510 struct brw_reg tmp = dst;
511 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
512 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
513 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
514 dst.file != BRW_GENERAL_REGISTER_FILE);
515
516 if (need_tmp) {
517 tmp = get_tmp(c);
518 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
519 }
520
521 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
522 * according to spec:
523 *
524 * These almost look likey they could be joined up, but not really
525 * practical:
526 *
527 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
528 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
529 */
530 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
531 brw_AND(p,
532 brw_writemask(tmp_ud, WRITEMASK_X),
533 brw_swizzle1(arg0_ud, 0),
534 brw_imm_ud((1U<<31)-1));
535
536 brw_SHR(p,
537 brw_writemask(tmp_ud, WRITEMASK_X),
538 tmp_ud,
539 brw_imm_ud(23));
540
541 brw_ADD(p,
542 brw_writemask(tmp, WRITEMASK_X),
543 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
544 brw_imm_d(-127));
545 }
546
547 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
548 brw_AND(p,
549 brw_writemask(tmp_ud, WRITEMASK_Y),
550 brw_swizzle1(arg0_ud, 0),
551 brw_imm_ud((1<<23)-1));
552
553 brw_OR(p,
554 brw_writemask(tmp_ud, WRITEMASK_Y),
555 tmp_ud,
556 brw_imm_ud(127<<23));
557 }
558
559 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
560 /* result[2] = result[0] + LOG2(result[1]); */
561
562 /* Why bother? The above is just a hint how to do this with a
563 * taylor series. Maybe we *should* use a taylor series as by
564 * the time all the above has been done it's almost certainly
565 * quicker than calling the mathbox, even with low precision.
566 *
567 * Options are:
568 * - result[0] + mathbox.LOG2(result[1])
569 * - mathbox.LOG2(arg0.x)
570 * - result[0] + inline_taylor_approx(result[1])
571 */
572 emit_math1(c,
573 BRW_MATH_FUNCTION_LOG,
574 brw_writemask(tmp, WRITEMASK_Z),
575 brw_swizzle1(tmp, 1),
576 BRW_MATH_PRECISION_FULL);
577
578 brw_ADD(p,
579 brw_writemask(tmp, WRITEMASK_Z),
580 brw_swizzle1(tmp, 2),
581 brw_swizzle1(tmp, 0));
582 }
583
584 if (dst.dw1.bits.writemask & WRITEMASK_W) {
585 /* result[3] = 1.0; */
586 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
587 }
588
589 if (need_tmp) {
590 brw_MOV(p, dst, tmp);
591 release_tmp(c, tmp);
592 }
593 }
594
595
596 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
597 */
598 static void emit_dst_noalias( struct brw_vs_compile *c,
599 struct brw_reg dst,
600 struct brw_reg arg0,
601 struct brw_reg arg1)
602 {
603 struct brw_compile *p = &c->func;
604
605 /* There must be a better way to do this:
606 */
607 if (dst.dw1.bits.writemask & WRITEMASK_X)
608 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
609 if (dst.dw1.bits.writemask & WRITEMASK_Y)
610 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
611 if (dst.dw1.bits.writemask & WRITEMASK_Z)
612 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
613 if (dst.dw1.bits.writemask & WRITEMASK_W)
614 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
615 }
616
617
618 static void emit_xpd( struct brw_compile *p,
619 struct brw_reg dst,
620 struct brw_reg t,
621 struct brw_reg u)
622 {
623 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
624 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
625 }
626
627
628 static void emit_lit_noalias( struct brw_vs_compile *c,
629 struct brw_reg dst,
630 struct brw_reg arg0 )
631 {
632 struct brw_compile *p = &c->func;
633 struct brw_instruction *if_insn;
634 struct brw_reg tmp = dst;
635 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
636
637 if (need_tmp)
638 tmp = get_tmp(c);
639
640 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
641 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
642
643 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
644 * to get all channels active inside the IF. In the clipping code
645 * we run with NoMask, so it's not an option and we can use
646 * BRW_EXECUTE_1 for all comparisions.
647 */
648 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
649 if_insn = brw_IF(p, BRW_EXECUTE_8);
650 {
651 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
652
653 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
654 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
655 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
656
657 emit_math2(c,
658 BRW_MATH_FUNCTION_POW,
659 brw_writemask(dst, WRITEMASK_Z),
660 brw_swizzle1(tmp, 2),
661 brw_swizzle1(arg0, 3),
662 BRW_MATH_PRECISION_PARTIAL);
663 }
664
665 brw_ENDIF(p, if_insn);
666
667 release_tmp(c, tmp);
668 }
669
670 static void emit_lrp_noalias(struct brw_vs_compile *c,
671 struct brw_reg dst,
672 struct brw_reg arg0,
673 struct brw_reg arg1,
674 struct brw_reg arg2)
675 {
676 struct brw_compile *p = &c->func;
677
678 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
679 brw_MUL(p, brw_null_reg(), dst, arg2);
680 brw_MAC(p, dst, arg0, arg1);
681 }
682
683 /** 3 or 4-component vector normalization */
684 static void emit_nrm( struct brw_vs_compile *c,
685 struct brw_reg dst,
686 struct brw_reg arg0,
687 int num_comps)
688 {
689 struct brw_compile *p = &c->func;
690 struct brw_reg tmp = get_tmp(c);
691
692 /* tmp = dot(arg0, arg0) */
693 if (num_comps == 3)
694 brw_DP3(p, tmp, arg0, arg0);
695 else
696 brw_DP4(p, tmp, arg0, arg0);
697
698 /* tmp = 1 / sqrt(tmp) */
699 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
700
701 /* dst = arg0 * tmp */
702 brw_MUL(p, dst, arg0, tmp);
703
704 release_tmp(c, tmp);
705 }
706
707
708 static struct brw_reg
709 get_constant(struct brw_vs_compile *c,
710 const struct prog_instruction *inst,
711 GLuint argIndex)
712 {
713 const struct prog_src_register *src = &inst->SrcReg[argIndex];
714 struct brw_compile *p = &c->func;
715 struct brw_reg const_reg;
716 struct brw_reg const2_reg;
717 const GLboolean relAddr = src->RelAddr;
718
719 assert(argIndex < 3);
720
721 if (c->current_const[argIndex].index != src->Index || relAddr) {
722 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
723
724 c->current_const[argIndex].index = src->Index;
725
726 #if 0
727 printf(" fetch const[%d] for arg %d into reg %d\n",
728 src->Index, argIndex, c->current_const[argIndex].reg.nr);
729 #endif
730 /* need to fetch the constant now */
731 brw_dp_READ_4_vs(p,
732 c->current_const[argIndex].reg,/* writeback dest */
733 0, /* oword */
734 relAddr, /* relative indexing? */
735 addrReg, /* address register */
736 16 * src->Index, /* byte offset */
737 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
738 );
739
740 if (relAddr) {
741 /* second read */
742 const2_reg = get_tmp(c);
743
744 /* use upper half of address reg for second read */
745 addrReg = stride(addrReg, 0, 4, 0);
746 addrReg.subnr = 16;
747
748 brw_dp_READ_4_vs(p,
749 const2_reg, /* writeback dest */
750 1, /* oword */
751 relAddr, /* relative indexing? */
752 addrReg, /* address register */
753 16 * src->Index, /* byte offset */
754 SURF_INDEX_VERT_CONST_BUFFER
755 );
756 }
757 }
758
759 const_reg = c->current_const[argIndex].reg;
760
761 if (relAddr) {
762 /* merge the two Owords into the constant register */
763 /* const_reg[7..4] = const2_reg[7..4] */
764 brw_MOV(p,
765 suboffset(stride(const_reg, 0, 4, 1), 4),
766 suboffset(stride(const2_reg, 0, 4, 1), 4));
767 release_tmp(c, const2_reg);
768 }
769 else {
770 /* replicate lower four floats into upper half (to get XYZWXYZW) */
771 const_reg = stride(const_reg, 0, 4, 0);
772 const_reg.subnr = 0;
773 }
774
775 return const_reg;
776 }
777
778
779
780 /* TODO: relative addressing!
781 */
782 static struct brw_reg get_reg( struct brw_vs_compile *c,
783 gl_register_file file,
784 GLuint index )
785 {
786 switch (file) {
787 case PROGRAM_TEMPORARY:
788 case PROGRAM_INPUT:
789 case PROGRAM_OUTPUT:
790 assert(c->regs[file][index].nr != 0);
791 return c->regs[file][index];
792 case PROGRAM_STATE_VAR:
793 case PROGRAM_CONSTANT:
794 case PROGRAM_UNIFORM:
795 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
796 return c->regs[PROGRAM_STATE_VAR][index];
797 case PROGRAM_ADDRESS:
798 assert(index == 0);
799 return c->regs[file][index];
800
801 case PROGRAM_UNDEFINED: /* undef values */
802 return brw_null_reg();
803
804 case PROGRAM_LOCAL_PARAM:
805 case PROGRAM_ENV_PARAM:
806 case PROGRAM_WRITE_ONLY:
807 default:
808 assert(0);
809 return brw_null_reg();
810 }
811 }
812
813
814 /**
815 * Indirect addressing: get reg[[arg] + offset].
816 */
817 static struct brw_reg deref( struct brw_vs_compile *c,
818 struct brw_reg arg,
819 GLint offset)
820 {
821 struct brw_compile *p = &c->func;
822 struct brw_reg tmp = vec4(get_tmp(c));
823 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
824 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
825 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
826 struct brw_reg indirect = brw_vec4_indirect(0,0);
827
828 {
829 brw_push_insn_state(p);
830 brw_set_access_mode(p, BRW_ALIGN_1);
831
832 /* This is pretty clunky - load the address register twice and
833 * fetch each 4-dword value in turn. There must be a way to do
834 * this in a single pass, but I couldn't get it to work.
835 */
836 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
837 brw_MOV(p, tmp, indirect);
838
839 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
840 brw_MOV(p, suboffset(tmp, 4), indirect);
841
842 brw_pop_insn_state(p);
843 }
844
845 /* NOTE: tmp not released */
846 return vec8(tmp);
847 }
848
849
850 /**
851 * Get brw reg corresponding to the instruction's [argIndex] src reg.
852 * TODO: relative addressing!
853 */
854 static struct brw_reg
855 get_src_reg( struct brw_vs_compile *c,
856 const struct prog_instruction *inst,
857 GLuint argIndex )
858 {
859 const GLuint file = inst->SrcReg[argIndex].File;
860 const GLint index = inst->SrcReg[argIndex].Index;
861 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
862
863 switch (file) {
864 case PROGRAM_TEMPORARY:
865 case PROGRAM_INPUT:
866 case PROGRAM_OUTPUT:
867 if (relAddr) {
868 return deref(c, c->regs[file][0], index);
869 }
870 else {
871 assert(c->regs[file][index].nr != 0);
872 return c->regs[file][index];
873 }
874
875 case PROGRAM_STATE_VAR:
876 case PROGRAM_CONSTANT:
877 case PROGRAM_UNIFORM:
878 if (c->vp->use_const_buffer) {
879 return get_constant(c, inst, argIndex);
880 }
881 else if (relAddr) {
882 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
883 }
884 else {
885 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
886 return c->regs[PROGRAM_STATE_VAR][index];
887 }
888 case PROGRAM_ADDRESS:
889 assert(index == 0);
890 return c->regs[file][index];
891
892 case PROGRAM_UNDEFINED:
893 /* this is a normal case since we loop over all three src args */
894 return brw_null_reg();
895
896 case PROGRAM_LOCAL_PARAM:
897 case PROGRAM_ENV_PARAM:
898 case PROGRAM_WRITE_ONLY:
899 default:
900 assert(0);
901 return brw_null_reg();
902 }
903 }
904
905
906 static void emit_arl( struct brw_vs_compile *c,
907 struct brw_reg dst,
908 struct brw_reg arg0 )
909 {
910 struct brw_compile *p = &c->func;
911 struct brw_reg tmp = dst;
912 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
913
914 if (need_tmp)
915 tmp = get_tmp(c);
916
917 brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
918 brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
919
920 if (need_tmp)
921 release_tmp(c, tmp);
922 }
923
924
925 /**
926 * Return the brw reg for the given instruction's src argument.
927 * Will return mangled results for SWZ op. The emit_swz() function
928 * ignores this result and recalculates taking extended swizzles into
929 * account.
930 */
931 static struct brw_reg get_arg( struct brw_vs_compile *c,
932 const struct prog_instruction *inst,
933 GLuint argIndex )
934 {
935 const struct prog_src_register *src = &inst->SrcReg[argIndex];
936 struct brw_reg reg;
937
938 if (src->File == PROGRAM_UNDEFINED)
939 return brw_null_reg();
940
941 reg = get_src_reg(c, inst, argIndex);
942
943 /* Convert 3-bit swizzle to 2-bit.
944 */
945 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
946 GET_SWZ(src->Swizzle, 1),
947 GET_SWZ(src->Swizzle, 2),
948 GET_SWZ(src->Swizzle, 3));
949
950 /* Note this is ok for non-swizzle instructions:
951 */
952 reg.negate = src->Negate ? 1 : 0;
953
954 return reg;
955 }
956
957
958 /**
959 * Get brw register for the given program dest register.
960 */
961 static struct brw_reg get_dst( struct brw_vs_compile *c,
962 struct prog_dst_register dst )
963 {
964 struct brw_reg reg;
965
966 switch (dst.File) {
967 case PROGRAM_TEMPORARY:
968 case PROGRAM_OUTPUT:
969 assert(c->regs[dst.File][dst.Index].nr != 0);
970 reg = c->regs[dst.File][dst.Index];
971 break;
972 case PROGRAM_ADDRESS:
973 assert(dst.Index == 0);
974 reg = c->regs[dst.File][dst.Index];
975 break;
976 case PROGRAM_UNDEFINED:
977 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
978 reg = brw_null_reg();
979 break;
980 default:
981 assert(0);
982 reg = brw_null_reg();
983 }
984
985 reg.dw1.bits.writemask = dst.WriteMask;
986
987 return reg;
988 }
989
990
991 static void emit_swz( struct brw_vs_compile *c,
992 struct brw_reg dst,
993 const struct prog_instruction *inst)
994 {
995 const GLuint argIndex = 0;
996 const struct prog_src_register src = inst->SrcReg[argIndex];
997 struct brw_compile *p = &c->func;
998 GLuint zeros_mask = 0;
999 GLuint ones_mask = 0;
1000 GLuint src_mask = 0;
1001 GLubyte src_swz[4];
1002 GLboolean need_tmp = (src.Negate &&
1003 dst.file != BRW_GENERAL_REGISTER_FILE);
1004 struct brw_reg tmp = dst;
1005 GLuint i;
1006
1007 if (need_tmp)
1008 tmp = get_tmp(c);
1009
1010 for (i = 0; i < 4; i++) {
1011 if (dst.dw1.bits.writemask & (1<<i)) {
1012 GLubyte s = GET_SWZ(src.Swizzle, i);
1013 switch (s) {
1014 case SWIZZLE_X:
1015 case SWIZZLE_Y:
1016 case SWIZZLE_Z:
1017 case SWIZZLE_W:
1018 src_mask |= 1<<i;
1019 src_swz[i] = s;
1020 break;
1021 case SWIZZLE_ZERO:
1022 zeros_mask |= 1<<i;
1023 break;
1024 case SWIZZLE_ONE:
1025 ones_mask |= 1<<i;
1026 break;
1027 }
1028 }
1029 }
1030
1031 /* Do src first, in case dst aliases src:
1032 */
1033 if (src_mask) {
1034 struct brw_reg arg0;
1035
1036 arg0 = get_src_reg(c, inst, argIndex);
1037
1038 arg0 = brw_swizzle(arg0,
1039 src_swz[0], src_swz[1],
1040 src_swz[2], src_swz[3]);
1041
1042 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1043 }
1044
1045 if (zeros_mask)
1046 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1047
1048 if (ones_mask)
1049 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1050
1051 if (src.Negate)
1052 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1053
1054 if (need_tmp) {
1055 brw_MOV(p, dst, tmp);
1056 release_tmp(c, tmp);
1057 }
1058 }
1059
1060
1061 /**
1062 * Post-vertex-program processing. Send the results to the URB.
1063 */
1064 static void emit_vertex_write( struct brw_vs_compile *c)
1065 {
1066 struct brw_compile *p = &c->func;
1067 struct brw_reg m0 = brw_message_reg(0);
1068 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1069 struct brw_reg ndc;
1070
1071 if (c->key.copy_edgeflag) {
1072 brw_MOV(p,
1073 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1074 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1075 }
1076
1077 /* Build ndc coords */
1078 ndc = get_tmp(c);
1079 /* ndc = 1.0 / pos.w */
1080 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1081 /* ndc.xyz = pos * ndc */
1082 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1083
1084 /* Update the header for point size, user clipping flags, and -ve rhw
1085 * workaround.
1086 */
1087 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1088 c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1089 {
1090 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1091 GLuint i;
1092
1093 brw_MOV(p, header1, brw_imm_ud(0));
1094
1095 brw_set_access_mode(p, BRW_ALIGN_16);
1096
1097 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1098 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1099 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1100 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1101 }
1102
1103 for (i = 0; i < c->key.nr_userclip; i++) {
1104 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1105 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1106 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1107 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1108 }
1109
1110 /* i965 clipping workaround:
1111 * 1) Test for -ve rhw
1112 * 2) If set,
1113 * set ndc = (0,0,0,0)
1114 * set ucp[6] = 1
1115 *
1116 * Later, clipping will detect ucp[6] and ensure the primitive is
1117 * clipped against all fixed planes.
1118 */
1119 if (!BRW_IS_G4X(p->brw)) {
1120 brw_CMP(p,
1121 vec8(brw_null_reg()),
1122 BRW_CONDITIONAL_L,
1123 brw_swizzle1(ndc, 3),
1124 brw_imm_f(0));
1125
1126 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1127 brw_MOV(p, ndc, brw_imm_f(0));
1128 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1129 }
1130
1131 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1132 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1133 brw_set_access_mode(p, BRW_ALIGN_16);
1134
1135 release_tmp(c, header1);
1136 }
1137 else {
1138 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1139 }
1140
1141 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1142 * of zeros followed by two sets of NDC coordinates:
1143 */
1144 brw_set_access_mode(p, BRW_ALIGN_1);
1145 brw_MOV(p, offset(m0, 2), ndc);
1146 brw_MOV(p, offset(m0, 3), pos);
1147
1148 brw_urb_WRITE(p,
1149 brw_null_reg(), /* dest */
1150 0, /* starting mrf reg nr */
1151 c->r0, /* src */
1152 0, /* allocate */
1153 1, /* used */
1154 c->nr_outputs + 3, /* msg len */
1155 0, /* response len */
1156 1, /* eot */
1157 1, /* writes complete */
1158 0, /* urb destination offset */
1159 BRW_URB_SWIZZLE_INTERLEAVE);
1160 }
1161
1162
1163 /**
1164 * Called after code generation to resolve subroutine calls and the
1165 * END instruction.
1166 * \param end_inst points to brw code for END instruction
1167 * \param last_inst points to last instruction emitted before vertex write
1168 */
1169 static void
1170 post_vs_emit( struct brw_vs_compile *c,
1171 struct brw_instruction *end_inst,
1172 struct brw_instruction *last_inst )
1173 {
1174 GLint offset;
1175
1176 brw_resolve_cals(&c->func);
1177
1178 /* patch up the END code to jump past subroutines, etc */
1179 offset = last_inst - end_inst;
1180 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1181 }
1182
1183
1184 /* Emit the vertex program instructions here.
1185 */
1186 void brw_vs_emit(struct brw_vs_compile *c )
1187 {
1188 #define MAX_IF_DEPTH 32
1189 #define MAX_LOOP_DEPTH 32
1190 struct brw_compile *p = &c->func;
1191 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1192 GLuint insn, if_depth = 0, loop_depth = 0;
1193 GLuint end_offset = 0;
1194 struct brw_instruction *end_inst, *last_inst;
1195 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1196 const struct brw_indirect stack_index = brw_indirect(0, 0);
1197 GLuint index;
1198 GLuint file;
1199
1200 if (INTEL_DEBUG & DEBUG_VS) {
1201 _mesa_printf("vs-emit:\n");
1202 _mesa_print_program(&c->vp->program.Base);
1203 _mesa_printf("\n");
1204 }
1205
1206 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1207 brw_set_access_mode(p, BRW_ALIGN_16);
1208
1209 /* Message registers can't be read, so copy the output into GRF register
1210 if they are used in source registers */
1211 for (insn = 0; insn < nr_insns; insn++) {
1212 GLuint i;
1213 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1214 for (i = 0; i < 3; i++) {
1215 struct prog_src_register *src = &inst->SrcReg[i];
1216 GLuint index = src->Index;
1217 GLuint file = src->File;
1218 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1219 c->output_regs[index].used_in_src = GL_TRUE;
1220 }
1221 }
1222
1223 /* Static register allocation
1224 */
1225 brw_vs_alloc_regs(c);
1226 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1227
1228 for (insn = 0; insn < nr_insns; insn++) {
1229
1230 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1231 struct brw_reg args[3], dst;
1232 GLuint i;
1233
1234 #if 0
1235 printf("%d: ", insn);
1236 _mesa_print_instruction(inst);
1237 #endif
1238
1239 /* Get argument regs. SWZ is special and does this itself.
1240 */
1241 if (inst->Opcode != OPCODE_SWZ)
1242 for (i = 0; i < 3; i++) {
1243 const struct prog_src_register *src = &inst->SrcReg[i];
1244 index = src->Index;
1245 file = src->File;
1246 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1247 args[i] = c->output_regs[index].reg;
1248 else
1249 args[i] = get_arg(c, inst, i);
1250 }
1251
1252 /* Get dest regs. Note that it is possible for a reg to be both
1253 * dst and arg, given the static allocation of registers. So
1254 * care needs to be taken emitting multi-operation instructions.
1255 */
1256 index = inst->DstReg.Index;
1257 file = inst->DstReg.File;
1258 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1259 dst = c->output_regs[index].reg;
1260 else
1261 dst = get_dst(c, inst->DstReg);
1262
1263 if (inst->SaturateMode != SATURATE_OFF) {
1264 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1265 inst->SaturateMode);
1266 }
1267
1268 switch (inst->Opcode) {
1269 case OPCODE_ABS:
1270 brw_MOV(p, dst, brw_abs(args[0]));
1271 break;
1272 case OPCODE_ADD:
1273 brw_ADD(p, dst, args[0], args[1]);
1274 break;
1275 case OPCODE_COS:
1276 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1277 break;
1278 case OPCODE_DP3:
1279 brw_DP3(p, dst, args[0], args[1]);
1280 break;
1281 case OPCODE_DP4:
1282 brw_DP4(p, dst, args[0], args[1]);
1283 break;
1284 case OPCODE_DPH:
1285 brw_DPH(p, dst, args[0], args[1]);
1286 break;
1287 case OPCODE_NRM3:
1288 emit_nrm(c, dst, args[0], 3);
1289 break;
1290 case OPCODE_NRM4:
1291 emit_nrm(c, dst, args[0], 4);
1292 break;
1293 case OPCODE_DST:
1294 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1295 break;
1296 case OPCODE_EXP:
1297 unalias1(c, dst, args[0], emit_exp_noalias);
1298 break;
1299 case OPCODE_EX2:
1300 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1301 break;
1302 case OPCODE_ARL:
1303 emit_arl(c, dst, args[0]);
1304 break;
1305 case OPCODE_FLR:
1306 brw_RNDD(p, dst, args[0]);
1307 break;
1308 case OPCODE_FRC:
1309 brw_FRC(p, dst, args[0]);
1310 break;
1311 case OPCODE_LOG:
1312 unalias1(c, dst, args[0], emit_log_noalias);
1313 break;
1314 case OPCODE_LG2:
1315 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1316 break;
1317 case OPCODE_LIT:
1318 unalias1(c, dst, args[0], emit_lit_noalias);
1319 break;
1320 case OPCODE_LRP:
1321 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1322 break;
1323 case OPCODE_MAD:
1324 brw_MOV(p, brw_acc_reg(), args[2]);
1325 brw_MAC(p, dst, args[0], args[1]);
1326 break;
1327 case OPCODE_MAX:
1328 emit_max(p, dst, args[0], args[1]);
1329 break;
1330 case OPCODE_MIN:
1331 emit_min(p, dst, args[0], args[1]);
1332 break;
1333 case OPCODE_MOV:
1334 brw_MOV(p, dst, args[0]);
1335 break;
1336 case OPCODE_MUL:
1337 brw_MUL(p, dst, args[0], args[1]);
1338 break;
1339 case OPCODE_POW:
1340 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1341 break;
1342 case OPCODE_RCP:
1343 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1344 break;
1345 case OPCODE_RSQ:
1346 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1347 break;
1348
1349 case OPCODE_SEQ:
1350 emit_seq(p, dst, args[0], args[1]);
1351 break;
1352 case OPCODE_SIN:
1353 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1354 break;
1355 case OPCODE_SNE:
1356 emit_sne(p, dst, args[0], args[1]);
1357 break;
1358 case OPCODE_SGE:
1359 emit_sge(p, dst, args[0], args[1]);
1360 break;
1361 case OPCODE_SGT:
1362 emit_sgt(p, dst, args[0], args[1]);
1363 break;
1364 case OPCODE_SLT:
1365 emit_slt(p, dst, args[0], args[1]);
1366 break;
1367 case OPCODE_SLE:
1368 emit_sle(p, dst, args[0], args[1]);
1369 break;
1370 case OPCODE_SUB:
1371 brw_ADD(p, dst, args[0], negate(args[1]));
1372 break;
1373 case OPCODE_SWZ:
1374 /* The args[0] value can't be used here as it won't have
1375 * correctly encoded the full swizzle:
1376 */
1377 emit_swz(c, dst, inst);
1378 break;
1379 case OPCODE_TRUNC:
1380 /* round toward zero */
1381 brw_RNDZ(p, dst, args[0]);
1382 break;
1383 case OPCODE_XPD:
1384 emit_xpd(p, dst, args[0], args[1]);
1385 break;
1386 case OPCODE_IF:
1387 assert(if_depth < MAX_IF_DEPTH);
1388 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1389 break;
1390 case OPCODE_ELSE:
1391 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1392 break;
1393 case OPCODE_ENDIF:
1394 assert(if_depth > 0);
1395 brw_ENDIF(p, if_inst[--if_depth]);
1396 break;
1397 #if 0
1398 case OPCODE_BGNLOOP:
1399 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1400 break;
1401 case OPCODE_BRK:
1402 brw_BREAK(p);
1403 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1404 break;
1405 case OPCODE_CONT:
1406 brw_CONT(p);
1407 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1408 break;
1409 case OPCODE_ENDLOOP:
1410 {
1411 struct brw_instruction *inst0, *inst1;
1412 loop_depth--;
1413 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1414 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1415 while (inst0 > loop_inst[loop_depth]) {
1416 inst0--;
1417 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1418 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
1419 inst0->bits3.if_else.pop_count = 0;
1420 }
1421 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1422 inst0->bits3.if_else.jump_count = inst1 - inst0;
1423 inst0->bits3.if_else.pop_count = 0;
1424 }
1425 }
1426 }
1427 break;
1428 #else
1429 (void) loop_inst;
1430 (void) loop_depth;
1431 #endif
1432 case OPCODE_BRA:
1433 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1434 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1435 brw_set_predicate_control_flag_value(p, 0xff);
1436 break;
1437 case OPCODE_CAL:
1438 brw_set_access_mode(p, BRW_ALIGN_1);
1439 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1440 brw_set_access_mode(p, BRW_ALIGN_16);
1441 brw_ADD(p, get_addr_reg(stack_index),
1442 get_addr_reg(stack_index), brw_imm_d(4));
1443 brw_save_call(p, inst->Comment, p->nr_insn);
1444 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1445 break;
1446 case OPCODE_RET:
1447 brw_ADD(p, get_addr_reg(stack_index),
1448 get_addr_reg(stack_index), brw_imm_d(-4));
1449 brw_set_access_mode(p, BRW_ALIGN_1);
1450 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1451 brw_set_access_mode(p, BRW_ALIGN_16);
1452 break;
1453 case OPCODE_END:
1454 end_offset = p->nr_insn;
1455 /* this instruction will get patched later to jump past subroutine
1456 * code, etc.
1457 */
1458 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1459 break;
1460 case OPCODE_PRINT:
1461 /* no-op */
1462 break;
1463 case OPCODE_BGNSUB:
1464 brw_save_label(p, inst->Comment, p->nr_insn);
1465 break;
1466 case OPCODE_ENDSUB:
1467 /* no-op */
1468 break;
1469 default:
1470 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1471 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1472 _mesa_opcode_string(inst->Opcode) :
1473 "unknown");
1474 }
1475
1476 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1477 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1478 && c->output_regs[inst->DstReg.Index].used_in_src) {
1479 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1480 }
1481
1482 /* Result color clamping.
1483 *
1484 * When destination register is an output register and
1485 * it's primary/secondary front/back color, we have to clamp
1486 * the result to [0,1]. This is done by enabling the
1487 * saturation bit for the last instruction.
1488 *
1489 * We don't use brw_set_saturate() as it modifies
1490 * p->current->header.saturate, which affects all the subsequent
1491 * instructions. Instead, we directly modify the header
1492 * of the last (already stored) instruction.
1493 */
1494 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1495 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1496 || (inst->DstReg.Index == VERT_RESULT_COL1)
1497 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1498 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1499 p->store[p->nr_insn-1].header.saturate = 1;
1500 }
1501 }
1502
1503 release_tmps(c);
1504 }
1505
1506 end_inst = &p->store[end_offset];
1507 last_inst = &p->store[p->nr_insn];
1508
1509 /* The END instruction will be patched to jump to this code */
1510 emit_vertex_write(c);
1511
1512 post_vs_emit(c, end_inst, last_inst);
1513 }