i965g: tgsi outputs cannot be used as source regs
[mesa.git] / src / gallium / drivers / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32 #include "pipe/p_shader_tokens.h"
33
34 #include "util/u_memory.h"
35 #include "util/u_math.h"
36
37 #include "tgsi/tgsi_parse.h"
38 #include "tgsi/tgsi_dump.h"
39 #include "tgsi/tgsi_info.h"
40
41 #include "brw_context.h"
42 #include "brw_vs.h"
43 #include "brw_debug.h"
44
45
46
47 static struct brw_reg get_tmp( struct brw_vs_compile *c )
48 {
49 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
50
51 if (++c->last_tmp > c->prog_data.total_grf)
52 c->prog_data.total_grf = c->last_tmp;
53
54 return tmp;
55 }
56
57 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
58 {
59 if (tmp.nr == c->last_tmp-1)
60 c->last_tmp--;
61 }
62
63 static void release_tmps( struct brw_vs_compile *c )
64 {
65 c->last_tmp = c->first_tmp;
66 }
67
68
69
70 /**
71 * Preallocate GRF register before code emit.
72 * Do things as simply as possible. Allocate and populate all regs
73 * ahead of time.
74 */
75 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
76 {
77 GLuint i, reg = 0, mrf;
78 int attributes_in_vue;
79
80 /* Determine whether to use a real constant buffer or use a block
81 * of GRF registers for constants. The later is faster but only
82 * works if everything fits in the GRF.
83 * XXX this heuristic/check may need some fine tuning...
84 */
85 if (c->vp->info.file_max[TGSI_FILE_CONSTANT] +
86 c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
87 c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF)
88 c->vp->use_const_buffer = GL_TRUE;
89 else {
90 /* XXX: immediates can go elsewhere if necessary:
91 */
92 assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
93 c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 <= BRW_MAX_GRF);
94
95 c->vp->use_const_buffer = GL_FALSE;
96 }
97
98 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
99
100 /* r0 -- reserved as usual
101 */
102 c->r0 = brw_vec8_grf(reg, 0);
103 reg++;
104
105 /* User clip planes from curbe:
106 */
107 if (c->key.nr_userclip) {
108 for (i = 0; i < c->key.nr_userclip; i++) {
109 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
110 }
111
112 /* Deal with curbe alignment:
113 */
114 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
115 }
116
117 /* Vertex program parameters from curbe:
118 */
119 if (c->vp->use_const_buffer) {
120 /* get constants from a real constant buffer */
121 c->prog_data.curb_read_length = 0;
122 c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
123 }
124 else {
125 /* use a section of the GRF for constants */
126 GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
127 for (i = 0; i < nr_params; i++) {
128 c->regs[TGSI_FILE_CONSTANT][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
129 }
130 reg += (nr_params + 1) / 2;
131 c->prog_data.curb_read_length = reg - 1;
132 c->prog_data.nr_params = nr_params * 4;
133 }
134
135 /* Allocate input regs:
136 */
137 c->nr_inputs = c->vp->info.num_inputs;
138 for (i = 0; i < c->nr_inputs; i++) {
139 c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
140 reg++;
141 }
142
143 /* If there are no inputs, we'll still be reading one attribute's worth
144 * because it's required -- see urb_read_length setting.
145 */
146 if (c->nr_inputs == 0)
147 reg++;
148
149 /* Allocate a GRF and load immediate values by hand with 4 MOVs!!!
150 *
151 * XXX: Try to encode float immediates as brw immediates
152 * XXX: Put immediates into the CURBE.
153 * XXX: Make sure ureg sets minimal immediate size and respect it
154 * here.
155 */
156 for (i = 0; i < c->nr_immediates; i++) {
157 struct brw_reg r;
158 int j;
159
160 r = brw_vec8_grf(reg, 0);
161
162 for (j = 0; j < 4; j++) {
163 brw_MOV(&c->func,
164 brw_writemask(r, (1<<j)),
165 brw_imm_f(c->immediate[i][j]));
166 }
167
168 reg++;
169 }
170
171
172 /* Allocate outputs. The non-position outputs go straight into message regs.
173 */
174 c->nr_outputs = c->prog_data.nr_outputs;
175 c->first_output = reg;
176 c->first_overflow_output = 0;
177
178 if (c->chipset.is_igdng)
179 mrf = 8;
180 else
181 mrf = 4;
182
183 /* XXX: need to access vertex output semantics here:
184 */
185 for (i = 0; i < c->prog_data.nr_outputs; i++) {
186 assert(i < Elements(c->regs[TGSI_FILE_OUTPUT]));
187
188 /* XXX: Hardwire position to zero:
189 */
190 if (i == 0) {
191 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
192 reg++;
193 }
194 /* XXX: disable psiz:
195 */
196 else if (0) {
197 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
198 reg++;
199 mrf++; /* just a placeholder? XXX fix later stages & remove this */
200 }
201 else if (mrf < 16) {
202 c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
203 mrf++;
204 }
205 else {
206 /* too many vertex results to fit in MRF, use GRF for overflow */
207 if (!c->first_overflow_output)
208 c->first_overflow_output = i;
209 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
210 reg++;
211 }
212 }
213
214 /* Allocate program temporaries:
215 */
216
217 for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
218 c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
219 reg++;
220 }
221
222 /* Address reg(s). Don't try to use the internal address reg until
223 * deref time.
224 */
225 for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
226 c->regs[TGSI_FILE_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
227 reg,
228 0,
229 BRW_REGISTER_TYPE_D,
230 BRW_VERTICAL_STRIDE_8,
231 BRW_WIDTH_8,
232 BRW_HORIZONTAL_STRIDE_1,
233 BRW_SWIZZLE_XXXX,
234 BRW_WRITEMASK_X);
235 reg++;
236 }
237
238 if (c->vp->use_const_buffer) {
239 for (i = 0; i < 3; i++) {
240 c->current_const[i].index = -1;
241 c->current_const[i].reg = brw_vec8_grf(reg, 0);
242 reg++;
243 }
244 }
245
246 #if 0
247 for (i = 0; i < 128; i++) {
248 if (c->output_regs[i].used_in_src) {
249 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
250 reg++;
251 }
252 }
253 #endif
254
255 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
256 reg += 2;
257
258 /* Some opcodes need an internal temporary:
259 */
260 c->first_tmp = reg;
261 c->last_tmp = reg; /* for allocation purposes */
262
263 /* Each input reg holds data from two vertices. The
264 * urb_read_length is the number of registers read from *each*
265 * vertex urb, so is half the amount:
266 */
267 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
268
269 /* Setting this field to 0 leads to undefined behavior according to the
270 * the VS_STATE docs. Our VUEs will always have at least one attribute
271 * sitting in them, even if it's padding.
272 */
273 if (c->prog_data.urb_read_length == 0)
274 c->prog_data.urb_read_length = 1;
275
276 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
277 * them to fit the biggest thing they need to.
278 */
279 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
280
281 if (c->chipset.is_igdng)
282 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
283 else
284 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
285
286 c->prog_data.total_grf = reg;
287
288 if (BRW_DEBUG & DEBUG_VS) {
289 debug_printf("%s NumAddrRegs %d\n", __FUNCTION__,
290 c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
291 debug_printf("%s NumTemps %d\n", __FUNCTION__,
292 c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
293 debug_printf("%s reg = %d\n", __FUNCTION__, reg);
294 }
295 }
296
297
298 /**
299 * If an instruction uses a temp reg both as a src and the dest, we
300 * sometimes need to allocate an intermediate temporary.
301 */
302 static void unalias1( struct brw_vs_compile *c,
303 struct brw_reg dst,
304 struct brw_reg arg0,
305 void (*func)( struct brw_vs_compile *,
306 struct brw_reg,
307 struct brw_reg ))
308 {
309 if (dst.file == arg0.file && dst.nr == arg0.nr) {
310 struct brw_compile *p = &c->func;
311 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
312 func(c, tmp, arg0);
313 brw_MOV(p, dst, tmp);
314 release_tmp(c, tmp);
315 }
316 else {
317 func(c, dst, arg0);
318 }
319 }
320
321 /**
322 * \sa unalias2
323 * Checkes if 2-operand instruction needs an intermediate temporary.
324 */
325 static void unalias2( struct brw_vs_compile *c,
326 struct brw_reg dst,
327 struct brw_reg arg0,
328 struct brw_reg arg1,
329 void (*func)( struct brw_vs_compile *,
330 struct brw_reg,
331 struct brw_reg,
332 struct brw_reg ))
333 {
334 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
335 (dst.file == arg1.file && dst.nr == arg1.nr)) {
336 struct brw_compile *p = &c->func;
337 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
338 func(c, tmp, arg0, arg1);
339 brw_MOV(p, dst, tmp);
340 release_tmp(c, tmp);
341 }
342 else {
343 func(c, dst, arg0, arg1);
344 }
345 }
346
347 /**
348 * \sa unalias2
349 * Checkes if 3-operand instruction needs an intermediate temporary.
350 */
351 static void unalias3( struct brw_vs_compile *c,
352 struct brw_reg dst,
353 struct brw_reg arg0,
354 struct brw_reg arg1,
355 struct brw_reg arg2,
356 void (*func)( struct brw_vs_compile *,
357 struct brw_reg,
358 struct brw_reg,
359 struct brw_reg,
360 struct brw_reg ))
361 {
362 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
363 (dst.file == arg1.file && dst.nr == arg1.nr) ||
364 (dst.file == arg2.file && dst.nr == arg2.nr)) {
365 struct brw_compile *p = &c->func;
366 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
367 func(c, tmp, arg0, arg1, arg2);
368 brw_MOV(p, dst, tmp);
369 release_tmp(c, tmp);
370 }
371 else {
372 func(c, dst, arg0, arg1, arg2);
373 }
374 }
375
376 static void emit_sop( struct brw_compile *p,
377 struct brw_reg dst,
378 struct brw_reg arg0,
379 struct brw_reg arg1,
380 GLuint cond)
381 {
382 brw_MOV(p, dst, brw_imm_f(0.0f));
383 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
384 brw_MOV(p, dst, brw_imm_f(1.0f));
385 brw_set_predicate_control_flag_value(p, 0xff);
386 }
387
388 static void emit_seq( struct brw_compile *p,
389 struct brw_reg dst,
390 struct brw_reg arg0,
391 struct brw_reg arg1 )
392 {
393 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
394 }
395
396 static void emit_sne( struct brw_compile *p,
397 struct brw_reg dst,
398 struct brw_reg arg0,
399 struct brw_reg arg1 )
400 {
401 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
402 }
403 static void emit_slt( struct brw_compile *p,
404 struct brw_reg dst,
405 struct brw_reg arg0,
406 struct brw_reg arg1 )
407 {
408 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
409 }
410
411 static void emit_sle( struct brw_compile *p,
412 struct brw_reg dst,
413 struct brw_reg arg0,
414 struct brw_reg arg1 )
415 {
416 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
417 }
418
419 static void emit_sgt( struct brw_compile *p,
420 struct brw_reg dst,
421 struct brw_reg arg0,
422 struct brw_reg arg1 )
423 {
424 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
425 }
426
427 static void emit_sge( struct brw_compile *p,
428 struct brw_reg dst,
429 struct brw_reg arg0,
430 struct brw_reg arg1 )
431 {
432 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
433 }
434
435 static void emit_max( struct brw_compile *p,
436 struct brw_reg dst,
437 struct brw_reg arg0,
438 struct brw_reg arg1 )
439 {
440 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
441 brw_SEL(p, dst, arg1, arg0);
442 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
443 }
444
445 static void emit_min( struct brw_compile *p,
446 struct brw_reg dst,
447 struct brw_reg arg0,
448 struct brw_reg arg1 )
449 {
450 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
451 brw_SEL(p, dst, arg0, arg1);
452 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
453 }
454
455
456 static void emit_math1( struct brw_vs_compile *c,
457 GLuint function,
458 struct brw_reg dst,
459 struct brw_reg arg0,
460 GLuint precision)
461 {
462 /* There are various odd behaviours with SEND on the simulator. In
463 * addition there are documented issues with the fact that the GEN4
464 * processor doesn't do dependency control properly on SEND
465 * results. So, on balance, this kludge to get around failures
466 * with writemasked math results looks like it might be necessary
467 * whether that turns out to be a simulator bug or not:
468 */
469 struct brw_compile *p = &c->func;
470 struct brw_reg tmp = dst;
471 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
472 dst.file != BRW_GENERAL_REGISTER_FILE);
473
474 if (need_tmp)
475 tmp = get_tmp(c);
476
477 brw_math(p,
478 tmp,
479 function,
480 BRW_MATH_SATURATE_NONE,
481 2,
482 arg0,
483 BRW_MATH_DATA_SCALAR,
484 precision);
485
486 if (need_tmp) {
487 brw_MOV(p, dst, tmp);
488 release_tmp(c, tmp);
489 }
490 }
491
492
493 static void emit_math2( struct brw_vs_compile *c,
494 GLuint function,
495 struct brw_reg dst,
496 struct brw_reg arg0,
497 struct brw_reg arg1,
498 GLuint precision)
499 {
500 struct brw_compile *p = &c->func;
501 struct brw_reg tmp = dst;
502 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
503 dst.file != BRW_GENERAL_REGISTER_FILE);
504
505 if (need_tmp)
506 tmp = get_tmp(c);
507
508 brw_MOV(p, brw_message_reg(3), arg1);
509
510 brw_math(p,
511 tmp,
512 function,
513 BRW_MATH_SATURATE_NONE,
514 2,
515 arg0,
516 BRW_MATH_DATA_SCALAR,
517 precision);
518
519 if (need_tmp) {
520 brw_MOV(p, dst, tmp);
521 release_tmp(c, tmp);
522 }
523 }
524
525
526 static void emit_exp_noalias( struct brw_vs_compile *c,
527 struct brw_reg dst,
528 struct brw_reg arg0 )
529 {
530 struct brw_compile *p = &c->func;
531
532
533 if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
534 struct brw_reg tmp = get_tmp(c);
535 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
536
537 /* tmp_d = floor(arg0.x) */
538 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
539
540 /* result[0] = 2.0 ^ tmp */
541
542 /* Adjust exponent for floating point:
543 * exp += 127
544 */
545 brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
546
547 /* Install exponent and sign.
548 * Excess drops off the edge:
549 */
550 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X),
551 tmp_d, brw_imm_d(23));
552
553 release_tmp(c, tmp);
554 }
555
556 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
557 /* result[1] = arg0.x - floor(arg0.x) */
558 brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
559 }
560
561 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
562 /* As with the LOG instruction, we might be better off just
563 * doing a taylor expansion here, seeing as we have to do all
564 * the prep work.
565 *
566 * If mathbox partial precision is too low, consider also:
567 * result[3] = result[0] * EXP(result[1])
568 */
569 emit_math1(c,
570 BRW_MATH_FUNCTION_EXP,
571 brw_writemask(dst, BRW_WRITEMASK_Z),
572 brw_swizzle1(arg0, 0),
573 BRW_MATH_PRECISION_FULL);
574 }
575
576 if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
577 /* result[3] = 1.0; */
578 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
579 }
580 }
581
582
583 static void emit_log_noalias( struct brw_vs_compile *c,
584 struct brw_reg dst,
585 struct brw_reg arg0 )
586 {
587 struct brw_compile *p = &c->func;
588 struct brw_reg tmp = dst;
589 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
590 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
591 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
592 dst.file != BRW_GENERAL_REGISTER_FILE);
593
594 if (need_tmp) {
595 tmp = get_tmp(c);
596 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
597 }
598
599 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
600 * according to spec:
601 *
602 * These almost look likey they could be joined up, but not really
603 * practical:
604 *
605 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
606 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
607 */
608 if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
609 brw_AND(p,
610 brw_writemask(tmp_ud, BRW_WRITEMASK_X),
611 brw_swizzle1(arg0_ud, 0),
612 brw_imm_ud((1U<<31)-1));
613
614 brw_SHR(p,
615 brw_writemask(tmp_ud, BRW_WRITEMASK_X),
616 tmp_ud,
617 brw_imm_ud(23));
618
619 brw_ADD(p,
620 brw_writemask(tmp, BRW_WRITEMASK_X),
621 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
622 brw_imm_d(-127));
623 }
624
625 if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
626 brw_AND(p,
627 brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
628 brw_swizzle1(arg0_ud, 0),
629 brw_imm_ud((1<<23)-1));
630
631 brw_OR(p,
632 brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
633 tmp_ud,
634 brw_imm_ud(127<<23));
635 }
636
637 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
638 /* result[2] = result[0] + LOG2(result[1]); */
639
640 /* Why bother? The above is just a hint how to do this with a
641 * taylor series. Maybe we *should* use a taylor series as by
642 * the time all the above has been done it's almost certainly
643 * quicker than calling the mathbox, even with low precision.
644 *
645 * Options are:
646 * - result[0] + mathbox.LOG2(result[1])
647 * - mathbox.LOG2(arg0.x)
648 * - result[0] + inline_taylor_approx(result[1])
649 */
650 emit_math1(c,
651 BRW_MATH_FUNCTION_LOG,
652 brw_writemask(tmp, BRW_WRITEMASK_Z),
653 brw_swizzle1(tmp, 1),
654 BRW_MATH_PRECISION_FULL);
655
656 brw_ADD(p,
657 brw_writemask(tmp, BRW_WRITEMASK_Z),
658 brw_swizzle1(tmp, 2),
659 brw_swizzle1(tmp, 0));
660 }
661
662 if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
663 /* result[3] = 1.0; */
664 brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
665 }
666
667 if (need_tmp) {
668 brw_MOV(p, dst, tmp);
669 release_tmp(c, tmp);
670 }
671 }
672
673
674 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
675 */
676 static void emit_dst_noalias( struct brw_vs_compile *c,
677 struct brw_reg dst,
678 struct brw_reg arg0,
679 struct brw_reg arg1)
680 {
681 struct brw_compile *p = &c->func;
682
683 /* There must be a better way to do this:
684 */
685 if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
686 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
687 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
688 brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
689 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
690 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
691 if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
692 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
693 }
694
695
696 static void emit_xpd( struct brw_compile *p,
697 struct brw_reg dst,
698 struct brw_reg t,
699 struct brw_reg u)
700 {
701 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
702 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
703 }
704
705
706 static void emit_lit_noalias( struct brw_vs_compile *c,
707 struct brw_reg dst,
708 struct brw_reg arg0 )
709 {
710 struct brw_compile *p = &c->func;
711 struct brw_instruction *if_insn;
712 struct brw_reg tmp = dst;
713 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
714
715 if (need_tmp)
716 tmp = get_tmp(c);
717
718 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0));
719 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1));
720
721 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
722 * to get all channels active inside the IF. In the clipping code
723 * we run with NoMask, so it's not an option and we can use
724 * BRW_EXECUTE_1 for all comparisions.
725 */
726 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
727 if_insn = brw_IF(p, BRW_EXECUTE_8);
728 {
729 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
730
731 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
732 brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z), brw_swizzle1(arg0,1));
733 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
734
735 emit_math2(c,
736 BRW_MATH_FUNCTION_POW,
737 brw_writemask(dst, BRW_WRITEMASK_Z),
738 brw_swizzle1(tmp, 2),
739 brw_swizzle1(arg0, 3),
740 BRW_MATH_PRECISION_PARTIAL);
741 }
742
743 brw_ENDIF(p, if_insn);
744
745 release_tmp(c, tmp);
746 }
747
748 static void emit_lrp_noalias(struct brw_vs_compile *c,
749 struct brw_reg dst,
750 struct brw_reg arg0,
751 struct brw_reg arg1,
752 struct brw_reg arg2)
753 {
754 struct brw_compile *p = &c->func;
755
756 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
757 brw_MUL(p, brw_null_reg(), dst, arg2);
758 brw_MAC(p, dst, arg0, arg1);
759 }
760
761 /** 3 or 4-component vector normalization */
762 static void emit_nrm( struct brw_vs_compile *c,
763 struct brw_reg dst,
764 struct brw_reg arg0,
765 int num_comps)
766 {
767 struct brw_compile *p = &c->func;
768 struct brw_reg tmp = get_tmp(c);
769
770 /* tmp = dot(arg0, arg0) */
771 if (num_comps == 3)
772 brw_DP3(p, tmp, arg0, arg0);
773 else
774 brw_DP4(p, tmp, arg0, arg0);
775
776 /* tmp = 1 / sqrt(tmp) */
777 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
778
779 /* dst = arg0 * tmp */
780 brw_MUL(p, dst, arg0, tmp);
781
782 release_tmp(c, tmp);
783 }
784
785
786 static struct brw_reg
787 get_constant(struct brw_vs_compile *c,
788 GLuint argIndex,
789 GLuint index,
790 GLboolean relAddr)
791 {
792 struct brw_compile *p = &c->func;
793 struct brw_reg const_reg;
794 struct brw_reg const2_reg;
795
796 assert(argIndex < 3);
797
798 if (c->current_const[argIndex].index != index || relAddr) {
799 struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
800
801 c->current_const[argIndex].index = index;
802
803 #if 0
804 printf(" fetch const[%d] for arg %d into reg %d\n",
805 src.Index, argIndex, c->current_const[argIndex].reg.nr);
806 #endif
807 /* need to fetch the constant now */
808 brw_dp_READ_4_vs(p,
809 c->current_const[argIndex].reg,/* writeback dest */
810 0, /* oword */
811 relAddr, /* relative indexing? */
812 addrReg, /* address register */
813 16 * index, /* byte offset */
814 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
815 );
816
817 if (relAddr) {
818 /* second read */
819 const2_reg = get_tmp(c);
820
821 /* use upper half of address reg for second read */
822 addrReg = stride(addrReg, 0, 4, 0);
823 addrReg.subnr = 16;
824
825 brw_dp_READ_4_vs(p,
826 const2_reg, /* writeback dest */
827 1, /* oword */
828 relAddr, /* relative indexing? */
829 addrReg, /* address register */
830 16 * index, /* byte offset */
831 SURF_INDEX_VERT_CONST_BUFFER
832 );
833 }
834 }
835
836 const_reg = c->current_const[argIndex].reg;
837
838 if (relAddr) {
839 /* merge the two Owords into the constant register */
840 /* const_reg[7..4] = const2_reg[7..4] */
841 brw_MOV(p,
842 suboffset(stride(const_reg, 0, 4, 1), 4),
843 suboffset(stride(const2_reg, 0, 4, 1), 4));
844 release_tmp(c, const2_reg);
845 }
846 else {
847 /* replicate lower four floats into upper half (to get XYZWXYZW) */
848 const_reg = stride(const_reg, 0, 4, 0);
849 const_reg.subnr = 0;
850 }
851
852 return const_reg;
853 }
854
855
856
857 /* TODO: relative addressing!
858 */
859 static struct brw_reg get_reg( struct brw_vs_compile *c,
860 enum tgsi_file_type file,
861 GLuint index )
862 {
863 switch (file) {
864 case TGSI_FILE_TEMPORARY:
865 case TGSI_FILE_INPUT:
866 case TGSI_FILE_OUTPUT:
867 case TGSI_FILE_CONSTANT:
868 assert(c->regs[file][index].nr != 0);
869 return c->regs[file][index];
870
871 case TGSI_FILE_ADDRESS:
872 assert(index == 0);
873 return c->regs[file][index];
874
875 case TGSI_FILE_NULL: /* undef values */
876 return brw_null_reg();
877
878 default:
879 assert(0);
880 return brw_null_reg();
881 }
882 }
883
884
885 /**
886 * Indirect addressing: get reg[[arg] + offset].
887 */
888 static struct brw_reg deref( struct brw_vs_compile *c,
889 struct brw_reg arg,
890 GLint offset)
891 {
892 struct brw_compile *p = &c->func;
893 struct brw_reg tmp = vec4(get_tmp(c));
894 struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
895 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
896 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
897 struct brw_reg indirect = brw_vec4_indirect(0,0);
898
899 {
900 brw_push_insn_state(p);
901 brw_set_access_mode(p, BRW_ALIGN_1);
902
903 /* This is pretty clunky - load the address register twice and
904 * fetch each 4-dword value in turn. There must be a way to do
905 * this in a single pass, but I couldn't get it to work.
906 */
907 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
908 brw_MOV(p, tmp, indirect);
909
910 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
911 brw_MOV(p, suboffset(tmp, 4), indirect);
912
913 brw_pop_insn_state(p);
914 }
915
916 /* NOTE: tmp not released */
917 return vec8(tmp);
918 }
919
920
921 /**
922 * Get brw reg corresponding to the instruction's [argIndex] src reg.
923 * TODO: relative addressing!
924 */
925 static struct brw_reg
926 get_src_reg( struct brw_vs_compile *c,
927 GLuint argIndex,
928 GLuint file,
929 GLint index,
930 GLboolean relAddr )
931 {
932
933 switch (file) {
934 case TGSI_FILE_TEMPORARY:
935 case TGSI_FILE_INPUT:
936 case TGSI_FILE_OUTPUT:
937 if (relAddr) {
938 return deref(c, c->regs[file][0], index);
939 }
940 else {
941 assert(c->regs[file][index].nr != 0);
942 return c->regs[file][index];
943 }
944
945 case TGSI_FILE_IMMEDIATE:
946 return c->regs[file][index];
947
948 case TGSI_FILE_CONSTANT:
949 if (c->vp->use_const_buffer) {
950 return get_constant(c, argIndex, index, relAddr);
951 }
952 else if (relAddr) {
953 return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
954 }
955 else {
956 assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
957 return c->regs[TGSI_FILE_CONSTANT][index];
958 }
959 case TGSI_FILE_ADDRESS:
960 assert(index == 0);
961 return c->regs[file][index];
962
963 case TGSI_FILE_NULL:
964 /* this is a normal case since we loop over all three src args */
965 return brw_null_reg();
966
967 default:
968 assert(0);
969 return brw_null_reg();
970 }
971 }
972
973
974 static void emit_arl( struct brw_vs_compile *c,
975 struct brw_reg dst,
976 struct brw_reg arg0 )
977 {
978 struct brw_compile *p = &c->func;
979 struct brw_reg tmp = dst;
980 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
981
982 if (need_tmp)
983 tmp = get_tmp(c);
984
985 brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
986 brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
987
988 if (need_tmp)
989 release_tmp(c, tmp);
990 }
991
992
993 /**
994 * Return the brw reg for the given instruction's src argument.
995 */
996 static struct brw_reg get_arg( struct brw_vs_compile *c,
997 const struct tgsi_full_src_register *src,
998 GLuint argIndex )
999 {
1000 struct brw_reg reg;
1001
1002 if (src->SrcRegister.File == TGSI_FILE_NULL)
1003 return brw_null_reg();
1004
1005 reg = get_src_reg(c, argIndex,
1006 src->SrcRegister.File,
1007 src->SrcRegister.Index,
1008 src->SrcRegister.Indirect);
1009
1010 /* Convert 3-bit swizzle to 2-bit.
1011 */
1012 reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SrcRegister.SwizzleX,
1013 src->SrcRegister.SwizzleY,
1014 src->SrcRegister.SwizzleZ,
1015 src->SrcRegister.SwizzleW);
1016
1017 reg.negate = src->SrcRegister.Negate ? 1 : 0;
1018
1019 /* XXX: abs, absneg
1020 */
1021
1022 return reg;
1023 }
1024
1025
1026 /**
1027 * Get brw register for the given program dest register.
1028 */
1029 static struct brw_reg get_dst( struct brw_vs_compile *c,
1030 unsigned file,
1031 unsigned index,
1032 unsigned writemask )
1033 {
1034 struct brw_reg reg;
1035
1036 switch (file) {
1037 case TGSI_FILE_TEMPORARY:
1038 case TGSI_FILE_OUTPUT:
1039 assert(c->regs[file][index].nr != 0);
1040 reg = c->regs[file][index];
1041 break;
1042 case TGSI_FILE_ADDRESS:
1043 assert(index == 0);
1044 reg = c->regs[file][index];
1045 break;
1046 case TGSI_FILE_NULL:
1047 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1048 reg = brw_null_reg();
1049 break;
1050 default:
1051 assert(0);
1052 reg = brw_null_reg();
1053 }
1054
1055 reg.dw1.bits.writemask = writemask;
1056
1057 return reg;
1058 }
1059
1060
1061
1062
1063 /**
1064 * Post-vertex-program processing. Send the results to the URB.
1065 */
1066 static void emit_vertex_write( struct brw_vs_compile *c)
1067 {
1068 struct brw_compile *p = &c->func;
1069 struct brw_reg m0 = brw_message_reg(0);
1070 struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
1071 struct brw_reg ndc;
1072 int eot;
1073 GLuint len_vertext_header = 2;
1074
1075 if (c->key.copy_edgeflag) {
1076 assert(0);
1077 brw_MOV(p,
1078 get_reg(c, TGSI_FILE_OUTPUT, 0),
1079 get_reg(c, TGSI_FILE_INPUT, 0));
1080 }
1081
1082 /* Build ndc coords */
1083 ndc = get_tmp(c);
1084 /* ndc = 1.0 / pos.w */
1085 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1086 /* ndc.xyz = pos * ndc */
1087 brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
1088
1089 /* Update the header for point size, user clipping flags, and -ve rhw
1090 * workaround.
1091 */
1092 if (c->prog_data.writes_psiz ||
1093 c->key.nr_userclip ||
1094 c->chipset.is_965)
1095 {
1096 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1097 GLuint i;
1098
1099 brw_MOV(p, header1, brw_imm_ud(0));
1100
1101 brw_set_access_mode(p, BRW_ALIGN_16);
1102
1103 if (c->prog_data.writes_psiz) {
1104 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
1105 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1106 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1107 }
1108
1109 for (i = 0; i < c->key.nr_userclip; i++) {
1110 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1111 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1112 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
1113 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1114 }
1115
1116 /* i965 clipping workaround:
1117 * 1) Test for -ve rhw
1118 * 2) If set,
1119 * set ndc = (0,0,0,0)
1120 * set ucp[6] = 1
1121 *
1122 * Later, clipping will detect ucp[6] and ensure the primitive is
1123 * clipped against all fixed planes.
1124 */
1125 if (c->chipset.is_965) {
1126 brw_CMP(p,
1127 vec8(brw_null_reg()),
1128 BRW_CONDITIONAL_L,
1129 brw_swizzle1(ndc, 3),
1130 brw_imm_f(0));
1131
1132 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
1133 brw_MOV(p, ndc, brw_imm_f(0));
1134 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1135 }
1136
1137 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1138 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1139 brw_set_access_mode(p, BRW_ALIGN_16);
1140
1141 release_tmp(c, header1);
1142 }
1143 else {
1144 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1145 }
1146
1147 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1148 * of zeros followed by two sets of NDC coordinates:
1149 */
1150 brw_set_access_mode(p, BRW_ALIGN_1);
1151 brw_MOV(p, offset(m0, 2), ndc);
1152
1153 if (c->chipset.is_igdng) {
1154 /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1155 brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1156 /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1157 * Seems it is useless for us.
1158 * m6 is used for aligning, so that the remainder of vertex element is
1159 * reg-aligned.
1160 */
1161 brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1162 len_vertext_header = 6;
1163 } else {
1164 brw_MOV(p, offset(m0, 3), pos);
1165 len_vertext_header = 2;
1166 }
1167
1168 eot = (c->first_overflow_output == 0);
1169
1170 brw_urb_WRITE(p,
1171 brw_null_reg(), /* dest */
1172 0, /* starting mrf reg nr */
1173 c->r0, /* src */
1174 0, /* allocate */
1175 1, /* used */
1176 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1177 0, /* response len */
1178 eot, /* eot */
1179 eot, /* writes complete */
1180 0, /* urb destination offset */
1181 BRW_URB_SWIZZLE_INTERLEAVE);
1182
1183 if (c->first_overflow_output > 0) {
1184 /* Not all of the vertex outputs/results fit into the MRF.
1185 * Move the overflowed attributes from the GRF to the MRF and
1186 * issue another brw_urb_WRITE().
1187 */
1188 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1189 * at mrf[4] atm...
1190 */
1191 GLuint i, mrf = 0;
1192 for (i = c->first_overflow_output; i < c->prog_data.nr_outputs; i++) {
1193 /* move from GRF to MRF */
1194 brw_MOV(p, brw_message_reg(4+mrf), c->regs[TGSI_FILE_OUTPUT][i]);
1195 mrf++;
1196 }
1197
1198 brw_urb_WRITE(p,
1199 brw_null_reg(), /* dest */
1200 4, /* starting mrf reg nr */
1201 c->r0, /* src */
1202 0, /* allocate */
1203 1, /* used */
1204 mrf+1, /* msg len */
1205 0, /* response len */
1206 1, /* eot */
1207 1, /* writes complete */
1208 BRW_MAX_MRF-1, /* urb destination offset */
1209 BRW_URB_SWIZZLE_INTERLEAVE);
1210 }
1211 }
1212
1213
1214 /**
1215 * Called after code generation to resolve subroutine calls and the
1216 * END instruction.
1217 * \param end_inst points to brw code for END instruction
1218 * \param last_inst points to last instruction emitted before vertex write
1219 */
1220 static void
1221 post_vs_emit( struct brw_vs_compile *c,
1222 struct brw_instruction *end_inst,
1223 struct brw_instruction *last_inst )
1224 {
1225 GLint offset;
1226
1227 brw_resolve_cals(&c->func);
1228
1229 /* patch up the END code to jump past subroutines, etc */
1230 offset = last_inst - end_inst;
1231 if (offset > 1) {
1232 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1233 } else {
1234 end_inst->header.opcode = BRW_OPCODE_NOP;
1235 }
1236 }
1237
1238 static uint32_t
1239 get_predicate(const struct tgsi_full_instruction *inst)
1240 {
1241 /* XXX: disabling for now
1242 */
1243 #if 0
1244 if (inst->dst.CondMask == COND_TR)
1245 return BRW_PREDICATE_NONE;
1246
1247 /* All of GLSL only produces predicates for COND_NE and one channel per
1248 * vector. Fail badly if someone starts doing something else, as it might
1249 * mean infinite looping or something.
1250 *
1251 * We'd like to support all the condition codes, but our hardware doesn't
1252 * quite match the Mesa IR, which is modeled after the NV extensions. For
1253 * those, the instruction may update the condition codes or not, then any
1254 * later instruction may use one of those condition codes. For gen4, the
1255 * instruction may update the flags register based on one of the condition
1256 * codes output by the instruction, and then further instructions may
1257 * predicate on that. We can probably support this, but it won't
1258 * necessarily be easy.
1259 */
1260 /* assert(inst->dst.CondMask == COND_NE); */
1261
1262 switch (inst->dst.CondSwizzle) {
1263 case SWIZZLE_XXXX:
1264 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1265 case SWIZZLE_YYYY:
1266 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1267 case SWIZZLE_ZZZZ:
1268 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1269 case SWIZZLE_WWWW:
1270 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1271 default:
1272 debug_printf("Unexpected predicate: 0x%08x\n",
1273 inst->dst.CondMask);
1274 return BRW_PREDICATE_NORMAL;
1275 }
1276 #else
1277 return BRW_PREDICATE_NORMAL;
1278 #endif
1279 }
1280
1281 static void emit_insn(struct brw_vs_compile *c,
1282 const struct tgsi_full_instruction *inst)
1283 {
1284 unsigned opcode = inst->Instruction.Opcode;
1285 unsigned label = inst->InstructionExtLabel.Label;
1286 struct brw_compile *p = &c->func;
1287 struct brw_reg args[3], dst;
1288 GLuint i;
1289
1290 #if 0
1291 printf("%d: ", insn);
1292 _mesa_print_instruction(inst);
1293 #endif
1294
1295 /* Get argument regs.
1296 */
1297 for (i = 0; i < 3; i++) {
1298 args[i] = get_arg(c, &inst->FullSrcRegisters[i], i);
1299 }
1300
1301 /* Get dest regs. Note that it is possible for a reg to be both
1302 * dst and arg, given the static allocation of registers. So
1303 * care needs to be taken emitting multi-operation instructions.
1304 */
1305 dst = get_dst(c,
1306 inst->FullDstRegisters[0].DstRegister.File,
1307 inst->FullDstRegisters[0].DstRegister.Index,
1308 inst->FullDstRegisters[0].DstRegister.WriteMask);
1309
1310 /* XXX: saturate
1311 */
1312 if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
1313 debug_printf("Unsupported saturate in vertex shader");
1314 }
1315
1316 switch (opcode) {
1317 case TGSI_OPCODE_ABS:
1318 brw_MOV(p, dst, brw_abs(args[0]));
1319 break;
1320 case TGSI_OPCODE_ADD:
1321 brw_ADD(p, dst, args[0], args[1]);
1322 break;
1323 case TGSI_OPCODE_COS:
1324 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1325 break;
1326 case TGSI_OPCODE_DP3:
1327 brw_DP3(p, dst, args[0], args[1]);
1328 break;
1329 case TGSI_OPCODE_DP4:
1330 brw_DP4(p, dst, args[0], args[1]);
1331 break;
1332 case TGSI_OPCODE_DPH:
1333 brw_DPH(p, dst, args[0], args[1]);
1334 break;
1335 case TGSI_OPCODE_NRM:
1336 emit_nrm(c, dst, args[0], 3);
1337 break;
1338 case TGSI_OPCODE_NRM4:
1339 emit_nrm(c, dst, args[0], 4);
1340 break;
1341 case TGSI_OPCODE_DST:
1342 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1343 break;
1344 case TGSI_OPCODE_EXP:
1345 unalias1(c, dst, args[0], emit_exp_noalias);
1346 break;
1347 case TGSI_OPCODE_EX2:
1348 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1349 break;
1350 case TGSI_OPCODE_ARL:
1351 emit_arl(c, dst, args[0]);
1352 break;
1353 case TGSI_OPCODE_FLR:
1354 brw_RNDD(p, dst, args[0]);
1355 break;
1356 case TGSI_OPCODE_FRC:
1357 brw_FRC(p, dst, args[0]);
1358 break;
1359 case TGSI_OPCODE_LOG:
1360 unalias1(c, dst, args[0], emit_log_noalias);
1361 break;
1362 case TGSI_OPCODE_LG2:
1363 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1364 break;
1365 case TGSI_OPCODE_LIT:
1366 unalias1(c, dst, args[0], emit_lit_noalias);
1367 break;
1368 case TGSI_OPCODE_LRP:
1369 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1370 break;
1371 case TGSI_OPCODE_MAD:
1372 brw_MOV(p, brw_acc_reg(), args[2]);
1373 brw_MAC(p, dst, args[0], args[1]);
1374 break;
1375 case TGSI_OPCODE_MAX:
1376 emit_max(p, dst, args[0], args[1]);
1377 break;
1378 case TGSI_OPCODE_MIN:
1379 emit_min(p, dst, args[0], args[1]);
1380 break;
1381 case TGSI_OPCODE_MOV:
1382 brw_MOV(p, dst, args[0]);
1383 break;
1384 case TGSI_OPCODE_MUL:
1385 brw_MUL(p, dst, args[0], args[1]);
1386 break;
1387 case TGSI_OPCODE_POW:
1388 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1389 break;
1390 case TGSI_OPCODE_RCP:
1391 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1392 break;
1393 case TGSI_OPCODE_RSQ:
1394 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1395 break;
1396 case TGSI_OPCODE_SEQ:
1397 emit_seq(p, dst, args[0], args[1]);
1398 break;
1399 case TGSI_OPCODE_SIN:
1400 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1401 break;
1402 case TGSI_OPCODE_SNE:
1403 emit_sne(p, dst, args[0], args[1]);
1404 break;
1405 case TGSI_OPCODE_SGE:
1406 emit_sge(p, dst, args[0], args[1]);
1407 break;
1408 case TGSI_OPCODE_SGT:
1409 emit_sgt(p, dst, args[0], args[1]);
1410 break;
1411 case TGSI_OPCODE_SLT:
1412 emit_slt(p, dst, args[0], args[1]);
1413 break;
1414 case TGSI_OPCODE_SLE:
1415 emit_sle(p, dst, args[0], args[1]);
1416 break;
1417 case TGSI_OPCODE_SUB:
1418 brw_ADD(p, dst, args[0], negate(args[1]));
1419 break;
1420 case TGSI_OPCODE_TRUNC:
1421 /* round toward zero */
1422 brw_RNDZ(p, dst, args[0]);
1423 break;
1424 case TGSI_OPCODE_XPD:
1425 emit_xpd(p, dst, args[0], args[1]);
1426 break;
1427 case TGSI_OPCODE_IF:
1428 assert(c->if_depth < MAX_IF_DEPTH);
1429 c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
1430 /* Note that brw_IF smashes the predicate_control field. */
1431 c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
1432 c->if_depth++;
1433 break;
1434 case TGSI_OPCODE_ELSE:
1435 c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
1436 break;
1437 case TGSI_OPCODE_ENDIF:
1438 assert(c->if_depth > 0);
1439 brw_ENDIF(p, c->if_inst[--c->if_depth]);
1440 break;
1441 case TGSI_OPCODE_BGNLOOP:
1442 c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1443 break;
1444 case TGSI_OPCODE_BRK:
1445 brw_set_predicate_control(p, get_predicate(inst));
1446 brw_BREAK(p);
1447 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1448 break;
1449 case TGSI_OPCODE_CONT:
1450 brw_set_predicate_control(p, get_predicate(inst));
1451 brw_CONT(p);
1452 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1453 break;
1454 case TGSI_OPCODE_ENDLOOP:
1455 {
1456 struct brw_instruction *inst0, *inst1;
1457 GLuint br = 1;
1458
1459 c->loop_depth--;
1460
1461 if (c->chipset.is_igdng)
1462 br = 2;
1463
1464 inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
1465 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1466 while (inst0 > c->loop_inst[c->loop_depth]) {
1467 inst0--;
1468 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
1469 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1470 inst0->bits3.if_else.pop_count = 0;
1471 }
1472 else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
1473 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1474 inst0->bits3.if_else.pop_count = 0;
1475 }
1476 }
1477 }
1478 break;
1479 case TGSI_OPCODE_BRA:
1480 brw_set_predicate_control(p, get_predicate(inst));
1481 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1482 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1483 break;
1484 case TGSI_OPCODE_CAL:
1485 brw_set_access_mode(p, BRW_ALIGN_1);
1486 brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1487 brw_set_access_mode(p, BRW_ALIGN_16);
1488 brw_ADD(p, get_addr_reg(c->stack_index),
1489 get_addr_reg(c->stack_index), brw_imm_d(4));
1490 brw_save_call(p, label, p->nr_insn);
1491 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1492 break;
1493 case TGSI_OPCODE_RET:
1494 brw_ADD(p, get_addr_reg(c->stack_index),
1495 get_addr_reg(c->stack_index), brw_imm_d(-4));
1496 brw_set_access_mode(p, BRW_ALIGN_1);
1497 brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
1498 brw_set_access_mode(p, BRW_ALIGN_16);
1499 break;
1500 case TGSI_OPCODE_END:
1501 c->end_offset = p->nr_insn;
1502 /* this instruction will get patched later to jump past subroutine
1503 * code, etc.
1504 */
1505 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1506 break;
1507 case TGSI_OPCODE_BGNSUB:
1508 brw_save_label(p, p->nr_insn, p->nr_insn);
1509 break;
1510 case TGSI_OPCODE_ENDSUB:
1511 /* no-op */
1512 break;
1513 default:
1514 debug_printf("Unsupported opcode %i (%s) in vertex shader",
1515 opcode,
1516 tgsi_get_opcode_name(opcode));
1517 }
1518
1519 /* Set the predication update on the last instruction of the native
1520 * instruction sequence.
1521 *
1522 * This would be problematic if it was set on a math instruction,
1523 * but that shouldn't be the case with the current GLSL compiler.
1524 */
1525 #if 0
1526 /* XXX: disabled
1527 */
1528 if (inst->CondUpdate) {
1529 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1530
1531 assert(hw_insn->header.destreg__conditionalmod == 0);
1532 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1533 }
1534 #endif
1535
1536 release_tmps(c);
1537 }
1538
1539
1540 /* Emit the vertex program instructions here.
1541 */
1542 void brw_vs_emit(struct brw_vs_compile *c)
1543 {
1544 struct brw_compile *p = &c->func;
1545 const struct tgsi_token *tokens = c->vp->tokens;
1546 struct brw_instruction *end_inst, *last_inst;
1547 struct tgsi_parse_context parse;
1548 struct tgsi_full_instruction *inst;
1549 boolean done = FALSE;
1550 int i;
1551
1552 if (BRW_DEBUG & DEBUG_VS)
1553 tgsi_dump(c->vp->tokens, 0);
1554
1555 c->stack_index = brw_indirect(0, 0);
1556
1557 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1558 brw_set_access_mode(p, BRW_ALIGN_16);
1559
1560 /* Inputs */
1561 tgsi_parse_init( &parse, tokens );
1562 while( !tgsi_parse_end_of_tokens( &parse ) ) {
1563 tgsi_parse_token( &parse );
1564
1565 switch( parse.FullToken.Token.Type ) {
1566 case TGSI_TOKEN_TYPE_DECLARATION:
1567 /* Nothing to do -- using info from tgsi_scan().
1568 */
1569 break;
1570
1571 case TGSI_TOKEN_TYPE_IMMEDIATE: {
1572 static const float id[4] = {0,0,0,1};
1573 const float *imm = &parse.FullToken.FullImmediate.u[i].Float;
1574 unsigned size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1575
1576 for (i = 0; i < size; i++)
1577 c->immediate[c->nr_immediates][i] = imm[i];
1578
1579 for ( ; i < 4; i++)
1580 c->immediate[c->nr_immediates][i] = id[i];
1581
1582 c->nr_immediates++;
1583 break;
1584 }
1585
1586 case TGSI_TOKEN_TYPE_INSTRUCTION:
1587 done = 1;
1588 break;
1589 }
1590 }
1591
1592 /* Static register allocation
1593 */
1594 brw_vs_alloc_regs(c);
1595 brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
1596
1597 /* Instructions
1598 */
1599 tgsi_parse_init( &parse, tokens );
1600 while( !tgsi_parse_end_of_tokens( &parse ) ) {
1601 tgsi_parse_token( &parse );
1602
1603 switch( parse.FullToken.Token.Type ) {
1604 case TGSI_TOKEN_TYPE_DECLARATION:
1605 case TGSI_TOKEN_TYPE_IMMEDIATE:
1606 break;
1607
1608 case TGSI_TOKEN_TYPE_INSTRUCTION:
1609 inst = &parse.FullToken.FullInstruction;
1610 emit_insn( c, inst );
1611 break;
1612
1613 default:
1614 assert( 0 );
1615 }
1616 }
1617 tgsi_parse_free( &parse );
1618
1619 end_inst = &p->store[c->end_offset];
1620 last_inst = &p->store[p->nr_insn];
1621
1622 /* The END instruction will be patched to jump to this code */
1623 emit_vertex_write(c);
1624
1625 post_vs_emit(c, end_inst, last_inst);
1626
1627 if (BRW_DEBUG & DEBUG_VS) {
1628 debug_printf("vs-native:\n");
1629 brw_disasm(stderr, p->store, p->nr_insn);
1630 }
1631 }