Merge branch 'gallium-edgeflags'
[mesa.git] / src / gallium / drivers / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32 #include "pipe/p_shader_tokens.h"
33
34 #include "util/u_memory.h"
35 #include "util/u_math.h"
36
37 #include "tgsi/tgsi_parse.h"
38 #include "tgsi/tgsi_dump.h"
39 #include "tgsi/tgsi_info.h"
40
41 #include "brw_context.h"
42 #include "brw_vs.h"
43 #include "brw_debug.h"
44 #include "brw_disasm.h"
45
46 /* Choose one of the 4 vec4's which can be packed into each 16-wide reg.
47 */
48 static INLINE struct brw_reg brw_vec4_grf_repeat( GLuint reg, GLuint slot )
49 {
50 int nr = reg + slot/2;
51 int subnr = (slot%2) * 4;
52
53 return stride(brw_vec4_grf(nr, subnr), 0, 4, 1);
54 }
55
56
57 static struct brw_reg get_tmp( struct brw_vs_compile *c )
58 {
59 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
60
61 if (++c->last_tmp > c->prog_data.total_grf)
62 c->prog_data.total_grf = c->last_tmp;
63
64 return tmp;
65 }
66
67 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
68 {
69 if (tmp.nr == c->last_tmp-1)
70 c->last_tmp--;
71 }
72
73 static void release_tmps( struct brw_vs_compile *c )
74 {
75 c->last_tmp = c->first_tmp;
76 }
77
78
79 static boolean is_position_output( struct brw_vs_compile *c,
80 unsigned vs_output )
81 {
82 struct brw_vertex_shader *vs = c->vp;
83
84 if (vs_output == c->prog_data.output_edgeflag) {
85 return FALSE;
86 }
87 else {
88 unsigned semantic = vs->info.output_semantic_name[vs_output];
89 unsigned index = vs->info.output_semantic_index[vs_output];
90
91 return (semantic == TGSI_SEMANTIC_POSITION &&
92 index == 0);
93 }
94 }
95
96
97 static boolean find_output_slot( struct brw_vs_compile *c,
98 unsigned vs_output,
99 unsigned *fs_input_slot )
100 {
101 struct brw_vertex_shader *vs = c->vp;
102
103 if (vs_output == c->prog_data.output_edgeflag) {
104 *fs_input_slot = c->key.fs_signature.nr_inputs;
105 return TRUE;
106 }
107 else {
108 unsigned semantic = vs->info.output_semantic_name[vs_output];
109 unsigned index = vs->info.output_semantic_index[vs_output];
110 unsigned i;
111
112 for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
113 if (c->key.fs_signature.input[i].semantic == semantic &&
114 c->key.fs_signature.input[i].semantic_index == index) {
115 *fs_input_slot = i;
116 return TRUE;
117 }
118 }
119 }
120
121 return FALSE;
122 }
123
124
125 /**
126 * Preallocate GRF register before code emit.
127 * Do things as simply as possible. Allocate and populate all regs
128 * ahead of time.
129 */
130 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
131 {
132 GLuint i, reg = 0, subreg = 0, mrf;
133 int attributes_in_vue;
134
135 /* Determine whether to use a real constant buffer or use a block
136 * of GRF registers for constants. The later is faster but only
137 * works if everything fits in the GRF.
138 * XXX this heuristic/check may need some fine tuning...
139 */
140 if (c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1 +
141 c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
142 c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 > BRW_MAX_GRF)
143 c->vp->use_const_buffer = GL_TRUE;
144 else {
145 /* XXX: immediates can go elsewhere if necessary:
146 */
147 assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
148 c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 <= BRW_MAX_GRF);
149
150 c->vp->use_const_buffer = GL_FALSE;
151 }
152
153 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
154
155 /* r0 -- reserved as usual
156 */
157 c->r0 = brw_vec8_grf(reg, 0);
158 reg++;
159
160 /* User clip planes from curbe:
161 */
162 if (c->key.nr_userclip) {
163 /* Skip over fixed planes: Or never read them into vs unit?
164 */
165 subreg += 6;
166
167 for (i = 0; i < c->key.nr_userclip; i++, subreg++) {
168 c->userplane[i] =
169 stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
170 }
171
172 /* Deal with curbe alignment:
173 */
174 subreg = align(subreg, 2);
175 /*reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;*/
176 }
177
178
179 /* Immediates: always in the curbe.
180 *
181 * XXX: Can try to encode some immediates as brw immediates
182 * XXX: Make sure ureg sets minimal immediate size and respect it
183 * here.
184 */
185 for (i = 0; i < c->vp->info.immediate_count; i++, subreg++) {
186 c->regs[TGSI_FILE_IMMEDIATE][i] =
187 stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
188 }
189 c->prog_data.nr_params = c->vp->info.immediate_count * 4;
190
191
192 /* Vertex constant buffer.
193 *
194 * Constants from the buffer can be either cached in the curbe or
195 * loaded as needed from the actual constant buffer.
196 */
197 if (!c->vp->use_const_buffer) {
198 GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
199
200 for (i = 0; i < nr_params; i++, subreg++) {
201 c->regs[TGSI_FILE_CONSTANT][i] =
202 stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
203 }
204
205 c->prog_data.nr_params += nr_params * 4;
206 }
207
208 /* All regs allocated
209 */
210 reg += (subreg + 1) / 2;
211 c->prog_data.curb_read_length = reg - 1;
212
213
214 /* Allocate input regs:
215 */
216 c->nr_inputs = c->vp->info.num_inputs;
217 for (i = 0; i < c->nr_inputs; i++) {
218 c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
219 reg++;
220 }
221
222 /* If there are no inputs, we'll still be reading one attribute's worth
223 * because it's required -- see urb_read_length setting.
224 */
225 if (c->nr_inputs == 0)
226 reg++;
227
228
229
230 /* Allocate outputs. The non-position outputs go straight into message regs.
231 */
232 c->nr_outputs = c->prog_data.nr_outputs;
233
234 if (c->chipset.is_igdng)
235 mrf = 8;
236 else
237 mrf = 4;
238
239
240 if (c->key.fs_signature.nr_inputs > BRW_MAX_MRF) {
241 c->overflow_grf_start = reg;
242 c->overflow_count = c->key.fs_signature.nr_inputs - BRW_MAX_MRF;
243 reg += c->overflow_count;
244 }
245
246 /* XXX: need to access vertex output semantics here:
247 */
248 for (i = 0; i < c->nr_outputs; i++) {
249 unsigned slot;
250
251 /* XXX: Put output position in slot zero always. Clipper, etc,
252 * need access to this reg.
253 */
254 if (is_position_output(c, i)) {
255 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0); /* copy to mrf 0 */
256 reg++;
257 }
258 else if (find_output_slot(c, i, &slot)) {
259
260 if (0 /* is_psize_output(c, i) */ ) {
261 /* c->psize_out.grf = reg; */
262 /* c->psize_out.mrf = i; */
263 }
264
265 /* The first (16-4) outputs can go straight into the message regs.
266 */
267 if (slot + mrf < BRW_MAX_MRF) {
268 c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(slot + mrf);
269 }
270 else {
271 int grf = c->overflow_grf_start + slot - BRW_MAX_MRF;
272 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(grf, 0);
273 }
274 }
275 else {
276 c->regs[TGSI_FILE_OUTPUT][i] = brw_null_reg();
277 }
278 }
279
280 /* Allocate program temporaries:
281 */
282
283 for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
284 c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
285 reg++;
286 }
287
288 /* Address reg(s). Don't try to use the internal address reg until
289 * deref time.
290 */
291 for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
292 c->regs[TGSI_FILE_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
293 reg,
294 0,
295 BRW_REGISTER_TYPE_D,
296 BRW_VERTICAL_STRIDE_8,
297 BRW_WIDTH_8,
298 BRW_HORIZONTAL_STRIDE_1,
299 BRW_SWIZZLE_XXXX,
300 BRW_WRITEMASK_X);
301 reg++;
302 }
303
304 if (c->vp->use_const_buffer) {
305 for (i = 0; i < 3; i++) {
306 c->current_const[i].index = -1;
307 c->current_const[i].reg = brw_vec8_grf(reg, 0);
308 reg++;
309 }
310 }
311
312 #if 0
313 for (i = 0; i < 128; i++) {
314 if (c->output_regs[i].used_in_src) {
315 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
316 reg++;
317 }
318 }
319 #endif
320
321 if (c->vp->has_flow_control) {
322 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
323 reg += 2;
324 }
325
326 /* Some opcodes need an internal temporary:
327 */
328 c->first_tmp = reg;
329 c->last_tmp = reg; /* for allocation purposes */
330
331 /* Each input reg holds data from two vertices. The
332 * urb_read_length is the number of registers read from *each*
333 * vertex urb, so is half the amount:
334 */
335 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
336
337 /* Setting this field to 0 leads to undefined behavior according to the
338 * the VS_STATE docs. Our VUEs will always have at least one attribute
339 * sitting in them, even if it's padding.
340 */
341 if (c->prog_data.urb_read_length == 0)
342 c->prog_data.urb_read_length = 1;
343
344 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
345 * them to fit the biggest thing they need to.
346 */
347 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
348
349 if (c->chipset.is_igdng)
350 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
351 else
352 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
353
354 c->prog_data.total_grf = reg;
355
356 if (BRW_DEBUG & DEBUG_VS) {
357 debug_printf("%s NumAddrRegs %d\n", __FUNCTION__,
358 c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
359 debug_printf("%s NumTemps %d\n", __FUNCTION__,
360 c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
361 debug_printf("%s reg = %d\n", __FUNCTION__, reg);
362 }
363 }
364
365
366 /**
367 * If an instruction uses a temp reg both as a src and the dest, we
368 * sometimes need to allocate an intermediate temporary.
369 */
370 static void unalias1( struct brw_vs_compile *c,
371 struct brw_reg dst,
372 struct brw_reg arg0,
373 void (*func)( struct brw_vs_compile *,
374 struct brw_reg,
375 struct brw_reg ))
376 {
377 if (dst.file == arg0.file && dst.nr == arg0.nr) {
378 struct brw_compile *p = &c->func;
379 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
380 func(c, tmp, arg0);
381 brw_MOV(p, dst, tmp);
382 release_tmp(c, tmp);
383 }
384 else {
385 func(c, dst, arg0);
386 }
387 }
388
389 /**
390 * \sa unalias2
391 * Checkes if 2-operand instruction needs an intermediate temporary.
392 */
393 static void unalias2( struct brw_vs_compile *c,
394 struct brw_reg dst,
395 struct brw_reg arg0,
396 struct brw_reg arg1,
397 void (*func)( struct brw_vs_compile *,
398 struct brw_reg,
399 struct brw_reg,
400 struct brw_reg ))
401 {
402 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
403 (dst.file == arg1.file && dst.nr == arg1.nr)) {
404 struct brw_compile *p = &c->func;
405 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
406 func(c, tmp, arg0, arg1);
407 brw_MOV(p, dst, tmp);
408 release_tmp(c, tmp);
409 }
410 else {
411 func(c, dst, arg0, arg1);
412 }
413 }
414
415 /**
416 * \sa unalias2
417 * Checkes if 3-operand instruction needs an intermediate temporary.
418 */
419 static void unalias3( struct brw_vs_compile *c,
420 struct brw_reg dst,
421 struct brw_reg arg0,
422 struct brw_reg arg1,
423 struct brw_reg arg2,
424 void (*func)( struct brw_vs_compile *,
425 struct brw_reg,
426 struct brw_reg,
427 struct brw_reg,
428 struct brw_reg ))
429 {
430 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
431 (dst.file == arg1.file && dst.nr == arg1.nr) ||
432 (dst.file == arg2.file && dst.nr == arg2.nr)) {
433 struct brw_compile *p = &c->func;
434 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
435 func(c, tmp, arg0, arg1, arg2);
436 brw_MOV(p, dst, tmp);
437 release_tmp(c, tmp);
438 }
439 else {
440 func(c, dst, arg0, arg1, arg2);
441 }
442 }
443
444 static void emit_sop( struct brw_compile *p,
445 struct brw_reg dst,
446 struct brw_reg arg0,
447 struct brw_reg arg1,
448 GLuint cond)
449 {
450 brw_MOV(p, dst, brw_imm_f(0.0f));
451 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
452 brw_MOV(p, dst, brw_imm_f(1.0f));
453 brw_set_predicate_control_flag_value(p, 0xff);
454 }
455
456 static void emit_seq( struct brw_compile *p,
457 struct brw_reg dst,
458 struct brw_reg arg0,
459 struct brw_reg arg1 )
460 {
461 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
462 }
463
464 static void emit_sne( struct brw_compile *p,
465 struct brw_reg dst,
466 struct brw_reg arg0,
467 struct brw_reg arg1 )
468 {
469 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
470 }
471 static void emit_slt( struct brw_compile *p,
472 struct brw_reg dst,
473 struct brw_reg arg0,
474 struct brw_reg arg1 )
475 {
476 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
477 }
478
479 static void emit_sle( struct brw_compile *p,
480 struct brw_reg dst,
481 struct brw_reg arg0,
482 struct brw_reg arg1 )
483 {
484 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
485 }
486
487 static void emit_sgt( struct brw_compile *p,
488 struct brw_reg dst,
489 struct brw_reg arg0,
490 struct brw_reg arg1 )
491 {
492 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
493 }
494
495 static void emit_sge( struct brw_compile *p,
496 struct brw_reg dst,
497 struct brw_reg arg0,
498 struct brw_reg arg1 )
499 {
500 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
501 }
502
503 static void emit_max( struct brw_compile *p,
504 struct brw_reg dst,
505 struct brw_reg arg0,
506 struct brw_reg arg1 )
507 {
508 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
509 brw_SEL(p, dst, arg1, arg0);
510 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
511 }
512
513 static void emit_min( struct brw_compile *p,
514 struct brw_reg dst,
515 struct brw_reg arg0,
516 struct brw_reg arg1 )
517 {
518 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
519 brw_SEL(p, dst, arg0, arg1);
520 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
521 }
522
523
524 static void emit_math1( struct brw_vs_compile *c,
525 GLuint function,
526 struct brw_reg dst,
527 struct brw_reg arg0,
528 GLuint precision)
529 {
530 /* There are various odd behaviours with SEND on the simulator. In
531 * addition there are documented issues with the fact that the GEN4
532 * processor doesn't do dependency control properly on SEND
533 * results. So, on balance, this kludge to get around failures
534 * with writemasked math results looks like it might be necessary
535 * whether that turns out to be a simulator bug or not:
536 */
537 struct brw_compile *p = &c->func;
538 struct brw_reg tmp = dst;
539 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
540 dst.file != BRW_GENERAL_REGISTER_FILE);
541
542 if (need_tmp)
543 tmp = get_tmp(c);
544
545 brw_math(p,
546 tmp,
547 function,
548 BRW_MATH_SATURATE_NONE,
549 2,
550 arg0,
551 BRW_MATH_DATA_SCALAR,
552 precision);
553
554 if (need_tmp) {
555 brw_MOV(p, dst, tmp);
556 release_tmp(c, tmp);
557 }
558 }
559
560
561 static void emit_math2( struct brw_vs_compile *c,
562 GLuint function,
563 struct brw_reg dst,
564 struct brw_reg arg0,
565 struct brw_reg arg1,
566 GLuint precision)
567 {
568 struct brw_compile *p = &c->func;
569 struct brw_reg tmp = dst;
570 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
571 dst.file != BRW_GENERAL_REGISTER_FILE);
572
573 if (need_tmp)
574 tmp = get_tmp(c);
575
576 brw_MOV(p, brw_message_reg(3), arg1);
577
578 brw_math(p,
579 tmp,
580 function,
581 BRW_MATH_SATURATE_NONE,
582 2,
583 arg0,
584 BRW_MATH_DATA_SCALAR,
585 precision);
586
587 if (need_tmp) {
588 brw_MOV(p, dst, tmp);
589 release_tmp(c, tmp);
590 }
591 }
592
593
594 static void emit_exp_noalias( struct brw_vs_compile *c,
595 struct brw_reg dst,
596 struct brw_reg arg0 )
597 {
598 struct brw_compile *p = &c->func;
599
600
601 if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
602 struct brw_reg tmp = get_tmp(c);
603 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
604
605 /* tmp_d = floor(arg0.x) */
606 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
607
608 /* result[0] = 2.0 ^ tmp */
609
610 /* Adjust exponent for floating point:
611 * exp += 127
612 */
613 brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
614
615 /* Install exponent and sign.
616 * Excess drops off the edge:
617 */
618 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X),
619 tmp_d, brw_imm_d(23));
620
621 release_tmp(c, tmp);
622 }
623
624 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
625 /* result[1] = arg0.x - floor(arg0.x) */
626 brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
627 }
628
629 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
630 /* As with the LOG instruction, we might be better off just
631 * doing a taylor expansion here, seeing as we have to do all
632 * the prep work.
633 *
634 * If mathbox partial precision is too low, consider also:
635 * result[3] = result[0] * EXP(result[1])
636 */
637 emit_math1(c,
638 BRW_MATH_FUNCTION_EXP,
639 brw_writemask(dst, BRW_WRITEMASK_Z),
640 brw_swizzle1(arg0, 0),
641 BRW_MATH_PRECISION_FULL);
642 }
643
644 if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
645 /* result[3] = 1.0; */
646 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
647 }
648 }
649
650
651 static void emit_log_noalias( struct brw_vs_compile *c,
652 struct brw_reg dst,
653 struct brw_reg arg0 )
654 {
655 struct brw_compile *p = &c->func;
656 struct brw_reg tmp = dst;
657 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
658 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
659 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
660 dst.file != BRW_GENERAL_REGISTER_FILE);
661
662 if (need_tmp) {
663 tmp = get_tmp(c);
664 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
665 }
666
667 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
668 * according to spec:
669 *
670 * These almost look likey they could be joined up, but not really
671 * practical:
672 *
673 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
674 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
675 */
676 if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
677 brw_AND(p,
678 brw_writemask(tmp_ud, BRW_WRITEMASK_X),
679 brw_swizzle1(arg0_ud, 0),
680 brw_imm_ud((1U<<31)-1));
681
682 brw_SHR(p,
683 brw_writemask(tmp_ud, BRW_WRITEMASK_X),
684 tmp_ud,
685 brw_imm_ud(23));
686
687 brw_ADD(p,
688 brw_writemask(tmp, BRW_WRITEMASK_X),
689 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
690 brw_imm_d(-127));
691 }
692
693 if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
694 brw_AND(p,
695 brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
696 brw_swizzle1(arg0_ud, 0),
697 brw_imm_ud((1<<23)-1));
698
699 brw_OR(p,
700 brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
701 tmp_ud,
702 brw_imm_ud(127<<23));
703 }
704
705 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
706 /* result[2] = result[0] + LOG2(result[1]); */
707
708 /* Why bother? The above is just a hint how to do this with a
709 * taylor series. Maybe we *should* use a taylor series as by
710 * the time all the above has been done it's almost certainly
711 * quicker than calling the mathbox, even with low precision.
712 *
713 * Options are:
714 * - result[0] + mathbox.LOG2(result[1])
715 * - mathbox.LOG2(arg0.x)
716 * - result[0] + inline_taylor_approx(result[1])
717 */
718 emit_math1(c,
719 BRW_MATH_FUNCTION_LOG,
720 brw_writemask(tmp, BRW_WRITEMASK_Z),
721 brw_swizzle1(tmp, 1),
722 BRW_MATH_PRECISION_FULL);
723
724 brw_ADD(p,
725 brw_writemask(tmp, BRW_WRITEMASK_Z),
726 brw_swizzle1(tmp, 2),
727 brw_swizzle1(tmp, 0));
728 }
729
730 if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
731 /* result[3] = 1.0; */
732 brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
733 }
734
735 if (need_tmp) {
736 brw_MOV(p, dst, tmp);
737 release_tmp(c, tmp);
738 }
739 }
740
741
742 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
743 */
744 static void emit_dst_noalias( struct brw_vs_compile *c,
745 struct brw_reg dst,
746 struct brw_reg arg0,
747 struct brw_reg arg1)
748 {
749 struct brw_compile *p = &c->func;
750
751 /* There must be a better way to do this:
752 */
753 if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
754 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
755 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
756 brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
757 if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
758 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
759 if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
760 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
761 }
762
763
764 static void emit_xpd( struct brw_compile *p,
765 struct brw_reg dst,
766 struct brw_reg t,
767 struct brw_reg u)
768 {
769 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
770 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
771 }
772
773
774 static void emit_lit_noalias( struct brw_vs_compile *c,
775 struct brw_reg dst,
776 struct brw_reg arg0 )
777 {
778 struct brw_compile *p = &c->func;
779 struct brw_instruction *if_insn;
780 struct brw_reg tmp = dst;
781 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
782
783 if (need_tmp)
784 tmp = get_tmp(c);
785
786 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0));
787 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1));
788
789 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
790 * to get all channels active inside the IF. In the clipping code
791 * we run with NoMask, so it's not an option and we can use
792 * BRW_EXECUTE_1 for all comparisions.
793 */
794 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
795 if_insn = brw_IF(p, BRW_EXECUTE_8);
796 {
797 brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
798
799 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
800 brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z), brw_swizzle1(arg0,1));
801 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
802
803 emit_math2(c,
804 BRW_MATH_FUNCTION_POW,
805 brw_writemask(dst, BRW_WRITEMASK_Z),
806 brw_swizzle1(tmp, 2),
807 brw_swizzle1(arg0, 3),
808 BRW_MATH_PRECISION_PARTIAL);
809 }
810
811 brw_ENDIF(p, if_insn);
812
813 release_tmp(c, tmp);
814 }
815
816 static void emit_lrp_noalias(struct brw_vs_compile *c,
817 struct brw_reg dst,
818 struct brw_reg arg0,
819 struct brw_reg arg1,
820 struct brw_reg arg2)
821 {
822 struct brw_compile *p = &c->func;
823
824 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
825 brw_MUL(p, brw_null_reg(), dst, arg2);
826 brw_MAC(p, dst, arg0, arg1);
827 }
828
829 /** 3 or 4-component vector normalization */
830 static void emit_nrm( struct brw_vs_compile *c,
831 struct brw_reg dst,
832 struct brw_reg arg0,
833 int num_comps)
834 {
835 struct brw_compile *p = &c->func;
836 struct brw_reg tmp = get_tmp(c);
837
838 /* tmp = dot(arg0, arg0) */
839 if (num_comps == 3)
840 brw_DP3(p, tmp, arg0, arg0);
841 else
842 brw_DP4(p, tmp, arg0, arg0);
843
844 /* tmp = 1 / sqrt(tmp) */
845 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
846
847 /* dst = arg0 * tmp */
848 brw_MUL(p, dst, arg0, tmp);
849
850 release_tmp(c, tmp);
851 }
852
853
854 static struct brw_reg
855 get_constant(struct brw_vs_compile *c,
856 GLuint argIndex,
857 GLuint index,
858 GLboolean relAddr)
859 {
860 struct brw_compile *p = &c->func;
861 struct brw_reg const_reg;
862 struct brw_reg const2_reg;
863
864 assert(argIndex < 3);
865
866 if (c->current_const[argIndex].index != index || relAddr) {
867 struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
868
869 c->current_const[argIndex].index = index;
870
871 #if 0
872 printf(" fetch const[%d] for arg %d into reg %d\n",
873 src.Index, argIndex, c->current_const[argIndex].reg.nr);
874 #endif
875 /* need to fetch the constant now */
876 brw_dp_READ_4_vs(p,
877 c->current_const[argIndex].reg,/* writeback dest */
878 0, /* oword */
879 relAddr, /* relative indexing? */
880 addrReg, /* address register */
881 16 * index, /* byte offset */
882 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
883 );
884
885 if (relAddr) {
886 /* second read */
887 const2_reg = get_tmp(c);
888
889 /* use upper half of address reg for second read */
890 addrReg = stride(addrReg, 0, 4, 0);
891 addrReg.subnr = 16;
892
893 brw_dp_READ_4_vs(p,
894 const2_reg, /* writeback dest */
895 1, /* oword */
896 relAddr, /* relative indexing? */
897 addrReg, /* address register */
898 16 * index, /* byte offset */
899 SURF_INDEX_VERT_CONST_BUFFER
900 );
901 }
902 }
903
904 const_reg = c->current_const[argIndex].reg;
905
906 if (relAddr) {
907 /* merge the two Owords into the constant register */
908 /* const_reg[7..4] = const2_reg[7..4] */
909 brw_MOV(p,
910 suboffset(stride(const_reg, 0, 4, 1), 4),
911 suboffset(stride(const2_reg, 0, 4, 1), 4));
912 release_tmp(c, const2_reg);
913 }
914 else {
915 /* replicate lower four floats into upper half (to get XYZWXYZW) */
916 const_reg = stride(const_reg, 0, 4, 0);
917 const_reg.subnr = 0;
918 }
919
920 return const_reg;
921 }
922
923
924
925 /* TODO: relative addressing!
926 */
927 static struct brw_reg get_reg( struct brw_vs_compile *c,
928 enum tgsi_file_type file,
929 GLuint index )
930 {
931 switch (file) {
932 case TGSI_FILE_TEMPORARY:
933 case TGSI_FILE_INPUT:
934 case TGSI_FILE_OUTPUT:
935 case TGSI_FILE_CONSTANT:
936 assert(c->regs[file][index].nr != 0);
937 return c->regs[file][index];
938
939 case TGSI_FILE_ADDRESS:
940 assert(index == 0);
941 return c->regs[file][index];
942
943 case TGSI_FILE_NULL: /* undef values */
944 return brw_null_reg();
945
946 default:
947 assert(0);
948 return brw_null_reg();
949 }
950 }
951
952
953 /**
954 * Indirect addressing: get reg[[arg] + offset].
955 */
956 static struct brw_reg deref( struct brw_vs_compile *c,
957 struct brw_reg arg,
958 GLint offset)
959 {
960 struct brw_compile *p = &c->func;
961 struct brw_reg tmp = vec4(get_tmp(c));
962 struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
963 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
964 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
965 struct brw_reg indirect = brw_vec4_indirect(0,0);
966
967 {
968 brw_push_insn_state(p);
969 brw_set_access_mode(p, BRW_ALIGN_1);
970
971 /* This is pretty clunky - load the address register twice and
972 * fetch each 4-dword value in turn. There must be a way to do
973 * this in a single pass, but I couldn't get it to work.
974 */
975 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
976 brw_MOV(p, tmp, indirect);
977
978 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
979 brw_MOV(p, suboffset(tmp, 4), indirect);
980
981 brw_pop_insn_state(p);
982 }
983
984 /* NOTE: tmp not released */
985 return vec8(tmp);
986 }
987
988
989 /**
990 * Get brw reg corresponding to the instruction's [argIndex] src reg.
991 * TODO: relative addressing!
992 */
993 static struct brw_reg
994 get_src_reg( struct brw_vs_compile *c,
995 GLuint argIndex,
996 GLuint file,
997 GLint index,
998 GLboolean relAddr )
999 {
1000
1001 switch (file) {
1002 case TGSI_FILE_TEMPORARY:
1003 case TGSI_FILE_INPUT:
1004 case TGSI_FILE_OUTPUT:
1005 if (relAddr) {
1006 return deref(c, c->regs[file][0], index);
1007 }
1008 else {
1009 assert(c->regs[file][index].nr != 0);
1010 return c->regs[file][index];
1011 }
1012
1013 case TGSI_FILE_IMMEDIATE:
1014 return c->regs[file][index];
1015
1016 case TGSI_FILE_CONSTANT:
1017 if (c->vp->use_const_buffer) {
1018 return get_constant(c, argIndex, index, relAddr);
1019 }
1020 else if (relAddr) {
1021 return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
1022 }
1023 else {
1024 assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
1025 return c->regs[TGSI_FILE_CONSTANT][index];
1026 }
1027 case TGSI_FILE_ADDRESS:
1028 assert(index == 0);
1029 return c->regs[file][index];
1030
1031 case TGSI_FILE_NULL:
1032 /* this is a normal case since we loop over all three src args */
1033 return brw_null_reg();
1034
1035 default:
1036 assert(0);
1037 return brw_null_reg();
1038 }
1039 }
1040
1041
1042 static void emit_arl( struct brw_vs_compile *c,
1043 struct brw_reg dst,
1044 struct brw_reg arg0 )
1045 {
1046 struct brw_compile *p = &c->func;
1047 struct brw_reg tmp = dst;
1048 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1049
1050 if (need_tmp)
1051 tmp = get_tmp(c);
1052
1053 brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
1054 brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
1055
1056 if (need_tmp)
1057 release_tmp(c, tmp);
1058 }
1059
1060
1061 /**
1062 * Return the brw reg for the given instruction's src argument.
1063 */
1064 static struct brw_reg get_arg( struct brw_vs_compile *c,
1065 const struct tgsi_full_src_register *src,
1066 GLuint argIndex )
1067 {
1068 struct brw_reg reg;
1069
1070 if (src->Register.File == TGSI_FILE_NULL)
1071 return brw_null_reg();
1072
1073 reg = get_src_reg(c, argIndex,
1074 src->Register.File,
1075 src->Register.Index,
1076 src->Register.Indirect);
1077
1078 /* Convert 3-bit swizzle to 2-bit.
1079 */
1080 reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->Register.SwizzleX,
1081 src->Register.SwizzleY,
1082 src->Register.SwizzleZ,
1083 src->Register.SwizzleW);
1084
1085 reg.negate = src->Register.Negate ? 1 : 0;
1086
1087 /* XXX: abs, absneg
1088 */
1089
1090 return reg;
1091 }
1092
1093
1094 /**
1095 * Get brw register for the given program dest register.
1096 */
1097 static struct brw_reg get_dst( struct brw_vs_compile *c,
1098 unsigned file,
1099 unsigned index,
1100 unsigned writemask )
1101 {
1102 struct brw_reg reg;
1103
1104 switch (file) {
1105 case TGSI_FILE_TEMPORARY:
1106 case TGSI_FILE_OUTPUT:
1107 assert(c->regs[file][index].nr != 0);
1108 reg = c->regs[file][index];
1109 break;
1110 case TGSI_FILE_ADDRESS:
1111 assert(index == 0);
1112 reg = c->regs[file][index];
1113 break;
1114 case TGSI_FILE_NULL:
1115 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1116 reg = brw_null_reg();
1117 break;
1118 default:
1119 assert(0);
1120 reg = brw_null_reg();
1121 }
1122
1123 reg.dw1.bits.writemask = writemask;
1124
1125 return reg;
1126 }
1127
1128
1129
1130
1131 /**
1132 * Post-vertex-program processing. Send the results to the URB.
1133 */
1134 static void emit_vertex_write( struct brw_vs_compile *c)
1135 {
1136 struct brw_compile *p = &c->func;
1137 struct brw_reg m0 = brw_message_reg(0);
1138 struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
1139 struct brw_reg ndc;
1140 int eot;
1141 int i;
1142 GLuint len_vertext_header = 2;
1143
1144 if (c->key.copy_edgeflag) {
1145 brw_MOV(p,
1146 get_reg(c, TGSI_FILE_OUTPUT, c->prog_data.output_edgeflag),
1147 brw_imm_f(1));
1148 }
1149
1150 /* Build ndc coords */
1151 ndc = get_tmp(c);
1152 /* ndc = 1.0 / pos.w */
1153 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1154 /* ndc.xyz = pos * ndc */
1155 brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
1156
1157 /* Update the header for point size, user clipping flags, and -ve rhw
1158 * workaround.
1159 */
1160 if (c->prog_data.writes_psiz ||
1161 c->key.nr_userclip ||
1162 c->chipset.is_965)
1163 {
1164 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1165 GLuint i;
1166
1167 brw_MOV(p, header1, brw_imm_ud(0));
1168
1169 brw_set_access_mode(p, BRW_ALIGN_16);
1170
1171 if (c->prog_data.writes_psiz) {
1172 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
1173 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1174 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1175 }
1176
1177 for (i = 0; i < c->key.nr_userclip; i++) {
1178 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1179 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1180 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
1181 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1182 }
1183
1184 /* i965 clipping workaround:
1185 * 1) Test for -ve rhw
1186 * 2) If set,
1187 * set ndc = (0,0,0,0)
1188 * set ucp[6] = 1
1189 *
1190 * Later, clipping will detect ucp[6] and ensure the primitive is
1191 * clipped against all fixed planes.
1192 */
1193 if (c->chipset.is_965) {
1194 brw_CMP(p,
1195 vec8(brw_null_reg()),
1196 BRW_CONDITIONAL_L,
1197 brw_swizzle1(ndc, 3),
1198 brw_imm_f(0));
1199
1200 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
1201 brw_MOV(p, ndc, brw_imm_f(0));
1202 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1203 }
1204
1205 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1206 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1207 brw_set_access_mode(p, BRW_ALIGN_16);
1208
1209 release_tmp(c, header1);
1210 }
1211 else {
1212 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1213 }
1214
1215 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1216 * of zeros followed by two sets of NDC coordinates:
1217 */
1218 brw_set_access_mode(p, BRW_ALIGN_1);
1219 brw_MOV(p, offset(m0, 2), ndc);
1220
1221 if (c->chipset.is_igdng) {
1222 /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1223 brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1224 /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1225 * Seems it is useless for us.
1226 * m6 is used for aligning, so that the remainder of vertex element is
1227 * reg-aligned.
1228 */
1229 brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1230 len_vertext_header = 6;
1231 } else {
1232 brw_MOV(p, offset(m0, 3), pos);
1233 len_vertext_header = 2;
1234 }
1235
1236 eot = (c->overflow_count == 0);
1237
1238 brw_urb_WRITE(p,
1239 brw_null_reg(), /* dest */
1240 0, /* starting mrf reg nr */
1241 c->r0, /* src */
1242 0, /* allocate */
1243 1, /* used */
1244 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1245 0, /* response len */
1246 eot, /* eot */
1247 eot, /* writes complete */
1248 0, /* urb destination offset */
1249 BRW_URB_SWIZZLE_INTERLEAVE);
1250
1251 /* Not all of the vertex outputs/results fit into the MRF.
1252 * Move the overflowed attributes from the GRF to the MRF and
1253 * issue another brw_urb_WRITE().
1254 */
1255 for (i = 0; i < c->overflow_count; i += BRW_MAX_MRF) {
1256 unsigned nr = MIN2(c->overflow_count - i, BRW_MAX_MRF);
1257 GLuint j;
1258
1259 eot = (i + nr >= c->overflow_count);
1260
1261 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1262 * at mrf[4] atm...
1263 */
1264 for (j = 0; j < nr; j++) {
1265 brw_MOV(p, brw_message_reg(4+j),
1266 brw_vec8_grf(c->overflow_grf_start + i + j, 0));
1267 }
1268
1269 brw_urb_WRITE(p,
1270 brw_null_reg(), /* dest */
1271 4, /* starting mrf reg nr */
1272 c->r0, /* src */
1273 0, /* allocate */
1274 1, /* used */
1275 nr+1, /* msg len */
1276 0, /* response len */
1277 eot, /* eot */
1278 eot, /* writes complete */
1279 i-1, /* urb destination offset */
1280 BRW_URB_SWIZZLE_INTERLEAVE);
1281 }
1282 }
1283
1284
1285 /**
1286 * Called after code generation to resolve subroutine calls and the
1287 * END instruction.
1288 * \param end_inst points to brw code for END instruction
1289 * \param last_inst points to last instruction emitted before vertex write
1290 */
1291 static void
1292 post_vs_emit( struct brw_vs_compile *c,
1293 struct brw_instruction *end_inst,
1294 struct brw_instruction *last_inst )
1295 {
1296 GLint offset;
1297
1298 brw_resolve_cals(&c->func);
1299
1300 /* patch up the END code to jump past subroutines, etc */
1301 offset = last_inst - end_inst;
1302 if (offset > 1) {
1303 brw_set_src1(end_inst, brw_imm_d(offset * 16));
1304 } else {
1305 end_inst->header.opcode = BRW_OPCODE_NOP;
1306 }
1307 }
1308
1309 static uint32_t
1310 get_predicate(const struct tgsi_full_instruction *inst)
1311 {
1312 /* XXX: disabling for now
1313 */
1314 #if 0
1315 if (inst->dst.CondMask == COND_TR)
1316 return BRW_PREDICATE_NONE;
1317
1318 /* All of GLSL only produces predicates for COND_NE and one channel per
1319 * vector. Fail badly if someone starts doing something else, as it might
1320 * mean infinite looping or something.
1321 *
1322 * We'd like to support all the condition codes, but our hardware doesn't
1323 * quite match the Mesa IR, which is modeled after the NV extensions. For
1324 * those, the instruction may update the condition codes or not, then any
1325 * later instruction may use one of those condition codes. For gen4, the
1326 * instruction may update the flags register based on one of the condition
1327 * codes output by the instruction, and then further instructions may
1328 * predicate on that. We can probably support this, but it won't
1329 * necessarily be easy.
1330 */
1331 /* assert(inst->dst.CondMask == COND_NE); */
1332
1333 switch (inst->dst.CondSwizzle) {
1334 case SWIZZLE_XXXX:
1335 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1336 case SWIZZLE_YYYY:
1337 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1338 case SWIZZLE_ZZZZ:
1339 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1340 case SWIZZLE_WWWW:
1341 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1342 default:
1343 debug_printf("Unexpected predicate: 0x%08x\n",
1344 inst->dst.CondMask);
1345 return BRW_PREDICATE_NORMAL;
1346 }
1347 #else
1348 return BRW_PREDICATE_NORMAL;
1349 #endif
1350 }
1351
1352 static void emit_insn(struct brw_vs_compile *c,
1353 const struct tgsi_full_instruction *inst)
1354 {
1355 unsigned opcode = inst->Instruction.Opcode;
1356 unsigned label = inst->Label.Label;
1357 struct brw_compile *p = &c->func;
1358 struct brw_reg args[3], dst;
1359 GLuint i;
1360
1361 #if 0
1362 printf("%d: ", insn);
1363 _mesa_print_instruction(inst);
1364 #endif
1365
1366 /* Get argument regs.
1367 */
1368 for (i = 0; i < 3; i++) {
1369 args[i] = get_arg(c, &inst->Src[i], i);
1370 }
1371
1372 /* Get dest regs. Note that it is possible for a reg to be both
1373 * dst and arg, given the static allocation of registers. So
1374 * care needs to be taken emitting multi-operation instructions.
1375 */
1376 dst = get_dst(c,
1377 inst->Dst[0].Register.File,
1378 inst->Dst[0].Register.Index,
1379 inst->Dst[0].Register.WriteMask);
1380
1381 /* XXX: saturate
1382 */
1383 if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
1384 debug_printf("Unsupported saturate in vertex shader");
1385 }
1386
1387 switch (opcode) {
1388 case TGSI_OPCODE_ABS:
1389 brw_MOV(p, dst, brw_abs(args[0]));
1390 break;
1391 case TGSI_OPCODE_ADD:
1392 brw_ADD(p, dst, args[0], args[1]);
1393 break;
1394 case TGSI_OPCODE_COS:
1395 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1396 break;
1397 case TGSI_OPCODE_DP3:
1398 brw_DP3(p, dst, args[0], args[1]);
1399 break;
1400 case TGSI_OPCODE_DP4:
1401 brw_DP4(p, dst, args[0], args[1]);
1402 break;
1403 case TGSI_OPCODE_DPH:
1404 brw_DPH(p, dst, args[0], args[1]);
1405 break;
1406 case TGSI_OPCODE_NRM:
1407 emit_nrm(c, dst, args[0], 3);
1408 break;
1409 case TGSI_OPCODE_NRM4:
1410 emit_nrm(c, dst, args[0], 4);
1411 break;
1412 case TGSI_OPCODE_DST:
1413 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1414 break;
1415 case TGSI_OPCODE_EXP:
1416 unalias1(c, dst, args[0], emit_exp_noalias);
1417 break;
1418 case TGSI_OPCODE_EX2:
1419 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1420 break;
1421 case TGSI_OPCODE_ARL:
1422 emit_arl(c, dst, args[0]);
1423 break;
1424 case TGSI_OPCODE_FLR:
1425 brw_RNDD(p, dst, args[0]);
1426 break;
1427 case TGSI_OPCODE_FRC:
1428 brw_FRC(p, dst, args[0]);
1429 break;
1430 case TGSI_OPCODE_LOG:
1431 unalias1(c, dst, args[0], emit_log_noalias);
1432 break;
1433 case TGSI_OPCODE_LG2:
1434 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1435 break;
1436 case TGSI_OPCODE_LIT:
1437 unalias1(c, dst, args[0], emit_lit_noalias);
1438 break;
1439 case TGSI_OPCODE_LRP:
1440 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1441 break;
1442 case TGSI_OPCODE_MAD:
1443 brw_MOV(p, brw_acc_reg(), args[2]);
1444 brw_MAC(p, dst, args[0], args[1]);
1445 break;
1446 case TGSI_OPCODE_MAX:
1447 emit_max(p, dst, args[0], args[1]);
1448 break;
1449 case TGSI_OPCODE_MIN:
1450 emit_min(p, dst, args[0], args[1]);
1451 break;
1452 case TGSI_OPCODE_MOV:
1453 brw_MOV(p, dst, args[0]);
1454 break;
1455 case TGSI_OPCODE_MUL:
1456 brw_MUL(p, dst, args[0], args[1]);
1457 break;
1458 case TGSI_OPCODE_POW:
1459 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1460 break;
1461 case TGSI_OPCODE_RCP:
1462 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1463 break;
1464 case TGSI_OPCODE_RSQ:
1465 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst,
1466 brw_swizzle(args[0], 0,0,0,0), BRW_MATH_PRECISION_FULL);
1467 break;
1468 case TGSI_OPCODE_SEQ:
1469 emit_seq(p, dst, args[0], args[1]);
1470 break;
1471 case TGSI_OPCODE_SIN:
1472 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1473 break;
1474 case TGSI_OPCODE_SNE:
1475 emit_sne(p, dst, args[0], args[1]);
1476 break;
1477 case TGSI_OPCODE_SGE:
1478 emit_sge(p, dst, args[0], args[1]);
1479 break;
1480 case TGSI_OPCODE_SGT:
1481 emit_sgt(p, dst, args[0], args[1]);
1482 break;
1483 case TGSI_OPCODE_SLT:
1484 emit_slt(p, dst, args[0], args[1]);
1485 break;
1486 case TGSI_OPCODE_SLE:
1487 emit_sle(p, dst, args[0], args[1]);
1488 break;
1489 case TGSI_OPCODE_SUB:
1490 brw_ADD(p, dst, args[0], negate(args[1]));
1491 break;
1492 case TGSI_OPCODE_TRUNC:
1493 /* round toward zero */
1494 brw_RNDZ(p, dst, args[0]);
1495 break;
1496 case TGSI_OPCODE_XPD:
1497 emit_xpd(p, dst, args[0], args[1]);
1498 break;
1499 case TGSI_OPCODE_IF:
1500 assert(c->if_depth < MAX_IF_DEPTH);
1501 c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
1502 /* Note that brw_IF smashes the predicate_control field. */
1503 c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
1504 c->if_depth++;
1505 break;
1506 case TGSI_OPCODE_ELSE:
1507 c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
1508 break;
1509 case TGSI_OPCODE_ENDIF:
1510 assert(c->if_depth > 0);
1511 brw_ENDIF(p, c->if_inst[--c->if_depth]);
1512 break;
1513 case TGSI_OPCODE_BGNLOOP:
1514 c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1515 break;
1516 case TGSI_OPCODE_BRK:
1517 brw_set_predicate_control(p, get_predicate(inst));
1518 brw_BREAK(p);
1519 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1520 break;
1521 case TGSI_OPCODE_CONT:
1522 brw_set_predicate_control(p, get_predicate(inst));
1523 brw_CONT(p);
1524 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1525 break;
1526 case TGSI_OPCODE_ENDLOOP:
1527 {
1528 struct brw_instruction *inst0, *inst1;
1529 GLuint br = 1;
1530
1531 c->loop_depth--;
1532
1533 if (c->chipset.is_igdng)
1534 br = 2;
1535
1536 inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
1537 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1538 while (inst0 > c->loop_inst[c->loop_depth]) {
1539 inst0--;
1540 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
1541 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1542 inst0->bits3.if_else.pop_count = 0;
1543 }
1544 else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
1545 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1546 inst0->bits3.if_else.pop_count = 0;
1547 }
1548 }
1549 }
1550 break;
1551 case TGSI_OPCODE_BRA:
1552 brw_set_predicate_control(p, get_predicate(inst));
1553 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1554 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1555 break;
1556 case TGSI_OPCODE_CAL:
1557 brw_set_access_mode(p, BRW_ALIGN_1);
1558 brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1559 brw_set_access_mode(p, BRW_ALIGN_16);
1560 brw_ADD(p, get_addr_reg(c->stack_index),
1561 get_addr_reg(c->stack_index), brw_imm_d(4));
1562 brw_save_call(p, label, p->nr_insn);
1563 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1564 break;
1565 case TGSI_OPCODE_RET:
1566 brw_ADD(p, get_addr_reg(c->stack_index),
1567 get_addr_reg(c->stack_index), brw_imm_d(-4));
1568 brw_set_access_mode(p, BRW_ALIGN_1);
1569 brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
1570 brw_set_access_mode(p, BRW_ALIGN_16);
1571 break;
1572 case TGSI_OPCODE_END:
1573 c->end_offset = p->nr_insn;
1574 /* this instruction will get patched later to jump past subroutine
1575 * code, etc.
1576 */
1577 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1578 break;
1579 case TGSI_OPCODE_BGNSUB:
1580 brw_save_label(p, p->nr_insn, p->nr_insn);
1581 break;
1582 case TGSI_OPCODE_ENDSUB:
1583 /* no-op */
1584 break;
1585 default:
1586 debug_printf("Unsupported opcode %i (%s) in vertex shader",
1587 opcode,
1588 tgsi_get_opcode_name(opcode));
1589 }
1590
1591 /* Set the predication update on the last instruction of the native
1592 * instruction sequence.
1593 *
1594 * This would be problematic if it was set on a math instruction,
1595 * but that shouldn't be the case with the current GLSL compiler.
1596 */
1597 #if 0
1598 /* XXX: disabled
1599 */
1600 if (inst->CondUpdate) {
1601 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1602
1603 assert(hw_insn->header.destreg__conditionalmod == 0);
1604 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1605 }
1606 #endif
1607
1608 release_tmps(c);
1609 }
1610
1611
1612 /* Emit the vertex program instructions here.
1613 */
1614 void brw_vs_emit(struct brw_vs_compile *c)
1615 {
1616 struct brw_compile *p = &c->func;
1617 const struct tgsi_token *tokens = c->vp->tokens;
1618 struct brw_instruction *end_inst, *last_inst;
1619 struct tgsi_parse_context parse;
1620 struct tgsi_full_instruction *inst;
1621
1622 if (BRW_DEBUG & DEBUG_VS)
1623 tgsi_dump(c->vp->tokens, 0);
1624
1625 c->stack_index = brw_indirect(0, 0);
1626
1627 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1628 brw_set_access_mode(p, BRW_ALIGN_16);
1629
1630
1631 /* Static register allocation
1632 */
1633 brw_vs_alloc_regs(c);
1634
1635 if (c->vp->has_flow_control) {
1636 brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
1637 }
1638
1639 /* Instructions
1640 */
1641 tgsi_parse_init( &parse, tokens );
1642 while( !tgsi_parse_end_of_tokens( &parse ) ) {
1643 tgsi_parse_token( &parse );
1644
1645 switch( parse.FullToken.Token.Type ) {
1646 case TGSI_TOKEN_TYPE_DECLARATION:
1647 case TGSI_TOKEN_TYPE_IMMEDIATE:
1648 break;
1649
1650 case TGSI_TOKEN_TYPE_INSTRUCTION:
1651 inst = &parse.FullToken.FullInstruction;
1652 emit_insn( c, inst );
1653 break;
1654
1655 default:
1656 assert( 0 );
1657 }
1658 }
1659 tgsi_parse_free( &parse );
1660
1661 end_inst = &p->store[c->end_offset];
1662 last_inst = &p->store[p->nr_insn];
1663
1664 /* The END instruction will be patched to jump to this code */
1665 emit_vertex_write(c);
1666
1667 post_vs_emit(c, end_inst, last_inst);
1668
1669 if (BRW_DEBUG & DEBUG_VS) {
1670 debug_printf("vs-native:\n");
1671 brw_disasm(stderr, p->store, p->nr_insn);
1672 }
1673 }