Merge branch 'llvm-cliptest-viewport'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf;
144 int attributes_in_vue;
145 int first_reladdr_output;
146
147 /* Determine whether to use a real constant buffer or use a block
148 * of GRF registers for constants. The later is faster but only
149 * works if everything fits in the GRF.
150 * XXX this heuristic/check may need some fine tuning...
151 */
152 if (c->vp->program.Base.Parameters->NumParameters +
153 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
154 c->vp->use_const_buffer = GL_TRUE;
155 else
156 c->vp->use_const_buffer = GL_FALSE;
157
158 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
159
160 /* r0 -- reserved as usual
161 */
162 c->r0 = brw_vec8_grf(reg, 0);
163 reg++;
164
165 /* User clip planes from curbe:
166 */
167 if (c->key.nr_userclip) {
168 for (i = 0; i < c->key.nr_userclip; i++) {
169 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
170 }
171
172 /* Deal with curbe alignment:
173 */
174 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
175 }
176
177 /* Vertex program parameters from curbe:
178 */
179 if (c->vp->use_const_buffer) {
180 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
181 int constant = 0;
182
183 /* We've got more constants than we can load with the push
184 * mechanism. This is often correlated with reladdr loads where
185 * we should probably be using a pull mechanism anyway to avoid
186 * excessive reading. However, the pull mechanism is slow in
187 * general. So, we try to allocate as many non-reladdr-loaded
188 * constants through the push buffer as we can before giving up.
189 */
190 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
191 for (i = 0;
192 i < c->vp->program.Base.NumInstructions && constant < max_constant;
193 i++) {
194 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
195 int arg;
196
197 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
198 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
199 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
200 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
201 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
202 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
203 inst->SrcReg[arg].RelAddr)
204 continue;
205
206 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
207 c->constant_map[inst->SrcReg[arg].Index] = constant++;
208 }
209 }
210 }
211
212 for (i = 0; i < constant; i++) {
213 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
214 (i%2) * 4),
215 0, 4, 1);
216 }
217 reg += (constant + 1) / 2;
218 c->prog_data.curb_read_length = reg - 1;
219 /* XXX 0 causes a bug elsewhere... */
220 c->prog_data.nr_params = MAX2(constant * 4, 4);
221 }
222 else {
223 /* use a section of the GRF for constants */
224 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
225 for (i = 0; i < nr_params; i++) {
226 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
227 }
228 reg += (nr_params + 1) / 2;
229 c->prog_data.curb_read_length = reg - 1;
230
231 c->prog_data.nr_params = nr_params * 4;
232 }
233
234 /* Allocate input regs:
235 */
236 c->nr_inputs = 0;
237 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
238 if (c->prog_data.inputs_read & (1 << i)) {
239 c->nr_inputs++;
240 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
241 reg++;
242 }
243 }
244 /* If there are no inputs, we'll still be reading one attribute's worth
245 * because it's required -- see urb_read_length setting.
246 */
247 if (c->nr_inputs == 0)
248 reg++;
249
250 /* Allocate outputs. The non-position outputs go straight into message regs.
251 */
252 c->nr_outputs = 0;
253 c->first_output = reg;
254 c->first_overflow_output = 0;
255
256 if (intel->gen >= 6)
257 mrf = 3; /* no more pos store in attribute */
258 else if (intel->gen == 5)
259 mrf = 8;
260 else
261 mrf = 4;
262
263 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
264 for (i = 0; i < VERT_RESULT_MAX; i++) {
265 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
266 c->nr_outputs++;
267 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
268 if (i == VERT_RESULT_HPOS) {
269 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
270 reg++;
271 }
272 else if (i == VERT_RESULT_PSIZ) {
273 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
274 reg++;
275 mrf++; /* just a placeholder? XXX fix later stages & remove this */
276 }
277 else {
278 /* Two restrictions on our compute-to-MRF here. The
279 * message length for all SEND messages is restricted to
280 * [1,15], so we can't use mrf 15, as that means a length
281 * of 16.
282 *
283 * Additionally, URB writes are aligned to URB rows, so we
284 * need to put an even number of registers of URB data in
285 * each URB write so that the later write is aligned. A
286 * message length of 15 means 1 message header reg plus 14
287 * regs of URB data.
288 *
289 * For attributes beyond the compute-to-MRF, we compute to
290 * GRFs and they will be written in the second URB_WRITE.
291 */
292 if (first_reladdr_output > i && mrf < 15) {
293 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
294 mrf++;
295 }
296 else {
297 if (mrf >= 15 && !c->first_overflow_output)
298 c->first_overflow_output = i;
299 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
300 reg++;
301 mrf++;
302 }
303 }
304 }
305 }
306
307 /* Allocate program temporaries:
308 */
309 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
310 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
311 reg++;
312 }
313
314 /* Address reg(s). Don't try to use the internal address reg until
315 * deref time.
316 */
317 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
318 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
319 reg,
320 0,
321 BRW_REGISTER_TYPE_D,
322 BRW_VERTICAL_STRIDE_8,
323 BRW_WIDTH_8,
324 BRW_HORIZONTAL_STRIDE_1,
325 BRW_SWIZZLE_XXXX,
326 WRITEMASK_X);
327 reg++;
328 }
329
330 if (c->vp->use_const_buffer) {
331 for (i = 0; i < 3; i++) {
332 c->current_const[i].reg = brw_vec8_grf(reg, 0);
333 reg++;
334 }
335 clear_current_const(c);
336 }
337
338 for (i = 0; i < 128; i++) {
339 if (c->output_regs[i].used_in_src) {
340 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
341 reg++;
342 }
343 }
344
345 if (c->needs_stack) {
346 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
347 reg += 2;
348 }
349
350 /* Some opcodes need an internal temporary:
351 */
352 c->first_tmp = reg;
353 c->last_tmp = reg; /* for allocation purposes */
354
355 /* Each input reg holds data from two vertices. The
356 * urb_read_length is the number of registers read from *each*
357 * vertex urb, so is half the amount:
358 */
359 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
360 /* Setting this field to 0 leads to undefined behavior according to the
361 * the VS_STATE docs. Our VUEs will always have at least one attribute
362 * sitting in them, even if it's padding.
363 */
364 if (c->prog_data.urb_read_length == 0)
365 c->prog_data.urb_read_length = 1;
366
367 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
368 * them to fit the biggest thing they need to.
369 */
370 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
371
372 /* See emit_vertex_write() for where the VUE's overhead on top of the
373 * attributes comes from.
374 */
375 if (intel->gen >= 6)
376 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
377 else if (intel->gen == 5)
378 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
379 else
380 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
381
382 c->prog_data.total_grf = reg;
383
384 if (INTEL_DEBUG & DEBUG_VS) {
385 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
386 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
387 printf("%s reg = %d\n", __FUNCTION__, reg);
388 }
389 }
390
391
392 /**
393 * If an instruction uses a temp reg both as a src and the dest, we
394 * sometimes need to allocate an intermediate temporary.
395 */
396 static void unalias1( struct brw_vs_compile *c,
397 struct brw_reg dst,
398 struct brw_reg arg0,
399 void (*func)( struct brw_vs_compile *,
400 struct brw_reg,
401 struct brw_reg ))
402 {
403 if (dst.file == arg0.file && dst.nr == arg0.nr) {
404 struct brw_compile *p = &c->func;
405 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
406 func(c, tmp, arg0);
407 brw_MOV(p, dst, tmp);
408 release_tmp(c, tmp);
409 }
410 else {
411 func(c, dst, arg0);
412 }
413 }
414
415 /**
416 * \sa unalias2
417 * Checkes if 2-operand instruction needs an intermediate temporary.
418 */
419 static void unalias2( struct brw_vs_compile *c,
420 struct brw_reg dst,
421 struct brw_reg arg0,
422 struct brw_reg arg1,
423 void (*func)( struct brw_vs_compile *,
424 struct brw_reg,
425 struct brw_reg,
426 struct brw_reg ))
427 {
428 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
429 (dst.file == arg1.file && dst.nr == arg1.nr)) {
430 struct brw_compile *p = &c->func;
431 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
432 func(c, tmp, arg0, arg1);
433 brw_MOV(p, dst, tmp);
434 release_tmp(c, tmp);
435 }
436 else {
437 func(c, dst, arg0, arg1);
438 }
439 }
440
441 /**
442 * \sa unalias2
443 * Checkes if 3-operand instruction needs an intermediate temporary.
444 */
445 static void unalias3( struct brw_vs_compile *c,
446 struct brw_reg dst,
447 struct brw_reg arg0,
448 struct brw_reg arg1,
449 struct brw_reg arg2,
450 void (*func)( struct brw_vs_compile *,
451 struct brw_reg,
452 struct brw_reg,
453 struct brw_reg,
454 struct brw_reg ))
455 {
456 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
457 (dst.file == arg1.file && dst.nr == arg1.nr) ||
458 (dst.file == arg2.file && dst.nr == arg2.nr)) {
459 struct brw_compile *p = &c->func;
460 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
461 func(c, tmp, arg0, arg1, arg2);
462 brw_MOV(p, dst, tmp);
463 release_tmp(c, tmp);
464 }
465 else {
466 func(c, dst, arg0, arg1, arg2);
467 }
468 }
469
470 static void emit_sop( struct brw_vs_compile *c,
471 struct brw_reg dst,
472 struct brw_reg arg0,
473 struct brw_reg arg1,
474 GLuint cond)
475 {
476 struct brw_compile *p = &c->func;
477
478 brw_MOV(p, dst, brw_imm_f(0.0f));
479 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
480 brw_MOV(p, dst, brw_imm_f(1.0f));
481 brw_set_predicate_control_flag_value(p, 0xff);
482 }
483
484 static void emit_seq( struct brw_vs_compile *c,
485 struct brw_reg dst,
486 struct brw_reg arg0,
487 struct brw_reg arg1 )
488 {
489 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
490 }
491
492 static void emit_sne( struct brw_vs_compile *c,
493 struct brw_reg dst,
494 struct brw_reg arg0,
495 struct brw_reg arg1 )
496 {
497 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
498 }
499 static void emit_slt( struct brw_vs_compile *c,
500 struct brw_reg dst,
501 struct brw_reg arg0,
502 struct brw_reg arg1 )
503 {
504 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
505 }
506
507 static void emit_sle( struct brw_vs_compile *c,
508 struct brw_reg dst,
509 struct brw_reg arg0,
510 struct brw_reg arg1 )
511 {
512 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
513 }
514
515 static void emit_sgt( struct brw_vs_compile *c,
516 struct brw_reg dst,
517 struct brw_reg arg0,
518 struct brw_reg arg1 )
519 {
520 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
521 }
522
523 static void emit_sge( struct brw_vs_compile *c,
524 struct brw_reg dst,
525 struct brw_reg arg0,
526 struct brw_reg arg1 )
527 {
528 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
529 }
530
531 static void emit_cmp( struct brw_compile *p,
532 struct brw_reg dst,
533 struct brw_reg arg0,
534 struct brw_reg arg1,
535 struct brw_reg arg2 )
536 {
537 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
538 brw_SEL(p, dst, arg1, arg2);
539 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
540 }
541
542 static void emit_sign(struct brw_vs_compile *c,
543 struct brw_reg dst,
544 struct brw_reg arg0)
545 {
546 struct brw_compile *p = &c->func;
547
548 brw_MOV(p, dst, brw_imm_f(0));
549
550 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
551 brw_MOV(p, dst, brw_imm_f(-1.0));
552 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
553
554 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
555 brw_MOV(p, dst, brw_imm_f(1.0));
556 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
557 }
558
559 static void emit_max( struct brw_compile *p,
560 struct brw_reg dst,
561 struct brw_reg arg0,
562 struct brw_reg arg1 )
563 {
564 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
565 brw_SEL(p, dst, arg0, arg1);
566 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
567 }
568
569 static void emit_min( struct brw_compile *p,
570 struct brw_reg dst,
571 struct brw_reg arg0,
572 struct brw_reg arg1 )
573 {
574 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
575 brw_SEL(p, dst, arg0, arg1);
576 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
577 }
578
579
580 static void emit_math1( struct brw_vs_compile *c,
581 GLuint function,
582 struct brw_reg dst,
583 struct brw_reg arg0,
584 GLuint precision)
585 {
586 /* There are various odd behaviours with SEND on the simulator. In
587 * addition there are documented issues with the fact that the GEN4
588 * processor doesn't do dependency control properly on SEND
589 * results. So, on balance, this kludge to get around failures
590 * with writemasked math results looks like it might be necessary
591 * whether that turns out to be a simulator bug or not:
592 */
593 struct brw_compile *p = &c->func;
594 struct intel_context *intel = &p->brw->intel;
595 struct brw_reg tmp = dst;
596 GLboolean need_tmp = GL_FALSE;
597
598 if (dst.file != BRW_GENERAL_REGISTER_FILE)
599 need_tmp = GL_TRUE;
600
601 if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
602 need_tmp = GL_TRUE;
603
604 if (need_tmp)
605 tmp = get_tmp(c);
606
607 brw_math(p,
608 tmp,
609 function,
610 BRW_MATH_SATURATE_NONE,
611 2,
612 arg0,
613 BRW_MATH_DATA_SCALAR,
614 precision);
615
616 if (need_tmp) {
617 brw_MOV(p, dst, tmp);
618 release_tmp(c, tmp);
619 }
620 }
621
622
623 static void emit_math2( struct brw_vs_compile *c,
624 GLuint function,
625 struct brw_reg dst,
626 struct brw_reg arg0,
627 struct brw_reg arg1,
628 GLuint precision)
629 {
630 struct brw_compile *p = &c->func;
631 struct intel_context *intel = &p->brw->intel;
632 struct brw_reg tmp = dst;
633 GLboolean need_tmp = GL_FALSE;
634
635 if (dst.file != BRW_GENERAL_REGISTER_FILE)
636 need_tmp = GL_TRUE;
637
638 if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
639 need_tmp = GL_TRUE;
640
641 if (need_tmp)
642 tmp = get_tmp(c);
643
644 brw_MOV(p, brw_message_reg(3), arg1);
645
646 brw_math(p,
647 tmp,
648 function,
649 BRW_MATH_SATURATE_NONE,
650 2,
651 arg0,
652 BRW_MATH_DATA_SCALAR,
653 precision);
654
655 if (need_tmp) {
656 brw_MOV(p, dst, tmp);
657 release_tmp(c, tmp);
658 }
659 }
660
661
662 static void emit_exp_noalias( struct brw_vs_compile *c,
663 struct brw_reg dst,
664 struct brw_reg arg0 )
665 {
666 struct brw_compile *p = &c->func;
667
668
669 if (dst.dw1.bits.writemask & WRITEMASK_X) {
670 struct brw_reg tmp = get_tmp(c);
671 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
672
673 /* tmp_d = floor(arg0.x) */
674 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
675
676 /* result[0] = 2.0 ^ tmp */
677
678 /* Adjust exponent for floating point:
679 * exp += 127
680 */
681 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
682
683 /* Install exponent and sign.
684 * Excess drops off the edge:
685 */
686 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
687 tmp_d, brw_imm_d(23));
688
689 release_tmp(c, tmp);
690 }
691
692 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
693 /* result[1] = arg0.x - floor(arg0.x) */
694 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
695 }
696
697 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
698 /* As with the LOG instruction, we might be better off just
699 * doing a taylor expansion here, seeing as we have to do all
700 * the prep work.
701 *
702 * If mathbox partial precision is too low, consider also:
703 * result[3] = result[0] * EXP(result[1])
704 */
705 emit_math1(c,
706 BRW_MATH_FUNCTION_EXP,
707 brw_writemask(dst, WRITEMASK_Z),
708 brw_swizzle1(arg0, 0),
709 BRW_MATH_PRECISION_FULL);
710 }
711
712 if (dst.dw1.bits.writemask & WRITEMASK_W) {
713 /* result[3] = 1.0; */
714 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
715 }
716 }
717
718
719 static void emit_log_noalias( struct brw_vs_compile *c,
720 struct brw_reg dst,
721 struct brw_reg arg0 )
722 {
723 struct brw_compile *p = &c->func;
724 struct brw_reg tmp = dst;
725 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
726 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
727 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
728 dst.file != BRW_GENERAL_REGISTER_FILE);
729
730 if (need_tmp) {
731 tmp = get_tmp(c);
732 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
733 }
734
735 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
736 * according to spec:
737 *
738 * These almost look likey they could be joined up, but not really
739 * practical:
740 *
741 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
742 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
743 */
744 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
745 brw_AND(p,
746 brw_writemask(tmp_ud, WRITEMASK_X),
747 brw_swizzle1(arg0_ud, 0),
748 brw_imm_ud((1U<<31)-1));
749
750 brw_SHR(p,
751 brw_writemask(tmp_ud, WRITEMASK_X),
752 tmp_ud,
753 brw_imm_ud(23));
754
755 brw_ADD(p,
756 brw_writemask(tmp, WRITEMASK_X),
757 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
758 brw_imm_d(-127));
759 }
760
761 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
762 brw_AND(p,
763 brw_writemask(tmp_ud, WRITEMASK_Y),
764 brw_swizzle1(arg0_ud, 0),
765 brw_imm_ud((1<<23)-1));
766
767 brw_OR(p,
768 brw_writemask(tmp_ud, WRITEMASK_Y),
769 tmp_ud,
770 brw_imm_ud(127<<23));
771 }
772
773 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
774 /* result[2] = result[0] + LOG2(result[1]); */
775
776 /* Why bother? The above is just a hint how to do this with a
777 * taylor series. Maybe we *should* use a taylor series as by
778 * the time all the above has been done it's almost certainly
779 * quicker than calling the mathbox, even with low precision.
780 *
781 * Options are:
782 * - result[0] + mathbox.LOG2(result[1])
783 * - mathbox.LOG2(arg0.x)
784 * - result[0] + inline_taylor_approx(result[1])
785 */
786 emit_math1(c,
787 BRW_MATH_FUNCTION_LOG,
788 brw_writemask(tmp, WRITEMASK_Z),
789 brw_swizzle1(tmp, 1),
790 BRW_MATH_PRECISION_FULL);
791
792 brw_ADD(p,
793 brw_writemask(tmp, WRITEMASK_Z),
794 brw_swizzle1(tmp, 2),
795 brw_swizzle1(tmp, 0));
796 }
797
798 if (dst.dw1.bits.writemask & WRITEMASK_W) {
799 /* result[3] = 1.0; */
800 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
801 }
802
803 if (need_tmp) {
804 brw_MOV(p, dst, tmp);
805 release_tmp(c, tmp);
806 }
807 }
808
809
810 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
811 */
812 static void emit_dst_noalias( struct brw_vs_compile *c,
813 struct brw_reg dst,
814 struct brw_reg arg0,
815 struct brw_reg arg1)
816 {
817 struct brw_compile *p = &c->func;
818
819 /* There must be a better way to do this:
820 */
821 if (dst.dw1.bits.writemask & WRITEMASK_X)
822 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
823 if (dst.dw1.bits.writemask & WRITEMASK_Y)
824 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
825 if (dst.dw1.bits.writemask & WRITEMASK_Z)
826 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
827 if (dst.dw1.bits.writemask & WRITEMASK_W)
828 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
829 }
830
831
832 static void emit_xpd( struct brw_compile *p,
833 struct brw_reg dst,
834 struct brw_reg t,
835 struct brw_reg u)
836 {
837 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
838 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
839 }
840
841
842 static void emit_lit_noalias( struct brw_vs_compile *c,
843 struct brw_reg dst,
844 struct brw_reg arg0 )
845 {
846 struct brw_compile *p = &c->func;
847 struct brw_instruction *if_insn;
848 struct brw_reg tmp = dst;
849 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
850
851 if (need_tmp)
852 tmp = get_tmp(c);
853
854 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
855 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
856
857 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
858 * to get all channels active inside the IF. In the clipping code
859 * we run with NoMask, so it's not an option and we can use
860 * BRW_EXECUTE_1 for all comparisions.
861 */
862 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
863 if_insn = brw_IF(p, BRW_EXECUTE_8);
864 {
865 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
866
867 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
868 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
869 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
870
871 emit_math2(c,
872 BRW_MATH_FUNCTION_POW,
873 brw_writemask(dst, WRITEMASK_Z),
874 brw_swizzle1(tmp, 2),
875 brw_swizzle1(arg0, 3),
876 BRW_MATH_PRECISION_PARTIAL);
877 }
878
879 brw_ENDIF(p, if_insn);
880
881 release_tmp(c, tmp);
882 }
883
884 static void emit_lrp_noalias(struct brw_vs_compile *c,
885 struct brw_reg dst,
886 struct brw_reg arg0,
887 struct brw_reg arg1,
888 struct brw_reg arg2)
889 {
890 struct brw_compile *p = &c->func;
891
892 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
893 brw_MUL(p, brw_null_reg(), dst, arg2);
894 brw_MAC(p, dst, arg0, arg1);
895 }
896
897 /** 3 or 4-component vector normalization */
898 static void emit_nrm( struct brw_vs_compile *c,
899 struct brw_reg dst,
900 struct brw_reg arg0,
901 int num_comps)
902 {
903 struct brw_compile *p = &c->func;
904 struct brw_reg tmp = get_tmp(c);
905
906 /* tmp = dot(arg0, arg0) */
907 if (num_comps == 3)
908 brw_DP3(p, tmp, arg0, arg0);
909 else
910 brw_DP4(p, tmp, arg0, arg0);
911
912 /* tmp = 1 / sqrt(tmp) */
913 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
914
915 /* dst = arg0 * tmp */
916 brw_MUL(p, dst, arg0, tmp);
917
918 release_tmp(c, tmp);
919 }
920
921
922 static struct brw_reg
923 get_constant(struct brw_vs_compile *c,
924 const struct prog_instruction *inst,
925 GLuint argIndex)
926 {
927 const struct prog_src_register *src = &inst->SrcReg[argIndex];
928 struct brw_compile *p = &c->func;
929 struct brw_reg const_reg = c->current_const[argIndex].reg;
930
931 assert(argIndex < 3);
932
933 assert(c->func.brw->intel.gen < 6); /* FINISHME */
934
935 if (c->current_const[argIndex].index != src->Index) {
936 /* Keep track of the last constant loaded in this slot, for reuse. */
937 c->current_const[argIndex].index = src->Index;
938
939 #if 0
940 printf(" fetch const[%d] for arg %d into reg %d\n",
941 src->Index, argIndex, c->current_const[argIndex].reg.nr);
942 #endif
943 /* need to fetch the constant now */
944 brw_dp_READ_4_vs(p,
945 const_reg, /* writeback dest */
946 16 * src->Index, /* byte offset */
947 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
948 );
949 }
950
951 /* replicate lower four floats into upper half (to get XYZWXYZW) */
952 const_reg = stride(const_reg, 0, 4, 0);
953 const_reg.subnr = 0;
954
955 return const_reg;
956 }
957
958 static struct brw_reg
959 get_reladdr_constant(struct brw_vs_compile *c,
960 const struct prog_instruction *inst,
961 GLuint argIndex)
962 {
963 const struct prog_src_register *src = &inst->SrcReg[argIndex];
964 struct brw_compile *p = &c->func;
965 struct brw_reg const_reg = c->current_const[argIndex].reg;
966 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
967 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
968
969 assert(argIndex < 3);
970
971 assert(c->func.brw->intel.gen < 6); /* FINISHME */
972
973 /* Can't reuse a reladdr constant load. */
974 c->current_const[argIndex].index = -1;
975
976 #if 0
977 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
978 src->Index, argIndex, c->current_const[argIndex].reg.nr);
979 #endif
980
981 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
982
983 /* fetch the first vec4 */
984 brw_dp_READ_4_vs_relative(p,
985 const_reg, /* writeback dest */
986 byte_addr_reg, /* address register */
987 16 * src->Index, /* byte offset */
988 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
989 );
990
991 return const_reg;
992 }
993
994
995
996 /* TODO: relative addressing!
997 */
998 static struct brw_reg get_reg( struct brw_vs_compile *c,
999 gl_register_file file,
1000 GLuint index )
1001 {
1002 switch (file) {
1003 case PROGRAM_TEMPORARY:
1004 case PROGRAM_INPUT:
1005 case PROGRAM_OUTPUT:
1006 assert(c->regs[file][index].nr != 0);
1007 return c->regs[file][index];
1008 case PROGRAM_STATE_VAR:
1009 case PROGRAM_CONSTANT:
1010 case PROGRAM_UNIFORM:
1011 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1012 return c->regs[PROGRAM_STATE_VAR][index];
1013 case PROGRAM_ADDRESS:
1014 assert(index == 0);
1015 return c->regs[file][index];
1016
1017 case PROGRAM_UNDEFINED: /* undef values */
1018 return brw_null_reg();
1019
1020 case PROGRAM_LOCAL_PARAM:
1021 case PROGRAM_ENV_PARAM:
1022 case PROGRAM_WRITE_ONLY:
1023 default:
1024 assert(0);
1025 return brw_null_reg();
1026 }
1027 }
1028
1029
1030 /**
1031 * Indirect addressing: get reg[[arg] + offset].
1032 */
1033 static struct brw_reg deref( struct brw_vs_compile *c,
1034 struct brw_reg arg,
1035 GLint offset,
1036 GLuint reg_size )
1037 {
1038 struct brw_compile *p = &c->func;
1039 struct brw_reg tmp = get_tmp(c);
1040 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1041 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1042 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1043 struct brw_reg indirect = brw_vec4_indirect(0,0);
1044 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1045
1046 /* Set the vertical stride on the register access so that the first
1047 * 4 components come from a0.0 and the second 4 from a0.1.
1048 */
1049 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1050
1051 {
1052 brw_push_insn_state(p);
1053 brw_set_access_mode(p, BRW_ALIGN_1);
1054
1055 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1056 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1057
1058 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1059 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1060
1061 brw_MOV(p, tmp, indirect);
1062
1063 brw_pop_insn_state(p);
1064 }
1065
1066 /* NOTE: tmp not released */
1067 return tmp;
1068 }
1069
1070 static void
1071 move_to_reladdr_dst(struct brw_vs_compile *c,
1072 const struct prog_instruction *inst,
1073 struct brw_reg val)
1074 {
1075 struct brw_compile *p = &c->func;
1076 int reg_size = 32;
1077 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1078 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1079 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1080 GLuint byte_offset = base.nr * 32 + base.subnr;
1081 struct brw_reg indirect = brw_vec4_indirect(0,0);
1082 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1083
1084 /* Because destination register indirect addressing can only use
1085 * one index, we'll write each vertex's vec4 value separately.
1086 */
1087 val.width = BRW_WIDTH_4;
1088 val.vstride = BRW_VERTICAL_STRIDE_4;
1089
1090 brw_push_insn_state(p);
1091 brw_set_access_mode(p, BRW_ALIGN_1);
1092
1093 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1094 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1095 brw_MOV(p, indirect, val);
1096
1097 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1098 brw_ADD(p, brw_address_reg(0), acc,
1099 brw_imm_uw(byte_offset + reg_size / 2));
1100 brw_MOV(p, indirect, suboffset(val, 4));
1101
1102 brw_pop_insn_state(p);
1103 }
1104
1105 /**
1106 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1107 * TODO: relative addressing!
1108 */
1109 static struct brw_reg
1110 get_src_reg( struct brw_vs_compile *c,
1111 const struct prog_instruction *inst,
1112 GLuint argIndex )
1113 {
1114 const GLuint file = inst->SrcReg[argIndex].File;
1115 const GLint index = inst->SrcReg[argIndex].Index;
1116 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1117
1118 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1119 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1120
1121 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1122 SWIZZLE_ZERO,
1123 SWIZZLE_ZERO,
1124 SWIZZLE_ZERO)) {
1125 return brw_imm_f(0.0f);
1126 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1127 SWIZZLE_ONE,
1128 SWIZZLE_ONE,
1129 SWIZZLE_ONE)) {
1130 if (src->Negate)
1131 return brw_imm_f(-1.0F);
1132 else
1133 return brw_imm_f(1.0F);
1134 } else if (src->File == PROGRAM_CONSTANT) {
1135 const struct gl_program_parameter_list *params;
1136 float f;
1137 int component = -1;
1138
1139 switch (src->Swizzle) {
1140 case SWIZZLE_XXXX:
1141 component = 0;
1142 break;
1143 case SWIZZLE_YYYY:
1144 component = 1;
1145 break;
1146 case SWIZZLE_ZZZZ:
1147 component = 2;
1148 break;
1149 case SWIZZLE_WWWW:
1150 component = 3;
1151 break;
1152 }
1153
1154 if (component >= 0) {
1155 params = c->vp->program.Base.Parameters;
1156 f = params->ParameterValues[src->Index][component];
1157
1158 if (src->Abs)
1159 f = fabs(f);
1160 if (src->Negate)
1161 f = -f;
1162 return brw_imm_f(f);
1163 }
1164 }
1165 }
1166
1167 switch (file) {
1168 case PROGRAM_TEMPORARY:
1169 case PROGRAM_INPUT:
1170 case PROGRAM_OUTPUT:
1171 if (relAddr) {
1172 return deref(c, c->regs[file][0], index, 32);
1173 }
1174 else {
1175 assert(c->regs[file][index].nr != 0);
1176 return c->regs[file][index];
1177 }
1178
1179 case PROGRAM_STATE_VAR:
1180 case PROGRAM_CONSTANT:
1181 case PROGRAM_UNIFORM:
1182 case PROGRAM_ENV_PARAM:
1183 case PROGRAM_LOCAL_PARAM:
1184 if (c->vp->use_const_buffer) {
1185 if (!relAddr && c->constant_map[index] != -1) {
1186 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1187 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1188 } else if (relAddr)
1189 return get_reladdr_constant(c, inst, argIndex);
1190 else
1191 return get_constant(c, inst, argIndex);
1192 }
1193 else if (relAddr) {
1194 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1195 }
1196 else {
1197 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1198 return c->regs[PROGRAM_STATE_VAR][index];
1199 }
1200 case PROGRAM_ADDRESS:
1201 assert(index == 0);
1202 return c->regs[file][index];
1203
1204 case PROGRAM_UNDEFINED:
1205 /* this is a normal case since we loop over all three src args */
1206 return brw_null_reg();
1207
1208 case PROGRAM_WRITE_ONLY:
1209 default:
1210 assert(0);
1211 return brw_null_reg();
1212 }
1213 }
1214
1215 /**
1216 * Return the brw reg for the given instruction's src argument.
1217 * Will return mangled results for SWZ op. The emit_swz() function
1218 * ignores this result and recalculates taking extended swizzles into
1219 * account.
1220 */
1221 static struct brw_reg get_arg( struct brw_vs_compile *c,
1222 const struct prog_instruction *inst,
1223 GLuint argIndex )
1224 {
1225 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1226 struct brw_reg reg;
1227
1228 if (src->File == PROGRAM_UNDEFINED)
1229 return brw_null_reg();
1230
1231 reg = get_src_reg(c, inst, argIndex);
1232
1233 /* Convert 3-bit swizzle to 2-bit.
1234 */
1235 if (reg.file != BRW_IMMEDIATE_VALUE) {
1236 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1237 GET_SWZ(src->Swizzle, 1),
1238 GET_SWZ(src->Swizzle, 2),
1239 GET_SWZ(src->Swizzle, 3));
1240 }
1241
1242 /* Note this is ok for non-swizzle instructions:
1243 */
1244 reg.negate = src->Negate ? 1 : 0;
1245
1246 return reg;
1247 }
1248
1249
1250 /**
1251 * Get brw register for the given program dest register.
1252 */
1253 static struct brw_reg get_dst( struct brw_vs_compile *c,
1254 struct prog_dst_register dst )
1255 {
1256 struct brw_reg reg;
1257
1258 switch (dst.File) {
1259 case PROGRAM_TEMPORARY:
1260 case PROGRAM_OUTPUT:
1261 /* register-indirect addressing is only 1x1, not VxH, for
1262 * destination regs. So, for RelAddr we'll return a temporary
1263 * for the dest and do a move of the result to the RelAddr
1264 * register after the instruction emit.
1265 */
1266 if (dst.RelAddr) {
1267 reg = get_tmp(c);
1268 } else {
1269 assert(c->regs[dst.File][dst.Index].nr != 0);
1270 reg = c->regs[dst.File][dst.Index];
1271 }
1272 break;
1273 case PROGRAM_ADDRESS:
1274 assert(dst.Index == 0);
1275 reg = c->regs[dst.File][dst.Index];
1276 break;
1277 case PROGRAM_UNDEFINED:
1278 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1279 reg = brw_null_reg();
1280 break;
1281 default:
1282 assert(0);
1283 reg = brw_null_reg();
1284 }
1285
1286 assert(reg.type != BRW_IMMEDIATE_VALUE);
1287 reg.dw1.bits.writemask = dst.WriteMask;
1288
1289 return reg;
1290 }
1291
1292
1293 static void emit_swz( struct brw_vs_compile *c,
1294 struct brw_reg dst,
1295 const struct prog_instruction *inst)
1296 {
1297 const GLuint argIndex = 0;
1298 const struct prog_src_register src = inst->SrcReg[argIndex];
1299 struct brw_compile *p = &c->func;
1300 GLuint zeros_mask = 0;
1301 GLuint ones_mask = 0;
1302 GLuint src_mask = 0;
1303 GLubyte src_swz[4];
1304 GLboolean need_tmp = (src.Negate &&
1305 dst.file != BRW_GENERAL_REGISTER_FILE);
1306 struct brw_reg tmp = dst;
1307 GLuint i;
1308
1309 if (need_tmp)
1310 tmp = get_tmp(c);
1311
1312 for (i = 0; i < 4; i++) {
1313 if (dst.dw1.bits.writemask & (1<<i)) {
1314 GLubyte s = GET_SWZ(src.Swizzle, i);
1315 switch (s) {
1316 case SWIZZLE_X:
1317 case SWIZZLE_Y:
1318 case SWIZZLE_Z:
1319 case SWIZZLE_W:
1320 src_mask |= 1<<i;
1321 src_swz[i] = s;
1322 break;
1323 case SWIZZLE_ZERO:
1324 zeros_mask |= 1<<i;
1325 break;
1326 case SWIZZLE_ONE:
1327 ones_mask |= 1<<i;
1328 break;
1329 }
1330 }
1331 }
1332
1333 /* Do src first, in case dst aliases src:
1334 */
1335 if (src_mask) {
1336 struct brw_reg arg0;
1337
1338 arg0 = get_src_reg(c, inst, argIndex);
1339
1340 arg0 = brw_swizzle(arg0,
1341 src_swz[0], src_swz[1],
1342 src_swz[2], src_swz[3]);
1343
1344 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1345 }
1346
1347 if (zeros_mask)
1348 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1349
1350 if (ones_mask)
1351 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1352
1353 if (src.Negate)
1354 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1355
1356 if (need_tmp) {
1357 brw_MOV(p, dst, tmp);
1358 release_tmp(c, tmp);
1359 }
1360 }
1361
1362
1363 /**
1364 * Post-vertex-program processing. Send the results to the URB.
1365 */
1366 static void emit_vertex_write( struct brw_vs_compile *c)
1367 {
1368 struct brw_compile *p = &c->func;
1369 struct brw_context *brw = p->brw;
1370 struct intel_context *intel = &brw->intel;
1371 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1372 struct brw_reg ndc;
1373 int eot;
1374 GLuint len_vertex_header = 2;
1375 int next_mrf, i;
1376
1377 if (c->key.copy_edgeflag) {
1378 brw_MOV(p,
1379 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1380 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1381 }
1382
1383 if (intel->gen < 6) {
1384 /* Build ndc coords */
1385 ndc = get_tmp(c);
1386 /* ndc = 1.0 / pos.w */
1387 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1388 /* ndc.xyz = pos * ndc */
1389 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1390 }
1391
1392 /* Update the header for point size, user clipping flags, and -ve rhw
1393 * workaround.
1394 */
1395 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1396 c->key.nr_userclip || brw->has_negative_rhw_bug)
1397 {
1398 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1399 GLuint i;
1400
1401 brw_MOV(p, header1, brw_imm_ud(0));
1402
1403 brw_set_access_mode(p, BRW_ALIGN_16);
1404
1405 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1406 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1407 if (intel->gen < 6) {
1408 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1409 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1410 } else
1411 brw_MOV(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0));
1412 }
1413
1414 for (i = 0; i < c->key.nr_userclip; i++) {
1415 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1416 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1417 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1418 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1419 }
1420
1421 /* i965 clipping workaround:
1422 * 1) Test for -ve rhw
1423 * 2) If set,
1424 * set ndc = (0,0,0,0)
1425 * set ucp[6] = 1
1426 *
1427 * Later, clipping will detect ucp[6] and ensure the primitive is
1428 * clipped against all fixed planes.
1429 */
1430 if (brw->has_negative_rhw_bug) {
1431 brw_CMP(p,
1432 vec8(brw_null_reg()),
1433 BRW_CONDITIONAL_L,
1434 brw_swizzle1(ndc, 3),
1435 brw_imm_f(0));
1436
1437 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1438 brw_MOV(p, ndc, brw_imm_f(0));
1439 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1440 }
1441
1442 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1443 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1444 brw_set_access_mode(p, BRW_ALIGN_16);
1445
1446 release_tmp(c, header1);
1447 }
1448 else {
1449 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1450 }
1451
1452 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1453 * of zeros followed by two sets of NDC coordinates:
1454 */
1455 brw_set_access_mode(p, BRW_ALIGN_1);
1456 brw_set_acc_write_control(p, 0);
1457
1458 /* The VUE layout is documented in Volume 2a. */
1459 if (intel->gen >= 6) {
1460 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1461 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1462 * dword 4-7 (m2) is the 4D space position
1463 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1464 * enabled. We don't use it, so skip it.
1465 * m3 is the first vertex element data we fill, which is the vertex
1466 * position.
1467 */
1468 brw_MOV(p, brw_message_reg(2), pos);
1469 len_vertex_header = 1;
1470 } else if (intel->gen == 5) {
1471 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1472 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1473 * dword 4-7 (m2) is the ndc position (set above)
1474 * dword 8-11 (m3) of the vertex header is the 4D space position
1475 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1476 * m6 is a pad so that the vertex element data is aligned
1477 * m7 is the first vertex data we fill, which is the vertex position.
1478 */
1479 brw_MOV(p, brw_message_reg(2), ndc);
1480 brw_MOV(p, brw_message_reg(3), pos);
1481 brw_MOV(p, brw_message_reg(7), pos);
1482 len_vertex_header = 6;
1483 } else {
1484 /* There are 8 dwords in VUE header pre-Ironlake:
1485 * dword 0-3 (m1) is indices, point width, clip flags.
1486 * dword 4-7 (m2) is ndc position (set above)
1487 *
1488 * dword 8-11 (m3) is the first vertex data, which we always have be the
1489 * vertex position.
1490 */
1491 brw_MOV(p, brw_message_reg(2), ndc);
1492 brw_MOV(p, brw_message_reg(3), pos);
1493 len_vertex_header = 2;
1494 }
1495
1496 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1497 next_mrf = 2 + len_vertex_header;
1498 for (i = 0; i < VERT_RESULT_MAX; i++) {
1499 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1500 break;
1501 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1502 continue;
1503
1504 if (i >= VERT_RESULT_TEX0 &&
1505 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1506 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1507 next_mrf++;
1508 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1509 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1510 }
1511 }
1512
1513 eot = (c->first_overflow_output == 0);
1514
1515 brw_urb_WRITE(p,
1516 brw_null_reg(), /* dest */
1517 0, /* starting mrf reg nr */
1518 c->r0, /* src */
1519 0, /* allocate */
1520 1, /* used */
1521 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1522 0, /* response len */
1523 eot, /* eot */
1524 eot, /* writes complete */
1525 0, /* urb destination offset */
1526 BRW_URB_SWIZZLE_INTERLEAVE);
1527
1528 if (c->first_overflow_output > 0) {
1529 /* Not all of the vertex outputs/results fit into the MRF.
1530 * Move the overflowed attributes from the GRF to the MRF and
1531 * issue another brw_urb_WRITE().
1532 */
1533 GLuint i, mrf = 1;
1534 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1535 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1536 /* move from GRF to MRF */
1537 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1538 mrf++;
1539 }
1540 }
1541
1542 brw_urb_WRITE(p,
1543 brw_null_reg(), /* dest */
1544 0, /* starting mrf reg nr */
1545 c->r0, /* src */
1546 0, /* allocate */
1547 1, /* used */
1548 mrf, /* msg len */
1549 0, /* response len */
1550 1, /* eot */
1551 1, /* writes complete */
1552 14 / 2, /* urb destination offset */
1553 BRW_URB_SWIZZLE_INTERLEAVE);
1554 }
1555 }
1556
1557 static GLboolean
1558 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1559 {
1560 struct brw_compile *p = &c->func;
1561 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1562
1563 if (p->nr_insn == 0)
1564 return GL_FALSE;
1565
1566 if (val.address_mode != BRW_ADDRESS_DIRECT)
1567 return GL_FALSE;
1568
1569 switch (prev_insn->header.opcode) {
1570 case BRW_OPCODE_MOV:
1571 case BRW_OPCODE_MAC:
1572 case BRW_OPCODE_MUL:
1573 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1574 prev_insn->header.execution_size == val.width &&
1575 prev_insn->bits1.da1.dest_reg_file == val.file &&
1576 prev_insn->bits1.da1.dest_reg_type == val.type &&
1577 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1578 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1579 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1580 prev_insn->bits1.da16.dest_writemask == 0xf)
1581 return GL_TRUE;
1582 else
1583 return GL_FALSE;
1584 default:
1585 return GL_FALSE;
1586 }
1587 }
1588
1589 static uint32_t
1590 get_predicate(const struct prog_instruction *inst)
1591 {
1592 if (inst->DstReg.CondMask == COND_TR)
1593 return BRW_PREDICATE_NONE;
1594
1595 /* All of GLSL only produces predicates for COND_NE and one channel per
1596 * vector. Fail badly if someone starts doing something else, as it might
1597 * mean infinite looping or something.
1598 *
1599 * We'd like to support all the condition codes, but our hardware doesn't
1600 * quite match the Mesa IR, which is modeled after the NV extensions. For
1601 * those, the instruction may update the condition codes or not, then any
1602 * later instruction may use one of those condition codes. For gen4, the
1603 * instruction may update the flags register based on one of the condition
1604 * codes output by the instruction, and then further instructions may
1605 * predicate on that. We can probably support this, but it won't
1606 * necessarily be easy.
1607 */
1608 assert(inst->DstReg.CondMask == COND_NE);
1609
1610 switch (inst->DstReg.CondSwizzle) {
1611 case SWIZZLE_XXXX:
1612 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1613 case SWIZZLE_YYYY:
1614 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1615 case SWIZZLE_ZZZZ:
1616 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1617 case SWIZZLE_WWWW:
1618 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1619 default:
1620 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1621 inst->DstReg.CondMask);
1622 return BRW_PREDICATE_NORMAL;
1623 }
1624 }
1625
1626 /* Emit the vertex program instructions here.
1627 */
1628 void brw_vs_emit(struct brw_vs_compile *c )
1629 {
1630 #define MAX_IF_DEPTH 32
1631 #define MAX_LOOP_DEPTH 32
1632 struct brw_compile *p = &c->func;
1633 struct brw_context *brw = p->brw;
1634 struct intel_context *intel = &brw->intel;
1635 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1636 GLuint insn, if_depth = 0, loop_depth = 0;
1637 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1638 int if_depth_in_loop[MAX_LOOP_DEPTH];
1639 const struct brw_indirect stack_index = brw_indirect(0, 0);
1640 GLuint index;
1641 GLuint file;
1642
1643 if (INTEL_DEBUG & DEBUG_VS) {
1644 printf("vs-mesa:\n");
1645 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1646 GL_TRUE);
1647 printf("\n");
1648 }
1649
1650 /* FIXME Need to fix conditional instruction to remove this */
1651 if (intel->gen >= 6)
1652 p->single_program_flow = GL_TRUE;
1653
1654 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1655 brw_set_access_mode(p, BRW_ALIGN_16);
1656 if_depth_in_loop[loop_depth] = 0;
1657
1658 brw_set_acc_write_control(p, 1);
1659
1660 for (insn = 0; insn < nr_insns; insn++) {
1661 GLuint i;
1662 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1663
1664 /* Message registers can't be read, so copy the output into GRF
1665 * register if they are used in source registers
1666 */
1667 for (i = 0; i < 3; i++) {
1668 struct prog_src_register *src = &inst->SrcReg[i];
1669 GLuint index = src->Index;
1670 GLuint file = src->File;
1671 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1672 c->output_regs[index].used_in_src = GL_TRUE;
1673 }
1674
1675 switch (inst->Opcode) {
1676 case OPCODE_CAL:
1677 case OPCODE_RET:
1678 c->needs_stack = GL_TRUE;
1679 break;
1680 default:
1681 break;
1682 }
1683 }
1684
1685 /* Static register allocation
1686 */
1687 brw_vs_alloc_regs(c);
1688
1689 if (c->needs_stack)
1690 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1691
1692 for (insn = 0; insn < nr_insns; insn++) {
1693
1694 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1695 struct brw_reg args[3], dst;
1696 GLuint i;
1697
1698 #if 0
1699 printf("%d: ", insn);
1700 _mesa_print_instruction(inst);
1701 #endif
1702
1703 /* Get argument regs. SWZ is special and does this itself.
1704 */
1705 if (inst->Opcode != OPCODE_SWZ)
1706 for (i = 0; i < 3; i++) {
1707 const struct prog_src_register *src = &inst->SrcReg[i];
1708 index = src->Index;
1709 file = src->File;
1710 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1711 args[i] = c->output_regs[index].reg;
1712 else
1713 args[i] = get_arg(c, inst, i);
1714 }
1715
1716 /* Get dest regs. Note that it is possible for a reg to be both
1717 * dst and arg, given the static allocation of registers. So
1718 * care needs to be taken emitting multi-operation instructions.
1719 */
1720 index = inst->DstReg.Index;
1721 file = inst->DstReg.File;
1722 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1723 dst = c->output_regs[index].reg;
1724 else
1725 dst = get_dst(c, inst->DstReg);
1726
1727 if (inst->SaturateMode != SATURATE_OFF) {
1728 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1729 inst->SaturateMode);
1730 }
1731
1732 switch (inst->Opcode) {
1733 case OPCODE_ABS:
1734 brw_MOV(p, dst, brw_abs(args[0]));
1735 break;
1736 case OPCODE_ADD:
1737 brw_ADD(p, dst, args[0], args[1]);
1738 break;
1739 case OPCODE_COS:
1740 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1741 break;
1742 case OPCODE_DP2:
1743 brw_DP2(p, dst, args[0], args[1]);
1744 break;
1745 case OPCODE_DP3:
1746 brw_DP3(p, dst, args[0], args[1]);
1747 break;
1748 case OPCODE_DP4:
1749 brw_DP4(p, dst, args[0], args[1]);
1750 break;
1751 case OPCODE_DPH:
1752 brw_DPH(p, dst, args[0], args[1]);
1753 break;
1754 case OPCODE_NRM3:
1755 emit_nrm(c, dst, args[0], 3);
1756 break;
1757 case OPCODE_NRM4:
1758 emit_nrm(c, dst, args[0], 4);
1759 break;
1760 case OPCODE_DST:
1761 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1762 break;
1763 case OPCODE_EXP:
1764 unalias1(c, dst, args[0], emit_exp_noalias);
1765 break;
1766 case OPCODE_EX2:
1767 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1768 break;
1769 case OPCODE_ARL:
1770 brw_RNDD(p, dst, args[0]);
1771 break;
1772 case OPCODE_FLR:
1773 brw_RNDD(p, dst, args[0]);
1774 break;
1775 case OPCODE_FRC:
1776 brw_FRC(p, dst, args[0]);
1777 break;
1778 case OPCODE_LOG:
1779 unalias1(c, dst, args[0], emit_log_noalias);
1780 break;
1781 case OPCODE_LG2:
1782 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1783 break;
1784 case OPCODE_LIT:
1785 unalias1(c, dst, args[0], emit_lit_noalias);
1786 break;
1787 case OPCODE_LRP:
1788 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1789 break;
1790 case OPCODE_MAD:
1791 if (!accumulator_contains(c, args[2]))
1792 brw_MOV(p, brw_acc_reg(), args[2]);
1793 brw_MAC(p, dst, args[0], args[1]);
1794 break;
1795 case OPCODE_CMP:
1796 emit_cmp(p, dst, args[0], args[1], args[2]);
1797 break;
1798 case OPCODE_MAX:
1799 emit_max(p, dst, args[0], args[1]);
1800 break;
1801 case OPCODE_MIN:
1802 emit_min(p, dst, args[0], args[1]);
1803 break;
1804 case OPCODE_MOV:
1805 brw_MOV(p, dst, args[0]);
1806 break;
1807 case OPCODE_MUL:
1808 brw_MUL(p, dst, args[0], args[1]);
1809 break;
1810 case OPCODE_POW:
1811 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1812 break;
1813 case OPCODE_RCP:
1814 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1815 break;
1816 case OPCODE_RSQ:
1817 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1818 break;
1819
1820 case OPCODE_SEQ:
1821 unalias2(c, dst, args[0], args[1], emit_seq);
1822 break;
1823 case OPCODE_SIN:
1824 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1825 break;
1826 case OPCODE_SNE:
1827 unalias2(c, dst, args[0], args[1], emit_sne);
1828 break;
1829 case OPCODE_SGE:
1830 unalias2(c, dst, args[0], args[1], emit_sge);
1831 break;
1832 case OPCODE_SGT:
1833 unalias2(c, dst, args[0], args[1], emit_sgt);
1834 break;
1835 case OPCODE_SLT:
1836 unalias2(c, dst, args[0], args[1], emit_slt);
1837 break;
1838 case OPCODE_SLE:
1839 unalias2(c, dst, args[0], args[1], emit_sle);
1840 break;
1841 case OPCODE_SSG:
1842 unalias1(c, dst, args[0], emit_sign);
1843 break;
1844 case OPCODE_SUB:
1845 brw_ADD(p, dst, args[0], negate(args[1]));
1846 break;
1847 case OPCODE_SWZ:
1848 /* The args[0] value can't be used here as it won't have
1849 * correctly encoded the full swizzle:
1850 */
1851 emit_swz(c, dst, inst);
1852 break;
1853 case OPCODE_TRUNC:
1854 /* round toward zero */
1855 brw_RNDZ(p, dst, args[0]);
1856 break;
1857 case OPCODE_XPD:
1858 emit_xpd(p, dst, args[0], args[1]);
1859 break;
1860 case OPCODE_IF:
1861 assert(if_depth < MAX_IF_DEPTH);
1862 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1863 /* Note that brw_IF smashes the predicate_control field. */
1864 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1865 if_depth_in_loop[loop_depth]++;
1866 if_depth++;
1867 break;
1868 case OPCODE_ELSE:
1869 clear_current_const(c);
1870 assert(if_depth > 0);
1871 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1872 break;
1873 case OPCODE_ENDIF:
1874 clear_current_const(c);
1875 assert(if_depth > 0);
1876 brw_ENDIF(p, if_inst[--if_depth]);
1877 if_depth_in_loop[loop_depth]--;
1878 break;
1879 case OPCODE_BGNLOOP:
1880 clear_current_const(c);
1881 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1882 if_depth_in_loop[loop_depth] = 0;
1883 break;
1884 case OPCODE_BRK:
1885 brw_set_predicate_control(p, get_predicate(inst));
1886 brw_BREAK(p, if_depth_in_loop[loop_depth]);
1887 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1888 break;
1889 case OPCODE_CONT:
1890 brw_set_predicate_control(p, get_predicate(inst));
1891 brw_CONT(p, if_depth_in_loop[loop_depth]);
1892 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1893 break;
1894 case OPCODE_ENDLOOP:
1895 {
1896 clear_current_const(c);
1897 struct brw_instruction *inst0, *inst1;
1898 GLuint br = 1;
1899
1900 loop_depth--;
1901
1902 if (intel->gen == 5)
1903 br = 2;
1904
1905 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1906 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1907 while (inst0 > loop_inst[loop_depth]) {
1908 inst0--;
1909 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1910 inst0->bits3.if_else.jump_count == 0) {
1911 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1912 }
1913 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1914 inst0->bits3.if_else.jump_count == 0) {
1915 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1916 }
1917 }
1918 }
1919 break;
1920 case OPCODE_BRA:
1921 brw_set_predicate_control(p, get_predicate(inst));
1922 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1923 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1924 break;
1925 case OPCODE_CAL:
1926 brw_set_access_mode(p, BRW_ALIGN_1);
1927 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1928 brw_set_access_mode(p, BRW_ALIGN_16);
1929 brw_ADD(p, get_addr_reg(stack_index),
1930 get_addr_reg(stack_index), brw_imm_d(4));
1931 brw_save_call(p, inst->Comment, p->nr_insn);
1932 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1933 break;
1934 case OPCODE_RET:
1935 brw_ADD(p, get_addr_reg(stack_index),
1936 get_addr_reg(stack_index), brw_imm_d(-4));
1937 brw_set_access_mode(p, BRW_ALIGN_1);
1938 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1939 brw_set_access_mode(p, BRW_ALIGN_16);
1940 break;
1941 case OPCODE_END:
1942 emit_vertex_write(c);
1943 break;
1944 case OPCODE_PRINT:
1945 /* no-op */
1946 break;
1947 case OPCODE_BGNSUB:
1948 brw_save_label(p, inst->Comment, p->nr_insn);
1949 break;
1950 case OPCODE_ENDSUB:
1951 /* no-op */
1952 break;
1953 default:
1954 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1955 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1956 _mesa_opcode_string(inst->Opcode) :
1957 "unknown");
1958 }
1959
1960 /* Set the predication update on the last instruction of the native
1961 * instruction sequence.
1962 *
1963 * This would be problematic if it was set on a math instruction,
1964 * but that shouldn't be the case with the current GLSL compiler.
1965 */
1966 if (inst->CondUpdate) {
1967 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1968
1969 assert(hw_insn->header.destreg__conditionalmod == 0);
1970 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1971 }
1972
1973 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1974 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1975 && c->output_regs[inst->DstReg.Index].used_in_src) {
1976 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1977 }
1978
1979 /* Result color clamping.
1980 *
1981 * When destination register is an output register and
1982 * it's primary/secondary front/back color, we have to clamp
1983 * the result to [0,1]. This is done by enabling the
1984 * saturation bit for the last instruction.
1985 *
1986 * We don't use brw_set_saturate() as it modifies
1987 * p->current->header.saturate, which affects all the subsequent
1988 * instructions. Instead, we directly modify the header
1989 * of the last (already stored) instruction.
1990 */
1991 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1992 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1993 || (inst->DstReg.Index == VERT_RESULT_COL1)
1994 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1995 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1996 p->store[p->nr_insn-1].header.saturate = 1;
1997 }
1998 }
1999
2000 if (inst->DstReg.RelAddr) {
2001 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2002 inst->DstReg.File == PROGRAM_OUTPUT);
2003 move_to_reladdr_dst(c, inst, dst);
2004 }
2005
2006 release_tmps(c);
2007 }
2008
2009 brw_resolve_cals(p);
2010
2011 brw_optimize(p);
2012
2013 if (INTEL_DEBUG & DEBUG_VS) {
2014 int i;
2015
2016 printf("vs-native:\n");
2017 for (i = 0; i < p->nr_insn; i++)
2018 brw_disasm(stdout, &p->store[i], intel->gen);
2019 printf("\n");
2020 }
2021 }