720a6566fd2f0d6f02d91c4e8547c7185464455d
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /**
120 * Preallocate GRF register before code emit.
121 * Do things as simply as possible. Allocate and populate all regs
122 * ahead of time.
123 */
124 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
125 {
126 struct intel_context *intel = &c->func.brw->intel;
127 GLuint i, reg = 0, mrf;
128 int attributes_in_vue;
129 int first_reladdr_output;
130
131 /* Determine whether to use a real constant buffer or use a block
132 * of GRF registers for constants. The later is faster but only
133 * works if everything fits in the GRF.
134 * XXX this heuristic/check may need some fine tuning...
135 */
136 if (c->vp->program.Base.Parameters->NumParameters +
137 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
138 c->vp->use_const_buffer = GL_TRUE;
139 else
140 c->vp->use_const_buffer = GL_FALSE;
141
142 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
143
144 /* r0 -- reserved as usual
145 */
146 c->r0 = brw_vec8_grf(reg, 0);
147 reg++;
148
149 /* User clip planes from curbe:
150 */
151 if (c->key.nr_userclip) {
152 for (i = 0; i < c->key.nr_userclip; i++) {
153 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
154 }
155
156 /* Deal with curbe alignment:
157 */
158 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
159 }
160
161 /* Vertex program parameters from curbe:
162 */
163 if (c->vp->use_const_buffer) {
164 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
165 int constant = 0;
166
167 /* We've got more constants than we can load with the push
168 * mechanism. This is often correlated with reladdr loads where
169 * we should probably be using a pull mechanism anyway to avoid
170 * excessive reading. However, the pull mechanism is slow in
171 * general. So, we try to allocate as many non-reladdr-loaded
172 * constants through the push buffer as we can before giving up.
173 */
174 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
175 for (i = 0;
176 i < c->vp->program.Base.NumInstructions && constant < max_constant;
177 i++) {
178 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
179 int arg;
180
181 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
182 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
183 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
184 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
185 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
186 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
187 inst->SrcReg[arg].RelAddr)
188 continue;
189
190 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
191 c->constant_map[inst->SrcReg[arg].Index] = constant++;
192 }
193 }
194 }
195
196 for (i = 0; i < constant; i++) {
197 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
198 (i%2) * 4),
199 0, 4, 1);
200 }
201 reg += (constant + 1) / 2;
202 c->prog_data.curb_read_length = reg - 1;
203 /* XXX 0 causes a bug elsewhere... */
204 c->prog_data.nr_params = MAX2(constant * 4, 4);
205 }
206 else {
207 /* use a section of the GRF for constants */
208 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
209 for (i = 0; i < nr_params; i++) {
210 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
211 }
212 reg += (nr_params + 1) / 2;
213 c->prog_data.curb_read_length = reg - 1;
214
215 c->prog_data.nr_params = nr_params * 4;
216 }
217
218 /* Allocate input regs:
219 */
220 c->nr_inputs = 0;
221 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
222 if (c->prog_data.inputs_read & (1 << i)) {
223 c->nr_inputs++;
224 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
225 reg++;
226 }
227 }
228 /* If there are no inputs, we'll still be reading one attribute's worth
229 * because it's required -- see urb_read_length setting.
230 */
231 if (c->nr_inputs == 0)
232 reg++;
233
234 /* Allocate outputs. The non-position outputs go straight into message regs.
235 */
236 c->nr_outputs = 0;
237 c->first_output = reg;
238 c->first_overflow_output = 0;
239
240 if (intel->gen >= 6)
241 mrf = 4;
242 else if (intel->gen == 5)
243 mrf = 8;
244 else
245 mrf = 4;
246
247 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
248 for (i = 0; i < VERT_RESULT_MAX; i++) {
249 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
250 c->nr_outputs++;
251 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
252 if (i == VERT_RESULT_HPOS) {
253 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
254 reg++;
255 }
256 else if (i == VERT_RESULT_PSIZ) {
257 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
258 reg++;
259 mrf++; /* just a placeholder? XXX fix later stages & remove this */
260 }
261 else {
262 /* Two restrictions on our compute-to-MRF here. The
263 * message length for all SEND messages is restricted to
264 * [1,15], so we can't use mrf 15, as that means a length
265 * of 16.
266 *
267 * Additionally, URB writes are aligned to URB rows, so we
268 * need to put an even number of registers of URB data in
269 * each URB write so that the later write is aligned. A
270 * message length of 15 means 1 message header reg plus 14
271 * regs of URB data.
272 *
273 * For attributes beyond the compute-to-MRF, we compute to
274 * GRFs and they will be written in the second URB_WRITE.
275 */
276 if (first_reladdr_output > i && mrf < 15) {
277 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
278 mrf++;
279 }
280 else {
281 if (mrf >= 15 && !c->first_overflow_output)
282 c->first_overflow_output = i;
283 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
284 reg++;
285 mrf++;
286 }
287 }
288 }
289 }
290
291 /* Allocate program temporaries:
292 */
293 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
294 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
295 reg++;
296 }
297
298 /* Address reg(s). Don't try to use the internal address reg until
299 * deref time.
300 */
301 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
302 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
303 reg,
304 0,
305 BRW_REGISTER_TYPE_D,
306 BRW_VERTICAL_STRIDE_8,
307 BRW_WIDTH_8,
308 BRW_HORIZONTAL_STRIDE_1,
309 BRW_SWIZZLE_XXXX,
310 WRITEMASK_X);
311 reg++;
312 }
313
314 if (c->vp->use_const_buffer) {
315 for (i = 0; i < 3; i++) {
316 c->current_const[i].index = -1;
317 c->current_const[i].reg = brw_vec8_grf(reg, 0);
318 reg++;
319 }
320 }
321
322 for (i = 0; i < 128; i++) {
323 if (c->output_regs[i].used_in_src) {
324 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
325 reg++;
326 }
327 }
328
329 if (c->needs_stack) {
330 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
331 reg += 2;
332 }
333
334 /* Some opcodes need an internal temporary:
335 */
336 c->first_tmp = reg;
337 c->last_tmp = reg; /* for allocation purposes */
338
339 /* Each input reg holds data from two vertices. The
340 * urb_read_length is the number of registers read from *each*
341 * vertex urb, so is half the amount:
342 */
343 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
344 /* Setting this field to 0 leads to undefined behavior according to the
345 * the VS_STATE docs. Our VUEs will always have at least one attribute
346 * sitting in them, even if it's padding.
347 */
348 if (c->prog_data.urb_read_length == 0)
349 c->prog_data.urb_read_length = 1;
350
351 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
352 * them to fit the biggest thing they need to.
353 */
354 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
355
356 /* See emit_vertex_write() for where the VUE's overhead on top of the
357 * attributes comes from.
358 */
359 if (intel->gen >= 6)
360 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
361 else if (intel->gen == 5)
362 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
363 else
364 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
365
366 c->prog_data.total_grf = reg;
367
368 if (INTEL_DEBUG & DEBUG_VS) {
369 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
370 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
371 printf("%s reg = %d\n", __FUNCTION__, reg);
372 }
373 }
374
375
376 /**
377 * If an instruction uses a temp reg both as a src and the dest, we
378 * sometimes need to allocate an intermediate temporary.
379 */
380 static void unalias1( struct brw_vs_compile *c,
381 struct brw_reg dst,
382 struct brw_reg arg0,
383 void (*func)( struct brw_vs_compile *,
384 struct brw_reg,
385 struct brw_reg ))
386 {
387 if (dst.file == arg0.file && dst.nr == arg0.nr) {
388 struct brw_compile *p = &c->func;
389 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
390 func(c, tmp, arg0);
391 brw_MOV(p, dst, tmp);
392 release_tmp(c, tmp);
393 }
394 else {
395 func(c, dst, arg0);
396 }
397 }
398
399 /**
400 * \sa unalias2
401 * Checkes if 2-operand instruction needs an intermediate temporary.
402 */
403 static void unalias2( struct brw_vs_compile *c,
404 struct brw_reg dst,
405 struct brw_reg arg0,
406 struct brw_reg arg1,
407 void (*func)( struct brw_vs_compile *,
408 struct brw_reg,
409 struct brw_reg,
410 struct brw_reg ))
411 {
412 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
413 (dst.file == arg1.file && dst.nr == arg1.nr)) {
414 struct brw_compile *p = &c->func;
415 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
416 func(c, tmp, arg0, arg1);
417 brw_MOV(p, dst, tmp);
418 release_tmp(c, tmp);
419 }
420 else {
421 func(c, dst, arg0, arg1);
422 }
423 }
424
425 /**
426 * \sa unalias2
427 * Checkes if 3-operand instruction needs an intermediate temporary.
428 */
429 static void unalias3( struct brw_vs_compile *c,
430 struct brw_reg dst,
431 struct brw_reg arg0,
432 struct brw_reg arg1,
433 struct brw_reg arg2,
434 void (*func)( struct brw_vs_compile *,
435 struct brw_reg,
436 struct brw_reg,
437 struct brw_reg,
438 struct brw_reg ))
439 {
440 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
441 (dst.file == arg1.file && dst.nr == arg1.nr) ||
442 (dst.file == arg2.file && dst.nr == arg2.nr)) {
443 struct brw_compile *p = &c->func;
444 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
445 func(c, tmp, arg0, arg1, arg2);
446 brw_MOV(p, dst, tmp);
447 release_tmp(c, tmp);
448 }
449 else {
450 func(c, dst, arg0, arg1, arg2);
451 }
452 }
453
454 static void emit_sop( struct brw_vs_compile *c,
455 struct brw_reg dst,
456 struct brw_reg arg0,
457 struct brw_reg arg1,
458 GLuint cond)
459 {
460 struct brw_compile *p = &c->func;
461
462 brw_MOV(p, dst, brw_imm_f(0.0f));
463 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
464 brw_MOV(p, dst, brw_imm_f(1.0f));
465 brw_set_predicate_control_flag_value(p, 0xff);
466 }
467
468 static void emit_seq( struct brw_vs_compile *c,
469 struct brw_reg dst,
470 struct brw_reg arg0,
471 struct brw_reg arg1 )
472 {
473 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
474 }
475
476 static void emit_sne( struct brw_vs_compile *c,
477 struct brw_reg dst,
478 struct brw_reg arg0,
479 struct brw_reg arg1 )
480 {
481 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
482 }
483 static void emit_slt( struct brw_vs_compile *c,
484 struct brw_reg dst,
485 struct brw_reg arg0,
486 struct brw_reg arg1 )
487 {
488 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
489 }
490
491 static void emit_sle( struct brw_vs_compile *c,
492 struct brw_reg dst,
493 struct brw_reg arg0,
494 struct brw_reg arg1 )
495 {
496 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
497 }
498
499 static void emit_sgt( struct brw_vs_compile *c,
500 struct brw_reg dst,
501 struct brw_reg arg0,
502 struct brw_reg arg1 )
503 {
504 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
505 }
506
507 static void emit_sge( struct brw_vs_compile *c,
508 struct brw_reg dst,
509 struct brw_reg arg0,
510 struct brw_reg arg1 )
511 {
512 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
513 }
514
515 static void emit_cmp( struct brw_compile *p,
516 struct brw_reg dst,
517 struct brw_reg arg0,
518 struct brw_reg arg1,
519 struct brw_reg arg2 )
520 {
521 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
522 brw_SEL(p, dst, arg1, arg2);
523 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
524 }
525
526 static void emit_sign(struct brw_vs_compile *c,
527 struct brw_reg dst,
528 struct brw_reg arg0)
529 {
530 struct brw_compile *p = &c->func;
531
532 brw_MOV(p, dst, brw_imm_f(0));
533
534 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
535 brw_MOV(p, dst, brw_imm_f(-1.0));
536 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
537
538 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
539 brw_MOV(p, dst, brw_imm_f(1.0));
540 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
541 }
542
543 static void emit_max( struct brw_compile *p,
544 struct brw_reg dst,
545 struct brw_reg arg0,
546 struct brw_reg arg1 )
547 {
548 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
549 brw_SEL(p, dst, arg0, arg1);
550 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
551 }
552
553 static void emit_min( struct brw_compile *p,
554 struct brw_reg dst,
555 struct brw_reg arg0,
556 struct brw_reg arg1 )
557 {
558 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
559 brw_SEL(p, dst, arg0, arg1);
560 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
561 }
562
563
564 static void emit_math1( struct brw_vs_compile *c,
565 GLuint function,
566 struct brw_reg dst,
567 struct brw_reg arg0,
568 GLuint precision)
569 {
570 /* There are various odd behaviours with SEND on the simulator. In
571 * addition there are documented issues with the fact that the GEN4
572 * processor doesn't do dependency control properly on SEND
573 * results. So, on balance, this kludge to get around failures
574 * with writemasked math results looks like it might be necessary
575 * whether that turns out to be a simulator bug or not:
576 */
577 struct brw_compile *p = &c->func;
578 struct intel_context *intel = &p->brw->intel;
579 struct brw_reg tmp = dst;
580 GLboolean need_tmp = (intel->gen < 6 &&
581 (dst.dw1.bits.writemask != 0xf ||
582 dst.file != BRW_GENERAL_REGISTER_FILE));
583
584 if (need_tmp)
585 tmp = get_tmp(c);
586
587 brw_math(p,
588 tmp,
589 function,
590 BRW_MATH_SATURATE_NONE,
591 2,
592 arg0,
593 BRW_MATH_DATA_SCALAR,
594 precision);
595
596 if (need_tmp) {
597 brw_MOV(p, dst, tmp);
598 release_tmp(c, tmp);
599 }
600 }
601
602
603 static void emit_math2( struct brw_vs_compile *c,
604 GLuint function,
605 struct brw_reg dst,
606 struct brw_reg arg0,
607 struct brw_reg arg1,
608 GLuint precision)
609 {
610 struct brw_compile *p = &c->func;
611 struct intel_context *intel = &p->brw->intel;
612 struct brw_reg tmp = dst;
613 GLboolean need_tmp = (intel->gen < 6 &&
614 (dst.dw1.bits.writemask != 0xf ||
615 dst.file != BRW_GENERAL_REGISTER_FILE));
616
617 if (need_tmp)
618 tmp = get_tmp(c);
619
620 brw_MOV(p, brw_message_reg(3), arg1);
621
622 brw_math(p,
623 tmp,
624 function,
625 BRW_MATH_SATURATE_NONE,
626 2,
627 arg0,
628 BRW_MATH_DATA_SCALAR,
629 precision);
630
631 if (need_tmp) {
632 brw_MOV(p, dst, tmp);
633 release_tmp(c, tmp);
634 }
635 }
636
637
638 static void emit_exp_noalias( struct brw_vs_compile *c,
639 struct brw_reg dst,
640 struct brw_reg arg0 )
641 {
642 struct brw_compile *p = &c->func;
643
644
645 if (dst.dw1.bits.writemask & WRITEMASK_X) {
646 struct brw_reg tmp = get_tmp(c);
647 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
648
649 /* tmp_d = floor(arg0.x) */
650 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
651
652 /* result[0] = 2.0 ^ tmp */
653
654 /* Adjust exponent for floating point:
655 * exp += 127
656 */
657 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
658
659 /* Install exponent and sign.
660 * Excess drops off the edge:
661 */
662 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
663 tmp_d, brw_imm_d(23));
664
665 release_tmp(c, tmp);
666 }
667
668 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
669 /* result[1] = arg0.x - floor(arg0.x) */
670 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
671 }
672
673 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
674 /* As with the LOG instruction, we might be better off just
675 * doing a taylor expansion here, seeing as we have to do all
676 * the prep work.
677 *
678 * If mathbox partial precision is too low, consider also:
679 * result[3] = result[0] * EXP(result[1])
680 */
681 emit_math1(c,
682 BRW_MATH_FUNCTION_EXP,
683 brw_writemask(dst, WRITEMASK_Z),
684 brw_swizzle1(arg0, 0),
685 BRW_MATH_PRECISION_FULL);
686 }
687
688 if (dst.dw1.bits.writemask & WRITEMASK_W) {
689 /* result[3] = 1.0; */
690 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
691 }
692 }
693
694
695 static void emit_log_noalias( struct brw_vs_compile *c,
696 struct brw_reg dst,
697 struct brw_reg arg0 )
698 {
699 struct brw_compile *p = &c->func;
700 struct brw_reg tmp = dst;
701 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
702 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
703 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
704 dst.file != BRW_GENERAL_REGISTER_FILE);
705
706 if (need_tmp) {
707 tmp = get_tmp(c);
708 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
709 }
710
711 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
712 * according to spec:
713 *
714 * These almost look likey they could be joined up, but not really
715 * practical:
716 *
717 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
718 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
719 */
720 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
721 brw_AND(p,
722 brw_writemask(tmp_ud, WRITEMASK_X),
723 brw_swizzle1(arg0_ud, 0),
724 brw_imm_ud((1U<<31)-1));
725
726 brw_SHR(p,
727 brw_writemask(tmp_ud, WRITEMASK_X),
728 tmp_ud,
729 brw_imm_ud(23));
730
731 brw_ADD(p,
732 brw_writemask(tmp, WRITEMASK_X),
733 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
734 brw_imm_d(-127));
735 }
736
737 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
738 brw_AND(p,
739 brw_writemask(tmp_ud, WRITEMASK_Y),
740 brw_swizzle1(arg0_ud, 0),
741 brw_imm_ud((1<<23)-1));
742
743 brw_OR(p,
744 brw_writemask(tmp_ud, WRITEMASK_Y),
745 tmp_ud,
746 brw_imm_ud(127<<23));
747 }
748
749 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
750 /* result[2] = result[0] + LOG2(result[1]); */
751
752 /* Why bother? The above is just a hint how to do this with a
753 * taylor series. Maybe we *should* use a taylor series as by
754 * the time all the above has been done it's almost certainly
755 * quicker than calling the mathbox, even with low precision.
756 *
757 * Options are:
758 * - result[0] + mathbox.LOG2(result[1])
759 * - mathbox.LOG2(arg0.x)
760 * - result[0] + inline_taylor_approx(result[1])
761 */
762 emit_math1(c,
763 BRW_MATH_FUNCTION_LOG,
764 brw_writemask(tmp, WRITEMASK_Z),
765 brw_swizzle1(tmp, 1),
766 BRW_MATH_PRECISION_FULL);
767
768 brw_ADD(p,
769 brw_writemask(tmp, WRITEMASK_Z),
770 brw_swizzle1(tmp, 2),
771 brw_swizzle1(tmp, 0));
772 }
773
774 if (dst.dw1.bits.writemask & WRITEMASK_W) {
775 /* result[3] = 1.0; */
776 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
777 }
778
779 if (need_tmp) {
780 brw_MOV(p, dst, tmp);
781 release_tmp(c, tmp);
782 }
783 }
784
785
786 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
787 */
788 static void emit_dst_noalias( struct brw_vs_compile *c,
789 struct brw_reg dst,
790 struct brw_reg arg0,
791 struct brw_reg arg1)
792 {
793 struct brw_compile *p = &c->func;
794
795 /* There must be a better way to do this:
796 */
797 if (dst.dw1.bits.writemask & WRITEMASK_X)
798 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
799 if (dst.dw1.bits.writemask & WRITEMASK_Y)
800 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
801 if (dst.dw1.bits.writemask & WRITEMASK_Z)
802 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
803 if (dst.dw1.bits.writemask & WRITEMASK_W)
804 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
805 }
806
807
808 static void emit_xpd( struct brw_compile *p,
809 struct brw_reg dst,
810 struct brw_reg t,
811 struct brw_reg u)
812 {
813 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
814 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
815 }
816
817
818 static void emit_lit_noalias( struct brw_vs_compile *c,
819 struct brw_reg dst,
820 struct brw_reg arg0 )
821 {
822 struct brw_compile *p = &c->func;
823 struct brw_instruction *if_insn;
824 struct brw_reg tmp = dst;
825 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
826
827 if (need_tmp)
828 tmp = get_tmp(c);
829
830 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
831 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
832
833 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
834 * to get all channels active inside the IF. In the clipping code
835 * we run with NoMask, so it's not an option and we can use
836 * BRW_EXECUTE_1 for all comparisions.
837 */
838 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
839 if_insn = brw_IF(p, BRW_EXECUTE_8);
840 {
841 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
842
843 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
844 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
845 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
846
847 emit_math2(c,
848 BRW_MATH_FUNCTION_POW,
849 brw_writemask(dst, WRITEMASK_Z),
850 brw_swizzle1(tmp, 2),
851 brw_swizzle1(arg0, 3),
852 BRW_MATH_PRECISION_PARTIAL);
853 }
854
855 brw_ENDIF(p, if_insn);
856
857 release_tmp(c, tmp);
858 }
859
860 static void emit_lrp_noalias(struct brw_vs_compile *c,
861 struct brw_reg dst,
862 struct brw_reg arg0,
863 struct brw_reg arg1,
864 struct brw_reg arg2)
865 {
866 struct brw_compile *p = &c->func;
867
868 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
869 brw_MUL(p, brw_null_reg(), dst, arg2);
870 brw_MAC(p, dst, arg0, arg1);
871 }
872
873 /** 3 or 4-component vector normalization */
874 static void emit_nrm( struct brw_vs_compile *c,
875 struct brw_reg dst,
876 struct brw_reg arg0,
877 int num_comps)
878 {
879 struct brw_compile *p = &c->func;
880 struct brw_reg tmp = get_tmp(c);
881
882 /* tmp = dot(arg0, arg0) */
883 if (num_comps == 3)
884 brw_DP3(p, tmp, arg0, arg0);
885 else
886 brw_DP4(p, tmp, arg0, arg0);
887
888 /* tmp = 1 / sqrt(tmp) */
889 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
890
891 /* dst = arg0 * tmp */
892 brw_MUL(p, dst, arg0, tmp);
893
894 release_tmp(c, tmp);
895 }
896
897
898 static struct brw_reg
899 get_constant(struct brw_vs_compile *c,
900 const struct prog_instruction *inst,
901 GLuint argIndex)
902 {
903 const struct prog_src_register *src = &inst->SrcReg[argIndex];
904 struct brw_compile *p = &c->func;
905 struct brw_reg const_reg = c->current_const[argIndex].reg;
906
907 assert(argIndex < 3);
908
909 if (c->current_const[argIndex].index != src->Index) {
910 /* Keep track of the last constant loaded in this slot, for reuse. */
911 c->current_const[argIndex].index = src->Index;
912
913 #if 0
914 printf(" fetch const[%d] for arg %d into reg %d\n",
915 src->Index, argIndex, c->current_const[argIndex].reg.nr);
916 #endif
917 /* need to fetch the constant now */
918 brw_dp_READ_4_vs(p,
919 const_reg, /* writeback dest */
920 16 * src->Index, /* byte offset */
921 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
922 );
923 }
924
925 /* replicate lower four floats into upper half (to get XYZWXYZW) */
926 const_reg = stride(const_reg, 0, 4, 0);
927 const_reg.subnr = 0;
928
929 return const_reg;
930 }
931
932 static struct brw_reg
933 get_reladdr_constant(struct brw_vs_compile *c,
934 const struct prog_instruction *inst,
935 GLuint argIndex)
936 {
937 const struct prog_src_register *src = &inst->SrcReg[argIndex];
938 struct brw_compile *p = &c->func;
939 struct brw_reg const_reg = c->current_const[argIndex].reg;
940 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
941 struct brw_reg byte_addr_reg = get_tmp(c);
942
943 assert(argIndex < 3);
944
945 /* Can't reuse a reladdr constant load. */
946 c->current_const[argIndex].index = -1;
947
948 #if 0
949 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
950 src->Index, argIndex, c->current_const[argIndex].reg.nr);
951 #endif
952
953 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
954
955 /* fetch the first vec4 */
956 brw_dp_READ_4_vs_relative(p,
957 const_reg, /* writeback dest */
958 byte_addr_reg, /* address register */
959 16 * src->Index, /* byte offset */
960 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
961 );
962
963 return const_reg;
964 }
965
966
967
968 /* TODO: relative addressing!
969 */
970 static struct brw_reg get_reg( struct brw_vs_compile *c,
971 gl_register_file file,
972 GLuint index )
973 {
974 switch (file) {
975 case PROGRAM_TEMPORARY:
976 case PROGRAM_INPUT:
977 case PROGRAM_OUTPUT:
978 assert(c->regs[file][index].nr != 0);
979 return c->regs[file][index];
980 case PROGRAM_STATE_VAR:
981 case PROGRAM_CONSTANT:
982 case PROGRAM_UNIFORM:
983 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
984 return c->regs[PROGRAM_STATE_VAR][index];
985 case PROGRAM_ADDRESS:
986 assert(index == 0);
987 return c->regs[file][index];
988
989 case PROGRAM_UNDEFINED: /* undef values */
990 return brw_null_reg();
991
992 case PROGRAM_LOCAL_PARAM:
993 case PROGRAM_ENV_PARAM:
994 case PROGRAM_WRITE_ONLY:
995 default:
996 assert(0);
997 return brw_null_reg();
998 }
999 }
1000
1001
1002 /**
1003 * Indirect addressing: get reg[[arg] + offset].
1004 */
1005 static struct brw_reg deref( struct brw_vs_compile *c,
1006 struct brw_reg arg,
1007 GLint offset,
1008 GLuint reg_size )
1009 {
1010 struct brw_compile *p = &c->func;
1011 struct brw_reg tmp = get_tmp(c);
1012 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1013 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1014 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1015 struct brw_reg indirect = brw_vec4_indirect(0,0);
1016 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1017
1018 /* Set the vertical stride on the register access so that the first
1019 * 4 components come from a0.0 and the second 4 from a0.1.
1020 */
1021 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1022
1023 {
1024 brw_push_insn_state(p);
1025 brw_set_access_mode(p, BRW_ALIGN_1);
1026
1027 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1028 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1029
1030 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1031 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1032
1033 brw_MOV(p, tmp, indirect);
1034
1035 brw_pop_insn_state(p);
1036 }
1037
1038 /* NOTE: tmp not released */
1039 return tmp;
1040 }
1041
1042 static void
1043 move_to_reladdr_dst(struct brw_vs_compile *c,
1044 const struct prog_instruction *inst,
1045 struct brw_reg val)
1046 {
1047 struct brw_compile *p = &c->func;
1048 int reg_size = 32;
1049 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1050 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1051 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1052 GLuint byte_offset = base.nr * 32 + base.subnr;
1053 struct brw_reg indirect = brw_vec4_indirect(0,0);
1054 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1055
1056 brw_push_insn_state(p);
1057 brw_set_access_mode(p, BRW_ALIGN_1);
1058
1059 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1060 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1061 brw_MOV(p, indirect, val);
1062
1063 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1064 brw_ADD(p, brw_address_reg(0), acc,
1065 brw_imm_uw(byte_offset + reg_size / 2));
1066 brw_MOV(p, indirect, suboffset(val, 4));
1067
1068 brw_pop_insn_state(p);
1069 }
1070
1071 /**
1072 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1073 * TODO: relative addressing!
1074 */
1075 static struct brw_reg
1076 get_src_reg( struct brw_vs_compile *c,
1077 const struct prog_instruction *inst,
1078 GLuint argIndex )
1079 {
1080 const GLuint file = inst->SrcReg[argIndex].File;
1081 const GLint index = inst->SrcReg[argIndex].Index;
1082 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1083
1084 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1085 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1086
1087 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1088 SWIZZLE_ZERO,
1089 SWIZZLE_ZERO,
1090 SWIZZLE_ZERO)) {
1091 return brw_imm_f(0.0f);
1092 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1093 SWIZZLE_ONE,
1094 SWIZZLE_ONE,
1095 SWIZZLE_ONE)) {
1096 if (src->Negate)
1097 return brw_imm_f(-1.0F);
1098 else
1099 return brw_imm_f(1.0F);
1100 } else if (src->File == PROGRAM_CONSTANT) {
1101 const struct gl_program_parameter_list *params;
1102 float f;
1103 int component = -1;
1104
1105 switch (src->Swizzle) {
1106 case SWIZZLE_XXXX:
1107 component = 0;
1108 break;
1109 case SWIZZLE_YYYY:
1110 component = 1;
1111 break;
1112 case SWIZZLE_ZZZZ:
1113 component = 2;
1114 break;
1115 case SWIZZLE_WWWW:
1116 component = 3;
1117 break;
1118 }
1119
1120 if (component >= 0) {
1121 params = c->vp->program.Base.Parameters;
1122 f = params->ParameterValues[src->Index][component];
1123
1124 if (src->Abs)
1125 f = fabs(f);
1126 if (src->Negate)
1127 f = -f;
1128 return brw_imm_f(f);
1129 }
1130 }
1131 }
1132
1133 switch (file) {
1134 case PROGRAM_TEMPORARY:
1135 case PROGRAM_INPUT:
1136 case PROGRAM_OUTPUT:
1137 if (relAddr) {
1138 return deref(c, c->regs[file][0], index, 32);
1139 }
1140 else {
1141 assert(c->regs[file][index].nr != 0);
1142 return c->regs[file][index];
1143 }
1144
1145 case PROGRAM_STATE_VAR:
1146 case PROGRAM_CONSTANT:
1147 case PROGRAM_UNIFORM:
1148 case PROGRAM_ENV_PARAM:
1149 case PROGRAM_LOCAL_PARAM:
1150 if (c->vp->use_const_buffer) {
1151 if (!relAddr && c->constant_map[index] != -1) {
1152 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1153 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1154 } else if (relAddr)
1155 return get_reladdr_constant(c, inst, argIndex);
1156 else
1157 return get_constant(c, inst, argIndex);
1158 }
1159 else if (relAddr) {
1160 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1161 }
1162 else {
1163 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1164 return c->regs[PROGRAM_STATE_VAR][index];
1165 }
1166 case PROGRAM_ADDRESS:
1167 assert(index == 0);
1168 return c->regs[file][index];
1169
1170 case PROGRAM_UNDEFINED:
1171 /* this is a normal case since we loop over all three src args */
1172 return brw_null_reg();
1173
1174 case PROGRAM_WRITE_ONLY:
1175 default:
1176 assert(0);
1177 return brw_null_reg();
1178 }
1179 }
1180
1181 /**
1182 * Return the brw reg for the given instruction's src argument.
1183 * Will return mangled results for SWZ op. The emit_swz() function
1184 * ignores this result and recalculates taking extended swizzles into
1185 * account.
1186 */
1187 static struct brw_reg get_arg( struct brw_vs_compile *c,
1188 const struct prog_instruction *inst,
1189 GLuint argIndex )
1190 {
1191 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1192 struct brw_reg reg;
1193
1194 if (src->File == PROGRAM_UNDEFINED)
1195 return brw_null_reg();
1196
1197 reg = get_src_reg(c, inst, argIndex);
1198
1199 /* Convert 3-bit swizzle to 2-bit.
1200 */
1201 if (reg.file != BRW_IMMEDIATE_VALUE) {
1202 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1203 GET_SWZ(src->Swizzle, 1),
1204 GET_SWZ(src->Swizzle, 2),
1205 GET_SWZ(src->Swizzle, 3));
1206 }
1207
1208 /* Note this is ok for non-swizzle instructions:
1209 */
1210 reg.negate = src->Negate ? 1 : 0;
1211
1212 return reg;
1213 }
1214
1215
1216 /**
1217 * Get brw register for the given program dest register.
1218 */
1219 static struct brw_reg get_dst( struct brw_vs_compile *c,
1220 struct prog_dst_register dst )
1221 {
1222 struct brw_reg reg;
1223
1224 switch (dst.File) {
1225 case PROGRAM_TEMPORARY:
1226 case PROGRAM_OUTPUT:
1227 /* register-indirect addressing is only 1x1, not VxH, for
1228 * destination regs. So, for RelAddr we'll return a temporary
1229 * for the dest and do a move of the result to the RelAddr
1230 * register after the instruction emit.
1231 */
1232 if (dst.RelAddr) {
1233 reg = get_tmp(c);
1234 } else {
1235 assert(c->regs[dst.File][dst.Index].nr != 0);
1236 reg = c->regs[dst.File][dst.Index];
1237 }
1238 break;
1239 case PROGRAM_ADDRESS:
1240 assert(dst.Index == 0);
1241 reg = c->regs[dst.File][dst.Index];
1242 break;
1243 case PROGRAM_UNDEFINED:
1244 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1245 reg = brw_null_reg();
1246 break;
1247 default:
1248 assert(0);
1249 reg = brw_null_reg();
1250 }
1251
1252 assert(reg.type != BRW_IMMEDIATE_VALUE);
1253 reg.dw1.bits.writemask = dst.WriteMask;
1254
1255 return reg;
1256 }
1257
1258
1259 static void emit_swz( struct brw_vs_compile *c,
1260 struct brw_reg dst,
1261 const struct prog_instruction *inst)
1262 {
1263 const GLuint argIndex = 0;
1264 const struct prog_src_register src = inst->SrcReg[argIndex];
1265 struct brw_compile *p = &c->func;
1266 GLuint zeros_mask = 0;
1267 GLuint ones_mask = 0;
1268 GLuint src_mask = 0;
1269 GLubyte src_swz[4];
1270 GLboolean need_tmp = (src.Negate &&
1271 dst.file != BRW_GENERAL_REGISTER_FILE);
1272 struct brw_reg tmp = dst;
1273 GLuint i;
1274
1275 if (need_tmp)
1276 tmp = get_tmp(c);
1277
1278 for (i = 0; i < 4; i++) {
1279 if (dst.dw1.bits.writemask & (1<<i)) {
1280 GLubyte s = GET_SWZ(src.Swizzle, i);
1281 switch (s) {
1282 case SWIZZLE_X:
1283 case SWIZZLE_Y:
1284 case SWIZZLE_Z:
1285 case SWIZZLE_W:
1286 src_mask |= 1<<i;
1287 src_swz[i] = s;
1288 break;
1289 case SWIZZLE_ZERO:
1290 zeros_mask |= 1<<i;
1291 break;
1292 case SWIZZLE_ONE:
1293 ones_mask |= 1<<i;
1294 break;
1295 }
1296 }
1297 }
1298
1299 /* Do src first, in case dst aliases src:
1300 */
1301 if (src_mask) {
1302 struct brw_reg arg0;
1303
1304 arg0 = get_src_reg(c, inst, argIndex);
1305
1306 arg0 = brw_swizzle(arg0,
1307 src_swz[0], src_swz[1],
1308 src_swz[2], src_swz[3]);
1309
1310 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1311 }
1312
1313 if (zeros_mask)
1314 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1315
1316 if (ones_mask)
1317 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1318
1319 if (src.Negate)
1320 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1321
1322 if (need_tmp) {
1323 brw_MOV(p, dst, tmp);
1324 release_tmp(c, tmp);
1325 }
1326 }
1327
1328
1329 /**
1330 * Post-vertex-program processing. Send the results to the URB.
1331 */
1332 static void emit_vertex_write( struct brw_vs_compile *c)
1333 {
1334 struct brw_compile *p = &c->func;
1335 struct brw_context *brw = p->brw;
1336 struct intel_context *intel = &brw->intel;
1337 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1338 struct brw_reg ndc;
1339 int eot;
1340 GLuint len_vertex_header = 2;
1341 int next_mrf, i;
1342
1343 if (c->key.copy_edgeflag) {
1344 brw_MOV(p,
1345 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1346 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1347 }
1348
1349 if (intel->gen < 6) {
1350 /* Build ndc coords */
1351 ndc = get_tmp(c);
1352 /* ndc = 1.0 / pos.w */
1353 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1354 /* ndc.xyz = pos * ndc */
1355 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1356 }
1357
1358 /* Update the header for point size, user clipping flags, and -ve rhw
1359 * workaround.
1360 */
1361 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1362 c->key.nr_userclip || brw->has_negative_rhw_bug)
1363 {
1364 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1365 GLuint i;
1366
1367 brw_MOV(p, header1, brw_imm_ud(0));
1368
1369 brw_set_access_mode(p, BRW_ALIGN_16);
1370
1371 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1372 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1373 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1374 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1375 }
1376
1377 for (i = 0; i < c->key.nr_userclip; i++) {
1378 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1379 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1380 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1381 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1382 }
1383
1384 /* i965 clipping workaround:
1385 * 1) Test for -ve rhw
1386 * 2) If set,
1387 * set ndc = (0,0,0,0)
1388 * set ucp[6] = 1
1389 *
1390 * Later, clipping will detect ucp[6] and ensure the primitive is
1391 * clipped against all fixed planes.
1392 */
1393 if (brw->has_negative_rhw_bug) {
1394 brw_CMP(p,
1395 vec8(brw_null_reg()),
1396 BRW_CONDITIONAL_L,
1397 brw_swizzle1(ndc, 3),
1398 brw_imm_f(0));
1399
1400 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1401 brw_MOV(p, ndc, brw_imm_f(0));
1402 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1403 }
1404
1405 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1406 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1407 brw_set_access_mode(p, BRW_ALIGN_16);
1408
1409 release_tmp(c, header1);
1410 }
1411 else {
1412 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1413 }
1414
1415 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1416 * of zeros followed by two sets of NDC coordinates:
1417 */
1418 brw_set_access_mode(p, BRW_ALIGN_1);
1419 brw_set_acc_write_control(p, 0);
1420
1421 /* The VUE layout is documented in Volume 2a. */
1422 if (intel->gen >= 6) {
1423 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1424 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1425 * dword 4-7 (m2) is the 4D space position
1426 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1427 * enabled. We don't use it, so skip it.
1428 * m3 is the first vertex element data we fill, which is the vertex
1429 * position.
1430 */
1431 brw_MOV(p, brw_message_reg(2), pos);
1432 brw_MOV(p, brw_message_reg(3), pos);
1433 len_vertex_header = 2;
1434 } else if (intel->gen == 5) {
1435 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1436 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1437 * dword 4-7 (m2) is the ndc position (set above)
1438 * dword 8-11 (m3) of the vertex header is the 4D space position
1439 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1440 * m6 is a pad so that the vertex element data is aligned
1441 * m7 is the first vertex data we fill, which is the vertex position.
1442 */
1443 brw_MOV(p, brw_message_reg(2), ndc);
1444 brw_MOV(p, brw_message_reg(3), pos);
1445 brw_MOV(p, brw_message_reg(7), pos);
1446 len_vertex_header = 6;
1447 } else {
1448 /* There are 8 dwords in VUE header pre-Ironlake:
1449 * dword 0-3 (m1) is indices, point width, clip flags.
1450 * dword 4-7 (m2) is ndc position (set above)
1451 *
1452 * dword 8-11 (m3) is the first vertex data, which we always have be the
1453 * vertex position.
1454 */
1455 brw_MOV(p, brw_message_reg(2), ndc);
1456 brw_MOV(p, brw_message_reg(3), pos);
1457 len_vertex_header = 2;
1458 }
1459
1460 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1461 next_mrf = 2 + len_vertex_header;
1462 for (i = 0; i < VERT_RESULT_MAX; i++) {
1463 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1464 break;
1465 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1466 continue;
1467
1468 if (i >= VERT_RESULT_TEX0 &&
1469 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1470 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1471 next_mrf++;
1472 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1473 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1474 }
1475 }
1476
1477 eot = (c->first_overflow_output == 0);
1478
1479 brw_urb_WRITE(p,
1480 brw_null_reg(), /* dest */
1481 0, /* starting mrf reg nr */
1482 c->r0, /* src */
1483 0, /* allocate */
1484 1, /* used */
1485 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1486 0, /* response len */
1487 eot, /* eot */
1488 eot, /* writes complete */
1489 0, /* urb destination offset */
1490 BRW_URB_SWIZZLE_INTERLEAVE);
1491
1492 if (c->first_overflow_output > 0) {
1493 /* Not all of the vertex outputs/results fit into the MRF.
1494 * Move the overflowed attributes from the GRF to the MRF and
1495 * issue another brw_urb_WRITE().
1496 */
1497 GLuint i, mrf = 1;
1498 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1499 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1500 /* move from GRF to MRF */
1501 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1502 mrf++;
1503 }
1504 }
1505
1506 brw_urb_WRITE(p,
1507 brw_null_reg(), /* dest */
1508 0, /* starting mrf reg nr */
1509 c->r0, /* src */
1510 0, /* allocate */
1511 1, /* used */
1512 mrf, /* msg len */
1513 0, /* response len */
1514 1, /* eot */
1515 1, /* writes complete */
1516 14 / 2, /* urb destination offset */
1517 BRW_URB_SWIZZLE_INTERLEAVE);
1518 }
1519 }
1520
1521 static GLboolean
1522 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1523 {
1524 struct brw_compile *p = &c->func;
1525 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1526
1527 if (p->nr_insn == 0)
1528 return GL_FALSE;
1529
1530 if (val.address_mode != BRW_ADDRESS_DIRECT)
1531 return GL_FALSE;
1532
1533 switch (prev_insn->header.opcode) {
1534 case BRW_OPCODE_MOV:
1535 case BRW_OPCODE_MAC:
1536 case BRW_OPCODE_MUL:
1537 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1538 prev_insn->header.execution_size == val.width &&
1539 prev_insn->bits1.da1.dest_reg_file == val.file &&
1540 prev_insn->bits1.da1.dest_reg_type == val.type &&
1541 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1542 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1543 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1544 prev_insn->bits1.da16.dest_writemask == 0xf)
1545 return GL_TRUE;
1546 else
1547 return GL_FALSE;
1548 default:
1549 return GL_FALSE;
1550 }
1551 }
1552
1553 static uint32_t
1554 get_predicate(const struct prog_instruction *inst)
1555 {
1556 if (inst->DstReg.CondMask == COND_TR)
1557 return BRW_PREDICATE_NONE;
1558
1559 /* All of GLSL only produces predicates for COND_NE and one channel per
1560 * vector. Fail badly if someone starts doing something else, as it might
1561 * mean infinite looping or something.
1562 *
1563 * We'd like to support all the condition codes, but our hardware doesn't
1564 * quite match the Mesa IR, which is modeled after the NV extensions. For
1565 * those, the instruction may update the condition codes or not, then any
1566 * later instruction may use one of those condition codes. For gen4, the
1567 * instruction may update the flags register based on one of the condition
1568 * codes output by the instruction, and then further instructions may
1569 * predicate on that. We can probably support this, but it won't
1570 * necessarily be easy.
1571 */
1572 assert(inst->DstReg.CondMask == COND_NE);
1573
1574 switch (inst->DstReg.CondSwizzle) {
1575 case SWIZZLE_XXXX:
1576 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1577 case SWIZZLE_YYYY:
1578 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1579 case SWIZZLE_ZZZZ:
1580 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1581 case SWIZZLE_WWWW:
1582 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1583 default:
1584 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1585 inst->DstReg.CondMask);
1586 return BRW_PREDICATE_NORMAL;
1587 }
1588 }
1589
1590 /* Emit the vertex program instructions here.
1591 */
1592 void brw_vs_emit(struct brw_vs_compile *c )
1593 {
1594 #define MAX_IF_DEPTH 32
1595 #define MAX_LOOP_DEPTH 32
1596 struct brw_compile *p = &c->func;
1597 struct brw_context *brw = p->brw;
1598 struct intel_context *intel = &brw->intel;
1599 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1600 GLuint insn, if_depth = 0, loop_depth = 0;
1601 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1602 int if_depth_in_loop[MAX_LOOP_DEPTH];
1603 const struct brw_indirect stack_index = brw_indirect(0, 0);
1604 GLuint index;
1605 GLuint file;
1606
1607 if (INTEL_DEBUG & DEBUG_VS) {
1608 printf("vs-mesa:\n");
1609 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1610 GL_TRUE);
1611 printf("\n");
1612 }
1613
1614 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1615 brw_set_access_mode(p, BRW_ALIGN_16);
1616 if_depth_in_loop[loop_depth] = 0;
1617
1618 brw_set_acc_write_control(p, 1);
1619
1620 for (insn = 0; insn < nr_insns; insn++) {
1621 GLuint i;
1622 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1623
1624 /* Message registers can't be read, so copy the output into GRF
1625 * register if they are used in source registers
1626 */
1627 for (i = 0; i < 3; i++) {
1628 struct prog_src_register *src = &inst->SrcReg[i];
1629 GLuint index = src->Index;
1630 GLuint file = src->File;
1631 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1632 c->output_regs[index].used_in_src = GL_TRUE;
1633 }
1634
1635 switch (inst->Opcode) {
1636 case OPCODE_CAL:
1637 case OPCODE_RET:
1638 c->needs_stack = GL_TRUE;
1639 break;
1640 default:
1641 break;
1642 }
1643 }
1644
1645 /* Static register allocation
1646 */
1647 brw_vs_alloc_regs(c);
1648
1649 if (c->needs_stack)
1650 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1651
1652 for (insn = 0; insn < nr_insns; insn++) {
1653
1654 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1655 struct brw_reg args[3], dst;
1656 GLuint i;
1657
1658 #if 0
1659 printf("%d: ", insn);
1660 _mesa_print_instruction(inst);
1661 #endif
1662
1663 /* Get argument regs. SWZ is special and does this itself.
1664 */
1665 if (inst->Opcode != OPCODE_SWZ)
1666 for (i = 0; i < 3; i++) {
1667 const struct prog_src_register *src = &inst->SrcReg[i];
1668 index = src->Index;
1669 file = src->File;
1670 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1671 args[i] = c->output_regs[index].reg;
1672 else
1673 args[i] = get_arg(c, inst, i);
1674 }
1675
1676 /* Get dest regs. Note that it is possible for a reg to be both
1677 * dst and arg, given the static allocation of registers. So
1678 * care needs to be taken emitting multi-operation instructions.
1679 */
1680 index = inst->DstReg.Index;
1681 file = inst->DstReg.File;
1682 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1683 dst = c->output_regs[index].reg;
1684 else
1685 dst = get_dst(c, inst->DstReg);
1686
1687 if (inst->SaturateMode != SATURATE_OFF) {
1688 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1689 inst->SaturateMode);
1690 }
1691
1692 switch (inst->Opcode) {
1693 case OPCODE_ABS:
1694 brw_MOV(p, dst, brw_abs(args[0]));
1695 break;
1696 case OPCODE_ADD:
1697 brw_ADD(p, dst, args[0], args[1]);
1698 break;
1699 case OPCODE_COS:
1700 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1701 break;
1702 case OPCODE_DP2:
1703 brw_DP2(p, dst, args[0], args[1]);
1704 break;
1705 case OPCODE_DP3:
1706 brw_DP3(p, dst, args[0], args[1]);
1707 break;
1708 case OPCODE_DP4:
1709 brw_DP4(p, dst, args[0], args[1]);
1710 break;
1711 case OPCODE_DPH:
1712 brw_DPH(p, dst, args[0], args[1]);
1713 break;
1714 case OPCODE_NRM3:
1715 emit_nrm(c, dst, args[0], 3);
1716 break;
1717 case OPCODE_NRM4:
1718 emit_nrm(c, dst, args[0], 4);
1719 break;
1720 case OPCODE_DST:
1721 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1722 break;
1723 case OPCODE_EXP:
1724 unalias1(c, dst, args[0], emit_exp_noalias);
1725 break;
1726 case OPCODE_EX2:
1727 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1728 break;
1729 case OPCODE_ARL:
1730 brw_RNDD(p, dst, args[0]);
1731 break;
1732 case OPCODE_FLR:
1733 brw_RNDD(p, dst, args[0]);
1734 break;
1735 case OPCODE_FRC:
1736 brw_FRC(p, dst, args[0]);
1737 break;
1738 case OPCODE_LOG:
1739 unalias1(c, dst, args[0], emit_log_noalias);
1740 break;
1741 case OPCODE_LG2:
1742 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1743 break;
1744 case OPCODE_LIT:
1745 unalias1(c, dst, args[0], emit_lit_noalias);
1746 break;
1747 case OPCODE_LRP:
1748 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1749 break;
1750 case OPCODE_MAD:
1751 if (!accumulator_contains(c, args[2]))
1752 brw_MOV(p, brw_acc_reg(), args[2]);
1753 brw_MAC(p, dst, args[0], args[1]);
1754 break;
1755 case OPCODE_CMP:
1756 emit_cmp(p, dst, args[0], args[1], args[2]);
1757 break;
1758 case OPCODE_MAX:
1759 emit_max(p, dst, args[0], args[1]);
1760 break;
1761 case OPCODE_MIN:
1762 emit_min(p, dst, args[0], args[1]);
1763 break;
1764 case OPCODE_MOV:
1765 brw_MOV(p, dst, args[0]);
1766 break;
1767 case OPCODE_MUL:
1768 brw_MUL(p, dst, args[0], args[1]);
1769 break;
1770 case OPCODE_POW:
1771 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1772 break;
1773 case OPCODE_RCP:
1774 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1775 break;
1776 case OPCODE_RSQ:
1777 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1778 break;
1779
1780 case OPCODE_SEQ:
1781 unalias2(c, dst, args[0], args[1], emit_seq);
1782 break;
1783 case OPCODE_SIN:
1784 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1785 break;
1786 case OPCODE_SNE:
1787 unalias2(c, dst, args[0], args[1], emit_sne);
1788 break;
1789 case OPCODE_SGE:
1790 unalias2(c, dst, args[0], args[1], emit_sge);
1791 break;
1792 case OPCODE_SGT:
1793 unalias2(c, dst, args[0], args[1], emit_sgt);
1794 break;
1795 case OPCODE_SLT:
1796 unalias2(c, dst, args[0], args[1], emit_slt);
1797 break;
1798 case OPCODE_SLE:
1799 unalias2(c, dst, args[0], args[1], emit_sle);
1800 break;
1801 case OPCODE_SSG:
1802 unalias1(c, dst, args[0], emit_sign);
1803 break;
1804 case OPCODE_SUB:
1805 brw_ADD(p, dst, args[0], negate(args[1]));
1806 break;
1807 case OPCODE_SWZ:
1808 /* The args[0] value can't be used here as it won't have
1809 * correctly encoded the full swizzle:
1810 */
1811 emit_swz(c, dst, inst);
1812 break;
1813 case OPCODE_TRUNC:
1814 /* round toward zero */
1815 brw_RNDZ(p, dst, args[0]);
1816 break;
1817 case OPCODE_XPD:
1818 emit_xpd(p, dst, args[0], args[1]);
1819 break;
1820 case OPCODE_IF:
1821 assert(if_depth < MAX_IF_DEPTH);
1822 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1823 /* Note that brw_IF smashes the predicate_control field. */
1824 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1825 if_depth_in_loop[loop_depth]++;
1826 if_depth++;
1827 break;
1828 case OPCODE_ELSE:
1829 assert(if_depth > 0);
1830 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1831 break;
1832 case OPCODE_ENDIF:
1833 assert(if_depth > 0);
1834 brw_ENDIF(p, if_inst[--if_depth]);
1835 if_depth_in_loop[loop_depth]--;
1836 break;
1837 case OPCODE_BGNLOOP:
1838 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1839 if_depth_in_loop[loop_depth] = 0;
1840 break;
1841 case OPCODE_BRK:
1842 brw_set_predicate_control(p, get_predicate(inst));
1843 brw_BREAK(p, if_depth_in_loop[loop_depth]);
1844 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1845 break;
1846 case OPCODE_CONT:
1847 brw_set_predicate_control(p, get_predicate(inst));
1848 brw_CONT(p, if_depth_in_loop[loop_depth]);
1849 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1850 break;
1851 case OPCODE_ENDLOOP:
1852 {
1853 struct brw_instruction *inst0, *inst1;
1854 GLuint br = 1;
1855
1856 loop_depth--;
1857
1858 if (intel->gen == 5)
1859 br = 2;
1860
1861 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1862 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1863 while (inst0 > loop_inst[loop_depth]) {
1864 inst0--;
1865 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1866 inst0->bits3.if_else.jump_count == 0) {
1867 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1868 }
1869 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1870 inst0->bits3.if_else.jump_count == 0) {
1871 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1872 }
1873 }
1874 }
1875 break;
1876 case OPCODE_BRA:
1877 brw_set_predicate_control(p, get_predicate(inst));
1878 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1879 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1880 break;
1881 case OPCODE_CAL:
1882 brw_set_access_mode(p, BRW_ALIGN_1);
1883 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1884 brw_set_access_mode(p, BRW_ALIGN_16);
1885 brw_ADD(p, get_addr_reg(stack_index),
1886 get_addr_reg(stack_index), brw_imm_d(4));
1887 brw_save_call(p, inst->Comment, p->nr_insn);
1888 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1889 break;
1890 case OPCODE_RET:
1891 brw_ADD(p, get_addr_reg(stack_index),
1892 get_addr_reg(stack_index), brw_imm_d(-4));
1893 brw_set_access_mode(p, BRW_ALIGN_1);
1894 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1895 brw_set_access_mode(p, BRW_ALIGN_16);
1896 break;
1897 case OPCODE_END:
1898 emit_vertex_write(c);
1899 break;
1900 case OPCODE_PRINT:
1901 /* no-op */
1902 break;
1903 case OPCODE_BGNSUB:
1904 brw_save_label(p, inst->Comment, p->nr_insn);
1905 break;
1906 case OPCODE_ENDSUB:
1907 /* no-op */
1908 break;
1909 default:
1910 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1911 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1912 _mesa_opcode_string(inst->Opcode) :
1913 "unknown");
1914 }
1915
1916 /* Set the predication update on the last instruction of the native
1917 * instruction sequence.
1918 *
1919 * This would be problematic if it was set on a math instruction,
1920 * but that shouldn't be the case with the current GLSL compiler.
1921 */
1922 if (inst->CondUpdate) {
1923 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1924
1925 assert(hw_insn->header.destreg__conditionalmod == 0);
1926 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1927 }
1928
1929 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1930 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1931 && c->output_regs[inst->DstReg.Index].used_in_src) {
1932 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1933 }
1934
1935 /* Result color clamping.
1936 *
1937 * When destination register is an output register and
1938 * it's primary/secondary front/back color, we have to clamp
1939 * the result to [0,1]. This is done by enabling the
1940 * saturation bit for the last instruction.
1941 *
1942 * We don't use brw_set_saturate() as it modifies
1943 * p->current->header.saturate, which affects all the subsequent
1944 * instructions. Instead, we directly modify the header
1945 * of the last (already stored) instruction.
1946 */
1947 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1948 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1949 || (inst->DstReg.Index == VERT_RESULT_COL1)
1950 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1951 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1952 p->store[p->nr_insn-1].header.saturate = 1;
1953 }
1954 }
1955
1956 if (inst->DstReg.RelAddr) {
1957 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
1958 inst->DstReg.File == PROGRAM_OUTPUT);
1959 move_to_reladdr_dst(c, inst, dst);
1960 }
1961
1962 release_tmps(c);
1963 }
1964
1965 brw_resolve_cals(p);
1966
1967 brw_optimize(p);
1968
1969 if (INTEL_DEBUG & DEBUG_VS) {
1970 int i;
1971
1972 printf("vs-native:\n");
1973 for (i = 0; i < p->nr_insn; i++)
1974 brw_disasm(stdout, &p->store[i], intel->gen);
1975 printf("\n");
1976 }
1977 }