Merge remote branch 'origin/master' into glsl2
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP3] = 2,
51 [OPCODE_DP4] = 2,
52 [OPCODE_DPH] = 2,
53 [OPCODE_MAX] = 2,
54 [OPCODE_MIN] = 2,
55 [OPCODE_MUL] = 2,
56 [OPCODE_SEQ] = 2,
57 [OPCODE_SGE] = 2,
58 [OPCODE_SGT] = 2,
59 [OPCODE_SLE] = 2,
60 [OPCODE_SLT] = 2,
61 [OPCODE_SNE] = 2,
62 [OPCODE_XPD] = 2,
63 };
64
65 /* These opcodes get broken down in a way that allow two
66 * args to be immediates.
67 */
68 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
69 if (arg == 1 || arg == 2)
70 return GL_TRUE;
71 }
72
73 if (opcode > ARRAY_SIZE(opcode_array))
74 return GL_FALSE;
75
76 return arg == opcode_array[opcode] - 1;
77 }
78
79 static struct brw_reg get_tmp( struct brw_vs_compile *c )
80 {
81 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
82
83 if (++c->last_tmp > c->prog_data.total_grf)
84 c->prog_data.total_grf = c->last_tmp;
85
86 return tmp;
87 }
88
89 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
90 {
91 if (tmp.nr == c->last_tmp-1)
92 c->last_tmp--;
93 }
94
95 static void release_tmps( struct brw_vs_compile *c )
96 {
97 c->last_tmp = c->first_tmp;
98 }
99
100
101 /**
102 * Preallocate GRF register before code emit.
103 * Do things as simply as possible. Allocate and populate all regs
104 * ahead of time.
105 */
106 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
107 {
108 struct intel_context *intel = &c->func.brw->intel;
109 GLuint i, reg = 0, mrf;
110 int attributes_in_vue;
111
112 /* Determine whether to use a real constant buffer or use a block
113 * of GRF registers for constants. The later is faster but only
114 * works if everything fits in the GRF.
115 * XXX this heuristic/check may need some fine tuning...
116 */
117 if (c->vp->program.Base.Parameters->NumParameters +
118 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
119 c->vp->use_const_buffer = GL_TRUE;
120 else
121 c->vp->use_const_buffer = GL_FALSE;
122
123 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
124
125 /* r0 -- reserved as usual
126 */
127 c->r0 = brw_vec8_grf(reg, 0);
128 reg++;
129
130 /* User clip planes from curbe:
131 */
132 if (c->key.nr_userclip) {
133 for (i = 0; i < c->key.nr_userclip; i++) {
134 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
135 }
136
137 /* Deal with curbe alignment:
138 */
139 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
140 }
141
142 /* Vertex program parameters from curbe:
143 */
144 if (c->vp->use_const_buffer) {
145 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
146 int constant = 0;
147
148 /* We've got more constants than we can load with the push
149 * mechanism. This is often correlated with reladdr loads where
150 * we should probably be using a pull mechanism anyway to avoid
151 * excessive reading. However, the pull mechanism is slow in
152 * general. So, we try to allocate as many non-reladdr-loaded
153 * constants through the push buffer as we can before giving up.
154 */
155 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
156 for (i = 0;
157 i < c->vp->program.Base.NumInstructions && constant < max_constant;
158 i++) {
159 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
160 int arg;
161
162 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
163 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
164 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
165 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
166 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
167 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
168 inst->SrcReg[arg].RelAddr)
169 continue;
170
171 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
172 c->constant_map[inst->SrcReg[arg].Index] = constant++;
173 }
174 }
175 }
176
177 for (i = 0; i < constant; i++) {
178 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
179 (i%2) * 4),
180 0, 4, 1);
181 }
182 reg += (constant + 1) / 2;
183 c->prog_data.curb_read_length = reg - 1;
184 /* XXX 0 causes a bug elsewhere... */
185 c->prog_data.nr_params = MAX2(constant * 4, 4);
186 }
187 else {
188 /* use a section of the GRF for constants */
189 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
190 for (i = 0; i < nr_params; i++) {
191 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
192 }
193 reg += (nr_params + 1) / 2;
194 c->prog_data.curb_read_length = reg - 1;
195
196 c->prog_data.nr_params = nr_params * 4;
197 }
198
199 /* Allocate input regs:
200 */
201 c->nr_inputs = 0;
202 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
203 if (c->prog_data.inputs_read & (1 << i)) {
204 c->nr_inputs++;
205 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
206 reg++;
207 }
208 }
209 /* If there are no inputs, we'll still be reading one attribute's worth
210 * because it's required -- see urb_read_length setting.
211 */
212 if (c->nr_inputs == 0)
213 reg++;
214
215 /* Allocate outputs. The non-position outputs go straight into message regs.
216 */
217 c->nr_outputs = 0;
218 c->first_output = reg;
219 c->first_overflow_output = 0;
220
221 if (intel->gen >= 6)
222 mrf = 4;
223 else if (intel->gen == 5)
224 mrf = 8;
225 else
226 mrf = 4;
227
228 for (i = 0; i < VERT_RESULT_MAX; i++) {
229 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
230 c->nr_outputs++;
231 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
232 if (i == VERT_RESULT_HPOS) {
233 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
234 reg++;
235 }
236 else if (i == VERT_RESULT_PSIZ) {
237 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
238 reg++;
239 mrf++; /* just a placeholder? XXX fix later stages & remove this */
240 }
241 else {
242 /* Two restrictions on our compute-to-MRF here. The
243 * message length for all SEND messages is restricted to
244 * [1,15], so we can't use mrf 15, as that means a length
245 * of 16.
246 *
247 * Additionally, URB writes are aligned to URB rows, so we
248 * need to put an even number of registers of URB data in
249 * each URB write so that the later write is aligned. A
250 * message length of 15 means 1 message header reg plus 14
251 * regs of URB data.
252 *
253 * For attributes beyond the compute-to-MRF, we compute to
254 * GRFs and they will be written in the second URB_WRITE.
255 */
256 if (mrf < 15) {
257 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
258 mrf++;
259 }
260 else {
261 if (!c->first_overflow_output)
262 c->first_overflow_output = i;
263 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
264 reg++;
265 }
266 }
267 }
268 }
269
270 /* Allocate program temporaries:
271 */
272 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
273 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
274 reg++;
275 }
276
277 /* Address reg(s). Don't try to use the internal address reg until
278 * deref time.
279 */
280 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
281 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
282 reg,
283 0,
284 BRW_REGISTER_TYPE_D,
285 BRW_VERTICAL_STRIDE_8,
286 BRW_WIDTH_8,
287 BRW_HORIZONTAL_STRIDE_1,
288 BRW_SWIZZLE_XXXX,
289 WRITEMASK_X);
290 reg++;
291 }
292
293 if (c->vp->use_const_buffer) {
294 for (i = 0; i < 3; i++) {
295 c->current_const[i].index = -1;
296 c->current_const[i].reg = brw_vec8_grf(reg, 0);
297 reg++;
298 }
299 }
300
301 for (i = 0; i < 128; i++) {
302 if (c->output_regs[i].used_in_src) {
303 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
304 reg++;
305 }
306 }
307
308 if (c->needs_stack) {
309 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
310 reg += 2;
311 }
312
313 /* Some opcodes need an internal temporary:
314 */
315 c->first_tmp = reg;
316 c->last_tmp = reg; /* for allocation purposes */
317
318 /* Each input reg holds data from two vertices. The
319 * urb_read_length is the number of registers read from *each*
320 * vertex urb, so is half the amount:
321 */
322 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
323 /* Setting this field to 0 leads to undefined behavior according to the
324 * the VS_STATE docs. Our VUEs will always have at least one attribute
325 * sitting in them, even if it's padding.
326 */
327 if (c->prog_data.urb_read_length == 0)
328 c->prog_data.urb_read_length = 1;
329
330 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
331 * them to fit the biggest thing they need to.
332 */
333 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
334
335 /* See emit_vertex_write() for where the VUE's overhead on top of the
336 * attributes comes from.
337 */
338 if (intel->gen >= 6)
339 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
340 else if (intel->gen == 5)
341 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
342 else
343 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
344
345 c->prog_data.total_grf = reg;
346
347 if (INTEL_DEBUG & DEBUG_VS) {
348 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
349 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
350 printf("%s reg = %d\n", __FUNCTION__, reg);
351 }
352 }
353
354
355 /**
356 * If an instruction uses a temp reg both as a src and the dest, we
357 * sometimes need to allocate an intermediate temporary.
358 */
359 static void unalias1( struct brw_vs_compile *c,
360 struct brw_reg dst,
361 struct brw_reg arg0,
362 void (*func)( struct brw_vs_compile *,
363 struct brw_reg,
364 struct brw_reg ))
365 {
366 if (dst.file == arg0.file && dst.nr == arg0.nr) {
367 struct brw_compile *p = &c->func;
368 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
369 func(c, tmp, arg0);
370 brw_MOV(p, dst, tmp);
371 release_tmp(c, tmp);
372 }
373 else {
374 func(c, dst, arg0);
375 }
376 }
377
378 /**
379 * \sa unalias2
380 * Checkes if 2-operand instruction needs an intermediate temporary.
381 */
382 static void unalias2( struct brw_vs_compile *c,
383 struct brw_reg dst,
384 struct brw_reg arg0,
385 struct brw_reg arg1,
386 void (*func)( struct brw_vs_compile *,
387 struct brw_reg,
388 struct brw_reg,
389 struct brw_reg ))
390 {
391 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
392 (dst.file == arg1.file && dst.nr == arg1.nr)) {
393 struct brw_compile *p = &c->func;
394 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
395 func(c, tmp, arg0, arg1);
396 brw_MOV(p, dst, tmp);
397 release_tmp(c, tmp);
398 }
399 else {
400 func(c, dst, arg0, arg1);
401 }
402 }
403
404 /**
405 * \sa unalias2
406 * Checkes if 3-operand instruction needs an intermediate temporary.
407 */
408 static void unalias3( struct brw_vs_compile *c,
409 struct brw_reg dst,
410 struct brw_reg arg0,
411 struct brw_reg arg1,
412 struct brw_reg arg2,
413 void (*func)( struct brw_vs_compile *,
414 struct brw_reg,
415 struct brw_reg,
416 struct brw_reg,
417 struct brw_reg ))
418 {
419 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
420 (dst.file == arg1.file && dst.nr == arg1.nr) ||
421 (dst.file == arg2.file && dst.nr == arg2.nr)) {
422 struct brw_compile *p = &c->func;
423 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
424 func(c, tmp, arg0, arg1, arg2);
425 brw_MOV(p, dst, tmp);
426 release_tmp(c, tmp);
427 }
428 else {
429 func(c, dst, arg0, arg1, arg2);
430 }
431 }
432
433 static void emit_sop( struct brw_vs_compile *c,
434 struct brw_reg dst,
435 struct brw_reg arg0,
436 struct brw_reg arg1,
437 GLuint cond)
438 {
439 struct brw_compile *p = &c->func;
440
441 brw_MOV(p, dst, brw_imm_f(0.0f));
442 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
443 brw_MOV(p, dst, brw_imm_f(1.0f));
444 brw_set_predicate_control_flag_value(p, 0xff);
445 }
446
447 static void emit_seq( struct brw_vs_compile *c,
448 struct brw_reg dst,
449 struct brw_reg arg0,
450 struct brw_reg arg1 )
451 {
452 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
453 }
454
455 static void emit_sne( struct brw_vs_compile *c,
456 struct brw_reg dst,
457 struct brw_reg arg0,
458 struct brw_reg arg1 )
459 {
460 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
461 }
462 static void emit_slt( struct brw_vs_compile *c,
463 struct brw_reg dst,
464 struct brw_reg arg0,
465 struct brw_reg arg1 )
466 {
467 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
468 }
469
470 static void emit_sle( struct brw_vs_compile *c,
471 struct brw_reg dst,
472 struct brw_reg arg0,
473 struct brw_reg arg1 )
474 {
475 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
476 }
477
478 static void emit_sgt( struct brw_vs_compile *c,
479 struct brw_reg dst,
480 struct brw_reg arg0,
481 struct brw_reg arg1 )
482 {
483 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
484 }
485
486 static void emit_sge( struct brw_vs_compile *c,
487 struct brw_reg dst,
488 struct brw_reg arg0,
489 struct brw_reg arg1 )
490 {
491 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
492 }
493
494 static void emit_cmp( struct brw_compile *p,
495 struct brw_reg dst,
496 struct brw_reg arg0,
497 struct brw_reg arg1,
498 struct brw_reg arg2 )
499 {
500 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
501 brw_SEL(p, dst, arg1, arg2);
502 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
503 }
504
505 static void emit_sign(struct brw_vs_compile *c,
506 struct brw_reg dst,
507 struct brw_reg arg0)
508 {
509 struct brw_compile *p = &c->func;
510
511 brw_MOV(p, dst, brw_imm_f(0));
512
513 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
514 brw_MOV(p, dst, brw_imm_f(-1.0));
515 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
516
517 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
518 brw_MOV(p, dst, brw_imm_f(1.0));
519 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
520 }
521
522 static void emit_max( struct brw_compile *p,
523 struct brw_reg dst,
524 struct brw_reg arg0,
525 struct brw_reg arg1 )
526 {
527 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
528 brw_SEL(p, dst, arg0, arg1);
529 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
530 }
531
532 static void emit_min( struct brw_compile *p,
533 struct brw_reg dst,
534 struct brw_reg arg0,
535 struct brw_reg arg1 )
536 {
537 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
538 brw_SEL(p, dst, arg0, arg1);
539 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
540 }
541
542
543 static void emit_math1( struct brw_vs_compile *c,
544 GLuint function,
545 struct brw_reg dst,
546 struct brw_reg arg0,
547 GLuint precision)
548 {
549 /* There are various odd behaviours with SEND on the simulator. In
550 * addition there are documented issues with the fact that the GEN4
551 * processor doesn't do dependency control properly on SEND
552 * results. So, on balance, this kludge to get around failures
553 * with writemasked math results looks like it might be necessary
554 * whether that turns out to be a simulator bug or not:
555 */
556 struct brw_compile *p = &c->func;
557 struct intel_context *intel = &p->brw->intel;
558 struct brw_reg tmp = dst;
559 GLboolean need_tmp = (intel->gen < 6 &&
560 (dst.dw1.bits.writemask != 0xf ||
561 dst.file != BRW_GENERAL_REGISTER_FILE));
562
563 if (need_tmp)
564 tmp = get_tmp(c);
565
566 brw_math(p,
567 tmp,
568 function,
569 BRW_MATH_SATURATE_NONE,
570 2,
571 arg0,
572 BRW_MATH_DATA_SCALAR,
573 precision);
574
575 if (need_tmp) {
576 brw_MOV(p, dst, tmp);
577 release_tmp(c, tmp);
578 }
579 }
580
581
582 static void emit_math2( struct brw_vs_compile *c,
583 GLuint function,
584 struct brw_reg dst,
585 struct brw_reg arg0,
586 struct brw_reg arg1,
587 GLuint precision)
588 {
589 struct brw_compile *p = &c->func;
590 struct intel_context *intel = &p->brw->intel;
591 struct brw_reg tmp = dst;
592 GLboolean need_tmp = (intel->gen < 6 &&
593 (dst.dw1.bits.writemask != 0xf ||
594 dst.file != BRW_GENERAL_REGISTER_FILE));
595
596 if (need_tmp)
597 tmp = get_tmp(c);
598
599 brw_MOV(p, brw_message_reg(3), arg1);
600
601 brw_math(p,
602 tmp,
603 function,
604 BRW_MATH_SATURATE_NONE,
605 2,
606 arg0,
607 BRW_MATH_DATA_SCALAR,
608 precision);
609
610 if (need_tmp) {
611 brw_MOV(p, dst, tmp);
612 release_tmp(c, tmp);
613 }
614 }
615
616
617 static void emit_exp_noalias( struct brw_vs_compile *c,
618 struct brw_reg dst,
619 struct brw_reg arg0 )
620 {
621 struct brw_compile *p = &c->func;
622
623
624 if (dst.dw1.bits.writemask & WRITEMASK_X) {
625 struct brw_reg tmp = get_tmp(c);
626 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
627
628 /* tmp_d = floor(arg0.x) */
629 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
630
631 /* result[0] = 2.0 ^ tmp */
632
633 /* Adjust exponent for floating point:
634 * exp += 127
635 */
636 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
637
638 /* Install exponent and sign.
639 * Excess drops off the edge:
640 */
641 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
642 tmp_d, brw_imm_d(23));
643
644 release_tmp(c, tmp);
645 }
646
647 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
648 /* result[1] = arg0.x - floor(arg0.x) */
649 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
650 }
651
652 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
653 /* As with the LOG instruction, we might be better off just
654 * doing a taylor expansion here, seeing as we have to do all
655 * the prep work.
656 *
657 * If mathbox partial precision is too low, consider also:
658 * result[3] = result[0] * EXP(result[1])
659 */
660 emit_math1(c,
661 BRW_MATH_FUNCTION_EXP,
662 brw_writemask(dst, WRITEMASK_Z),
663 brw_swizzle1(arg0, 0),
664 BRW_MATH_PRECISION_FULL);
665 }
666
667 if (dst.dw1.bits.writemask & WRITEMASK_W) {
668 /* result[3] = 1.0; */
669 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
670 }
671 }
672
673
674 static void emit_log_noalias( struct brw_vs_compile *c,
675 struct brw_reg dst,
676 struct brw_reg arg0 )
677 {
678 struct brw_compile *p = &c->func;
679 struct brw_reg tmp = dst;
680 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
681 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
682 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
683 dst.file != BRW_GENERAL_REGISTER_FILE);
684
685 if (need_tmp) {
686 tmp = get_tmp(c);
687 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
688 }
689
690 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
691 * according to spec:
692 *
693 * These almost look likey they could be joined up, but not really
694 * practical:
695 *
696 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
697 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
698 */
699 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
700 brw_AND(p,
701 brw_writemask(tmp_ud, WRITEMASK_X),
702 brw_swizzle1(arg0_ud, 0),
703 brw_imm_ud((1U<<31)-1));
704
705 brw_SHR(p,
706 brw_writemask(tmp_ud, WRITEMASK_X),
707 tmp_ud,
708 brw_imm_ud(23));
709
710 brw_ADD(p,
711 brw_writemask(tmp, WRITEMASK_X),
712 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
713 brw_imm_d(-127));
714 }
715
716 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
717 brw_AND(p,
718 brw_writemask(tmp_ud, WRITEMASK_Y),
719 brw_swizzle1(arg0_ud, 0),
720 brw_imm_ud((1<<23)-1));
721
722 brw_OR(p,
723 brw_writemask(tmp_ud, WRITEMASK_Y),
724 tmp_ud,
725 brw_imm_ud(127<<23));
726 }
727
728 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
729 /* result[2] = result[0] + LOG2(result[1]); */
730
731 /* Why bother? The above is just a hint how to do this with a
732 * taylor series. Maybe we *should* use a taylor series as by
733 * the time all the above has been done it's almost certainly
734 * quicker than calling the mathbox, even with low precision.
735 *
736 * Options are:
737 * - result[0] + mathbox.LOG2(result[1])
738 * - mathbox.LOG2(arg0.x)
739 * - result[0] + inline_taylor_approx(result[1])
740 */
741 emit_math1(c,
742 BRW_MATH_FUNCTION_LOG,
743 brw_writemask(tmp, WRITEMASK_Z),
744 brw_swizzle1(tmp, 1),
745 BRW_MATH_PRECISION_FULL);
746
747 brw_ADD(p,
748 brw_writemask(tmp, WRITEMASK_Z),
749 brw_swizzle1(tmp, 2),
750 brw_swizzle1(tmp, 0));
751 }
752
753 if (dst.dw1.bits.writemask & WRITEMASK_W) {
754 /* result[3] = 1.0; */
755 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
756 }
757
758 if (need_tmp) {
759 brw_MOV(p, dst, tmp);
760 release_tmp(c, tmp);
761 }
762 }
763
764
765 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
766 */
767 static void emit_dst_noalias( struct brw_vs_compile *c,
768 struct brw_reg dst,
769 struct brw_reg arg0,
770 struct brw_reg arg1)
771 {
772 struct brw_compile *p = &c->func;
773
774 /* There must be a better way to do this:
775 */
776 if (dst.dw1.bits.writemask & WRITEMASK_X)
777 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
778 if (dst.dw1.bits.writemask & WRITEMASK_Y)
779 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
780 if (dst.dw1.bits.writemask & WRITEMASK_Z)
781 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
782 if (dst.dw1.bits.writemask & WRITEMASK_W)
783 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
784 }
785
786
787 static void emit_xpd( struct brw_compile *p,
788 struct brw_reg dst,
789 struct brw_reg t,
790 struct brw_reg u)
791 {
792 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
793 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
794 }
795
796
797 static void emit_lit_noalias( struct brw_vs_compile *c,
798 struct brw_reg dst,
799 struct brw_reg arg0 )
800 {
801 struct brw_compile *p = &c->func;
802 struct brw_instruction *if_insn;
803 struct brw_reg tmp = dst;
804 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
805
806 if (need_tmp)
807 tmp = get_tmp(c);
808
809 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
810 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
811
812 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
813 * to get all channels active inside the IF. In the clipping code
814 * we run with NoMask, so it's not an option and we can use
815 * BRW_EXECUTE_1 for all comparisions.
816 */
817 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
818 if_insn = brw_IF(p, BRW_EXECUTE_8);
819 {
820 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
821
822 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
823 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
824 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
825
826 emit_math2(c,
827 BRW_MATH_FUNCTION_POW,
828 brw_writemask(dst, WRITEMASK_Z),
829 brw_swizzle1(tmp, 2),
830 brw_swizzle1(arg0, 3),
831 BRW_MATH_PRECISION_PARTIAL);
832 }
833
834 brw_ENDIF(p, if_insn);
835
836 release_tmp(c, tmp);
837 }
838
839 static void emit_lrp_noalias(struct brw_vs_compile *c,
840 struct brw_reg dst,
841 struct brw_reg arg0,
842 struct brw_reg arg1,
843 struct brw_reg arg2)
844 {
845 struct brw_compile *p = &c->func;
846
847 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
848 brw_MUL(p, brw_null_reg(), dst, arg2);
849 brw_MAC(p, dst, arg0, arg1);
850 }
851
852 /** 3 or 4-component vector normalization */
853 static void emit_nrm( struct brw_vs_compile *c,
854 struct brw_reg dst,
855 struct brw_reg arg0,
856 int num_comps)
857 {
858 struct brw_compile *p = &c->func;
859 struct brw_reg tmp = get_tmp(c);
860
861 /* tmp = dot(arg0, arg0) */
862 if (num_comps == 3)
863 brw_DP3(p, tmp, arg0, arg0);
864 else
865 brw_DP4(p, tmp, arg0, arg0);
866
867 /* tmp = 1 / sqrt(tmp) */
868 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
869
870 /* dst = arg0 * tmp */
871 brw_MUL(p, dst, arg0, tmp);
872
873 release_tmp(c, tmp);
874 }
875
876
877 static struct brw_reg
878 get_constant(struct brw_vs_compile *c,
879 const struct prog_instruction *inst,
880 GLuint argIndex)
881 {
882 const struct prog_src_register *src = &inst->SrcReg[argIndex];
883 struct brw_compile *p = &c->func;
884 struct brw_reg const_reg = c->current_const[argIndex].reg;
885
886 assert(argIndex < 3);
887
888 if (c->current_const[argIndex].index != src->Index) {
889 /* Keep track of the last constant loaded in this slot, for reuse. */
890 c->current_const[argIndex].index = src->Index;
891
892 #if 0
893 printf(" fetch const[%d] for arg %d into reg %d\n",
894 src->Index, argIndex, c->current_const[argIndex].reg.nr);
895 #endif
896 /* need to fetch the constant now */
897 brw_dp_READ_4_vs(p,
898 const_reg, /* writeback dest */
899 16 * src->Index, /* byte offset */
900 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
901 );
902 }
903
904 /* replicate lower four floats into upper half (to get XYZWXYZW) */
905 const_reg = stride(const_reg, 0, 4, 0);
906 const_reg.subnr = 0;
907
908 return const_reg;
909 }
910
911 static struct brw_reg
912 get_reladdr_constant(struct brw_vs_compile *c,
913 const struct prog_instruction *inst,
914 GLuint argIndex)
915 {
916 const struct prog_src_register *src = &inst->SrcReg[argIndex];
917 struct brw_compile *p = &c->func;
918 struct brw_reg const_reg = c->current_const[argIndex].reg;
919 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
920 struct brw_reg byte_addr_reg = get_tmp(c);
921
922 assert(argIndex < 3);
923
924 /* Can't reuse a reladdr constant load. */
925 c->current_const[argIndex].index = -1;
926
927 #if 0
928 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
929 src->Index, argIndex, c->current_const[argIndex].reg.nr);
930 #endif
931
932 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
933
934 /* fetch the first vec4 */
935 brw_dp_READ_4_vs_relative(p,
936 const_reg, /* writeback dest */
937 byte_addr_reg, /* address register */
938 16 * src->Index, /* byte offset */
939 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
940 );
941
942 return const_reg;
943 }
944
945
946
947 /* TODO: relative addressing!
948 */
949 static struct brw_reg get_reg( struct brw_vs_compile *c,
950 gl_register_file file,
951 GLuint index )
952 {
953 switch (file) {
954 case PROGRAM_TEMPORARY:
955 case PROGRAM_INPUT:
956 case PROGRAM_OUTPUT:
957 assert(c->regs[file][index].nr != 0);
958 return c->regs[file][index];
959 case PROGRAM_STATE_VAR:
960 case PROGRAM_CONSTANT:
961 case PROGRAM_UNIFORM:
962 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
963 return c->regs[PROGRAM_STATE_VAR][index];
964 case PROGRAM_ADDRESS:
965 assert(index == 0);
966 return c->regs[file][index];
967
968 case PROGRAM_UNDEFINED: /* undef values */
969 return brw_null_reg();
970
971 case PROGRAM_LOCAL_PARAM:
972 case PROGRAM_ENV_PARAM:
973 case PROGRAM_WRITE_ONLY:
974 default:
975 assert(0);
976 return brw_null_reg();
977 }
978 }
979
980
981 /**
982 * Indirect addressing: get reg[[arg] + offset].
983 */
984 static struct brw_reg deref( struct brw_vs_compile *c,
985 struct brw_reg arg,
986 GLint offset,
987 GLuint reg_size )
988 {
989 struct brw_compile *p = &c->func;
990 struct brw_reg tmp = get_tmp(c);
991 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
992 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
993 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
994 struct brw_reg indirect = brw_vec4_indirect(0,0);
995 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
996
997 /* Set the vertical stride on the register access so that the first
998 * 4 components come from a0.0 and the second 4 from a0.1.
999 */
1000 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1001
1002 {
1003 brw_push_insn_state(p);
1004 brw_set_access_mode(p, BRW_ALIGN_1);
1005
1006 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1007 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1008
1009 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1010 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1011
1012 brw_MOV(p, tmp, indirect);
1013
1014 brw_pop_insn_state(p);
1015 }
1016
1017 /* NOTE: tmp not released */
1018 return tmp;
1019 }
1020
1021 static void
1022 move_to_reladdr_dst(struct brw_vs_compile *c,
1023 const struct prog_instruction *inst,
1024 struct brw_reg val)
1025 {
1026 struct brw_compile *p = &c->func;
1027 int reg_size = 32;
1028 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1029 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1030 struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
1031 GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
1032 struct brw_reg indirect = brw_vec4_indirect(0,0);
1033 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1034
1035 byte_offset += inst->DstReg.Index * reg_size;
1036
1037 brw_push_insn_state(p);
1038 brw_set_access_mode(p, BRW_ALIGN_1);
1039
1040 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1041 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1042 brw_MOV(p, indirect, val);
1043
1044 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1045 brw_ADD(p, brw_address_reg(0), acc,
1046 brw_imm_uw(byte_offset + reg_size / 2));
1047 brw_MOV(p, indirect, suboffset(val, 4));
1048
1049 brw_pop_insn_state(p);
1050 }
1051
1052 /**
1053 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1054 * TODO: relative addressing!
1055 */
1056 static struct brw_reg
1057 get_src_reg( struct brw_vs_compile *c,
1058 const struct prog_instruction *inst,
1059 GLuint argIndex )
1060 {
1061 const GLuint file = inst->SrcReg[argIndex].File;
1062 const GLint index = inst->SrcReg[argIndex].Index;
1063 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1064
1065 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1066 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1067
1068 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1069 SWIZZLE_ZERO,
1070 SWIZZLE_ZERO,
1071 SWIZZLE_ZERO)) {
1072 return brw_imm_f(0.0f);
1073 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1074 SWIZZLE_ONE,
1075 SWIZZLE_ONE,
1076 SWIZZLE_ONE)) {
1077 if (src->Negate)
1078 return brw_imm_f(-1.0F);
1079 else
1080 return brw_imm_f(1.0F);
1081 } else if (src->File == PROGRAM_CONSTANT) {
1082 const struct gl_program_parameter_list *params;
1083 float f;
1084 int component = -1;
1085
1086 switch (src->Swizzle) {
1087 case SWIZZLE_XXXX:
1088 component = 0;
1089 break;
1090 case SWIZZLE_YYYY:
1091 component = 1;
1092 break;
1093 case SWIZZLE_ZZZZ:
1094 component = 2;
1095 break;
1096 case SWIZZLE_WWWW:
1097 component = 3;
1098 break;
1099 }
1100
1101 if (component >= 0) {
1102 params = c->vp->program.Base.Parameters;
1103 f = params->ParameterValues[src->Index][component];
1104
1105 if (src->Abs)
1106 f = fabs(f);
1107 if (src->Negate)
1108 f = -f;
1109 return brw_imm_f(f);
1110 }
1111 }
1112 }
1113
1114 switch (file) {
1115 case PROGRAM_TEMPORARY:
1116 case PROGRAM_INPUT:
1117 case PROGRAM_OUTPUT:
1118 if (relAddr) {
1119 return deref(c, c->regs[file][0], index, 32);
1120 }
1121 else {
1122 assert(c->regs[file][index].nr != 0);
1123 return c->regs[file][index];
1124 }
1125
1126 case PROGRAM_STATE_VAR:
1127 case PROGRAM_CONSTANT:
1128 case PROGRAM_UNIFORM:
1129 case PROGRAM_ENV_PARAM:
1130 case PROGRAM_LOCAL_PARAM:
1131 if (c->vp->use_const_buffer) {
1132 if (!relAddr && c->constant_map[index] != -1) {
1133 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1134 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1135 } else if (relAddr)
1136 return get_reladdr_constant(c, inst, argIndex);
1137 else
1138 return get_constant(c, inst, argIndex);
1139 }
1140 else if (relAddr) {
1141 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1142 }
1143 else {
1144 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1145 return c->regs[PROGRAM_STATE_VAR][index];
1146 }
1147 case PROGRAM_ADDRESS:
1148 assert(index == 0);
1149 return c->regs[file][index];
1150
1151 case PROGRAM_UNDEFINED:
1152 /* this is a normal case since we loop over all three src args */
1153 return brw_null_reg();
1154
1155 case PROGRAM_WRITE_ONLY:
1156 default:
1157 assert(0);
1158 return brw_null_reg();
1159 }
1160 }
1161
1162 /**
1163 * Return the brw reg for the given instruction's src argument.
1164 * Will return mangled results for SWZ op. The emit_swz() function
1165 * ignores this result and recalculates taking extended swizzles into
1166 * account.
1167 */
1168 static struct brw_reg get_arg( struct brw_vs_compile *c,
1169 const struct prog_instruction *inst,
1170 GLuint argIndex )
1171 {
1172 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1173 struct brw_reg reg;
1174
1175 if (src->File == PROGRAM_UNDEFINED)
1176 return brw_null_reg();
1177
1178 reg = get_src_reg(c, inst, argIndex);
1179
1180 /* Convert 3-bit swizzle to 2-bit.
1181 */
1182 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1183 GET_SWZ(src->Swizzle, 1),
1184 GET_SWZ(src->Swizzle, 2),
1185 GET_SWZ(src->Swizzle, 3));
1186
1187 /* Note this is ok for non-swizzle instructions:
1188 */
1189 reg.negate = src->Negate ? 1 : 0;
1190
1191 return reg;
1192 }
1193
1194
1195 /**
1196 * Get brw register for the given program dest register.
1197 */
1198 static struct brw_reg get_dst( struct brw_vs_compile *c,
1199 struct prog_dst_register dst )
1200 {
1201 struct brw_reg reg;
1202
1203 switch (dst.File) {
1204 case PROGRAM_TEMPORARY:
1205 case PROGRAM_OUTPUT:
1206 /* register-indirect addressing is only 1x1, not VxH, for
1207 * destination regs. So, for RelAddr we'll return a temporary
1208 * for the dest and do a move of the result to the RelAddr
1209 * register after the instruction emit.
1210 */
1211 if (dst.RelAddr) {
1212 reg = get_tmp(c);
1213 } else {
1214 assert(c->regs[dst.File][dst.Index].nr != 0);
1215 reg = c->regs[dst.File][dst.Index];
1216 }
1217 break;
1218 case PROGRAM_ADDRESS:
1219 assert(dst.Index == 0);
1220 reg = c->regs[dst.File][dst.Index];
1221 break;
1222 case PROGRAM_UNDEFINED:
1223 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1224 reg = brw_null_reg();
1225 break;
1226 default:
1227 assert(0);
1228 reg = brw_null_reg();
1229 }
1230
1231 reg.dw1.bits.writemask = dst.WriteMask;
1232
1233 return reg;
1234 }
1235
1236
1237 static void emit_swz( struct brw_vs_compile *c,
1238 struct brw_reg dst,
1239 const struct prog_instruction *inst)
1240 {
1241 const GLuint argIndex = 0;
1242 const struct prog_src_register src = inst->SrcReg[argIndex];
1243 struct brw_compile *p = &c->func;
1244 GLuint zeros_mask = 0;
1245 GLuint ones_mask = 0;
1246 GLuint src_mask = 0;
1247 GLubyte src_swz[4];
1248 GLboolean need_tmp = (src.Negate &&
1249 dst.file != BRW_GENERAL_REGISTER_FILE);
1250 struct brw_reg tmp = dst;
1251 GLuint i;
1252
1253 if (need_tmp)
1254 tmp = get_tmp(c);
1255
1256 for (i = 0; i < 4; i++) {
1257 if (dst.dw1.bits.writemask & (1<<i)) {
1258 GLubyte s = GET_SWZ(src.Swizzle, i);
1259 switch (s) {
1260 case SWIZZLE_X:
1261 case SWIZZLE_Y:
1262 case SWIZZLE_Z:
1263 case SWIZZLE_W:
1264 src_mask |= 1<<i;
1265 src_swz[i] = s;
1266 break;
1267 case SWIZZLE_ZERO:
1268 zeros_mask |= 1<<i;
1269 break;
1270 case SWIZZLE_ONE:
1271 ones_mask |= 1<<i;
1272 break;
1273 }
1274 }
1275 }
1276
1277 /* Do src first, in case dst aliases src:
1278 */
1279 if (src_mask) {
1280 struct brw_reg arg0;
1281
1282 arg0 = get_src_reg(c, inst, argIndex);
1283
1284 arg0 = brw_swizzle(arg0,
1285 src_swz[0], src_swz[1],
1286 src_swz[2], src_swz[3]);
1287
1288 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1289 }
1290
1291 if (zeros_mask)
1292 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1293
1294 if (ones_mask)
1295 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1296
1297 if (src.Negate)
1298 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1299
1300 if (need_tmp) {
1301 brw_MOV(p, dst, tmp);
1302 release_tmp(c, tmp);
1303 }
1304 }
1305
1306
1307 /**
1308 * Post-vertex-program processing. Send the results to the URB.
1309 */
1310 static void emit_vertex_write( struct brw_vs_compile *c)
1311 {
1312 struct brw_compile *p = &c->func;
1313 struct brw_context *brw = p->brw;
1314 struct intel_context *intel = &brw->intel;
1315 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1316 struct brw_reg ndc;
1317 int eot;
1318 GLuint len_vertex_header = 2;
1319
1320 if (c->key.copy_edgeflag) {
1321 brw_MOV(p,
1322 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1323 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1324 }
1325
1326 if (intel->gen < 6) {
1327 /* Build ndc coords */
1328 ndc = get_tmp(c);
1329 /* ndc = 1.0 / pos.w */
1330 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1331 /* ndc.xyz = pos * ndc */
1332 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1333 }
1334
1335 /* Update the header for point size, user clipping flags, and -ve rhw
1336 * workaround.
1337 */
1338 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1339 c->key.nr_userclip || brw->has_negative_rhw_bug)
1340 {
1341 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1342 GLuint i;
1343
1344 brw_MOV(p, header1, brw_imm_ud(0));
1345
1346 brw_set_access_mode(p, BRW_ALIGN_16);
1347
1348 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1349 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1350 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1351 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1352 }
1353
1354 for (i = 0; i < c->key.nr_userclip; i++) {
1355 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1356 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1357 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1358 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1359 }
1360
1361 /* i965 clipping workaround:
1362 * 1) Test for -ve rhw
1363 * 2) If set,
1364 * set ndc = (0,0,0,0)
1365 * set ucp[6] = 1
1366 *
1367 * Later, clipping will detect ucp[6] and ensure the primitive is
1368 * clipped against all fixed planes.
1369 */
1370 if (brw->has_negative_rhw_bug) {
1371 brw_CMP(p,
1372 vec8(brw_null_reg()),
1373 BRW_CONDITIONAL_L,
1374 brw_swizzle1(ndc, 3),
1375 brw_imm_f(0));
1376
1377 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1378 brw_MOV(p, ndc, brw_imm_f(0));
1379 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1380 }
1381
1382 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1383 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1384 brw_set_access_mode(p, BRW_ALIGN_16);
1385
1386 release_tmp(c, header1);
1387 }
1388 else {
1389 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1390 }
1391
1392 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1393 * of zeros followed by two sets of NDC coordinates:
1394 */
1395 brw_set_access_mode(p, BRW_ALIGN_1);
1396
1397 /* The VUE layout is documented in Volume 2a. */
1398 if (intel->gen >= 6) {
1399 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1400 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1401 * dword 4-7 (m2) is the 4D space position
1402 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1403 * enabled. We don't use it, so skip it.
1404 * m3 is the first vertex element data we fill, which is the vertex
1405 * position.
1406 */
1407 brw_MOV(p, brw_message_reg(2), pos);
1408 brw_MOV(p, brw_message_reg(3), pos);
1409 len_vertex_header = 2;
1410 } else if (intel->gen == 5) {
1411 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1412 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1413 * dword 4-7 (m2) is the ndc position (set above)
1414 * dword 8-11 (m3) of the vertex header is the 4D space position
1415 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1416 * m6 is a pad so that the vertex element data is aligned
1417 * m7 is the first vertex data we fill, which is the vertex position.
1418 */
1419 brw_MOV(p, brw_message_reg(2), ndc);
1420 brw_MOV(p, brw_message_reg(3), pos);
1421 brw_MOV(p, brw_message_reg(7), pos);
1422 len_vertex_header = 6;
1423 } else {
1424 /* There are 8 dwords in VUE header pre-Ironlake:
1425 * dword 0-3 (m1) is indices, point width, clip flags.
1426 * dword 4-7 (m2) is ndc position (set above)
1427 *
1428 * dword 8-11 (m3) is the first vertex data, which we always have be the
1429 * vertex position.
1430 */
1431 brw_MOV(p, brw_message_reg(2), ndc);
1432 brw_MOV(p, brw_message_reg(3), pos);
1433 len_vertex_header = 2;
1434 }
1435
1436 eot = (c->first_overflow_output == 0);
1437
1438 brw_urb_WRITE(p,
1439 brw_null_reg(), /* dest */
1440 0, /* starting mrf reg nr */
1441 c->r0, /* src */
1442 0, /* allocate */
1443 1, /* used */
1444 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1445 0, /* response len */
1446 eot, /* eot */
1447 eot, /* writes complete */
1448 0, /* urb destination offset */
1449 BRW_URB_SWIZZLE_INTERLEAVE);
1450
1451 if (c->first_overflow_output > 0) {
1452 /* Not all of the vertex outputs/results fit into the MRF.
1453 * Move the overflowed attributes from the GRF to the MRF and
1454 * issue another brw_urb_WRITE().
1455 */
1456 GLuint i, mrf = 1;
1457 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1458 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1459 /* move from GRF to MRF */
1460 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1461 mrf++;
1462 }
1463 }
1464
1465 brw_urb_WRITE(p,
1466 brw_null_reg(), /* dest */
1467 0, /* starting mrf reg nr */
1468 c->r0, /* src */
1469 0, /* allocate */
1470 1, /* used */
1471 mrf, /* msg len */
1472 0, /* response len */
1473 1, /* eot */
1474 1, /* writes complete */
1475 14 / 2, /* urb destination offset */
1476 BRW_URB_SWIZZLE_INTERLEAVE);
1477 }
1478 }
1479
1480 static GLboolean
1481 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1482 {
1483 struct brw_compile *p = &c->func;
1484 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1485
1486 if (p->nr_insn == 0)
1487 return GL_FALSE;
1488
1489 if (val.address_mode != BRW_ADDRESS_DIRECT)
1490 return GL_FALSE;
1491
1492 switch (prev_insn->header.opcode) {
1493 case BRW_OPCODE_MOV:
1494 case BRW_OPCODE_MAC:
1495 case BRW_OPCODE_MUL:
1496 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1497 prev_insn->header.execution_size == val.width &&
1498 prev_insn->bits1.da1.dest_reg_file == val.file &&
1499 prev_insn->bits1.da1.dest_reg_type == val.type &&
1500 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1501 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1502 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1503 prev_insn->bits1.da16.dest_writemask == 0xf)
1504 return GL_TRUE;
1505 else
1506 return GL_FALSE;
1507 default:
1508 return GL_FALSE;
1509 }
1510 }
1511
1512 static uint32_t
1513 get_predicate(const struct prog_instruction *inst)
1514 {
1515 if (inst->DstReg.CondMask == COND_TR)
1516 return BRW_PREDICATE_NONE;
1517
1518 /* All of GLSL only produces predicates for COND_NE and one channel per
1519 * vector. Fail badly if someone starts doing something else, as it might
1520 * mean infinite looping or something.
1521 *
1522 * We'd like to support all the condition codes, but our hardware doesn't
1523 * quite match the Mesa IR, which is modeled after the NV extensions. For
1524 * those, the instruction may update the condition codes or not, then any
1525 * later instruction may use one of those condition codes. For gen4, the
1526 * instruction may update the flags register based on one of the condition
1527 * codes output by the instruction, and then further instructions may
1528 * predicate on that. We can probably support this, but it won't
1529 * necessarily be easy.
1530 */
1531 assert(inst->DstReg.CondMask == COND_NE);
1532
1533 switch (inst->DstReg.CondSwizzle) {
1534 case SWIZZLE_XXXX:
1535 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1536 case SWIZZLE_YYYY:
1537 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1538 case SWIZZLE_ZZZZ:
1539 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1540 case SWIZZLE_WWWW:
1541 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1542 default:
1543 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1544 inst->DstReg.CondMask);
1545 return BRW_PREDICATE_NORMAL;
1546 }
1547 }
1548
1549 /* Emit the vertex program instructions here.
1550 */
1551 void brw_vs_emit(struct brw_vs_compile *c )
1552 {
1553 #define MAX_IF_DEPTH 32
1554 #define MAX_LOOP_DEPTH 32
1555 struct brw_compile *p = &c->func;
1556 struct brw_context *brw = p->brw;
1557 struct intel_context *intel = &brw->intel;
1558 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1559 GLuint insn, if_depth = 0, loop_depth = 0;
1560 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1561 const struct brw_indirect stack_index = brw_indirect(0, 0);
1562 GLuint index;
1563 GLuint file;
1564
1565 if (INTEL_DEBUG & DEBUG_VS) {
1566 printf("vs-mesa:\n");
1567 _mesa_print_program(&c->vp->program.Base);
1568 printf("\n");
1569 }
1570
1571 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1572 brw_set_access_mode(p, BRW_ALIGN_16);
1573
1574 for (insn = 0; insn < nr_insns; insn++) {
1575 GLuint i;
1576 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1577
1578 /* Message registers can't be read, so copy the output into GRF
1579 * register if they are used in source registers
1580 */
1581 for (i = 0; i < 3; i++) {
1582 struct prog_src_register *src = &inst->SrcReg[i];
1583 GLuint index = src->Index;
1584 GLuint file = src->File;
1585 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1586 c->output_regs[index].used_in_src = GL_TRUE;
1587 }
1588
1589 switch (inst->Opcode) {
1590 case OPCODE_CAL:
1591 case OPCODE_RET:
1592 c->needs_stack = GL_TRUE;
1593 break;
1594 default:
1595 break;
1596 }
1597 }
1598
1599 /* Static register allocation
1600 */
1601 brw_vs_alloc_regs(c);
1602
1603 if (c->needs_stack)
1604 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1605
1606 for (insn = 0; insn < nr_insns; insn++) {
1607
1608 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1609 struct brw_reg args[3], dst;
1610 GLuint i;
1611
1612 #if 0
1613 printf("%d: ", insn);
1614 _mesa_print_instruction(inst);
1615 #endif
1616
1617 /* Get argument regs. SWZ is special and does this itself.
1618 */
1619 if (inst->Opcode != OPCODE_SWZ)
1620 for (i = 0; i < 3; i++) {
1621 const struct prog_src_register *src = &inst->SrcReg[i];
1622 index = src->Index;
1623 file = src->File;
1624 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1625 args[i] = c->output_regs[index].reg;
1626 else
1627 args[i] = get_arg(c, inst, i);
1628 }
1629
1630 /* Get dest regs. Note that it is possible for a reg to be both
1631 * dst and arg, given the static allocation of registers. So
1632 * care needs to be taken emitting multi-operation instructions.
1633 */
1634 index = inst->DstReg.Index;
1635 file = inst->DstReg.File;
1636 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1637 dst = c->output_regs[index].reg;
1638 else
1639 dst = get_dst(c, inst->DstReg);
1640
1641 if (inst->SaturateMode != SATURATE_OFF) {
1642 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1643 inst->SaturateMode);
1644 }
1645
1646 switch (inst->Opcode) {
1647 case OPCODE_ABS:
1648 brw_MOV(p, dst, brw_abs(args[0]));
1649 break;
1650 case OPCODE_ADD:
1651 brw_ADD(p, dst, args[0], args[1]);
1652 break;
1653 case OPCODE_COS:
1654 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1655 break;
1656 case OPCODE_DP3:
1657 brw_DP3(p, dst, args[0], args[1]);
1658 break;
1659 case OPCODE_DP4:
1660 brw_DP4(p, dst, args[0], args[1]);
1661 break;
1662 case OPCODE_DPH:
1663 brw_DPH(p, dst, args[0], args[1]);
1664 break;
1665 case OPCODE_NRM3:
1666 emit_nrm(c, dst, args[0], 3);
1667 break;
1668 case OPCODE_NRM4:
1669 emit_nrm(c, dst, args[0], 4);
1670 break;
1671 case OPCODE_DST:
1672 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1673 break;
1674 case OPCODE_EXP:
1675 unalias1(c, dst, args[0], emit_exp_noalias);
1676 break;
1677 case OPCODE_EX2:
1678 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1679 break;
1680 case OPCODE_ARL:
1681 brw_RNDD(p, dst, args[0]);
1682 break;
1683 case OPCODE_FLR:
1684 brw_RNDD(p, dst, args[0]);
1685 break;
1686 case OPCODE_FRC:
1687 brw_FRC(p, dst, args[0]);
1688 break;
1689 case OPCODE_LOG:
1690 unalias1(c, dst, args[0], emit_log_noalias);
1691 break;
1692 case OPCODE_LG2:
1693 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1694 break;
1695 case OPCODE_LIT:
1696 unalias1(c, dst, args[0], emit_lit_noalias);
1697 break;
1698 case OPCODE_LRP:
1699 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1700 break;
1701 case OPCODE_MAD:
1702 if (!accumulator_contains(c, args[2]))
1703 brw_MOV(p, brw_acc_reg(), args[2]);
1704 brw_MAC(p, dst, args[0], args[1]);
1705 break;
1706 case OPCODE_CMP:
1707 emit_cmp(p, dst, args[0], args[1], args[2]);
1708 break;
1709 case OPCODE_MAX:
1710 emit_max(p, dst, args[0], args[1]);
1711 break;
1712 case OPCODE_MIN:
1713 emit_min(p, dst, args[0], args[1]);
1714 break;
1715 case OPCODE_MOV:
1716 brw_MOV(p, dst, args[0]);
1717 break;
1718 case OPCODE_MUL:
1719 brw_MUL(p, dst, args[0], args[1]);
1720 break;
1721 case OPCODE_POW:
1722 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1723 break;
1724 case OPCODE_RCP:
1725 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1726 break;
1727 case OPCODE_RSQ:
1728 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1729 break;
1730
1731 case OPCODE_SEQ:
1732 unalias2(c, dst, args[0], args[1], emit_seq);
1733 break;
1734 case OPCODE_SIN:
1735 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1736 break;
1737 case OPCODE_SNE:
1738 unalias2(c, dst, args[0], args[1], emit_sne);
1739 break;
1740 case OPCODE_SGE:
1741 unalias2(c, dst, args[0], args[1], emit_sge);
1742 break;
1743 case OPCODE_SGT:
1744 unalias2(c, dst, args[0], args[1], emit_sgt);
1745 break;
1746 case OPCODE_SLT:
1747 unalias2(c, dst, args[0], args[1], emit_slt);
1748 break;
1749 case OPCODE_SLE:
1750 unalias2(c, dst, args[0], args[1], emit_sle);
1751 break;
1752 case OPCODE_SSG:
1753 unalias1(c, dst, args[0], emit_sign);
1754 break;
1755 case OPCODE_SUB:
1756 brw_ADD(p, dst, args[0], negate(args[1]));
1757 break;
1758 case OPCODE_SWZ:
1759 /* The args[0] value can't be used here as it won't have
1760 * correctly encoded the full swizzle:
1761 */
1762 emit_swz(c, dst, inst);
1763 break;
1764 case OPCODE_TRUNC:
1765 /* round toward zero */
1766 brw_RNDZ(p, dst, args[0]);
1767 break;
1768 case OPCODE_XPD:
1769 emit_xpd(p, dst, args[0], args[1]);
1770 break;
1771 case OPCODE_IF:
1772 assert(if_depth < MAX_IF_DEPTH);
1773 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1774 /* Note that brw_IF smashes the predicate_control field. */
1775 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1776 if_depth++;
1777 break;
1778 case OPCODE_ELSE:
1779 assert(if_depth > 0);
1780 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1781 break;
1782 case OPCODE_ENDIF:
1783 assert(if_depth > 0);
1784 brw_ENDIF(p, if_inst[--if_depth]);
1785 break;
1786 case OPCODE_BGNLOOP:
1787 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1788 break;
1789 case OPCODE_BRK:
1790 brw_set_predicate_control(p, get_predicate(inst));
1791 brw_BREAK(p);
1792 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1793 break;
1794 case OPCODE_CONT:
1795 brw_set_predicate_control(p, get_predicate(inst));
1796 brw_CONT(p);
1797 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1798 break;
1799 case OPCODE_ENDLOOP:
1800 {
1801 struct brw_instruction *inst0, *inst1;
1802 GLuint br = 1;
1803
1804 loop_depth--;
1805
1806 if (intel->gen == 5)
1807 br = 2;
1808
1809 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1810 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1811 while (inst0 > loop_inst[loop_depth]) {
1812 inst0--;
1813 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1814 inst0->bits3.if_else.jump_count == 0) {
1815 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1816 inst0->bits3.if_else.pop_count = 0;
1817 }
1818 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1819 inst0->bits3.if_else.jump_count == 0) {
1820 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1821 inst0->bits3.if_else.pop_count = 0;
1822 }
1823 }
1824 }
1825 break;
1826 case OPCODE_BRA:
1827 brw_set_predicate_control(p, get_predicate(inst));
1828 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1829 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1830 break;
1831 case OPCODE_CAL:
1832 brw_set_access_mode(p, BRW_ALIGN_1);
1833 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1834 brw_set_access_mode(p, BRW_ALIGN_16);
1835 brw_ADD(p, get_addr_reg(stack_index),
1836 get_addr_reg(stack_index), brw_imm_d(4));
1837 brw_save_call(p, inst->Comment, p->nr_insn);
1838 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1839 break;
1840 case OPCODE_RET:
1841 brw_ADD(p, get_addr_reg(stack_index),
1842 get_addr_reg(stack_index), brw_imm_d(-4));
1843 brw_set_access_mode(p, BRW_ALIGN_1);
1844 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1845 brw_set_access_mode(p, BRW_ALIGN_16);
1846 break;
1847 case OPCODE_END:
1848 emit_vertex_write(c);
1849 break;
1850 case OPCODE_PRINT:
1851 /* no-op */
1852 break;
1853 case OPCODE_BGNSUB:
1854 brw_save_label(p, inst->Comment, p->nr_insn);
1855 break;
1856 case OPCODE_ENDSUB:
1857 /* no-op */
1858 break;
1859 default:
1860 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1861 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1862 _mesa_opcode_string(inst->Opcode) :
1863 "unknown");
1864 }
1865
1866 /* Set the predication update on the last instruction of the native
1867 * instruction sequence.
1868 *
1869 * This would be problematic if it was set on a math instruction,
1870 * but that shouldn't be the case with the current GLSL compiler.
1871 */
1872 if (inst->CondUpdate) {
1873 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1874
1875 assert(hw_insn->header.destreg__conditionalmod == 0);
1876 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1877 }
1878
1879 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1880 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1881 && c->output_regs[inst->DstReg.Index].used_in_src) {
1882 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1883 }
1884
1885 /* Result color clamping.
1886 *
1887 * When destination register is an output register and
1888 * it's primary/secondary front/back color, we have to clamp
1889 * the result to [0,1]. This is done by enabling the
1890 * saturation bit for the last instruction.
1891 *
1892 * We don't use brw_set_saturate() as it modifies
1893 * p->current->header.saturate, which affects all the subsequent
1894 * instructions. Instead, we directly modify the header
1895 * of the last (already stored) instruction.
1896 */
1897 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1898 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1899 || (inst->DstReg.Index == VERT_RESULT_COL1)
1900 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1901 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1902 p->store[p->nr_insn-1].header.saturate = 1;
1903 }
1904 }
1905
1906 if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
1907 /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1908 * compute-to-mrf and the fact that we are allocating
1909 * registers for only the used PROGRAM_OUTPUTs.
1910 */
1911 move_to_reladdr_dst(c, inst, dst);
1912 }
1913
1914 release_tmps(c);
1915 }
1916
1917 brw_resolve_cals(p);
1918
1919 brw_optimize(p);
1920
1921 if (INTEL_DEBUG & DEBUG_VS) {
1922 int i;
1923
1924 printf("vs-native:\n");
1925 for (i = 0; i < p->nr_insn; i++)
1926 brw_disasm(stderr, &p->store[i], intel->gen);
1927 printf("\n");
1928 }
1929 }