2f4653fbda743d8d997b654645d91b1e7a5e9b53
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101
102 /**
103 * Preallocate GRF register before code emit.
104 * Do things as simply as possible. Allocate and populate all regs
105 * ahead of time.
106 */
107 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
108 {
109 struct intel_context *intel = &c->func.brw->intel;
110 GLuint i, reg = 0, mrf;
111 int attributes_in_vue;
112
113 /* Determine whether to use a real constant buffer or use a block
114 * of GRF registers for constants. The later is faster but only
115 * works if everything fits in the GRF.
116 * XXX this heuristic/check may need some fine tuning...
117 */
118 if (c->vp->program.Base.Parameters->NumParameters +
119 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
120 c->vp->use_const_buffer = GL_TRUE;
121 else
122 c->vp->use_const_buffer = GL_FALSE;
123
124 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
125
126 /* r0 -- reserved as usual
127 */
128 c->r0 = brw_vec8_grf(reg, 0);
129 reg++;
130
131 /* User clip planes from curbe:
132 */
133 if (c->key.nr_userclip) {
134 for (i = 0; i < c->key.nr_userclip; i++) {
135 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
136 }
137
138 /* Deal with curbe alignment:
139 */
140 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
141 }
142
143 /* Vertex program parameters from curbe:
144 */
145 if (c->vp->use_const_buffer) {
146 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
147 int constant = 0;
148
149 /* We've got more constants than we can load with the push
150 * mechanism. This is often correlated with reladdr loads where
151 * we should probably be using a pull mechanism anyway to avoid
152 * excessive reading. However, the pull mechanism is slow in
153 * general. So, we try to allocate as many non-reladdr-loaded
154 * constants through the push buffer as we can before giving up.
155 */
156 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
157 for (i = 0;
158 i < c->vp->program.Base.NumInstructions && constant < max_constant;
159 i++) {
160 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
161 int arg;
162
163 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
164 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
165 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
166 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
167 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
168 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
169 inst->SrcReg[arg].RelAddr)
170 continue;
171
172 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
173 c->constant_map[inst->SrcReg[arg].Index] = constant++;
174 }
175 }
176 }
177
178 for (i = 0; i < constant; i++) {
179 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
180 (i%2) * 4),
181 0, 4, 1);
182 }
183 reg += (constant + 1) / 2;
184 c->prog_data.curb_read_length = reg - 1;
185 /* XXX 0 causes a bug elsewhere... */
186 c->prog_data.nr_params = MAX2(constant * 4, 4);
187 }
188 else {
189 /* use a section of the GRF for constants */
190 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
191 for (i = 0; i < nr_params; i++) {
192 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
193 }
194 reg += (nr_params + 1) / 2;
195 c->prog_data.curb_read_length = reg - 1;
196
197 c->prog_data.nr_params = nr_params * 4;
198 }
199
200 /* Allocate input regs:
201 */
202 c->nr_inputs = 0;
203 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
204 if (c->prog_data.inputs_read & (1 << i)) {
205 c->nr_inputs++;
206 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
207 reg++;
208 }
209 }
210 /* If there are no inputs, we'll still be reading one attribute's worth
211 * because it's required -- see urb_read_length setting.
212 */
213 if (c->nr_inputs == 0)
214 reg++;
215
216 /* Allocate outputs. The non-position outputs go straight into message regs.
217 */
218 c->nr_outputs = 0;
219 c->first_output = reg;
220 c->first_overflow_output = 0;
221
222 if (intel->gen >= 6)
223 mrf = 4;
224 else if (intel->gen == 5)
225 mrf = 8;
226 else
227 mrf = 4;
228
229 for (i = 0; i < VERT_RESULT_MAX; i++) {
230 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
231 c->nr_outputs++;
232 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
233 if (i == VERT_RESULT_HPOS) {
234 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
235 reg++;
236 }
237 else if (i == VERT_RESULT_PSIZ) {
238 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
239 reg++;
240 mrf++; /* just a placeholder? XXX fix later stages & remove this */
241 }
242 else {
243 /* Two restrictions on our compute-to-MRF here. The
244 * message length for all SEND messages is restricted to
245 * [1,15], so we can't use mrf 15, as that means a length
246 * of 16.
247 *
248 * Additionally, URB writes are aligned to URB rows, so we
249 * need to put an even number of registers of URB data in
250 * each URB write so that the later write is aligned. A
251 * message length of 15 means 1 message header reg plus 14
252 * regs of URB data.
253 *
254 * For attributes beyond the compute-to-MRF, we compute to
255 * GRFs and they will be written in the second URB_WRITE.
256 */
257 if (mrf < 15) {
258 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
259 mrf++;
260 }
261 else {
262 if (!c->first_overflow_output)
263 c->first_overflow_output = i;
264 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
265 reg++;
266 }
267 }
268 }
269 }
270
271 /* Allocate program temporaries:
272 */
273 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
274 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
275 reg++;
276 }
277
278 /* Address reg(s). Don't try to use the internal address reg until
279 * deref time.
280 */
281 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
282 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
283 reg,
284 0,
285 BRW_REGISTER_TYPE_D,
286 BRW_VERTICAL_STRIDE_8,
287 BRW_WIDTH_8,
288 BRW_HORIZONTAL_STRIDE_1,
289 BRW_SWIZZLE_XXXX,
290 WRITEMASK_X);
291 reg++;
292 }
293
294 if (c->vp->use_const_buffer) {
295 for (i = 0; i < 3; i++) {
296 c->current_const[i].index = -1;
297 c->current_const[i].reg = brw_vec8_grf(reg, 0);
298 reg++;
299 }
300 }
301
302 for (i = 0; i < 128; i++) {
303 if (c->output_regs[i].used_in_src) {
304 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
305 reg++;
306 }
307 }
308
309 if (c->needs_stack) {
310 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
311 reg += 2;
312 }
313
314 /* Some opcodes need an internal temporary:
315 */
316 c->first_tmp = reg;
317 c->last_tmp = reg; /* for allocation purposes */
318
319 /* Each input reg holds data from two vertices. The
320 * urb_read_length is the number of registers read from *each*
321 * vertex urb, so is half the amount:
322 */
323 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
324 /* Setting this field to 0 leads to undefined behavior according to the
325 * the VS_STATE docs. Our VUEs will always have at least one attribute
326 * sitting in them, even if it's padding.
327 */
328 if (c->prog_data.urb_read_length == 0)
329 c->prog_data.urb_read_length = 1;
330
331 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
332 * them to fit the biggest thing they need to.
333 */
334 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
335
336 /* See emit_vertex_write() for where the VUE's overhead on top of the
337 * attributes comes from.
338 */
339 if (intel->gen >= 6)
340 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
341 else if (intel->gen == 5)
342 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
343 else
344 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
345
346 c->prog_data.total_grf = reg;
347
348 if (INTEL_DEBUG & DEBUG_VS) {
349 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
350 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
351 printf("%s reg = %d\n", __FUNCTION__, reg);
352 }
353 }
354
355
356 /**
357 * If an instruction uses a temp reg both as a src and the dest, we
358 * sometimes need to allocate an intermediate temporary.
359 */
360 static void unalias1( struct brw_vs_compile *c,
361 struct brw_reg dst,
362 struct brw_reg arg0,
363 void (*func)( struct brw_vs_compile *,
364 struct brw_reg,
365 struct brw_reg ))
366 {
367 if (dst.file == arg0.file && dst.nr == arg0.nr) {
368 struct brw_compile *p = &c->func;
369 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
370 func(c, tmp, arg0);
371 brw_MOV(p, dst, tmp);
372 release_tmp(c, tmp);
373 }
374 else {
375 func(c, dst, arg0);
376 }
377 }
378
379 /**
380 * \sa unalias2
381 * Checkes if 2-operand instruction needs an intermediate temporary.
382 */
383 static void unalias2( struct brw_vs_compile *c,
384 struct brw_reg dst,
385 struct brw_reg arg0,
386 struct brw_reg arg1,
387 void (*func)( struct brw_vs_compile *,
388 struct brw_reg,
389 struct brw_reg,
390 struct brw_reg ))
391 {
392 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
393 (dst.file == arg1.file && dst.nr == arg1.nr)) {
394 struct brw_compile *p = &c->func;
395 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
396 func(c, tmp, arg0, arg1);
397 brw_MOV(p, dst, tmp);
398 release_tmp(c, tmp);
399 }
400 else {
401 func(c, dst, arg0, arg1);
402 }
403 }
404
405 /**
406 * \sa unalias2
407 * Checkes if 3-operand instruction needs an intermediate temporary.
408 */
409 static void unalias3( struct brw_vs_compile *c,
410 struct brw_reg dst,
411 struct brw_reg arg0,
412 struct brw_reg arg1,
413 struct brw_reg arg2,
414 void (*func)( struct brw_vs_compile *,
415 struct brw_reg,
416 struct brw_reg,
417 struct brw_reg,
418 struct brw_reg ))
419 {
420 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
421 (dst.file == arg1.file && dst.nr == arg1.nr) ||
422 (dst.file == arg2.file && dst.nr == arg2.nr)) {
423 struct brw_compile *p = &c->func;
424 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
425 func(c, tmp, arg0, arg1, arg2);
426 brw_MOV(p, dst, tmp);
427 release_tmp(c, tmp);
428 }
429 else {
430 func(c, dst, arg0, arg1, arg2);
431 }
432 }
433
434 static void emit_sop( struct brw_vs_compile *c,
435 struct brw_reg dst,
436 struct brw_reg arg0,
437 struct brw_reg arg1,
438 GLuint cond)
439 {
440 struct brw_compile *p = &c->func;
441
442 brw_MOV(p, dst, brw_imm_f(0.0f));
443 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
444 brw_MOV(p, dst, brw_imm_f(1.0f));
445 brw_set_predicate_control_flag_value(p, 0xff);
446 }
447
448 static void emit_seq( struct brw_vs_compile *c,
449 struct brw_reg dst,
450 struct brw_reg arg0,
451 struct brw_reg arg1 )
452 {
453 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
454 }
455
456 static void emit_sne( struct brw_vs_compile *c,
457 struct brw_reg dst,
458 struct brw_reg arg0,
459 struct brw_reg arg1 )
460 {
461 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
462 }
463 static void emit_slt( struct brw_vs_compile *c,
464 struct brw_reg dst,
465 struct brw_reg arg0,
466 struct brw_reg arg1 )
467 {
468 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
469 }
470
471 static void emit_sle( struct brw_vs_compile *c,
472 struct brw_reg dst,
473 struct brw_reg arg0,
474 struct brw_reg arg1 )
475 {
476 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
477 }
478
479 static void emit_sgt( struct brw_vs_compile *c,
480 struct brw_reg dst,
481 struct brw_reg arg0,
482 struct brw_reg arg1 )
483 {
484 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
485 }
486
487 static void emit_sge( struct brw_vs_compile *c,
488 struct brw_reg dst,
489 struct brw_reg arg0,
490 struct brw_reg arg1 )
491 {
492 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
493 }
494
495 static void emit_cmp( struct brw_compile *p,
496 struct brw_reg dst,
497 struct brw_reg arg0,
498 struct brw_reg arg1,
499 struct brw_reg arg2 )
500 {
501 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
502 brw_SEL(p, dst, arg1, arg2);
503 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
504 }
505
506 static void emit_sign(struct brw_vs_compile *c,
507 struct brw_reg dst,
508 struct brw_reg arg0)
509 {
510 struct brw_compile *p = &c->func;
511
512 brw_MOV(p, dst, brw_imm_f(0));
513
514 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
515 brw_MOV(p, dst, brw_imm_f(-1.0));
516 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
517
518 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
519 brw_MOV(p, dst, brw_imm_f(1.0));
520 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
521 }
522
523 static void emit_max( struct brw_compile *p,
524 struct brw_reg dst,
525 struct brw_reg arg0,
526 struct brw_reg arg1 )
527 {
528 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
529 brw_SEL(p, dst, arg0, arg1);
530 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
531 }
532
533 static void emit_min( struct brw_compile *p,
534 struct brw_reg dst,
535 struct brw_reg arg0,
536 struct brw_reg arg1 )
537 {
538 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
539 brw_SEL(p, dst, arg0, arg1);
540 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
541 }
542
543
544 static void emit_math1( struct brw_vs_compile *c,
545 GLuint function,
546 struct brw_reg dst,
547 struct brw_reg arg0,
548 GLuint precision)
549 {
550 /* There are various odd behaviours with SEND on the simulator. In
551 * addition there are documented issues with the fact that the GEN4
552 * processor doesn't do dependency control properly on SEND
553 * results. So, on balance, this kludge to get around failures
554 * with writemasked math results looks like it might be necessary
555 * whether that turns out to be a simulator bug or not:
556 */
557 struct brw_compile *p = &c->func;
558 struct intel_context *intel = &p->brw->intel;
559 struct brw_reg tmp = dst;
560 GLboolean need_tmp = (intel->gen < 6 &&
561 (dst.dw1.bits.writemask != 0xf ||
562 dst.file != BRW_GENERAL_REGISTER_FILE));
563
564 if (need_tmp)
565 tmp = get_tmp(c);
566
567 brw_math(p,
568 tmp,
569 function,
570 BRW_MATH_SATURATE_NONE,
571 2,
572 arg0,
573 BRW_MATH_DATA_SCALAR,
574 precision);
575
576 if (need_tmp) {
577 brw_MOV(p, dst, tmp);
578 release_tmp(c, tmp);
579 }
580 }
581
582
583 static void emit_math2( struct brw_vs_compile *c,
584 GLuint function,
585 struct brw_reg dst,
586 struct brw_reg arg0,
587 struct brw_reg arg1,
588 GLuint precision)
589 {
590 struct brw_compile *p = &c->func;
591 struct intel_context *intel = &p->brw->intel;
592 struct brw_reg tmp = dst;
593 GLboolean need_tmp = (intel->gen < 6 &&
594 (dst.dw1.bits.writemask != 0xf ||
595 dst.file != BRW_GENERAL_REGISTER_FILE));
596
597 if (need_tmp)
598 tmp = get_tmp(c);
599
600 brw_MOV(p, brw_message_reg(3), arg1);
601
602 brw_math(p,
603 tmp,
604 function,
605 BRW_MATH_SATURATE_NONE,
606 2,
607 arg0,
608 BRW_MATH_DATA_SCALAR,
609 precision);
610
611 if (need_tmp) {
612 brw_MOV(p, dst, tmp);
613 release_tmp(c, tmp);
614 }
615 }
616
617
618 static void emit_exp_noalias( struct brw_vs_compile *c,
619 struct brw_reg dst,
620 struct brw_reg arg0 )
621 {
622 struct brw_compile *p = &c->func;
623
624
625 if (dst.dw1.bits.writemask & WRITEMASK_X) {
626 struct brw_reg tmp = get_tmp(c);
627 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
628
629 /* tmp_d = floor(arg0.x) */
630 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
631
632 /* result[0] = 2.0 ^ tmp */
633
634 /* Adjust exponent for floating point:
635 * exp += 127
636 */
637 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
638
639 /* Install exponent and sign.
640 * Excess drops off the edge:
641 */
642 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
643 tmp_d, brw_imm_d(23));
644
645 release_tmp(c, tmp);
646 }
647
648 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
649 /* result[1] = arg0.x - floor(arg0.x) */
650 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
651 }
652
653 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
654 /* As with the LOG instruction, we might be better off just
655 * doing a taylor expansion here, seeing as we have to do all
656 * the prep work.
657 *
658 * If mathbox partial precision is too low, consider also:
659 * result[3] = result[0] * EXP(result[1])
660 */
661 emit_math1(c,
662 BRW_MATH_FUNCTION_EXP,
663 brw_writemask(dst, WRITEMASK_Z),
664 brw_swizzle1(arg0, 0),
665 BRW_MATH_PRECISION_FULL);
666 }
667
668 if (dst.dw1.bits.writemask & WRITEMASK_W) {
669 /* result[3] = 1.0; */
670 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
671 }
672 }
673
674
675 static void emit_log_noalias( struct brw_vs_compile *c,
676 struct brw_reg dst,
677 struct brw_reg arg0 )
678 {
679 struct brw_compile *p = &c->func;
680 struct brw_reg tmp = dst;
681 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
682 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
683 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
684 dst.file != BRW_GENERAL_REGISTER_FILE);
685
686 if (need_tmp) {
687 tmp = get_tmp(c);
688 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
689 }
690
691 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
692 * according to spec:
693 *
694 * These almost look likey they could be joined up, but not really
695 * practical:
696 *
697 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
698 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
699 */
700 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
701 brw_AND(p,
702 brw_writemask(tmp_ud, WRITEMASK_X),
703 brw_swizzle1(arg0_ud, 0),
704 brw_imm_ud((1U<<31)-1));
705
706 brw_SHR(p,
707 brw_writemask(tmp_ud, WRITEMASK_X),
708 tmp_ud,
709 brw_imm_ud(23));
710
711 brw_ADD(p,
712 brw_writemask(tmp, WRITEMASK_X),
713 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
714 brw_imm_d(-127));
715 }
716
717 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
718 brw_AND(p,
719 brw_writemask(tmp_ud, WRITEMASK_Y),
720 brw_swizzle1(arg0_ud, 0),
721 brw_imm_ud((1<<23)-1));
722
723 brw_OR(p,
724 brw_writemask(tmp_ud, WRITEMASK_Y),
725 tmp_ud,
726 brw_imm_ud(127<<23));
727 }
728
729 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
730 /* result[2] = result[0] + LOG2(result[1]); */
731
732 /* Why bother? The above is just a hint how to do this with a
733 * taylor series. Maybe we *should* use a taylor series as by
734 * the time all the above has been done it's almost certainly
735 * quicker than calling the mathbox, even with low precision.
736 *
737 * Options are:
738 * - result[0] + mathbox.LOG2(result[1])
739 * - mathbox.LOG2(arg0.x)
740 * - result[0] + inline_taylor_approx(result[1])
741 */
742 emit_math1(c,
743 BRW_MATH_FUNCTION_LOG,
744 brw_writemask(tmp, WRITEMASK_Z),
745 brw_swizzle1(tmp, 1),
746 BRW_MATH_PRECISION_FULL);
747
748 brw_ADD(p,
749 brw_writemask(tmp, WRITEMASK_Z),
750 brw_swizzle1(tmp, 2),
751 brw_swizzle1(tmp, 0));
752 }
753
754 if (dst.dw1.bits.writemask & WRITEMASK_W) {
755 /* result[3] = 1.0; */
756 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
757 }
758
759 if (need_tmp) {
760 brw_MOV(p, dst, tmp);
761 release_tmp(c, tmp);
762 }
763 }
764
765
766 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
767 */
768 static void emit_dst_noalias( struct brw_vs_compile *c,
769 struct brw_reg dst,
770 struct brw_reg arg0,
771 struct brw_reg arg1)
772 {
773 struct brw_compile *p = &c->func;
774
775 /* There must be a better way to do this:
776 */
777 if (dst.dw1.bits.writemask & WRITEMASK_X)
778 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
779 if (dst.dw1.bits.writemask & WRITEMASK_Y)
780 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
781 if (dst.dw1.bits.writemask & WRITEMASK_Z)
782 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
783 if (dst.dw1.bits.writemask & WRITEMASK_W)
784 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
785 }
786
787
788 static void emit_xpd( struct brw_compile *p,
789 struct brw_reg dst,
790 struct brw_reg t,
791 struct brw_reg u)
792 {
793 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
794 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
795 }
796
797
798 static void emit_lit_noalias( struct brw_vs_compile *c,
799 struct brw_reg dst,
800 struct brw_reg arg0 )
801 {
802 struct brw_compile *p = &c->func;
803 struct brw_instruction *if_insn;
804 struct brw_reg tmp = dst;
805 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
806
807 if (need_tmp)
808 tmp = get_tmp(c);
809
810 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
811 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
812
813 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
814 * to get all channels active inside the IF. In the clipping code
815 * we run with NoMask, so it's not an option and we can use
816 * BRW_EXECUTE_1 for all comparisions.
817 */
818 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
819 if_insn = brw_IF(p, BRW_EXECUTE_8);
820 {
821 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
822
823 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
824 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
825 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
826
827 emit_math2(c,
828 BRW_MATH_FUNCTION_POW,
829 brw_writemask(dst, WRITEMASK_Z),
830 brw_swizzle1(tmp, 2),
831 brw_swizzle1(arg0, 3),
832 BRW_MATH_PRECISION_PARTIAL);
833 }
834
835 brw_ENDIF(p, if_insn);
836
837 release_tmp(c, tmp);
838 }
839
840 static void emit_lrp_noalias(struct brw_vs_compile *c,
841 struct brw_reg dst,
842 struct brw_reg arg0,
843 struct brw_reg arg1,
844 struct brw_reg arg2)
845 {
846 struct brw_compile *p = &c->func;
847
848 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
849 brw_MUL(p, brw_null_reg(), dst, arg2);
850 brw_MAC(p, dst, arg0, arg1);
851 }
852
853 /** 3 or 4-component vector normalization */
854 static void emit_nrm( struct brw_vs_compile *c,
855 struct brw_reg dst,
856 struct brw_reg arg0,
857 int num_comps)
858 {
859 struct brw_compile *p = &c->func;
860 struct brw_reg tmp = get_tmp(c);
861
862 /* tmp = dot(arg0, arg0) */
863 if (num_comps == 3)
864 brw_DP3(p, tmp, arg0, arg0);
865 else
866 brw_DP4(p, tmp, arg0, arg0);
867
868 /* tmp = 1 / sqrt(tmp) */
869 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
870
871 /* dst = arg0 * tmp */
872 brw_MUL(p, dst, arg0, tmp);
873
874 release_tmp(c, tmp);
875 }
876
877
878 static struct brw_reg
879 get_constant(struct brw_vs_compile *c,
880 const struct prog_instruction *inst,
881 GLuint argIndex)
882 {
883 const struct prog_src_register *src = &inst->SrcReg[argIndex];
884 struct brw_compile *p = &c->func;
885 struct brw_reg const_reg = c->current_const[argIndex].reg;
886
887 assert(argIndex < 3);
888
889 if (c->current_const[argIndex].index != src->Index) {
890 /* Keep track of the last constant loaded in this slot, for reuse. */
891 c->current_const[argIndex].index = src->Index;
892
893 #if 0
894 printf(" fetch const[%d] for arg %d into reg %d\n",
895 src->Index, argIndex, c->current_const[argIndex].reg.nr);
896 #endif
897 /* need to fetch the constant now */
898 brw_dp_READ_4_vs(p,
899 const_reg, /* writeback dest */
900 16 * src->Index, /* byte offset */
901 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
902 );
903 }
904
905 /* replicate lower four floats into upper half (to get XYZWXYZW) */
906 const_reg = stride(const_reg, 0, 4, 0);
907 const_reg.subnr = 0;
908
909 return const_reg;
910 }
911
912 static struct brw_reg
913 get_reladdr_constant(struct brw_vs_compile *c,
914 const struct prog_instruction *inst,
915 GLuint argIndex)
916 {
917 const struct prog_src_register *src = &inst->SrcReg[argIndex];
918 struct brw_compile *p = &c->func;
919 struct brw_reg const_reg = c->current_const[argIndex].reg;
920 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
921 struct brw_reg byte_addr_reg = get_tmp(c);
922
923 assert(argIndex < 3);
924
925 /* Can't reuse a reladdr constant load. */
926 c->current_const[argIndex].index = -1;
927
928 #if 0
929 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
930 src->Index, argIndex, c->current_const[argIndex].reg.nr);
931 #endif
932
933 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
934
935 /* fetch the first vec4 */
936 brw_dp_READ_4_vs_relative(p,
937 const_reg, /* writeback dest */
938 byte_addr_reg, /* address register */
939 16 * src->Index, /* byte offset */
940 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
941 );
942
943 return const_reg;
944 }
945
946
947
948 /* TODO: relative addressing!
949 */
950 static struct brw_reg get_reg( struct brw_vs_compile *c,
951 gl_register_file file,
952 GLuint index )
953 {
954 switch (file) {
955 case PROGRAM_TEMPORARY:
956 case PROGRAM_INPUT:
957 case PROGRAM_OUTPUT:
958 assert(c->regs[file][index].nr != 0);
959 return c->regs[file][index];
960 case PROGRAM_STATE_VAR:
961 case PROGRAM_CONSTANT:
962 case PROGRAM_UNIFORM:
963 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
964 return c->regs[PROGRAM_STATE_VAR][index];
965 case PROGRAM_ADDRESS:
966 assert(index == 0);
967 return c->regs[file][index];
968
969 case PROGRAM_UNDEFINED: /* undef values */
970 return brw_null_reg();
971
972 case PROGRAM_LOCAL_PARAM:
973 case PROGRAM_ENV_PARAM:
974 case PROGRAM_WRITE_ONLY:
975 default:
976 assert(0);
977 return brw_null_reg();
978 }
979 }
980
981
982 /**
983 * Indirect addressing: get reg[[arg] + offset].
984 */
985 static struct brw_reg deref( struct brw_vs_compile *c,
986 struct brw_reg arg,
987 GLint offset,
988 GLuint reg_size )
989 {
990 struct brw_compile *p = &c->func;
991 struct brw_reg tmp = get_tmp(c);
992 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
993 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
994 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
995 struct brw_reg indirect = brw_vec4_indirect(0,0);
996 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
997
998 /* Set the vertical stride on the register access so that the first
999 * 4 components come from a0.0 and the second 4 from a0.1.
1000 */
1001 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1002
1003 {
1004 brw_push_insn_state(p);
1005 brw_set_access_mode(p, BRW_ALIGN_1);
1006
1007 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1008 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1009
1010 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1011 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1012
1013 brw_MOV(p, tmp, indirect);
1014
1015 brw_pop_insn_state(p);
1016 }
1017
1018 /* NOTE: tmp not released */
1019 return tmp;
1020 }
1021
1022 static void
1023 move_to_reladdr_dst(struct brw_vs_compile *c,
1024 const struct prog_instruction *inst,
1025 struct brw_reg val)
1026 {
1027 struct brw_compile *p = &c->func;
1028 int reg_size = 32;
1029 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1030 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1031 struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
1032 GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
1033 struct brw_reg indirect = brw_vec4_indirect(0,0);
1034 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1035
1036 byte_offset += inst->DstReg.Index * reg_size;
1037
1038 brw_push_insn_state(p);
1039 brw_set_access_mode(p, BRW_ALIGN_1);
1040
1041 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1042 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1043 brw_MOV(p, indirect, val);
1044
1045 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1046 brw_ADD(p, brw_address_reg(0), acc,
1047 brw_imm_uw(byte_offset + reg_size / 2));
1048 brw_MOV(p, indirect, suboffset(val, 4));
1049
1050 brw_pop_insn_state(p);
1051 }
1052
1053 /**
1054 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1055 * TODO: relative addressing!
1056 */
1057 static struct brw_reg
1058 get_src_reg( struct brw_vs_compile *c,
1059 const struct prog_instruction *inst,
1060 GLuint argIndex )
1061 {
1062 const GLuint file = inst->SrcReg[argIndex].File;
1063 const GLint index = inst->SrcReg[argIndex].Index;
1064 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1065
1066 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1067 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1068
1069 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1070 SWIZZLE_ZERO,
1071 SWIZZLE_ZERO,
1072 SWIZZLE_ZERO)) {
1073 return brw_imm_f(0.0f);
1074 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1075 SWIZZLE_ONE,
1076 SWIZZLE_ONE,
1077 SWIZZLE_ONE)) {
1078 if (src->Negate)
1079 return brw_imm_f(-1.0F);
1080 else
1081 return brw_imm_f(1.0F);
1082 } else if (src->File == PROGRAM_CONSTANT) {
1083 const struct gl_program_parameter_list *params;
1084 float f;
1085 int component = -1;
1086
1087 switch (src->Swizzle) {
1088 case SWIZZLE_XXXX:
1089 component = 0;
1090 break;
1091 case SWIZZLE_YYYY:
1092 component = 1;
1093 break;
1094 case SWIZZLE_ZZZZ:
1095 component = 2;
1096 break;
1097 case SWIZZLE_WWWW:
1098 component = 3;
1099 break;
1100 }
1101
1102 if (component >= 0) {
1103 params = c->vp->program.Base.Parameters;
1104 f = params->ParameterValues[src->Index][component];
1105
1106 if (src->Abs)
1107 f = fabs(f);
1108 if (src->Negate)
1109 f = -f;
1110 return brw_imm_f(f);
1111 }
1112 }
1113 }
1114
1115 switch (file) {
1116 case PROGRAM_TEMPORARY:
1117 case PROGRAM_INPUT:
1118 case PROGRAM_OUTPUT:
1119 if (relAddr) {
1120 return deref(c, c->regs[file][0], index, 32);
1121 }
1122 else {
1123 assert(c->regs[file][index].nr != 0);
1124 return c->regs[file][index];
1125 }
1126
1127 case PROGRAM_STATE_VAR:
1128 case PROGRAM_CONSTANT:
1129 case PROGRAM_UNIFORM:
1130 case PROGRAM_ENV_PARAM:
1131 case PROGRAM_LOCAL_PARAM:
1132 if (c->vp->use_const_buffer) {
1133 if (!relAddr && c->constant_map[index] != -1) {
1134 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1135 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1136 } else if (relAddr)
1137 return get_reladdr_constant(c, inst, argIndex);
1138 else
1139 return get_constant(c, inst, argIndex);
1140 }
1141 else if (relAddr) {
1142 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1143 }
1144 else {
1145 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1146 return c->regs[PROGRAM_STATE_VAR][index];
1147 }
1148 case PROGRAM_ADDRESS:
1149 assert(index == 0);
1150 return c->regs[file][index];
1151
1152 case PROGRAM_UNDEFINED:
1153 /* this is a normal case since we loop over all three src args */
1154 return brw_null_reg();
1155
1156 case PROGRAM_WRITE_ONLY:
1157 default:
1158 assert(0);
1159 return brw_null_reg();
1160 }
1161 }
1162
1163 /**
1164 * Return the brw reg for the given instruction's src argument.
1165 * Will return mangled results for SWZ op. The emit_swz() function
1166 * ignores this result and recalculates taking extended swizzles into
1167 * account.
1168 */
1169 static struct brw_reg get_arg( struct brw_vs_compile *c,
1170 const struct prog_instruction *inst,
1171 GLuint argIndex )
1172 {
1173 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1174 struct brw_reg reg;
1175
1176 if (src->File == PROGRAM_UNDEFINED)
1177 return brw_null_reg();
1178
1179 reg = get_src_reg(c, inst, argIndex);
1180
1181 /* Convert 3-bit swizzle to 2-bit.
1182 */
1183 if (reg.file != BRW_IMMEDIATE_VALUE) {
1184 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1185 GET_SWZ(src->Swizzle, 1),
1186 GET_SWZ(src->Swizzle, 2),
1187 GET_SWZ(src->Swizzle, 3));
1188 }
1189
1190 /* Note this is ok for non-swizzle instructions:
1191 */
1192 reg.negate = src->Negate ? 1 : 0;
1193
1194 return reg;
1195 }
1196
1197
1198 /**
1199 * Get brw register for the given program dest register.
1200 */
1201 static struct brw_reg get_dst( struct brw_vs_compile *c,
1202 struct prog_dst_register dst )
1203 {
1204 struct brw_reg reg;
1205
1206 switch (dst.File) {
1207 case PROGRAM_TEMPORARY:
1208 case PROGRAM_OUTPUT:
1209 /* register-indirect addressing is only 1x1, not VxH, for
1210 * destination regs. So, for RelAddr we'll return a temporary
1211 * for the dest and do a move of the result to the RelAddr
1212 * register after the instruction emit.
1213 */
1214 if (dst.RelAddr) {
1215 reg = get_tmp(c);
1216 } else {
1217 assert(c->regs[dst.File][dst.Index].nr != 0);
1218 reg = c->regs[dst.File][dst.Index];
1219 }
1220 break;
1221 case PROGRAM_ADDRESS:
1222 assert(dst.Index == 0);
1223 reg = c->regs[dst.File][dst.Index];
1224 break;
1225 case PROGRAM_UNDEFINED:
1226 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1227 reg = brw_null_reg();
1228 break;
1229 default:
1230 assert(0);
1231 reg = brw_null_reg();
1232 }
1233
1234 assert(reg.type != BRW_IMMEDIATE_VALUE);
1235 reg.dw1.bits.writemask = dst.WriteMask;
1236
1237 return reg;
1238 }
1239
1240
1241 static void emit_swz( struct brw_vs_compile *c,
1242 struct brw_reg dst,
1243 const struct prog_instruction *inst)
1244 {
1245 const GLuint argIndex = 0;
1246 const struct prog_src_register src = inst->SrcReg[argIndex];
1247 struct brw_compile *p = &c->func;
1248 GLuint zeros_mask = 0;
1249 GLuint ones_mask = 0;
1250 GLuint src_mask = 0;
1251 GLubyte src_swz[4];
1252 GLboolean need_tmp = (src.Negate &&
1253 dst.file != BRW_GENERAL_REGISTER_FILE);
1254 struct brw_reg tmp = dst;
1255 GLuint i;
1256
1257 if (need_tmp)
1258 tmp = get_tmp(c);
1259
1260 for (i = 0; i < 4; i++) {
1261 if (dst.dw1.bits.writemask & (1<<i)) {
1262 GLubyte s = GET_SWZ(src.Swizzle, i);
1263 switch (s) {
1264 case SWIZZLE_X:
1265 case SWIZZLE_Y:
1266 case SWIZZLE_Z:
1267 case SWIZZLE_W:
1268 src_mask |= 1<<i;
1269 src_swz[i] = s;
1270 break;
1271 case SWIZZLE_ZERO:
1272 zeros_mask |= 1<<i;
1273 break;
1274 case SWIZZLE_ONE:
1275 ones_mask |= 1<<i;
1276 break;
1277 }
1278 }
1279 }
1280
1281 /* Do src first, in case dst aliases src:
1282 */
1283 if (src_mask) {
1284 struct brw_reg arg0;
1285
1286 arg0 = get_src_reg(c, inst, argIndex);
1287
1288 arg0 = brw_swizzle(arg0,
1289 src_swz[0], src_swz[1],
1290 src_swz[2], src_swz[3]);
1291
1292 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1293 }
1294
1295 if (zeros_mask)
1296 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1297
1298 if (ones_mask)
1299 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1300
1301 if (src.Negate)
1302 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1303
1304 if (need_tmp) {
1305 brw_MOV(p, dst, tmp);
1306 release_tmp(c, tmp);
1307 }
1308 }
1309
1310
1311 /**
1312 * Post-vertex-program processing. Send the results to the URB.
1313 */
1314 static void emit_vertex_write( struct brw_vs_compile *c)
1315 {
1316 struct brw_compile *p = &c->func;
1317 struct brw_context *brw = p->brw;
1318 struct intel_context *intel = &brw->intel;
1319 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1320 struct brw_reg ndc;
1321 int eot;
1322 GLuint len_vertex_header = 2;
1323
1324 if (c->key.copy_edgeflag) {
1325 brw_MOV(p,
1326 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1327 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1328 }
1329
1330 if (intel->gen < 6) {
1331 /* Build ndc coords */
1332 ndc = get_tmp(c);
1333 /* ndc = 1.0 / pos.w */
1334 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1335 /* ndc.xyz = pos * ndc */
1336 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1337 }
1338
1339 /* Update the header for point size, user clipping flags, and -ve rhw
1340 * workaround.
1341 */
1342 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1343 c->key.nr_userclip || brw->has_negative_rhw_bug)
1344 {
1345 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1346 GLuint i;
1347
1348 brw_MOV(p, header1, brw_imm_ud(0));
1349
1350 brw_set_access_mode(p, BRW_ALIGN_16);
1351
1352 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1353 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1354 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1355 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1356 }
1357
1358 for (i = 0; i < c->key.nr_userclip; i++) {
1359 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1360 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1361 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1362 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1363 }
1364
1365 /* i965 clipping workaround:
1366 * 1) Test for -ve rhw
1367 * 2) If set,
1368 * set ndc = (0,0,0,0)
1369 * set ucp[6] = 1
1370 *
1371 * Later, clipping will detect ucp[6] and ensure the primitive is
1372 * clipped against all fixed planes.
1373 */
1374 if (brw->has_negative_rhw_bug) {
1375 brw_CMP(p,
1376 vec8(brw_null_reg()),
1377 BRW_CONDITIONAL_L,
1378 brw_swizzle1(ndc, 3),
1379 brw_imm_f(0));
1380
1381 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1382 brw_MOV(p, ndc, brw_imm_f(0));
1383 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1384 }
1385
1386 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1387 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1388 brw_set_access_mode(p, BRW_ALIGN_16);
1389
1390 release_tmp(c, header1);
1391 }
1392 else {
1393 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1394 }
1395
1396 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1397 * of zeros followed by two sets of NDC coordinates:
1398 */
1399 brw_set_access_mode(p, BRW_ALIGN_1);
1400 brw_set_acc_write_control(p, 0);
1401
1402 /* The VUE layout is documented in Volume 2a. */
1403 if (intel->gen >= 6) {
1404 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1405 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1406 * dword 4-7 (m2) is the 4D space position
1407 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1408 * enabled. We don't use it, so skip it.
1409 * m3 is the first vertex element data we fill, which is the vertex
1410 * position.
1411 */
1412 brw_MOV(p, brw_message_reg(2), pos);
1413 brw_MOV(p, brw_message_reg(3), pos);
1414 len_vertex_header = 2;
1415 } else if (intel->gen == 5) {
1416 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1417 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1418 * dword 4-7 (m2) is the ndc position (set above)
1419 * dword 8-11 (m3) of the vertex header is the 4D space position
1420 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1421 * m6 is a pad so that the vertex element data is aligned
1422 * m7 is the first vertex data we fill, which is the vertex position.
1423 */
1424 brw_MOV(p, brw_message_reg(2), ndc);
1425 brw_MOV(p, brw_message_reg(3), pos);
1426 brw_MOV(p, brw_message_reg(7), pos);
1427 len_vertex_header = 6;
1428 } else {
1429 /* There are 8 dwords in VUE header pre-Ironlake:
1430 * dword 0-3 (m1) is indices, point width, clip flags.
1431 * dword 4-7 (m2) is ndc position (set above)
1432 *
1433 * dword 8-11 (m3) is the first vertex data, which we always have be the
1434 * vertex position.
1435 */
1436 brw_MOV(p, brw_message_reg(2), ndc);
1437 brw_MOV(p, brw_message_reg(3), pos);
1438 len_vertex_header = 2;
1439 }
1440
1441 eot = (c->first_overflow_output == 0);
1442
1443 brw_urb_WRITE(p,
1444 brw_null_reg(), /* dest */
1445 0, /* starting mrf reg nr */
1446 c->r0, /* src */
1447 0, /* allocate */
1448 1, /* used */
1449 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1450 0, /* response len */
1451 eot, /* eot */
1452 eot, /* writes complete */
1453 0, /* urb destination offset */
1454 BRW_URB_SWIZZLE_INTERLEAVE);
1455
1456 if (c->first_overflow_output > 0) {
1457 /* Not all of the vertex outputs/results fit into the MRF.
1458 * Move the overflowed attributes from the GRF to the MRF and
1459 * issue another brw_urb_WRITE().
1460 */
1461 GLuint i, mrf = 1;
1462 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1463 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1464 /* move from GRF to MRF */
1465 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1466 mrf++;
1467 }
1468 }
1469
1470 brw_urb_WRITE(p,
1471 brw_null_reg(), /* dest */
1472 0, /* starting mrf reg nr */
1473 c->r0, /* src */
1474 0, /* allocate */
1475 1, /* used */
1476 mrf, /* msg len */
1477 0, /* response len */
1478 1, /* eot */
1479 1, /* writes complete */
1480 14 / 2, /* urb destination offset */
1481 BRW_URB_SWIZZLE_INTERLEAVE);
1482 }
1483 }
1484
1485 static GLboolean
1486 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1487 {
1488 struct brw_compile *p = &c->func;
1489 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1490
1491 if (p->nr_insn == 0)
1492 return GL_FALSE;
1493
1494 if (val.address_mode != BRW_ADDRESS_DIRECT)
1495 return GL_FALSE;
1496
1497 switch (prev_insn->header.opcode) {
1498 case BRW_OPCODE_MOV:
1499 case BRW_OPCODE_MAC:
1500 case BRW_OPCODE_MUL:
1501 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1502 prev_insn->header.execution_size == val.width &&
1503 prev_insn->bits1.da1.dest_reg_file == val.file &&
1504 prev_insn->bits1.da1.dest_reg_type == val.type &&
1505 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1506 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1507 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1508 prev_insn->bits1.da16.dest_writemask == 0xf)
1509 return GL_TRUE;
1510 else
1511 return GL_FALSE;
1512 default:
1513 return GL_FALSE;
1514 }
1515 }
1516
1517 static uint32_t
1518 get_predicate(const struct prog_instruction *inst)
1519 {
1520 if (inst->DstReg.CondMask == COND_TR)
1521 return BRW_PREDICATE_NONE;
1522
1523 /* All of GLSL only produces predicates for COND_NE and one channel per
1524 * vector. Fail badly if someone starts doing something else, as it might
1525 * mean infinite looping or something.
1526 *
1527 * We'd like to support all the condition codes, but our hardware doesn't
1528 * quite match the Mesa IR, which is modeled after the NV extensions. For
1529 * those, the instruction may update the condition codes or not, then any
1530 * later instruction may use one of those condition codes. For gen4, the
1531 * instruction may update the flags register based on one of the condition
1532 * codes output by the instruction, and then further instructions may
1533 * predicate on that. We can probably support this, but it won't
1534 * necessarily be easy.
1535 */
1536 assert(inst->DstReg.CondMask == COND_NE);
1537
1538 switch (inst->DstReg.CondSwizzle) {
1539 case SWIZZLE_XXXX:
1540 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1541 case SWIZZLE_YYYY:
1542 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1543 case SWIZZLE_ZZZZ:
1544 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1545 case SWIZZLE_WWWW:
1546 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1547 default:
1548 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1549 inst->DstReg.CondMask);
1550 return BRW_PREDICATE_NORMAL;
1551 }
1552 }
1553
1554 /* Emit the vertex program instructions here.
1555 */
1556 void brw_vs_emit(struct brw_vs_compile *c )
1557 {
1558 #define MAX_IF_DEPTH 32
1559 #define MAX_LOOP_DEPTH 32
1560 struct brw_compile *p = &c->func;
1561 struct brw_context *brw = p->brw;
1562 struct intel_context *intel = &brw->intel;
1563 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1564 GLuint insn, if_depth = 0, loop_depth = 0;
1565 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1566 int if_depth_in_loop[MAX_LOOP_DEPTH];
1567 const struct brw_indirect stack_index = brw_indirect(0, 0);
1568 GLuint index;
1569 GLuint file;
1570
1571 if (INTEL_DEBUG & DEBUG_VS) {
1572 printf("vs-mesa:\n");
1573 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1574 GL_TRUE);
1575 printf("\n");
1576 }
1577
1578 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1579 brw_set_access_mode(p, BRW_ALIGN_16);
1580 if_depth_in_loop[loop_depth] = 0;
1581
1582 brw_set_acc_write_control(p, 1);
1583
1584 for (insn = 0; insn < nr_insns; insn++) {
1585 GLuint i;
1586 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1587
1588 /* Message registers can't be read, so copy the output into GRF
1589 * register if they are used in source registers
1590 */
1591 for (i = 0; i < 3; i++) {
1592 struct prog_src_register *src = &inst->SrcReg[i];
1593 GLuint index = src->Index;
1594 GLuint file = src->File;
1595 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1596 c->output_regs[index].used_in_src = GL_TRUE;
1597 }
1598
1599 switch (inst->Opcode) {
1600 case OPCODE_CAL:
1601 case OPCODE_RET:
1602 c->needs_stack = GL_TRUE;
1603 break;
1604 default:
1605 break;
1606 }
1607 }
1608
1609 /* Static register allocation
1610 */
1611 brw_vs_alloc_regs(c);
1612
1613 if (c->needs_stack)
1614 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1615
1616 for (insn = 0; insn < nr_insns; insn++) {
1617
1618 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1619 struct brw_reg args[3], dst;
1620 GLuint i;
1621 struct brw_instruction *temp;
1622
1623 #if 0
1624 printf("%d: ", insn);
1625 _mesa_print_instruction(inst);
1626 #endif
1627
1628 /* Get argument regs. SWZ is special and does this itself.
1629 */
1630 if (inst->Opcode != OPCODE_SWZ)
1631 for (i = 0; i < 3; i++) {
1632 const struct prog_src_register *src = &inst->SrcReg[i];
1633 index = src->Index;
1634 file = src->File;
1635 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1636 args[i] = c->output_regs[index].reg;
1637 else
1638 args[i] = get_arg(c, inst, i);
1639 }
1640
1641 /* Get dest regs. Note that it is possible for a reg to be both
1642 * dst and arg, given the static allocation of registers. So
1643 * care needs to be taken emitting multi-operation instructions.
1644 */
1645 index = inst->DstReg.Index;
1646 file = inst->DstReg.File;
1647 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1648 dst = c->output_regs[index].reg;
1649 else
1650 dst = get_dst(c, inst->DstReg);
1651
1652 if (inst->SaturateMode != SATURATE_OFF) {
1653 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1654 inst->SaturateMode);
1655 }
1656
1657 switch (inst->Opcode) {
1658 case OPCODE_ABS:
1659 brw_MOV(p, dst, brw_abs(args[0]));
1660 break;
1661 case OPCODE_ADD:
1662 brw_ADD(p, dst, args[0], args[1]);
1663 break;
1664 case OPCODE_COS:
1665 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1666 break;
1667 case OPCODE_DP2:
1668 brw_DP2(p, dst, args[0], args[1]);
1669 break;
1670 case OPCODE_DP3:
1671 brw_DP3(p, dst, args[0], args[1]);
1672 break;
1673 case OPCODE_DP4:
1674 brw_DP4(p, dst, args[0], args[1]);
1675 break;
1676 case OPCODE_DPH:
1677 brw_DPH(p, dst, args[0], args[1]);
1678 break;
1679 case OPCODE_NRM3:
1680 emit_nrm(c, dst, args[0], 3);
1681 break;
1682 case OPCODE_NRM4:
1683 emit_nrm(c, dst, args[0], 4);
1684 break;
1685 case OPCODE_DST:
1686 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1687 break;
1688 case OPCODE_EXP:
1689 unalias1(c, dst, args[0], emit_exp_noalias);
1690 break;
1691 case OPCODE_EX2:
1692 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1693 break;
1694 case OPCODE_ARL:
1695 brw_RNDD(p, dst, args[0]);
1696 break;
1697 case OPCODE_FLR:
1698 brw_RNDD(p, dst, args[0]);
1699 break;
1700 case OPCODE_FRC:
1701 brw_FRC(p, dst, args[0]);
1702 break;
1703 case OPCODE_LOG:
1704 unalias1(c, dst, args[0], emit_log_noalias);
1705 break;
1706 case OPCODE_LG2:
1707 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1708 break;
1709 case OPCODE_LIT:
1710 unalias1(c, dst, args[0], emit_lit_noalias);
1711 break;
1712 case OPCODE_LRP:
1713 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1714 break;
1715 case OPCODE_MAD:
1716 if (!accumulator_contains(c, args[2]))
1717 brw_MOV(p, brw_acc_reg(), args[2]);
1718 brw_MAC(p, dst, args[0], args[1]);
1719 break;
1720 case OPCODE_CMP:
1721 emit_cmp(p, dst, args[0], args[1], args[2]);
1722 break;
1723 case OPCODE_MAX:
1724 emit_max(p, dst, args[0], args[1]);
1725 break;
1726 case OPCODE_MIN:
1727 emit_min(p, dst, args[0], args[1]);
1728 break;
1729 case OPCODE_MOV:
1730 brw_MOV(p, dst, args[0]);
1731 break;
1732 case OPCODE_MUL:
1733 brw_MUL(p, dst, args[0], args[1]);
1734 break;
1735 case OPCODE_POW:
1736 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1737 break;
1738 case OPCODE_RCP:
1739 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1740 break;
1741 case OPCODE_RSQ:
1742 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1743 break;
1744
1745 case OPCODE_SEQ:
1746 unalias2(c, dst, args[0], args[1], emit_seq);
1747 break;
1748 case OPCODE_SIN:
1749 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1750 break;
1751 case OPCODE_SNE:
1752 unalias2(c, dst, args[0], args[1], emit_sne);
1753 break;
1754 case OPCODE_SGE:
1755 unalias2(c, dst, args[0], args[1], emit_sge);
1756 break;
1757 case OPCODE_SGT:
1758 unalias2(c, dst, args[0], args[1], emit_sgt);
1759 break;
1760 case OPCODE_SLT:
1761 unalias2(c, dst, args[0], args[1], emit_slt);
1762 break;
1763 case OPCODE_SLE:
1764 unalias2(c, dst, args[0], args[1], emit_sle);
1765 break;
1766 case OPCODE_SSG:
1767 unalias1(c, dst, args[0], emit_sign);
1768 break;
1769 case OPCODE_SUB:
1770 brw_ADD(p, dst, args[0], negate(args[1]));
1771 break;
1772 case OPCODE_SWZ:
1773 /* The args[0] value can't be used here as it won't have
1774 * correctly encoded the full swizzle:
1775 */
1776 emit_swz(c, dst, inst);
1777 break;
1778 case OPCODE_TRUNC:
1779 /* round toward zero */
1780 brw_RNDZ(p, dst, args[0]);
1781 break;
1782 case OPCODE_XPD:
1783 emit_xpd(p, dst, args[0], args[1]);
1784 break;
1785 case OPCODE_IF:
1786 assert(if_depth < MAX_IF_DEPTH);
1787 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1788 /* Note that brw_IF smashes the predicate_control field. */
1789 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1790 if_depth_in_loop[loop_depth]++;
1791 if_depth++;
1792 break;
1793 case OPCODE_ELSE:
1794 assert(if_depth > 0);
1795 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1796 break;
1797 case OPCODE_ENDIF:
1798 assert(if_depth > 0);
1799 brw_ENDIF(p, if_inst[--if_depth]);
1800 if_depth_in_loop[loop_depth]--;
1801 break;
1802 case OPCODE_BGNLOOP:
1803 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1804 if_depth_in_loop[loop_depth] = 0;
1805 break;
1806 case OPCODE_BRK:
1807 brw_set_predicate_control(p, get_predicate(inst));
1808 temp = brw_BREAK(p);
1809 temp->bits3.if_else.pop_count = if_depth_in_loop[loop_depth];
1810 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1811 break;
1812 case OPCODE_CONT:
1813 brw_set_predicate_control(p, get_predicate(inst));
1814 temp = brw_CONT(p);
1815 temp->bits3.if_else.pop_count = if_depth_in_loop[loop_depth];
1816 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1817 break;
1818 case OPCODE_ENDLOOP:
1819 {
1820 struct brw_instruction *inst0, *inst1;
1821 GLuint br = 1;
1822
1823 loop_depth--;
1824
1825 if (intel->gen == 5)
1826 br = 2;
1827
1828 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1829 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1830 while (inst0 > loop_inst[loop_depth]) {
1831 inst0--;
1832 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1833 inst0->bits3.if_else.jump_count == 0) {
1834 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1835 }
1836 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1837 inst0->bits3.if_else.jump_count == 0) {
1838 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1839 }
1840 }
1841 }
1842 break;
1843 case OPCODE_BRA:
1844 brw_set_predicate_control(p, get_predicate(inst));
1845 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1846 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1847 break;
1848 case OPCODE_CAL:
1849 brw_set_access_mode(p, BRW_ALIGN_1);
1850 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1851 brw_set_access_mode(p, BRW_ALIGN_16);
1852 brw_ADD(p, get_addr_reg(stack_index),
1853 get_addr_reg(stack_index), brw_imm_d(4));
1854 brw_save_call(p, inst->Comment, p->nr_insn);
1855 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1856 break;
1857 case OPCODE_RET:
1858 brw_ADD(p, get_addr_reg(stack_index),
1859 get_addr_reg(stack_index), brw_imm_d(-4));
1860 brw_set_access_mode(p, BRW_ALIGN_1);
1861 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1862 brw_set_access_mode(p, BRW_ALIGN_16);
1863 break;
1864 case OPCODE_END:
1865 emit_vertex_write(c);
1866 break;
1867 case OPCODE_PRINT:
1868 /* no-op */
1869 break;
1870 case OPCODE_BGNSUB:
1871 brw_save_label(p, inst->Comment, p->nr_insn);
1872 break;
1873 case OPCODE_ENDSUB:
1874 /* no-op */
1875 break;
1876 default:
1877 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1878 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1879 _mesa_opcode_string(inst->Opcode) :
1880 "unknown");
1881 }
1882
1883 /* Set the predication update on the last instruction of the native
1884 * instruction sequence.
1885 *
1886 * This would be problematic if it was set on a math instruction,
1887 * but that shouldn't be the case with the current GLSL compiler.
1888 */
1889 if (inst->CondUpdate) {
1890 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1891
1892 assert(hw_insn->header.destreg__conditionalmod == 0);
1893 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1894 }
1895
1896 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1897 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1898 && c->output_regs[inst->DstReg.Index].used_in_src) {
1899 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1900 }
1901
1902 /* Result color clamping.
1903 *
1904 * When destination register is an output register and
1905 * it's primary/secondary front/back color, we have to clamp
1906 * the result to [0,1]. This is done by enabling the
1907 * saturation bit for the last instruction.
1908 *
1909 * We don't use brw_set_saturate() as it modifies
1910 * p->current->header.saturate, which affects all the subsequent
1911 * instructions. Instead, we directly modify the header
1912 * of the last (already stored) instruction.
1913 */
1914 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1915 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1916 || (inst->DstReg.Index == VERT_RESULT_COL1)
1917 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1918 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1919 p->store[p->nr_insn-1].header.saturate = 1;
1920 }
1921 }
1922
1923 if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
1924 /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1925 * compute-to-mrf and the fact that we are allocating
1926 * registers for only the used PROGRAM_OUTPUTs.
1927 */
1928 move_to_reladdr_dst(c, inst, dst);
1929 }
1930
1931 release_tmps(c);
1932 }
1933
1934 brw_resolve_cals(p);
1935
1936 brw_optimize(p);
1937
1938 if (INTEL_DEBUG & DEBUG_VS) {
1939 int i;
1940
1941 printf("vs-native:\n");
1942 for (i = 0; i < p->nr_insn; i++)
1943 brw_disasm(stdout, &p->store[i], intel->gen);
1944 printf("\n");
1945 }
1946 }