7cbf22f2da9391caab4340ce24f94276ae752c09
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101
102 /**
103 * Preallocate GRF register before code emit.
104 * Do things as simply as possible. Allocate and populate all regs
105 * ahead of time.
106 */
107 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
108 {
109 struct intel_context *intel = &c->func.brw->intel;
110 GLuint i, reg = 0, mrf;
111 int attributes_in_vue;
112
113 /* Determine whether to use a real constant buffer or use a block
114 * of GRF registers for constants. The later is faster but only
115 * works if everything fits in the GRF.
116 * XXX this heuristic/check may need some fine tuning...
117 */
118 if (c->vp->program.Base.Parameters->NumParameters +
119 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
120 c->vp->use_const_buffer = GL_TRUE;
121 else
122 c->vp->use_const_buffer = GL_FALSE;
123
124 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
125
126 /* r0 -- reserved as usual
127 */
128 c->r0 = brw_vec8_grf(reg, 0);
129 reg++;
130
131 /* User clip planes from curbe:
132 */
133 if (c->key.nr_userclip) {
134 for (i = 0; i < c->key.nr_userclip; i++) {
135 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
136 }
137
138 /* Deal with curbe alignment:
139 */
140 reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
141 }
142
143 /* Vertex program parameters from curbe:
144 */
145 if (c->vp->use_const_buffer) {
146 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
147 int constant = 0;
148
149 /* We've got more constants than we can load with the push
150 * mechanism. This is often correlated with reladdr loads where
151 * we should probably be using a pull mechanism anyway to avoid
152 * excessive reading. However, the pull mechanism is slow in
153 * general. So, we try to allocate as many non-reladdr-loaded
154 * constants through the push buffer as we can before giving up.
155 */
156 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
157 for (i = 0;
158 i < c->vp->program.Base.NumInstructions && constant < max_constant;
159 i++) {
160 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
161 int arg;
162
163 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
164 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
165 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
166 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
167 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
168 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
169 inst->SrcReg[arg].RelAddr)
170 continue;
171
172 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
173 c->constant_map[inst->SrcReg[arg].Index] = constant++;
174 }
175 }
176 }
177
178 for (i = 0; i < constant; i++) {
179 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
180 (i%2) * 4),
181 0, 4, 1);
182 }
183 reg += (constant + 1) / 2;
184 c->prog_data.curb_read_length = reg - 1;
185 /* XXX 0 causes a bug elsewhere... */
186 c->prog_data.nr_params = MAX2(constant * 4, 4);
187 }
188 else {
189 /* use a section of the GRF for constants */
190 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
191 for (i = 0; i < nr_params; i++) {
192 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
193 }
194 reg += (nr_params + 1) / 2;
195 c->prog_data.curb_read_length = reg - 1;
196
197 c->prog_data.nr_params = nr_params * 4;
198 }
199
200 /* Allocate input regs:
201 */
202 c->nr_inputs = 0;
203 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
204 if (c->prog_data.inputs_read & (1 << i)) {
205 c->nr_inputs++;
206 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
207 reg++;
208 }
209 }
210 /* If there are no inputs, we'll still be reading one attribute's worth
211 * because it's required -- see urb_read_length setting.
212 */
213 if (c->nr_inputs == 0)
214 reg++;
215
216 /* Allocate outputs. The non-position outputs go straight into message regs.
217 */
218 c->nr_outputs = 0;
219 c->first_output = reg;
220 c->first_overflow_output = 0;
221
222 if (intel->gen >= 6)
223 mrf = 4;
224 else if (intel->gen == 5)
225 mrf = 8;
226 else
227 mrf = 4;
228
229 for (i = 0; i < VERT_RESULT_MAX; i++) {
230 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
231 c->nr_outputs++;
232 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
233 if (i == VERT_RESULT_HPOS) {
234 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
235 reg++;
236 }
237 else if (i == VERT_RESULT_PSIZ) {
238 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
239 reg++;
240 mrf++; /* just a placeholder? XXX fix later stages & remove this */
241 }
242 else {
243 /* Two restrictions on our compute-to-MRF here. The
244 * message length for all SEND messages is restricted to
245 * [1,15], so we can't use mrf 15, as that means a length
246 * of 16.
247 *
248 * Additionally, URB writes are aligned to URB rows, so we
249 * need to put an even number of registers of URB data in
250 * each URB write so that the later write is aligned. A
251 * message length of 15 means 1 message header reg plus 14
252 * regs of URB data.
253 *
254 * For attributes beyond the compute-to-MRF, we compute to
255 * GRFs and they will be written in the second URB_WRITE.
256 */
257 if (mrf < 15) {
258 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
259 mrf++;
260 }
261 else {
262 if (!c->first_overflow_output)
263 c->first_overflow_output = i;
264 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
265 reg++;
266 }
267 }
268 }
269 }
270
271 /* Allocate program temporaries:
272 */
273 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
274 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
275 reg++;
276 }
277
278 /* Address reg(s). Don't try to use the internal address reg until
279 * deref time.
280 */
281 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
282 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
283 reg,
284 0,
285 BRW_REGISTER_TYPE_D,
286 BRW_VERTICAL_STRIDE_8,
287 BRW_WIDTH_8,
288 BRW_HORIZONTAL_STRIDE_1,
289 BRW_SWIZZLE_XXXX,
290 WRITEMASK_X);
291 reg++;
292 }
293
294 if (c->vp->use_const_buffer) {
295 for (i = 0; i < 3; i++) {
296 c->current_const[i].index = -1;
297 c->current_const[i].reg = brw_vec8_grf(reg, 0);
298 reg++;
299 }
300 }
301
302 for (i = 0; i < 128; i++) {
303 if (c->output_regs[i].used_in_src) {
304 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
305 reg++;
306 }
307 }
308
309 if (c->needs_stack) {
310 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
311 reg += 2;
312 }
313
314 /* Some opcodes need an internal temporary:
315 */
316 c->first_tmp = reg;
317 c->last_tmp = reg; /* for allocation purposes */
318
319 /* Each input reg holds data from two vertices. The
320 * urb_read_length is the number of registers read from *each*
321 * vertex urb, so is half the amount:
322 */
323 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
324 /* Setting this field to 0 leads to undefined behavior according to the
325 * the VS_STATE docs. Our VUEs will always have at least one attribute
326 * sitting in them, even if it's padding.
327 */
328 if (c->prog_data.urb_read_length == 0)
329 c->prog_data.urb_read_length = 1;
330
331 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
332 * them to fit the biggest thing they need to.
333 */
334 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
335
336 /* See emit_vertex_write() for where the VUE's overhead on top of the
337 * attributes comes from.
338 */
339 if (intel->gen >= 6)
340 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
341 else if (intel->gen == 5)
342 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
343 else
344 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
345
346 c->prog_data.total_grf = reg;
347
348 if (INTEL_DEBUG & DEBUG_VS) {
349 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
350 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
351 printf("%s reg = %d\n", __FUNCTION__, reg);
352 }
353 }
354
355
356 /**
357 * If an instruction uses a temp reg both as a src and the dest, we
358 * sometimes need to allocate an intermediate temporary.
359 */
360 static void unalias1( struct brw_vs_compile *c,
361 struct brw_reg dst,
362 struct brw_reg arg0,
363 void (*func)( struct brw_vs_compile *,
364 struct brw_reg,
365 struct brw_reg ))
366 {
367 if (dst.file == arg0.file && dst.nr == arg0.nr) {
368 struct brw_compile *p = &c->func;
369 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
370 func(c, tmp, arg0);
371 brw_MOV(p, dst, tmp);
372 release_tmp(c, tmp);
373 }
374 else {
375 func(c, dst, arg0);
376 }
377 }
378
379 /**
380 * \sa unalias2
381 * Checkes if 2-operand instruction needs an intermediate temporary.
382 */
383 static void unalias2( struct brw_vs_compile *c,
384 struct brw_reg dst,
385 struct brw_reg arg0,
386 struct brw_reg arg1,
387 void (*func)( struct brw_vs_compile *,
388 struct brw_reg,
389 struct brw_reg,
390 struct brw_reg ))
391 {
392 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
393 (dst.file == arg1.file && dst.nr == arg1.nr)) {
394 struct brw_compile *p = &c->func;
395 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
396 func(c, tmp, arg0, arg1);
397 brw_MOV(p, dst, tmp);
398 release_tmp(c, tmp);
399 }
400 else {
401 func(c, dst, arg0, arg1);
402 }
403 }
404
405 /**
406 * \sa unalias2
407 * Checkes if 3-operand instruction needs an intermediate temporary.
408 */
409 static void unalias3( struct brw_vs_compile *c,
410 struct brw_reg dst,
411 struct brw_reg arg0,
412 struct brw_reg arg1,
413 struct brw_reg arg2,
414 void (*func)( struct brw_vs_compile *,
415 struct brw_reg,
416 struct brw_reg,
417 struct brw_reg,
418 struct brw_reg ))
419 {
420 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
421 (dst.file == arg1.file && dst.nr == arg1.nr) ||
422 (dst.file == arg2.file && dst.nr == arg2.nr)) {
423 struct brw_compile *p = &c->func;
424 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
425 func(c, tmp, arg0, arg1, arg2);
426 brw_MOV(p, dst, tmp);
427 release_tmp(c, tmp);
428 }
429 else {
430 func(c, dst, arg0, arg1, arg2);
431 }
432 }
433
434 static void emit_sop( struct brw_vs_compile *c,
435 struct brw_reg dst,
436 struct brw_reg arg0,
437 struct brw_reg arg1,
438 GLuint cond)
439 {
440 struct brw_compile *p = &c->func;
441
442 brw_MOV(p, dst, brw_imm_f(0.0f));
443 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
444 brw_MOV(p, dst, brw_imm_f(1.0f));
445 brw_set_predicate_control_flag_value(p, 0xff);
446 }
447
448 static void emit_seq( struct brw_vs_compile *c,
449 struct brw_reg dst,
450 struct brw_reg arg0,
451 struct brw_reg arg1 )
452 {
453 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
454 }
455
456 static void emit_sne( struct brw_vs_compile *c,
457 struct brw_reg dst,
458 struct brw_reg arg0,
459 struct brw_reg arg1 )
460 {
461 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
462 }
463 static void emit_slt( struct brw_vs_compile *c,
464 struct brw_reg dst,
465 struct brw_reg arg0,
466 struct brw_reg arg1 )
467 {
468 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
469 }
470
471 static void emit_sle( struct brw_vs_compile *c,
472 struct brw_reg dst,
473 struct brw_reg arg0,
474 struct brw_reg arg1 )
475 {
476 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
477 }
478
479 static void emit_sgt( struct brw_vs_compile *c,
480 struct brw_reg dst,
481 struct brw_reg arg0,
482 struct brw_reg arg1 )
483 {
484 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
485 }
486
487 static void emit_sge( struct brw_vs_compile *c,
488 struct brw_reg dst,
489 struct brw_reg arg0,
490 struct brw_reg arg1 )
491 {
492 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
493 }
494
495 static void emit_cmp( struct brw_compile *p,
496 struct brw_reg dst,
497 struct brw_reg arg0,
498 struct brw_reg arg1,
499 struct brw_reg arg2 )
500 {
501 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
502 brw_SEL(p, dst, arg1, arg2);
503 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
504 }
505
506 static void emit_sign(struct brw_vs_compile *c,
507 struct brw_reg dst,
508 struct brw_reg arg0)
509 {
510 struct brw_compile *p = &c->func;
511
512 brw_MOV(p, dst, brw_imm_f(0));
513
514 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
515 brw_MOV(p, dst, brw_imm_f(-1.0));
516 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
517
518 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
519 brw_MOV(p, dst, brw_imm_f(1.0));
520 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
521 }
522
523 static void emit_max( struct brw_compile *p,
524 struct brw_reg dst,
525 struct brw_reg arg0,
526 struct brw_reg arg1 )
527 {
528 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
529 brw_SEL(p, dst, arg0, arg1);
530 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
531 }
532
533 static void emit_min( struct brw_compile *p,
534 struct brw_reg dst,
535 struct brw_reg arg0,
536 struct brw_reg arg1 )
537 {
538 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
539 brw_SEL(p, dst, arg0, arg1);
540 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
541 }
542
543
544 static void emit_math1( struct brw_vs_compile *c,
545 GLuint function,
546 struct brw_reg dst,
547 struct brw_reg arg0,
548 GLuint precision)
549 {
550 /* There are various odd behaviours with SEND on the simulator. In
551 * addition there are documented issues with the fact that the GEN4
552 * processor doesn't do dependency control properly on SEND
553 * results. So, on balance, this kludge to get around failures
554 * with writemasked math results looks like it might be necessary
555 * whether that turns out to be a simulator bug or not:
556 */
557 struct brw_compile *p = &c->func;
558 struct intel_context *intel = &p->brw->intel;
559 struct brw_reg tmp = dst;
560 GLboolean need_tmp = (intel->gen < 6 &&
561 (dst.dw1.bits.writemask != 0xf ||
562 dst.file != BRW_GENERAL_REGISTER_FILE));
563
564 if (need_tmp)
565 tmp = get_tmp(c);
566
567 brw_math(p,
568 tmp,
569 function,
570 BRW_MATH_SATURATE_NONE,
571 2,
572 arg0,
573 BRW_MATH_DATA_SCALAR,
574 precision);
575
576 if (need_tmp) {
577 brw_MOV(p, dst, tmp);
578 release_tmp(c, tmp);
579 }
580 }
581
582
583 static void emit_math2( struct brw_vs_compile *c,
584 GLuint function,
585 struct brw_reg dst,
586 struct brw_reg arg0,
587 struct brw_reg arg1,
588 GLuint precision)
589 {
590 struct brw_compile *p = &c->func;
591 struct intel_context *intel = &p->brw->intel;
592 struct brw_reg tmp = dst;
593 GLboolean need_tmp = (intel->gen < 6 &&
594 (dst.dw1.bits.writemask != 0xf ||
595 dst.file != BRW_GENERAL_REGISTER_FILE));
596
597 if (need_tmp)
598 tmp = get_tmp(c);
599
600 brw_MOV(p, brw_message_reg(3), arg1);
601
602 brw_math(p,
603 tmp,
604 function,
605 BRW_MATH_SATURATE_NONE,
606 2,
607 arg0,
608 BRW_MATH_DATA_SCALAR,
609 precision);
610
611 if (need_tmp) {
612 brw_MOV(p, dst, tmp);
613 release_tmp(c, tmp);
614 }
615 }
616
617
618 static void emit_exp_noalias( struct brw_vs_compile *c,
619 struct brw_reg dst,
620 struct brw_reg arg0 )
621 {
622 struct brw_compile *p = &c->func;
623
624
625 if (dst.dw1.bits.writemask & WRITEMASK_X) {
626 struct brw_reg tmp = get_tmp(c);
627 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
628
629 /* tmp_d = floor(arg0.x) */
630 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
631
632 /* result[0] = 2.0 ^ tmp */
633
634 /* Adjust exponent for floating point:
635 * exp += 127
636 */
637 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
638
639 /* Install exponent and sign.
640 * Excess drops off the edge:
641 */
642 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
643 tmp_d, brw_imm_d(23));
644
645 release_tmp(c, tmp);
646 }
647
648 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
649 /* result[1] = arg0.x - floor(arg0.x) */
650 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
651 }
652
653 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
654 /* As with the LOG instruction, we might be better off just
655 * doing a taylor expansion here, seeing as we have to do all
656 * the prep work.
657 *
658 * If mathbox partial precision is too low, consider also:
659 * result[3] = result[0] * EXP(result[1])
660 */
661 emit_math1(c,
662 BRW_MATH_FUNCTION_EXP,
663 brw_writemask(dst, WRITEMASK_Z),
664 brw_swizzle1(arg0, 0),
665 BRW_MATH_PRECISION_FULL);
666 }
667
668 if (dst.dw1.bits.writemask & WRITEMASK_W) {
669 /* result[3] = 1.0; */
670 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
671 }
672 }
673
674
675 static void emit_log_noalias( struct brw_vs_compile *c,
676 struct brw_reg dst,
677 struct brw_reg arg0 )
678 {
679 struct brw_compile *p = &c->func;
680 struct brw_reg tmp = dst;
681 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
682 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
683 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
684 dst.file != BRW_GENERAL_REGISTER_FILE);
685
686 if (need_tmp) {
687 tmp = get_tmp(c);
688 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
689 }
690
691 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
692 * according to spec:
693 *
694 * These almost look likey they could be joined up, but not really
695 * practical:
696 *
697 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
698 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
699 */
700 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
701 brw_AND(p,
702 brw_writemask(tmp_ud, WRITEMASK_X),
703 brw_swizzle1(arg0_ud, 0),
704 brw_imm_ud((1U<<31)-1));
705
706 brw_SHR(p,
707 brw_writemask(tmp_ud, WRITEMASK_X),
708 tmp_ud,
709 brw_imm_ud(23));
710
711 brw_ADD(p,
712 brw_writemask(tmp, WRITEMASK_X),
713 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
714 brw_imm_d(-127));
715 }
716
717 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
718 brw_AND(p,
719 brw_writemask(tmp_ud, WRITEMASK_Y),
720 brw_swizzle1(arg0_ud, 0),
721 brw_imm_ud((1<<23)-1));
722
723 brw_OR(p,
724 brw_writemask(tmp_ud, WRITEMASK_Y),
725 tmp_ud,
726 brw_imm_ud(127<<23));
727 }
728
729 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
730 /* result[2] = result[0] + LOG2(result[1]); */
731
732 /* Why bother? The above is just a hint how to do this with a
733 * taylor series. Maybe we *should* use a taylor series as by
734 * the time all the above has been done it's almost certainly
735 * quicker than calling the mathbox, even with low precision.
736 *
737 * Options are:
738 * - result[0] + mathbox.LOG2(result[1])
739 * - mathbox.LOG2(arg0.x)
740 * - result[0] + inline_taylor_approx(result[1])
741 */
742 emit_math1(c,
743 BRW_MATH_FUNCTION_LOG,
744 brw_writemask(tmp, WRITEMASK_Z),
745 brw_swizzle1(tmp, 1),
746 BRW_MATH_PRECISION_FULL);
747
748 brw_ADD(p,
749 brw_writemask(tmp, WRITEMASK_Z),
750 brw_swizzle1(tmp, 2),
751 brw_swizzle1(tmp, 0));
752 }
753
754 if (dst.dw1.bits.writemask & WRITEMASK_W) {
755 /* result[3] = 1.0; */
756 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
757 }
758
759 if (need_tmp) {
760 brw_MOV(p, dst, tmp);
761 release_tmp(c, tmp);
762 }
763 }
764
765
766 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
767 */
768 static void emit_dst_noalias( struct brw_vs_compile *c,
769 struct brw_reg dst,
770 struct brw_reg arg0,
771 struct brw_reg arg1)
772 {
773 struct brw_compile *p = &c->func;
774
775 /* There must be a better way to do this:
776 */
777 if (dst.dw1.bits.writemask & WRITEMASK_X)
778 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
779 if (dst.dw1.bits.writemask & WRITEMASK_Y)
780 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
781 if (dst.dw1.bits.writemask & WRITEMASK_Z)
782 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
783 if (dst.dw1.bits.writemask & WRITEMASK_W)
784 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
785 }
786
787
788 static void emit_xpd( struct brw_compile *p,
789 struct brw_reg dst,
790 struct brw_reg t,
791 struct brw_reg u)
792 {
793 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
794 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
795 }
796
797
798 static void emit_lit_noalias( struct brw_vs_compile *c,
799 struct brw_reg dst,
800 struct brw_reg arg0 )
801 {
802 struct brw_compile *p = &c->func;
803 struct brw_instruction *if_insn;
804 struct brw_reg tmp = dst;
805 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
806
807 if (need_tmp)
808 tmp = get_tmp(c);
809
810 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
811 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
812
813 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
814 * to get all channels active inside the IF. In the clipping code
815 * we run with NoMask, so it's not an option and we can use
816 * BRW_EXECUTE_1 for all comparisions.
817 */
818 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
819 if_insn = brw_IF(p, BRW_EXECUTE_8);
820 {
821 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
822
823 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
824 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
825 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
826
827 emit_math2(c,
828 BRW_MATH_FUNCTION_POW,
829 brw_writemask(dst, WRITEMASK_Z),
830 brw_swizzle1(tmp, 2),
831 brw_swizzle1(arg0, 3),
832 BRW_MATH_PRECISION_PARTIAL);
833 }
834
835 brw_ENDIF(p, if_insn);
836
837 release_tmp(c, tmp);
838 }
839
840 static void emit_lrp_noalias(struct brw_vs_compile *c,
841 struct brw_reg dst,
842 struct brw_reg arg0,
843 struct brw_reg arg1,
844 struct brw_reg arg2)
845 {
846 struct brw_compile *p = &c->func;
847
848 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
849 brw_MUL(p, brw_null_reg(), dst, arg2);
850 brw_MAC(p, dst, arg0, arg1);
851 }
852
853 /** 3 or 4-component vector normalization */
854 static void emit_nrm( struct brw_vs_compile *c,
855 struct brw_reg dst,
856 struct brw_reg arg0,
857 int num_comps)
858 {
859 struct brw_compile *p = &c->func;
860 struct brw_reg tmp = get_tmp(c);
861
862 /* tmp = dot(arg0, arg0) */
863 if (num_comps == 3)
864 brw_DP3(p, tmp, arg0, arg0);
865 else
866 brw_DP4(p, tmp, arg0, arg0);
867
868 /* tmp = 1 / sqrt(tmp) */
869 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
870
871 /* dst = arg0 * tmp */
872 brw_MUL(p, dst, arg0, tmp);
873
874 release_tmp(c, tmp);
875 }
876
877
878 static struct brw_reg
879 get_constant(struct brw_vs_compile *c,
880 const struct prog_instruction *inst,
881 GLuint argIndex)
882 {
883 const struct prog_src_register *src = &inst->SrcReg[argIndex];
884 struct brw_compile *p = &c->func;
885 struct brw_reg const_reg = c->current_const[argIndex].reg;
886
887 assert(argIndex < 3);
888
889 if (c->current_const[argIndex].index != src->Index) {
890 /* Keep track of the last constant loaded in this slot, for reuse. */
891 c->current_const[argIndex].index = src->Index;
892
893 #if 0
894 printf(" fetch const[%d] for arg %d into reg %d\n",
895 src->Index, argIndex, c->current_const[argIndex].reg.nr);
896 #endif
897 /* need to fetch the constant now */
898 brw_dp_READ_4_vs(p,
899 const_reg, /* writeback dest */
900 16 * src->Index, /* byte offset */
901 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
902 );
903 }
904
905 /* replicate lower four floats into upper half (to get XYZWXYZW) */
906 const_reg = stride(const_reg, 0, 4, 0);
907 const_reg.subnr = 0;
908
909 return const_reg;
910 }
911
912 static struct brw_reg
913 get_reladdr_constant(struct brw_vs_compile *c,
914 const struct prog_instruction *inst,
915 GLuint argIndex)
916 {
917 const struct prog_src_register *src = &inst->SrcReg[argIndex];
918 struct brw_compile *p = &c->func;
919 struct brw_reg const_reg = c->current_const[argIndex].reg;
920 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
921 struct brw_reg byte_addr_reg = get_tmp(c);
922
923 assert(argIndex < 3);
924
925 /* Can't reuse a reladdr constant load. */
926 c->current_const[argIndex].index = -1;
927
928 #if 0
929 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
930 src->Index, argIndex, c->current_const[argIndex].reg.nr);
931 #endif
932
933 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
934
935 /* fetch the first vec4 */
936 brw_dp_READ_4_vs_relative(p,
937 const_reg, /* writeback dest */
938 byte_addr_reg, /* address register */
939 16 * src->Index, /* byte offset */
940 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
941 );
942
943 return const_reg;
944 }
945
946
947
948 /* TODO: relative addressing!
949 */
950 static struct brw_reg get_reg( struct brw_vs_compile *c,
951 gl_register_file file,
952 GLuint index )
953 {
954 switch (file) {
955 case PROGRAM_TEMPORARY:
956 case PROGRAM_INPUT:
957 case PROGRAM_OUTPUT:
958 assert(c->regs[file][index].nr != 0);
959 return c->regs[file][index];
960 case PROGRAM_STATE_VAR:
961 case PROGRAM_CONSTANT:
962 case PROGRAM_UNIFORM:
963 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
964 return c->regs[PROGRAM_STATE_VAR][index];
965 case PROGRAM_ADDRESS:
966 assert(index == 0);
967 return c->regs[file][index];
968
969 case PROGRAM_UNDEFINED: /* undef values */
970 return brw_null_reg();
971
972 case PROGRAM_LOCAL_PARAM:
973 case PROGRAM_ENV_PARAM:
974 case PROGRAM_WRITE_ONLY:
975 default:
976 assert(0);
977 return brw_null_reg();
978 }
979 }
980
981
982 /**
983 * Indirect addressing: get reg[[arg] + offset].
984 */
985 static struct brw_reg deref( struct brw_vs_compile *c,
986 struct brw_reg arg,
987 GLint offset,
988 GLuint reg_size )
989 {
990 struct brw_compile *p = &c->func;
991 struct brw_reg tmp = get_tmp(c);
992 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
993 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
994 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
995 struct brw_reg indirect = brw_vec4_indirect(0,0);
996 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
997
998 /* Set the vertical stride on the register access so that the first
999 * 4 components come from a0.0 and the second 4 from a0.1.
1000 */
1001 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1002
1003 {
1004 brw_push_insn_state(p);
1005 brw_set_access_mode(p, BRW_ALIGN_1);
1006
1007 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1008 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1009
1010 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1011 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1012
1013 brw_MOV(p, tmp, indirect);
1014
1015 brw_pop_insn_state(p);
1016 }
1017
1018 /* NOTE: tmp not released */
1019 return tmp;
1020 }
1021
1022 static void
1023 move_to_reladdr_dst(struct brw_vs_compile *c,
1024 const struct prog_instruction *inst,
1025 struct brw_reg val)
1026 {
1027 struct brw_compile *p = &c->func;
1028 int reg_size = 32;
1029 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1030 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1031 struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
1032 GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
1033 struct brw_reg indirect = brw_vec4_indirect(0,0);
1034 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1035
1036 byte_offset += inst->DstReg.Index * reg_size;
1037
1038 brw_push_insn_state(p);
1039 brw_set_access_mode(p, BRW_ALIGN_1);
1040
1041 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1042 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1043 brw_MOV(p, indirect, val);
1044
1045 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1046 brw_ADD(p, brw_address_reg(0), acc,
1047 brw_imm_uw(byte_offset + reg_size / 2));
1048 brw_MOV(p, indirect, suboffset(val, 4));
1049
1050 brw_pop_insn_state(p);
1051 }
1052
1053 /**
1054 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1055 * TODO: relative addressing!
1056 */
1057 static struct brw_reg
1058 get_src_reg( struct brw_vs_compile *c,
1059 const struct prog_instruction *inst,
1060 GLuint argIndex )
1061 {
1062 const GLuint file = inst->SrcReg[argIndex].File;
1063 const GLint index = inst->SrcReg[argIndex].Index;
1064 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1065
1066 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1067 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1068
1069 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1070 SWIZZLE_ZERO,
1071 SWIZZLE_ZERO,
1072 SWIZZLE_ZERO)) {
1073 return brw_imm_f(0.0f);
1074 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1075 SWIZZLE_ONE,
1076 SWIZZLE_ONE,
1077 SWIZZLE_ONE)) {
1078 if (src->Negate)
1079 return brw_imm_f(-1.0F);
1080 else
1081 return brw_imm_f(1.0F);
1082 } else if (src->File == PROGRAM_CONSTANT) {
1083 const struct gl_program_parameter_list *params;
1084 float f;
1085 int component = -1;
1086
1087 switch (src->Swizzle) {
1088 case SWIZZLE_XXXX:
1089 component = 0;
1090 break;
1091 case SWIZZLE_YYYY:
1092 component = 1;
1093 break;
1094 case SWIZZLE_ZZZZ:
1095 component = 2;
1096 break;
1097 case SWIZZLE_WWWW:
1098 component = 3;
1099 break;
1100 }
1101
1102 if (component >= 0) {
1103 params = c->vp->program.Base.Parameters;
1104 f = params->ParameterValues[src->Index][component];
1105
1106 if (src->Abs)
1107 f = fabs(f);
1108 if (src->Negate)
1109 f = -f;
1110 return brw_imm_f(f);
1111 }
1112 }
1113 }
1114
1115 switch (file) {
1116 case PROGRAM_TEMPORARY:
1117 case PROGRAM_INPUT:
1118 case PROGRAM_OUTPUT:
1119 if (relAddr) {
1120 return deref(c, c->regs[file][0], index, 32);
1121 }
1122 else {
1123 assert(c->regs[file][index].nr != 0);
1124 return c->regs[file][index];
1125 }
1126
1127 case PROGRAM_STATE_VAR:
1128 case PROGRAM_CONSTANT:
1129 case PROGRAM_UNIFORM:
1130 case PROGRAM_ENV_PARAM:
1131 case PROGRAM_LOCAL_PARAM:
1132 if (c->vp->use_const_buffer) {
1133 if (!relAddr && c->constant_map[index] != -1) {
1134 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1135 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1136 } else if (relAddr)
1137 return get_reladdr_constant(c, inst, argIndex);
1138 else
1139 return get_constant(c, inst, argIndex);
1140 }
1141 else if (relAddr) {
1142 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1143 }
1144 else {
1145 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1146 return c->regs[PROGRAM_STATE_VAR][index];
1147 }
1148 case PROGRAM_ADDRESS:
1149 assert(index == 0);
1150 return c->regs[file][index];
1151
1152 case PROGRAM_UNDEFINED:
1153 /* this is a normal case since we loop over all three src args */
1154 return brw_null_reg();
1155
1156 case PROGRAM_WRITE_ONLY:
1157 default:
1158 assert(0);
1159 return brw_null_reg();
1160 }
1161 }
1162
1163 /**
1164 * Return the brw reg for the given instruction's src argument.
1165 * Will return mangled results for SWZ op. The emit_swz() function
1166 * ignores this result and recalculates taking extended swizzles into
1167 * account.
1168 */
1169 static struct brw_reg get_arg( struct brw_vs_compile *c,
1170 const struct prog_instruction *inst,
1171 GLuint argIndex )
1172 {
1173 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1174 struct brw_reg reg;
1175
1176 if (src->File == PROGRAM_UNDEFINED)
1177 return brw_null_reg();
1178
1179 reg = get_src_reg(c, inst, argIndex);
1180
1181 /* Convert 3-bit swizzle to 2-bit.
1182 */
1183 if (reg.file != BRW_IMMEDIATE_VALUE) {
1184 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1185 GET_SWZ(src->Swizzle, 1),
1186 GET_SWZ(src->Swizzle, 2),
1187 GET_SWZ(src->Swizzle, 3));
1188 }
1189
1190 /* Note this is ok for non-swizzle instructions:
1191 */
1192 reg.negate = src->Negate ? 1 : 0;
1193
1194 return reg;
1195 }
1196
1197
1198 /**
1199 * Get brw register for the given program dest register.
1200 */
1201 static struct brw_reg get_dst( struct brw_vs_compile *c,
1202 struct prog_dst_register dst )
1203 {
1204 struct brw_reg reg;
1205
1206 switch (dst.File) {
1207 case PROGRAM_TEMPORARY:
1208 case PROGRAM_OUTPUT:
1209 /* register-indirect addressing is only 1x1, not VxH, for
1210 * destination regs. So, for RelAddr we'll return a temporary
1211 * for the dest and do a move of the result to the RelAddr
1212 * register after the instruction emit.
1213 */
1214 if (dst.RelAddr) {
1215 reg = get_tmp(c);
1216 } else {
1217 assert(c->regs[dst.File][dst.Index].nr != 0);
1218 reg = c->regs[dst.File][dst.Index];
1219 }
1220 break;
1221 case PROGRAM_ADDRESS:
1222 assert(dst.Index == 0);
1223 reg = c->regs[dst.File][dst.Index];
1224 break;
1225 case PROGRAM_UNDEFINED:
1226 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1227 reg = brw_null_reg();
1228 break;
1229 default:
1230 assert(0);
1231 reg = brw_null_reg();
1232 }
1233
1234 assert(reg.type != BRW_IMMEDIATE_VALUE);
1235 reg.dw1.bits.writemask = dst.WriteMask;
1236
1237 return reg;
1238 }
1239
1240
1241 static void emit_swz( struct brw_vs_compile *c,
1242 struct brw_reg dst,
1243 const struct prog_instruction *inst)
1244 {
1245 const GLuint argIndex = 0;
1246 const struct prog_src_register src = inst->SrcReg[argIndex];
1247 struct brw_compile *p = &c->func;
1248 GLuint zeros_mask = 0;
1249 GLuint ones_mask = 0;
1250 GLuint src_mask = 0;
1251 GLubyte src_swz[4];
1252 GLboolean need_tmp = (src.Negate &&
1253 dst.file != BRW_GENERAL_REGISTER_FILE);
1254 struct brw_reg tmp = dst;
1255 GLuint i;
1256
1257 if (need_tmp)
1258 tmp = get_tmp(c);
1259
1260 for (i = 0; i < 4; i++) {
1261 if (dst.dw1.bits.writemask & (1<<i)) {
1262 GLubyte s = GET_SWZ(src.Swizzle, i);
1263 switch (s) {
1264 case SWIZZLE_X:
1265 case SWIZZLE_Y:
1266 case SWIZZLE_Z:
1267 case SWIZZLE_W:
1268 src_mask |= 1<<i;
1269 src_swz[i] = s;
1270 break;
1271 case SWIZZLE_ZERO:
1272 zeros_mask |= 1<<i;
1273 break;
1274 case SWIZZLE_ONE:
1275 ones_mask |= 1<<i;
1276 break;
1277 }
1278 }
1279 }
1280
1281 /* Do src first, in case dst aliases src:
1282 */
1283 if (src_mask) {
1284 struct brw_reg arg0;
1285
1286 arg0 = get_src_reg(c, inst, argIndex);
1287
1288 arg0 = brw_swizzle(arg0,
1289 src_swz[0], src_swz[1],
1290 src_swz[2], src_swz[3]);
1291
1292 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1293 }
1294
1295 if (zeros_mask)
1296 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1297
1298 if (ones_mask)
1299 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1300
1301 if (src.Negate)
1302 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1303
1304 if (need_tmp) {
1305 brw_MOV(p, dst, tmp);
1306 release_tmp(c, tmp);
1307 }
1308 }
1309
1310
1311 /**
1312 * Post-vertex-program processing. Send the results to the URB.
1313 */
1314 static void emit_vertex_write( struct brw_vs_compile *c)
1315 {
1316 struct brw_compile *p = &c->func;
1317 struct brw_context *brw = p->brw;
1318 struct intel_context *intel = &brw->intel;
1319 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1320 struct brw_reg ndc;
1321 int eot;
1322 GLuint len_vertex_header = 2;
1323
1324 if (c->key.copy_edgeflag) {
1325 brw_MOV(p,
1326 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1327 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1328 }
1329
1330 if (intel->gen < 6) {
1331 /* Build ndc coords */
1332 ndc = get_tmp(c);
1333 /* ndc = 1.0 / pos.w */
1334 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1335 /* ndc.xyz = pos * ndc */
1336 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1337 }
1338
1339 /* Update the header for point size, user clipping flags, and -ve rhw
1340 * workaround.
1341 */
1342 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1343 c->key.nr_userclip || brw->has_negative_rhw_bug)
1344 {
1345 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1346 GLuint i;
1347
1348 brw_MOV(p, header1, brw_imm_ud(0));
1349
1350 brw_set_access_mode(p, BRW_ALIGN_16);
1351
1352 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1353 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1354 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1355 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1356 }
1357
1358 for (i = 0; i < c->key.nr_userclip; i++) {
1359 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1360 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1361 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1362 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1363 }
1364
1365 /* i965 clipping workaround:
1366 * 1) Test for -ve rhw
1367 * 2) If set,
1368 * set ndc = (0,0,0,0)
1369 * set ucp[6] = 1
1370 *
1371 * Later, clipping will detect ucp[6] and ensure the primitive is
1372 * clipped against all fixed planes.
1373 */
1374 if (brw->has_negative_rhw_bug) {
1375 brw_CMP(p,
1376 vec8(brw_null_reg()),
1377 BRW_CONDITIONAL_L,
1378 brw_swizzle1(ndc, 3),
1379 brw_imm_f(0));
1380
1381 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1382 brw_MOV(p, ndc, brw_imm_f(0));
1383 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1384 }
1385
1386 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1387 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1388 brw_set_access_mode(p, BRW_ALIGN_16);
1389
1390 release_tmp(c, header1);
1391 }
1392 else {
1393 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1394 }
1395
1396 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1397 * of zeros followed by two sets of NDC coordinates:
1398 */
1399 brw_set_access_mode(p, BRW_ALIGN_1);
1400
1401 /* The VUE layout is documented in Volume 2a. */
1402 if (intel->gen >= 6) {
1403 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1404 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1405 * dword 4-7 (m2) is the 4D space position
1406 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1407 * enabled. We don't use it, so skip it.
1408 * m3 is the first vertex element data we fill, which is the vertex
1409 * position.
1410 */
1411 brw_MOV(p, brw_message_reg(2), pos);
1412 brw_MOV(p, brw_message_reg(3), pos);
1413 len_vertex_header = 2;
1414 } else if (intel->gen == 5) {
1415 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1416 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1417 * dword 4-7 (m2) is the ndc position (set above)
1418 * dword 8-11 (m3) of the vertex header is the 4D space position
1419 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1420 * m6 is a pad so that the vertex element data is aligned
1421 * m7 is the first vertex data we fill, which is the vertex position.
1422 */
1423 brw_MOV(p, brw_message_reg(2), ndc);
1424 brw_MOV(p, brw_message_reg(3), pos);
1425 brw_MOV(p, brw_message_reg(7), pos);
1426 len_vertex_header = 6;
1427 } else {
1428 /* There are 8 dwords in VUE header pre-Ironlake:
1429 * dword 0-3 (m1) is indices, point width, clip flags.
1430 * dword 4-7 (m2) is ndc position (set above)
1431 *
1432 * dword 8-11 (m3) is the first vertex data, which we always have be the
1433 * vertex position.
1434 */
1435 brw_MOV(p, brw_message_reg(2), ndc);
1436 brw_MOV(p, brw_message_reg(3), pos);
1437 len_vertex_header = 2;
1438 }
1439
1440 eot = (c->first_overflow_output == 0);
1441
1442 brw_urb_WRITE(p,
1443 brw_null_reg(), /* dest */
1444 0, /* starting mrf reg nr */
1445 c->r0, /* src */
1446 0, /* allocate */
1447 1, /* used */
1448 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1449 0, /* response len */
1450 eot, /* eot */
1451 eot, /* writes complete */
1452 0, /* urb destination offset */
1453 BRW_URB_SWIZZLE_INTERLEAVE);
1454
1455 if (c->first_overflow_output > 0) {
1456 /* Not all of the vertex outputs/results fit into the MRF.
1457 * Move the overflowed attributes from the GRF to the MRF and
1458 * issue another brw_urb_WRITE().
1459 */
1460 GLuint i, mrf = 1;
1461 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1462 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1463 /* move from GRF to MRF */
1464 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1465 mrf++;
1466 }
1467 }
1468
1469 brw_urb_WRITE(p,
1470 brw_null_reg(), /* dest */
1471 0, /* starting mrf reg nr */
1472 c->r0, /* src */
1473 0, /* allocate */
1474 1, /* used */
1475 mrf, /* msg len */
1476 0, /* response len */
1477 1, /* eot */
1478 1, /* writes complete */
1479 14 / 2, /* urb destination offset */
1480 BRW_URB_SWIZZLE_INTERLEAVE);
1481 }
1482 }
1483
1484 static GLboolean
1485 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1486 {
1487 struct brw_compile *p = &c->func;
1488 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1489
1490 if (p->nr_insn == 0)
1491 return GL_FALSE;
1492
1493 if (val.address_mode != BRW_ADDRESS_DIRECT)
1494 return GL_FALSE;
1495
1496 switch (prev_insn->header.opcode) {
1497 case BRW_OPCODE_MOV:
1498 case BRW_OPCODE_MAC:
1499 case BRW_OPCODE_MUL:
1500 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1501 prev_insn->header.execution_size == val.width &&
1502 prev_insn->bits1.da1.dest_reg_file == val.file &&
1503 prev_insn->bits1.da1.dest_reg_type == val.type &&
1504 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1505 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1506 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1507 prev_insn->bits1.da16.dest_writemask == 0xf)
1508 return GL_TRUE;
1509 else
1510 return GL_FALSE;
1511 default:
1512 return GL_FALSE;
1513 }
1514 }
1515
1516 static uint32_t
1517 get_predicate(const struct prog_instruction *inst)
1518 {
1519 if (inst->DstReg.CondMask == COND_TR)
1520 return BRW_PREDICATE_NONE;
1521
1522 /* All of GLSL only produces predicates for COND_NE and one channel per
1523 * vector. Fail badly if someone starts doing something else, as it might
1524 * mean infinite looping or something.
1525 *
1526 * We'd like to support all the condition codes, but our hardware doesn't
1527 * quite match the Mesa IR, which is modeled after the NV extensions. For
1528 * those, the instruction may update the condition codes or not, then any
1529 * later instruction may use one of those condition codes. For gen4, the
1530 * instruction may update the flags register based on one of the condition
1531 * codes output by the instruction, and then further instructions may
1532 * predicate on that. We can probably support this, but it won't
1533 * necessarily be easy.
1534 */
1535 assert(inst->DstReg.CondMask == COND_NE);
1536
1537 switch (inst->DstReg.CondSwizzle) {
1538 case SWIZZLE_XXXX:
1539 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1540 case SWIZZLE_YYYY:
1541 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1542 case SWIZZLE_ZZZZ:
1543 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1544 case SWIZZLE_WWWW:
1545 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1546 default:
1547 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1548 inst->DstReg.CondMask);
1549 return BRW_PREDICATE_NORMAL;
1550 }
1551 }
1552
1553 /* Emit the vertex program instructions here.
1554 */
1555 void brw_vs_emit(struct brw_vs_compile *c )
1556 {
1557 #define MAX_IF_DEPTH 32
1558 #define MAX_LOOP_DEPTH 32
1559 struct brw_compile *p = &c->func;
1560 struct brw_context *brw = p->brw;
1561 struct intel_context *intel = &brw->intel;
1562 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1563 GLuint insn, if_depth = 0, loop_depth = 0;
1564 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1565 const struct brw_indirect stack_index = brw_indirect(0, 0);
1566 GLuint index;
1567 GLuint file;
1568
1569 if (INTEL_DEBUG & DEBUG_VS) {
1570 printf("vs-mesa:\n");
1571 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1572 GL_TRUE);
1573 printf("\n");
1574 }
1575
1576 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1577 brw_set_access_mode(p, BRW_ALIGN_16);
1578
1579 for (insn = 0; insn < nr_insns; insn++) {
1580 GLuint i;
1581 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1582
1583 /* Message registers can't be read, so copy the output into GRF
1584 * register if they are used in source registers
1585 */
1586 for (i = 0; i < 3; i++) {
1587 struct prog_src_register *src = &inst->SrcReg[i];
1588 GLuint index = src->Index;
1589 GLuint file = src->File;
1590 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1591 c->output_regs[index].used_in_src = GL_TRUE;
1592 }
1593
1594 switch (inst->Opcode) {
1595 case OPCODE_CAL:
1596 case OPCODE_RET:
1597 c->needs_stack = GL_TRUE;
1598 break;
1599 default:
1600 break;
1601 }
1602 }
1603
1604 /* Static register allocation
1605 */
1606 brw_vs_alloc_regs(c);
1607
1608 if (c->needs_stack)
1609 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1610
1611 for (insn = 0; insn < nr_insns; insn++) {
1612
1613 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1614 struct brw_reg args[3], dst;
1615 GLuint i;
1616
1617 #if 0
1618 printf("%d: ", insn);
1619 _mesa_print_instruction(inst);
1620 #endif
1621
1622 /* Get argument regs. SWZ is special and does this itself.
1623 */
1624 if (inst->Opcode != OPCODE_SWZ)
1625 for (i = 0; i < 3; i++) {
1626 const struct prog_src_register *src = &inst->SrcReg[i];
1627 index = src->Index;
1628 file = src->File;
1629 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1630 args[i] = c->output_regs[index].reg;
1631 else
1632 args[i] = get_arg(c, inst, i);
1633 }
1634
1635 /* Get dest regs. Note that it is possible for a reg to be both
1636 * dst and arg, given the static allocation of registers. So
1637 * care needs to be taken emitting multi-operation instructions.
1638 */
1639 index = inst->DstReg.Index;
1640 file = inst->DstReg.File;
1641 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1642 dst = c->output_regs[index].reg;
1643 else
1644 dst = get_dst(c, inst->DstReg);
1645
1646 if (inst->SaturateMode != SATURATE_OFF) {
1647 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1648 inst->SaturateMode);
1649 }
1650
1651 switch (inst->Opcode) {
1652 case OPCODE_ABS:
1653 brw_MOV(p, dst, brw_abs(args[0]));
1654 break;
1655 case OPCODE_ADD:
1656 brw_ADD(p, dst, args[0], args[1]);
1657 break;
1658 case OPCODE_COS:
1659 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1660 break;
1661 case OPCODE_DP2:
1662 brw_DP2(p, dst, args[0], args[1]);
1663 break;
1664 case OPCODE_DP3:
1665 brw_DP3(p, dst, args[0], args[1]);
1666 break;
1667 case OPCODE_DP4:
1668 brw_DP4(p, dst, args[0], args[1]);
1669 break;
1670 case OPCODE_DPH:
1671 brw_DPH(p, dst, args[0], args[1]);
1672 break;
1673 case OPCODE_NRM3:
1674 emit_nrm(c, dst, args[0], 3);
1675 break;
1676 case OPCODE_NRM4:
1677 emit_nrm(c, dst, args[0], 4);
1678 break;
1679 case OPCODE_DST:
1680 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1681 break;
1682 case OPCODE_EXP:
1683 unalias1(c, dst, args[0], emit_exp_noalias);
1684 break;
1685 case OPCODE_EX2:
1686 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1687 break;
1688 case OPCODE_ARL:
1689 brw_RNDD(p, dst, args[0]);
1690 break;
1691 case OPCODE_FLR:
1692 brw_RNDD(p, dst, args[0]);
1693 break;
1694 case OPCODE_FRC:
1695 brw_FRC(p, dst, args[0]);
1696 break;
1697 case OPCODE_LOG:
1698 unalias1(c, dst, args[0], emit_log_noalias);
1699 break;
1700 case OPCODE_LG2:
1701 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1702 break;
1703 case OPCODE_LIT:
1704 unalias1(c, dst, args[0], emit_lit_noalias);
1705 break;
1706 case OPCODE_LRP:
1707 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1708 break;
1709 case OPCODE_MAD:
1710 if (!accumulator_contains(c, args[2]))
1711 brw_MOV(p, brw_acc_reg(), args[2]);
1712 brw_MAC(p, dst, args[0], args[1]);
1713 break;
1714 case OPCODE_CMP:
1715 emit_cmp(p, dst, args[0], args[1], args[2]);
1716 break;
1717 case OPCODE_MAX:
1718 emit_max(p, dst, args[0], args[1]);
1719 break;
1720 case OPCODE_MIN:
1721 emit_min(p, dst, args[0], args[1]);
1722 break;
1723 case OPCODE_MOV:
1724 brw_MOV(p, dst, args[0]);
1725 break;
1726 case OPCODE_MUL:
1727 brw_MUL(p, dst, args[0], args[1]);
1728 break;
1729 case OPCODE_POW:
1730 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1731 break;
1732 case OPCODE_RCP:
1733 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1734 break;
1735 case OPCODE_RSQ:
1736 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1737 break;
1738
1739 case OPCODE_SEQ:
1740 unalias2(c, dst, args[0], args[1], emit_seq);
1741 break;
1742 case OPCODE_SIN:
1743 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1744 break;
1745 case OPCODE_SNE:
1746 unalias2(c, dst, args[0], args[1], emit_sne);
1747 break;
1748 case OPCODE_SGE:
1749 unalias2(c, dst, args[0], args[1], emit_sge);
1750 break;
1751 case OPCODE_SGT:
1752 unalias2(c, dst, args[0], args[1], emit_sgt);
1753 break;
1754 case OPCODE_SLT:
1755 unalias2(c, dst, args[0], args[1], emit_slt);
1756 break;
1757 case OPCODE_SLE:
1758 unalias2(c, dst, args[0], args[1], emit_sle);
1759 break;
1760 case OPCODE_SSG:
1761 unalias1(c, dst, args[0], emit_sign);
1762 break;
1763 case OPCODE_SUB:
1764 brw_ADD(p, dst, args[0], negate(args[1]));
1765 break;
1766 case OPCODE_SWZ:
1767 /* The args[0] value can't be used here as it won't have
1768 * correctly encoded the full swizzle:
1769 */
1770 emit_swz(c, dst, inst);
1771 break;
1772 case OPCODE_TRUNC:
1773 /* round toward zero */
1774 brw_RNDZ(p, dst, args[0]);
1775 break;
1776 case OPCODE_XPD:
1777 emit_xpd(p, dst, args[0], args[1]);
1778 break;
1779 case OPCODE_IF:
1780 assert(if_depth < MAX_IF_DEPTH);
1781 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1782 /* Note that brw_IF smashes the predicate_control field. */
1783 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1784 if_depth++;
1785 break;
1786 case OPCODE_ELSE:
1787 assert(if_depth > 0);
1788 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1789 break;
1790 case OPCODE_ENDIF:
1791 assert(if_depth > 0);
1792 brw_ENDIF(p, if_inst[--if_depth]);
1793 break;
1794 case OPCODE_BGNLOOP:
1795 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1796 break;
1797 case OPCODE_BRK:
1798 brw_set_predicate_control(p, get_predicate(inst));
1799 brw_BREAK(p);
1800 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1801 break;
1802 case OPCODE_CONT:
1803 brw_set_predicate_control(p, get_predicate(inst));
1804 brw_CONT(p);
1805 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1806 break;
1807 case OPCODE_ENDLOOP:
1808 {
1809 struct brw_instruction *inst0, *inst1;
1810 GLuint br = 1;
1811
1812 loop_depth--;
1813
1814 if (intel->gen == 5)
1815 br = 2;
1816
1817 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1818 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1819 while (inst0 > loop_inst[loop_depth]) {
1820 inst0--;
1821 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1822 inst0->bits3.if_else.jump_count == 0) {
1823 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1824 inst0->bits3.if_else.pop_count = 0;
1825 }
1826 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1827 inst0->bits3.if_else.jump_count == 0) {
1828 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1829 inst0->bits3.if_else.pop_count = 0;
1830 }
1831 }
1832 }
1833 break;
1834 case OPCODE_BRA:
1835 brw_set_predicate_control(p, get_predicate(inst));
1836 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1837 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1838 break;
1839 case OPCODE_CAL:
1840 brw_set_access_mode(p, BRW_ALIGN_1);
1841 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1842 brw_set_access_mode(p, BRW_ALIGN_16);
1843 brw_ADD(p, get_addr_reg(stack_index),
1844 get_addr_reg(stack_index), brw_imm_d(4));
1845 brw_save_call(p, inst->Comment, p->nr_insn);
1846 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1847 break;
1848 case OPCODE_RET:
1849 brw_ADD(p, get_addr_reg(stack_index),
1850 get_addr_reg(stack_index), brw_imm_d(-4));
1851 brw_set_access_mode(p, BRW_ALIGN_1);
1852 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1853 brw_set_access_mode(p, BRW_ALIGN_16);
1854 break;
1855 case OPCODE_END:
1856 emit_vertex_write(c);
1857 break;
1858 case OPCODE_PRINT:
1859 /* no-op */
1860 break;
1861 case OPCODE_BGNSUB:
1862 brw_save_label(p, inst->Comment, p->nr_insn);
1863 break;
1864 case OPCODE_ENDSUB:
1865 /* no-op */
1866 break;
1867 default:
1868 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1869 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1870 _mesa_opcode_string(inst->Opcode) :
1871 "unknown");
1872 }
1873
1874 /* Set the predication update on the last instruction of the native
1875 * instruction sequence.
1876 *
1877 * This would be problematic if it was set on a math instruction,
1878 * but that shouldn't be the case with the current GLSL compiler.
1879 */
1880 if (inst->CondUpdate) {
1881 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1882
1883 assert(hw_insn->header.destreg__conditionalmod == 0);
1884 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1885 }
1886
1887 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1888 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1889 && c->output_regs[inst->DstReg.Index].used_in_src) {
1890 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1891 }
1892
1893 /* Result color clamping.
1894 *
1895 * When destination register is an output register and
1896 * it's primary/secondary front/back color, we have to clamp
1897 * the result to [0,1]. This is done by enabling the
1898 * saturation bit for the last instruction.
1899 *
1900 * We don't use brw_set_saturate() as it modifies
1901 * p->current->header.saturate, which affects all the subsequent
1902 * instructions. Instead, we directly modify the header
1903 * of the last (already stored) instruction.
1904 */
1905 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1906 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1907 || (inst->DstReg.Index == VERT_RESULT_COL1)
1908 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1909 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1910 p->store[p->nr_insn-1].header.saturate = 1;
1911 }
1912 }
1913
1914 if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
1915 /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1916 * compute-to-mrf and the fact that we are allocating
1917 * registers for only the used PROGRAM_OUTPUTs.
1918 */
1919 move_to_reladdr_dst(c, inst, dst);
1920 }
1921
1922 release_tmps(c);
1923 }
1924
1925 brw_resolve_cals(p);
1926
1927 brw_optimize(p);
1928
1929 if (INTEL_DEBUG & DEBUG_VS) {
1930 int i;
1931
1932 printf("vs-native:\n");
1933 for (i = 0; i < p->nr_insn; i++)
1934 brw_disasm(stdout, &p->store[i], intel->gen);
1935 printf("\n");
1936 }
1937 }