i965: also using align1 mode for math2 on sandybridge
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf;
144 int attributes_in_vue;
145 int first_reladdr_output;
146
147 /* Determine whether to use a real constant buffer or use a block
148 * of GRF registers for constants. The later is faster but only
149 * works if everything fits in the GRF.
150 * XXX this heuristic/check may need some fine tuning...
151 */
152 if (c->vp->program.Base.Parameters->NumParameters +
153 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
154 c->vp->use_const_buffer = GL_TRUE;
155 else
156 c->vp->use_const_buffer = GL_FALSE;
157
158 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
159
160 /* r0 -- reserved as usual
161 */
162 c->r0 = brw_vec8_grf(reg, 0);
163 reg++;
164
165 /* User clip planes from curbe:
166 */
167 if (c->key.nr_userclip) {
168 if (intel->gen >= 6) {
169 for (i = 0; i < c->key.nr_userclip; i++) {
170 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
171 (i % 2) * 4), 0, 4, 1);
172 }
173 reg += ALIGN(c->key.nr_userclip, 2) / 2;
174 } else {
175 for (i = 0; i < c->key.nr_userclip; i++) {
176 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
177 (i % 2) * 4), 0, 4, 1);
178 }
179 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
180 }
181
182 }
183
184 /* Vertex program parameters from curbe:
185 */
186 if (c->vp->use_const_buffer) {
187 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
188 int constant = 0;
189
190 /* We've got more constants than we can load with the push
191 * mechanism. This is often correlated with reladdr loads where
192 * we should probably be using a pull mechanism anyway to avoid
193 * excessive reading. However, the pull mechanism is slow in
194 * general. So, we try to allocate as many non-reladdr-loaded
195 * constants through the push buffer as we can before giving up.
196 */
197 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
198 for (i = 0;
199 i < c->vp->program.Base.NumInstructions && constant < max_constant;
200 i++) {
201 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
202 int arg;
203
204 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
205 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
206 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
207 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
208 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
209 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
210 inst->SrcReg[arg].RelAddr)
211 continue;
212
213 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
214 c->constant_map[inst->SrcReg[arg].Index] = constant++;
215 }
216 }
217 }
218
219 for (i = 0; i < constant; i++) {
220 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
221 (i%2) * 4),
222 0, 4, 1);
223 }
224 reg += (constant + 1) / 2;
225 c->prog_data.curb_read_length = reg - 1;
226 /* XXX 0 causes a bug elsewhere... */
227 c->prog_data.nr_params = MAX2(constant * 4, 4);
228 }
229 else {
230 /* use a section of the GRF for constants */
231 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
232 for (i = 0; i < nr_params; i++) {
233 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
234 }
235 reg += (nr_params + 1) / 2;
236 c->prog_data.curb_read_length = reg - 1;
237
238 c->prog_data.nr_params = nr_params * 4;
239 }
240
241 /* Allocate input regs:
242 */
243 c->nr_inputs = 0;
244 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
245 if (c->prog_data.inputs_read & (1 << i)) {
246 c->nr_inputs++;
247 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
248 reg++;
249 }
250 }
251 /* If there are no inputs, we'll still be reading one attribute's worth
252 * because it's required -- see urb_read_length setting.
253 */
254 if (c->nr_inputs == 0)
255 reg++;
256
257 /* Allocate outputs. The non-position outputs go straight into message regs.
258 */
259 c->nr_outputs = 0;
260 c->first_output = reg;
261 c->first_overflow_output = 0;
262
263 if (intel->gen >= 6) {
264 mrf = 3;
265 if (c->key.nr_userclip)
266 mrf += 2;
267 } else if (intel->gen == 5)
268 mrf = 8;
269 else
270 mrf = 4;
271
272 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
273 for (i = 0; i < VERT_RESULT_MAX; i++) {
274 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
275 c->nr_outputs++;
276 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
277 if (i == VERT_RESULT_HPOS) {
278 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
279 reg++;
280 }
281 else if (i == VERT_RESULT_PSIZ) {
282 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
283 reg++;
284 }
285 else {
286 /* Two restrictions on our compute-to-MRF here. The
287 * message length for all SEND messages is restricted to
288 * [1,15], so we can't use mrf 15, as that means a length
289 * of 16.
290 *
291 * Additionally, URB writes are aligned to URB rows, so we
292 * need to put an even number of registers of URB data in
293 * each URB write so that the later write is aligned. A
294 * message length of 15 means 1 message header reg plus 14
295 * regs of URB data.
296 *
297 * For attributes beyond the compute-to-MRF, we compute to
298 * GRFs and they will be written in the second URB_WRITE.
299 */
300 if (first_reladdr_output > i && mrf < 15) {
301 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
302 mrf++;
303 }
304 else {
305 if (mrf >= 15 && !c->first_overflow_output)
306 c->first_overflow_output = i;
307 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
308 reg++;
309 mrf++;
310 }
311 }
312 }
313 }
314
315 /* Allocate program temporaries:
316 */
317 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
318 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
319 reg++;
320 }
321
322 /* Address reg(s). Don't try to use the internal address reg until
323 * deref time.
324 */
325 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
326 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
327 reg,
328 0,
329 BRW_REGISTER_TYPE_D,
330 BRW_VERTICAL_STRIDE_8,
331 BRW_WIDTH_8,
332 BRW_HORIZONTAL_STRIDE_1,
333 BRW_SWIZZLE_XXXX,
334 WRITEMASK_X);
335 reg++;
336 }
337
338 if (c->vp->use_const_buffer) {
339 for (i = 0; i < 3; i++) {
340 c->current_const[i].reg = brw_vec8_grf(reg, 0);
341 reg++;
342 }
343 clear_current_const(c);
344 }
345
346 for (i = 0; i < 128; i++) {
347 if (c->output_regs[i].used_in_src) {
348 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
349 reg++;
350 }
351 }
352
353 if (c->needs_stack) {
354 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
355 reg += 2;
356 }
357
358 /* Some opcodes need an internal temporary:
359 */
360 c->first_tmp = reg;
361 c->last_tmp = reg; /* for allocation purposes */
362
363 /* Each input reg holds data from two vertices. The
364 * urb_read_length is the number of registers read from *each*
365 * vertex urb, so is half the amount:
366 */
367 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
368 /* Setting this field to 0 leads to undefined behavior according to the
369 * the VS_STATE docs. Our VUEs will always have at least one attribute
370 * sitting in them, even if it's padding.
371 */
372 if (c->prog_data.urb_read_length == 0)
373 c->prog_data.urb_read_length = 1;
374
375 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
376 * them to fit the biggest thing they need to.
377 */
378 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
379
380 /* See emit_vertex_write() for where the VUE's overhead on top of the
381 * attributes comes from.
382 */
383 if (intel->gen >= 6) {
384 int header_regs = 2;
385 if (c->key.nr_userclip)
386 header_regs += 2;
387
388 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
389 } else if (intel->gen == 5)
390 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
391 else
392 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
393
394 c->prog_data.total_grf = reg;
395
396 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
397 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
398 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
399 printf("%s reg = %d\n", __FUNCTION__, reg);
400 }
401 }
402
403
404 /**
405 * If an instruction uses a temp reg both as a src and the dest, we
406 * sometimes need to allocate an intermediate temporary.
407 */
408 static void unalias1( struct brw_vs_compile *c,
409 struct brw_reg dst,
410 struct brw_reg arg0,
411 void (*func)( struct brw_vs_compile *,
412 struct brw_reg,
413 struct brw_reg ))
414 {
415 if (dst.file == arg0.file && dst.nr == arg0.nr) {
416 struct brw_compile *p = &c->func;
417 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
418 func(c, tmp, arg0);
419 brw_MOV(p, dst, tmp);
420 release_tmp(c, tmp);
421 }
422 else {
423 func(c, dst, arg0);
424 }
425 }
426
427 /**
428 * \sa unalias2
429 * Checkes if 2-operand instruction needs an intermediate temporary.
430 */
431 static void unalias2( struct brw_vs_compile *c,
432 struct brw_reg dst,
433 struct brw_reg arg0,
434 struct brw_reg arg1,
435 void (*func)( struct brw_vs_compile *,
436 struct brw_reg,
437 struct brw_reg,
438 struct brw_reg ))
439 {
440 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
441 (dst.file == arg1.file && dst.nr == arg1.nr)) {
442 struct brw_compile *p = &c->func;
443 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
444 func(c, tmp, arg0, arg1);
445 brw_MOV(p, dst, tmp);
446 release_tmp(c, tmp);
447 }
448 else {
449 func(c, dst, arg0, arg1);
450 }
451 }
452
453 /**
454 * \sa unalias2
455 * Checkes if 3-operand instruction needs an intermediate temporary.
456 */
457 static void unalias3( struct brw_vs_compile *c,
458 struct brw_reg dst,
459 struct brw_reg arg0,
460 struct brw_reg arg1,
461 struct brw_reg arg2,
462 void (*func)( struct brw_vs_compile *,
463 struct brw_reg,
464 struct brw_reg,
465 struct brw_reg,
466 struct brw_reg ))
467 {
468 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
469 (dst.file == arg1.file && dst.nr == arg1.nr) ||
470 (dst.file == arg2.file && dst.nr == arg2.nr)) {
471 struct brw_compile *p = &c->func;
472 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
473 func(c, tmp, arg0, arg1, arg2);
474 brw_MOV(p, dst, tmp);
475 release_tmp(c, tmp);
476 }
477 else {
478 func(c, dst, arg0, arg1, arg2);
479 }
480 }
481
482 static void emit_sop( struct brw_vs_compile *c,
483 struct brw_reg dst,
484 struct brw_reg arg0,
485 struct brw_reg arg1,
486 GLuint cond)
487 {
488 struct brw_compile *p = &c->func;
489
490 brw_MOV(p, dst, brw_imm_f(0.0f));
491 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
492 brw_MOV(p, dst, brw_imm_f(1.0f));
493 brw_set_predicate_control_flag_value(p, 0xff);
494 }
495
496 static void emit_seq( struct brw_vs_compile *c,
497 struct brw_reg dst,
498 struct brw_reg arg0,
499 struct brw_reg arg1 )
500 {
501 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
502 }
503
504 static void emit_sne( struct brw_vs_compile *c,
505 struct brw_reg dst,
506 struct brw_reg arg0,
507 struct brw_reg arg1 )
508 {
509 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
510 }
511 static void emit_slt( struct brw_vs_compile *c,
512 struct brw_reg dst,
513 struct brw_reg arg0,
514 struct brw_reg arg1 )
515 {
516 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
517 }
518
519 static void emit_sle( struct brw_vs_compile *c,
520 struct brw_reg dst,
521 struct brw_reg arg0,
522 struct brw_reg arg1 )
523 {
524 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
525 }
526
527 static void emit_sgt( struct brw_vs_compile *c,
528 struct brw_reg dst,
529 struct brw_reg arg0,
530 struct brw_reg arg1 )
531 {
532 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
533 }
534
535 static void emit_sge( struct brw_vs_compile *c,
536 struct brw_reg dst,
537 struct brw_reg arg0,
538 struct brw_reg arg1 )
539 {
540 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
541 }
542
543 static void emit_cmp( struct brw_compile *p,
544 struct brw_reg dst,
545 struct brw_reg arg0,
546 struct brw_reg arg1,
547 struct brw_reg arg2 )
548 {
549 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
550 brw_SEL(p, dst, arg1, arg2);
551 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
552 }
553
554 static void emit_sign(struct brw_vs_compile *c,
555 struct brw_reg dst,
556 struct brw_reg arg0)
557 {
558 struct brw_compile *p = &c->func;
559
560 brw_MOV(p, dst, brw_imm_f(0));
561
562 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
563 brw_MOV(p, dst, brw_imm_f(-1.0));
564 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
565
566 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
567 brw_MOV(p, dst, brw_imm_f(1.0));
568 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
569 }
570
571 static void emit_max( struct brw_compile *p,
572 struct brw_reg dst,
573 struct brw_reg arg0,
574 struct brw_reg arg1 )
575 {
576 struct intel_context *intel = &p->brw->intel;
577
578 if (intel->gen >= 6) {
579 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
580 brw_SEL(p, dst, arg0, arg1);
581 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
582 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
583 } else {
584 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
585 brw_SEL(p, dst, arg0, arg1);
586 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
587 }
588 }
589
590 static void emit_min( struct brw_compile *p,
591 struct brw_reg dst,
592 struct brw_reg arg0,
593 struct brw_reg arg1 )
594 {
595 struct intel_context *intel = &p->brw->intel;
596
597 if (intel->gen >= 6) {
598 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
599 brw_SEL(p, dst, arg0, arg1);
600 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
601 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
602 } else {
603 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
604 brw_SEL(p, dst, arg0, arg1);
605 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
606 }
607 }
608
609 static void emit_math1_gen4(struct brw_vs_compile *c,
610 GLuint function,
611 struct brw_reg dst,
612 struct brw_reg arg0,
613 GLuint precision)
614 {
615 /* There are various odd behaviours with SEND on the simulator. In
616 * addition there are documented issues with the fact that the GEN4
617 * processor doesn't do dependency control properly on SEND
618 * results. So, on balance, this kludge to get around failures
619 * with writemasked math results looks like it might be necessary
620 * whether that turns out to be a simulator bug or not:
621 */
622 struct brw_compile *p = &c->func;
623 struct brw_reg tmp = dst;
624 GLboolean need_tmp = GL_FALSE;
625
626 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
627 dst.dw1.bits.writemask != 0xf)
628 need_tmp = GL_TRUE;
629
630 if (need_tmp)
631 tmp = get_tmp(c);
632
633 brw_math(p,
634 tmp,
635 function,
636 BRW_MATH_SATURATE_NONE,
637 2,
638 arg0,
639 BRW_MATH_DATA_SCALAR,
640 precision);
641
642 if (need_tmp) {
643 brw_MOV(p, dst, tmp);
644 release_tmp(c, tmp);
645 }
646 }
647
648 static void
649 emit_math1_gen6(struct brw_vs_compile *c,
650 GLuint function,
651 struct brw_reg dst,
652 struct brw_reg arg0,
653 GLuint precision)
654 {
655 struct brw_compile *p = &c->func;
656 struct brw_reg tmp_src, tmp_dst;
657
658 /* Something is strange on gen6 math in 16-wide mode, though the
659 * docs say it's supposed to work. Punt to using align1 mode,
660 * which doesn't do writemasking and swizzles.
661 */
662 tmp_src = get_tmp(c);
663 tmp_dst = get_tmp(c);
664
665 brw_MOV(p, tmp_src, arg0);
666
667 brw_set_access_mode(p, BRW_ALIGN_1);
668 brw_math(p,
669 tmp_dst,
670 function,
671 BRW_MATH_SATURATE_NONE,
672 2,
673 tmp_src,
674 BRW_MATH_DATA_SCALAR,
675 precision);
676 brw_set_access_mode(p, BRW_ALIGN_16);
677
678 brw_MOV(p, dst, tmp_dst);
679
680 release_tmp(c, tmp_src);
681 release_tmp(c, tmp_dst);
682 }
683
684 static void
685 emit_math1(struct brw_vs_compile *c,
686 GLuint function,
687 struct brw_reg dst,
688 struct brw_reg arg0,
689 GLuint precision)
690 {
691 struct brw_compile *p = &c->func;
692 struct intel_context *intel = &p->brw->intel;
693
694 if (intel->gen >= 6)
695 emit_math1_gen6(c, function, dst, arg0, precision);
696 else
697 emit_math1_gen4(c, function, dst, arg0, precision);
698 }
699
700 static void emit_math2_gen4( struct brw_vs_compile *c,
701 GLuint function,
702 struct brw_reg dst,
703 struct brw_reg arg0,
704 struct brw_reg arg1,
705 GLuint precision)
706 {
707 struct brw_compile *p = &c->func;
708 struct brw_reg tmp = dst;
709 GLboolean need_tmp = GL_FALSE;
710
711 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
712 dst.dw1.bits.writemask != 0xf)
713 need_tmp = GL_TRUE;
714
715 if (need_tmp)
716 tmp = get_tmp(c);
717
718 brw_MOV(p, brw_message_reg(3), arg1);
719
720 brw_math(p,
721 tmp,
722 function,
723 BRW_MATH_SATURATE_NONE,
724 2,
725 arg0,
726 BRW_MATH_DATA_SCALAR,
727 precision);
728
729 if (need_tmp) {
730 brw_MOV(p, dst, tmp);
731 release_tmp(c, tmp);
732 }
733 }
734
735 static void emit_math2_gen6( struct brw_vs_compile *c,
736 GLuint function,
737 struct brw_reg dst,
738 struct brw_reg arg0,
739 struct brw_reg arg1,
740 GLuint precision)
741 {
742 struct brw_compile *p = &c->func;
743 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
744
745 tmp_src0 = get_tmp(c);
746 tmp_src1 = get_tmp(c);
747 tmp_dst = get_tmp(c);
748
749 brw_MOV(p, tmp_src0, arg0);
750 brw_MOV(p, tmp_src1, arg1);
751
752 brw_set_access_mode(p, BRW_ALIGN_1);
753 brw_math2(p,
754 tmp_dst,
755 function,
756 tmp_src0,
757 tmp_src1);
758 brw_set_access_mode(p, BRW_ALIGN_16);
759
760 brw_MOV(p, dst, tmp_dst);
761
762 release_tmp(c, tmp_src0);
763 release_tmp(c, tmp_src1);
764 release_tmp(c, tmp_dst);
765 }
766
767 static void emit_math2( struct brw_vs_compile *c,
768 GLuint function,
769 struct brw_reg dst,
770 struct brw_reg arg0,
771 struct brw_reg arg1,
772 GLuint precision)
773 {
774 struct brw_compile *p = &c->func;
775 struct intel_context *intel = &p->brw->intel;
776
777 if (intel->gen >= 6)
778 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
779 else
780 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
781 }
782
783 static void emit_exp_noalias( struct brw_vs_compile *c,
784 struct brw_reg dst,
785 struct brw_reg arg0 )
786 {
787 struct brw_compile *p = &c->func;
788
789
790 if (dst.dw1.bits.writemask & WRITEMASK_X) {
791 struct brw_reg tmp = get_tmp(c);
792 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
793
794 /* tmp_d = floor(arg0.x) */
795 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
796
797 /* result[0] = 2.0 ^ tmp */
798
799 /* Adjust exponent for floating point:
800 * exp += 127
801 */
802 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
803
804 /* Install exponent and sign.
805 * Excess drops off the edge:
806 */
807 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
808 tmp_d, brw_imm_d(23));
809
810 release_tmp(c, tmp);
811 }
812
813 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
814 /* result[1] = arg0.x - floor(arg0.x) */
815 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
816 }
817
818 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
819 /* As with the LOG instruction, we might be better off just
820 * doing a taylor expansion here, seeing as we have to do all
821 * the prep work.
822 *
823 * If mathbox partial precision is too low, consider also:
824 * result[3] = result[0] * EXP(result[1])
825 */
826 emit_math1(c,
827 BRW_MATH_FUNCTION_EXP,
828 brw_writemask(dst, WRITEMASK_Z),
829 brw_swizzle1(arg0, 0),
830 BRW_MATH_PRECISION_FULL);
831 }
832
833 if (dst.dw1.bits.writemask & WRITEMASK_W) {
834 /* result[3] = 1.0; */
835 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
836 }
837 }
838
839
840 static void emit_log_noalias( struct brw_vs_compile *c,
841 struct brw_reg dst,
842 struct brw_reg arg0 )
843 {
844 struct brw_compile *p = &c->func;
845 struct brw_reg tmp = dst;
846 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
847 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
848 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
849 dst.file != BRW_GENERAL_REGISTER_FILE);
850
851 if (need_tmp) {
852 tmp = get_tmp(c);
853 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
854 }
855
856 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
857 * according to spec:
858 *
859 * These almost look likey they could be joined up, but not really
860 * practical:
861 *
862 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
863 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
864 */
865 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
866 brw_AND(p,
867 brw_writemask(tmp_ud, WRITEMASK_X),
868 brw_swizzle1(arg0_ud, 0),
869 brw_imm_ud((1U<<31)-1));
870
871 brw_SHR(p,
872 brw_writemask(tmp_ud, WRITEMASK_X),
873 tmp_ud,
874 brw_imm_ud(23));
875
876 brw_ADD(p,
877 brw_writemask(tmp, WRITEMASK_X),
878 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
879 brw_imm_d(-127));
880 }
881
882 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
883 brw_AND(p,
884 brw_writemask(tmp_ud, WRITEMASK_Y),
885 brw_swizzle1(arg0_ud, 0),
886 brw_imm_ud((1<<23)-1));
887
888 brw_OR(p,
889 brw_writemask(tmp_ud, WRITEMASK_Y),
890 tmp_ud,
891 brw_imm_ud(127<<23));
892 }
893
894 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
895 /* result[2] = result[0] + LOG2(result[1]); */
896
897 /* Why bother? The above is just a hint how to do this with a
898 * taylor series. Maybe we *should* use a taylor series as by
899 * the time all the above has been done it's almost certainly
900 * quicker than calling the mathbox, even with low precision.
901 *
902 * Options are:
903 * - result[0] + mathbox.LOG2(result[1])
904 * - mathbox.LOG2(arg0.x)
905 * - result[0] + inline_taylor_approx(result[1])
906 */
907 emit_math1(c,
908 BRW_MATH_FUNCTION_LOG,
909 brw_writemask(tmp, WRITEMASK_Z),
910 brw_swizzle1(tmp, 1),
911 BRW_MATH_PRECISION_FULL);
912
913 brw_ADD(p,
914 brw_writemask(tmp, WRITEMASK_Z),
915 brw_swizzle1(tmp, 2),
916 brw_swizzle1(tmp, 0));
917 }
918
919 if (dst.dw1.bits.writemask & WRITEMASK_W) {
920 /* result[3] = 1.0; */
921 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
922 }
923
924 if (need_tmp) {
925 brw_MOV(p, dst, tmp);
926 release_tmp(c, tmp);
927 }
928 }
929
930
931 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
932 */
933 static void emit_dst_noalias( struct brw_vs_compile *c,
934 struct brw_reg dst,
935 struct brw_reg arg0,
936 struct brw_reg arg1)
937 {
938 struct brw_compile *p = &c->func;
939
940 /* There must be a better way to do this:
941 */
942 if (dst.dw1.bits.writemask & WRITEMASK_X)
943 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
944 if (dst.dw1.bits.writemask & WRITEMASK_Y)
945 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
946 if (dst.dw1.bits.writemask & WRITEMASK_Z)
947 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
948 if (dst.dw1.bits.writemask & WRITEMASK_W)
949 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
950 }
951
952
953 static void emit_xpd( struct brw_compile *p,
954 struct brw_reg dst,
955 struct brw_reg t,
956 struct brw_reg u)
957 {
958 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
959 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
960 }
961
962
963 static void emit_lit_noalias( struct brw_vs_compile *c,
964 struct brw_reg dst,
965 struct brw_reg arg0 )
966 {
967 struct brw_compile *p = &c->func;
968 struct brw_instruction *if_insn;
969 struct brw_reg tmp = dst;
970 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
971
972 if (need_tmp)
973 tmp = get_tmp(c);
974
975 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
976 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
977
978 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
979 * to get all channels active inside the IF. In the clipping code
980 * we run with NoMask, so it's not an option and we can use
981 * BRW_EXECUTE_1 for all comparisions.
982 */
983 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
984 if_insn = brw_IF(p, BRW_EXECUTE_8);
985 {
986 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
987
988 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
989 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
990 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
991
992 emit_math2(c,
993 BRW_MATH_FUNCTION_POW,
994 brw_writemask(dst, WRITEMASK_Z),
995 brw_swizzle1(tmp, 2),
996 brw_swizzle1(arg0, 3),
997 BRW_MATH_PRECISION_PARTIAL);
998 }
999
1000 brw_ENDIF(p, if_insn);
1001
1002 release_tmp(c, tmp);
1003 }
1004
1005 static void emit_lrp_noalias(struct brw_vs_compile *c,
1006 struct brw_reg dst,
1007 struct brw_reg arg0,
1008 struct brw_reg arg1,
1009 struct brw_reg arg2)
1010 {
1011 struct brw_compile *p = &c->func;
1012
1013 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1014 brw_MUL(p, brw_null_reg(), dst, arg2);
1015 brw_MAC(p, dst, arg0, arg1);
1016 }
1017
1018 /** 3 or 4-component vector normalization */
1019 static void emit_nrm( struct brw_vs_compile *c,
1020 struct brw_reg dst,
1021 struct brw_reg arg0,
1022 int num_comps)
1023 {
1024 struct brw_compile *p = &c->func;
1025 struct brw_reg tmp = get_tmp(c);
1026
1027 /* tmp = dot(arg0, arg0) */
1028 if (num_comps == 3)
1029 brw_DP3(p, tmp, arg0, arg0);
1030 else
1031 brw_DP4(p, tmp, arg0, arg0);
1032
1033 /* tmp = 1 / sqrt(tmp) */
1034 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
1035
1036 /* dst = arg0 * tmp */
1037 brw_MUL(p, dst, arg0, tmp);
1038
1039 release_tmp(c, tmp);
1040 }
1041
1042
1043 static struct brw_reg
1044 get_constant(struct brw_vs_compile *c,
1045 const struct prog_instruction *inst,
1046 GLuint argIndex)
1047 {
1048 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1049 struct brw_compile *p = &c->func;
1050 struct brw_reg const_reg = c->current_const[argIndex].reg;
1051
1052 assert(argIndex < 3);
1053
1054 assert(c->func.brw->intel.gen < 6); /* FINISHME */
1055
1056 if (c->current_const[argIndex].index != src->Index) {
1057 /* Keep track of the last constant loaded in this slot, for reuse. */
1058 c->current_const[argIndex].index = src->Index;
1059
1060 #if 0
1061 printf(" fetch const[%d] for arg %d into reg %d\n",
1062 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1063 #endif
1064 /* need to fetch the constant now */
1065 brw_dp_READ_4_vs(p,
1066 const_reg, /* writeback dest */
1067 16 * src->Index, /* byte offset */
1068 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1069 );
1070 }
1071
1072 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1073 const_reg = stride(const_reg, 0, 4, 0);
1074 const_reg.subnr = 0;
1075
1076 return const_reg;
1077 }
1078
1079 static struct brw_reg
1080 get_reladdr_constant(struct brw_vs_compile *c,
1081 const struct prog_instruction *inst,
1082 GLuint argIndex)
1083 {
1084 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1085 struct brw_compile *p = &c->func;
1086 struct brw_reg const_reg = c->current_const[argIndex].reg;
1087 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
1088 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1089
1090 assert(argIndex < 3);
1091
1092 assert(c->func.brw->intel.gen < 6); /* FINISHME */
1093
1094 /* Can't reuse a reladdr constant load. */
1095 c->current_const[argIndex].index = -1;
1096
1097 #if 0
1098 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1099 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1100 #endif
1101
1102 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
1103
1104 /* fetch the first vec4 */
1105 brw_dp_READ_4_vs_relative(p,
1106 const_reg, /* writeback dest */
1107 byte_addr_reg, /* address register */
1108 16 * src->Index, /* byte offset */
1109 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1110 );
1111
1112 return const_reg;
1113 }
1114
1115
1116
1117 /* TODO: relative addressing!
1118 */
1119 static struct brw_reg get_reg( struct brw_vs_compile *c,
1120 gl_register_file file,
1121 GLuint index )
1122 {
1123 switch (file) {
1124 case PROGRAM_TEMPORARY:
1125 case PROGRAM_INPUT:
1126 case PROGRAM_OUTPUT:
1127 assert(c->regs[file][index].nr != 0);
1128 return c->regs[file][index];
1129 case PROGRAM_STATE_VAR:
1130 case PROGRAM_CONSTANT:
1131 case PROGRAM_UNIFORM:
1132 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1133 return c->regs[PROGRAM_STATE_VAR][index];
1134 case PROGRAM_ADDRESS:
1135 assert(index == 0);
1136 return c->regs[file][index];
1137
1138 case PROGRAM_UNDEFINED: /* undef values */
1139 return brw_null_reg();
1140
1141 case PROGRAM_LOCAL_PARAM:
1142 case PROGRAM_ENV_PARAM:
1143 case PROGRAM_WRITE_ONLY:
1144 default:
1145 assert(0);
1146 return brw_null_reg();
1147 }
1148 }
1149
1150
1151 /**
1152 * Indirect addressing: get reg[[arg] + offset].
1153 */
1154 static struct brw_reg deref( struct brw_vs_compile *c,
1155 struct brw_reg arg,
1156 GLint offset,
1157 GLuint reg_size )
1158 {
1159 struct brw_compile *p = &c->func;
1160 struct brw_reg tmp = get_tmp(c);
1161 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1162 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1163 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1164 struct brw_reg indirect = brw_vec4_indirect(0,0);
1165 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1166
1167 /* Set the vertical stride on the register access so that the first
1168 * 4 components come from a0.0 and the second 4 from a0.1.
1169 */
1170 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1171
1172 {
1173 brw_push_insn_state(p);
1174 brw_set_access_mode(p, BRW_ALIGN_1);
1175
1176 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1177 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1178
1179 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1180 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1181
1182 brw_MOV(p, tmp, indirect);
1183
1184 brw_pop_insn_state(p);
1185 }
1186
1187 /* NOTE: tmp not released */
1188 return tmp;
1189 }
1190
1191 static void
1192 move_to_reladdr_dst(struct brw_vs_compile *c,
1193 const struct prog_instruction *inst,
1194 struct brw_reg val)
1195 {
1196 struct brw_compile *p = &c->func;
1197 int reg_size = 32;
1198 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1199 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1200 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1201 GLuint byte_offset = base.nr * 32 + base.subnr;
1202 struct brw_reg indirect = brw_vec4_indirect(0,0);
1203 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1204
1205 /* Because destination register indirect addressing can only use
1206 * one index, we'll write each vertex's vec4 value separately.
1207 */
1208 val.width = BRW_WIDTH_4;
1209 val.vstride = BRW_VERTICAL_STRIDE_4;
1210
1211 brw_push_insn_state(p);
1212 brw_set_access_mode(p, BRW_ALIGN_1);
1213
1214 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1215 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1216 brw_MOV(p, indirect, val);
1217
1218 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1219 brw_ADD(p, brw_address_reg(0), acc,
1220 brw_imm_uw(byte_offset + reg_size / 2));
1221 brw_MOV(p, indirect, suboffset(val, 4));
1222
1223 brw_pop_insn_state(p);
1224 }
1225
1226 /**
1227 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1228 * TODO: relative addressing!
1229 */
1230 static struct brw_reg
1231 get_src_reg( struct brw_vs_compile *c,
1232 const struct prog_instruction *inst,
1233 GLuint argIndex )
1234 {
1235 const GLuint file = inst->SrcReg[argIndex].File;
1236 const GLint index = inst->SrcReg[argIndex].Index;
1237 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1238
1239 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1240 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1241
1242 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1243 SWIZZLE_ZERO,
1244 SWIZZLE_ZERO,
1245 SWIZZLE_ZERO)) {
1246 return brw_imm_f(0.0f);
1247 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1248 SWIZZLE_ONE,
1249 SWIZZLE_ONE,
1250 SWIZZLE_ONE)) {
1251 if (src->Negate)
1252 return brw_imm_f(-1.0F);
1253 else
1254 return brw_imm_f(1.0F);
1255 } else if (src->File == PROGRAM_CONSTANT) {
1256 const struct gl_program_parameter_list *params;
1257 float f;
1258 int component = -1;
1259
1260 switch (src->Swizzle) {
1261 case SWIZZLE_XXXX:
1262 component = 0;
1263 break;
1264 case SWIZZLE_YYYY:
1265 component = 1;
1266 break;
1267 case SWIZZLE_ZZZZ:
1268 component = 2;
1269 break;
1270 case SWIZZLE_WWWW:
1271 component = 3;
1272 break;
1273 }
1274
1275 if (component >= 0) {
1276 params = c->vp->program.Base.Parameters;
1277 f = params->ParameterValues[src->Index][component];
1278
1279 if (src->Abs)
1280 f = fabs(f);
1281 if (src->Negate)
1282 f = -f;
1283 return brw_imm_f(f);
1284 }
1285 }
1286 }
1287
1288 switch (file) {
1289 case PROGRAM_TEMPORARY:
1290 case PROGRAM_INPUT:
1291 case PROGRAM_OUTPUT:
1292 if (relAddr) {
1293 return deref(c, c->regs[file][0], index, 32);
1294 }
1295 else {
1296 assert(c->regs[file][index].nr != 0);
1297 return c->regs[file][index];
1298 }
1299
1300 case PROGRAM_STATE_VAR:
1301 case PROGRAM_CONSTANT:
1302 case PROGRAM_UNIFORM:
1303 case PROGRAM_ENV_PARAM:
1304 case PROGRAM_LOCAL_PARAM:
1305 if (c->vp->use_const_buffer) {
1306 if (!relAddr && c->constant_map[index] != -1) {
1307 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1308 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1309 } else if (relAddr)
1310 return get_reladdr_constant(c, inst, argIndex);
1311 else
1312 return get_constant(c, inst, argIndex);
1313 }
1314 else if (relAddr) {
1315 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1316 }
1317 else {
1318 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1319 return c->regs[PROGRAM_STATE_VAR][index];
1320 }
1321 case PROGRAM_ADDRESS:
1322 assert(index == 0);
1323 return c->regs[file][index];
1324
1325 case PROGRAM_UNDEFINED:
1326 /* this is a normal case since we loop over all three src args */
1327 return brw_null_reg();
1328
1329 case PROGRAM_WRITE_ONLY:
1330 default:
1331 assert(0);
1332 return brw_null_reg();
1333 }
1334 }
1335
1336 /**
1337 * Return the brw reg for the given instruction's src argument.
1338 * Will return mangled results for SWZ op. The emit_swz() function
1339 * ignores this result and recalculates taking extended swizzles into
1340 * account.
1341 */
1342 static struct brw_reg get_arg( struct brw_vs_compile *c,
1343 const struct prog_instruction *inst,
1344 GLuint argIndex )
1345 {
1346 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1347 struct brw_reg reg;
1348
1349 if (src->File == PROGRAM_UNDEFINED)
1350 return brw_null_reg();
1351
1352 reg = get_src_reg(c, inst, argIndex);
1353
1354 /* Convert 3-bit swizzle to 2-bit.
1355 */
1356 if (reg.file != BRW_IMMEDIATE_VALUE) {
1357 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1358 GET_SWZ(src->Swizzle, 1),
1359 GET_SWZ(src->Swizzle, 2),
1360 GET_SWZ(src->Swizzle, 3));
1361 }
1362
1363 /* Note this is ok for non-swizzle instructions:
1364 */
1365 reg.negate = src->Negate ? 1 : 0;
1366
1367 return reg;
1368 }
1369
1370
1371 /**
1372 * Get brw register for the given program dest register.
1373 */
1374 static struct brw_reg get_dst( struct brw_vs_compile *c,
1375 struct prog_dst_register dst )
1376 {
1377 struct brw_reg reg;
1378
1379 switch (dst.File) {
1380 case PROGRAM_TEMPORARY:
1381 case PROGRAM_OUTPUT:
1382 /* register-indirect addressing is only 1x1, not VxH, for
1383 * destination regs. So, for RelAddr we'll return a temporary
1384 * for the dest and do a move of the result to the RelAddr
1385 * register after the instruction emit.
1386 */
1387 if (dst.RelAddr) {
1388 reg = get_tmp(c);
1389 } else {
1390 assert(c->regs[dst.File][dst.Index].nr != 0);
1391 reg = c->regs[dst.File][dst.Index];
1392 }
1393 break;
1394 case PROGRAM_ADDRESS:
1395 assert(dst.Index == 0);
1396 reg = c->regs[dst.File][dst.Index];
1397 break;
1398 case PROGRAM_UNDEFINED:
1399 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1400 reg = brw_null_reg();
1401 break;
1402 default:
1403 assert(0);
1404 reg = brw_null_reg();
1405 }
1406
1407 assert(reg.type != BRW_IMMEDIATE_VALUE);
1408 reg.dw1.bits.writemask = dst.WriteMask;
1409
1410 return reg;
1411 }
1412
1413
1414 static void emit_swz( struct brw_vs_compile *c,
1415 struct brw_reg dst,
1416 const struct prog_instruction *inst)
1417 {
1418 const GLuint argIndex = 0;
1419 const struct prog_src_register src = inst->SrcReg[argIndex];
1420 struct brw_compile *p = &c->func;
1421 GLuint zeros_mask = 0;
1422 GLuint ones_mask = 0;
1423 GLuint src_mask = 0;
1424 GLubyte src_swz[4];
1425 GLboolean need_tmp = (src.Negate &&
1426 dst.file != BRW_GENERAL_REGISTER_FILE);
1427 struct brw_reg tmp = dst;
1428 GLuint i;
1429
1430 if (need_tmp)
1431 tmp = get_tmp(c);
1432
1433 for (i = 0; i < 4; i++) {
1434 if (dst.dw1.bits.writemask & (1<<i)) {
1435 GLubyte s = GET_SWZ(src.Swizzle, i);
1436 switch (s) {
1437 case SWIZZLE_X:
1438 case SWIZZLE_Y:
1439 case SWIZZLE_Z:
1440 case SWIZZLE_W:
1441 src_mask |= 1<<i;
1442 src_swz[i] = s;
1443 break;
1444 case SWIZZLE_ZERO:
1445 zeros_mask |= 1<<i;
1446 break;
1447 case SWIZZLE_ONE:
1448 ones_mask |= 1<<i;
1449 break;
1450 }
1451 }
1452 }
1453
1454 /* Do src first, in case dst aliases src:
1455 */
1456 if (src_mask) {
1457 struct brw_reg arg0;
1458
1459 arg0 = get_src_reg(c, inst, argIndex);
1460
1461 arg0 = brw_swizzle(arg0,
1462 src_swz[0], src_swz[1],
1463 src_swz[2], src_swz[3]);
1464
1465 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1466 }
1467
1468 if (zeros_mask)
1469 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1470
1471 if (ones_mask)
1472 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1473
1474 if (src.Negate)
1475 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1476
1477 if (need_tmp) {
1478 brw_MOV(p, dst, tmp);
1479 release_tmp(c, tmp);
1480 }
1481 }
1482
1483
1484 /**
1485 * Post-vertex-program processing. Send the results to the URB.
1486 */
1487 static void emit_vertex_write( struct brw_vs_compile *c)
1488 {
1489 struct brw_compile *p = &c->func;
1490 struct brw_context *brw = p->brw;
1491 struct intel_context *intel = &brw->intel;
1492 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1493 struct brw_reg ndc;
1494 int eot;
1495 GLuint len_vertex_header = 2;
1496 int next_mrf, i;
1497
1498 if (c->key.copy_edgeflag) {
1499 brw_MOV(p,
1500 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1501 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1502 }
1503
1504 if (intel->gen < 6) {
1505 /* Build ndc coords */
1506 ndc = get_tmp(c);
1507 /* ndc = 1.0 / pos.w */
1508 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1509 /* ndc.xyz = pos * ndc */
1510 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1511 }
1512
1513 /* Update the header for point size, user clipping flags, and -ve rhw
1514 * workaround.
1515 */
1516 if (intel->gen >= 6) {
1517 struct brw_reg m1 = brw_message_reg(1);
1518
1519 /* On gen6, m1 has each value in a separate dword, so we never
1520 * need to mess with a temporary for computing the m1 value.
1521 */
1522 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1523 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1524 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1525 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1526 }
1527
1528 /* Set the user clip distances in dword 8-15. (m3-4)*/
1529 if (c->key.nr_userclip) {
1530 for (i = 0; i < c->key.nr_userclip; i++) {
1531 struct brw_reg m;
1532 if (i < 4)
1533 m = brw_message_reg(3);
1534 else
1535 m = brw_message_reg(4);
1536
1537 brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1538 }
1539 }
1540 } else if ((c->prog_data.outputs_written &
1541 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1542 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1543 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1544 GLuint i;
1545
1546 brw_MOV(p, header1, brw_imm_ud(0));
1547
1548 brw_set_access_mode(p, BRW_ALIGN_16);
1549
1550 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1551 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1552 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1553 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1554 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1555 header1, brw_imm_ud(0x7ff<<8));
1556 }
1557
1558 for (i = 0; i < c->key.nr_userclip; i++) {
1559 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1560 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1561 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1562 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1563 }
1564
1565 /* i965 clipping workaround:
1566 * 1) Test for -ve rhw
1567 * 2) If set,
1568 * set ndc = (0,0,0,0)
1569 * set ucp[6] = 1
1570 *
1571 * Later, clipping will detect ucp[6] and ensure the primitive is
1572 * clipped against all fixed planes.
1573 */
1574 if (brw->has_negative_rhw_bug) {
1575 brw_CMP(p,
1576 vec8(brw_null_reg()),
1577 BRW_CONDITIONAL_L,
1578 brw_swizzle1(ndc, 3),
1579 brw_imm_f(0));
1580
1581 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1582 brw_MOV(p, ndc, brw_imm_f(0));
1583 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1584 }
1585
1586 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1587 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1588 brw_set_access_mode(p, BRW_ALIGN_16);
1589
1590 release_tmp(c, header1);
1591 }
1592 else {
1593 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1594 }
1595
1596 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1597 * of zeros followed by two sets of NDC coordinates:
1598 */
1599 brw_set_access_mode(p, BRW_ALIGN_1);
1600 brw_set_acc_write_control(p, 0);
1601
1602 /* The VUE layout is documented in Volume 2a. */
1603 if (intel->gen >= 6) {
1604 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1605 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1606 * dword 4-7 (m2) is the 4D space position
1607 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1608 * enabled.
1609 * m3 or 5 is the first vertex element data we fill, which is
1610 * the vertex position.
1611 */
1612 brw_MOV(p, brw_message_reg(2), pos);
1613 len_vertex_header = 1;
1614 if (c->key.nr_userclip > 0)
1615 len_vertex_header += 2;
1616 } else if (intel->gen == 5) {
1617 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1618 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1619 * dword 4-7 (m2) is the ndc position (set above)
1620 * dword 8-11 (m3) of the vertex header is the 4D space position
1621 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1622 * m6 is a pad so that the vertex element data is aligned
1623 * m7 is the first vertex data we fill, which is the vertex position.
1624 */
1625 brw_MOV(p, brw_message_reg(2), ndc);
1626 brw_MOV(p, brw_message_reg(3), pos);
1627 brw_MOV(p, brw_message_reg(7), pos);
1628 len_vertex_header = 6;
1629 } else {
1630 /* There are 8 dwords in VUE header pre-Ironlake:
1631 * dword 0-3 (m1) is indices, point width, clip flags.
1632 * dword 4-7 (m2) is ndc position (set above)
1633 *
1634 * dword 8-11 (m3) is the first vertex data, which we always have be the
1635 * vertex position.
1636 */
1637 brw_MOV(p, brw_message_reg(2), ndc);
1638 brw_MOV(p, brw_message_reg(3), pos);
1639 len_vertex_header = 2;
1640 }
1641
1642 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1643 next_mrf = 2 + len_vertex_header;
1644 for (i = 0; i < VERT_RESULT_MAX; i++) {
1645 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1646 break;
1647 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1648 continue;
1649 if (i == VERT_RESULT_PSIZ)
1650 continue;
1651
1652 if (i >= VERT_RESULT_TEX0 &&
1653 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1654 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1655 next_mrf++;
1656 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1657 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1658 }
1659 }
1660
1661 eot = (c->first_overflow_output == 0);
1662
1663 brw_urb_WRITE(p,
1664 brw_null_reg(), /* dest */
1665 0, /* starting mrf reg nr */
1666 c->r0, /* src */
1667 0, /* allocate */
1668 1, /* used */
1669 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1670 0, /* response len */
1671 eot, /* eot */
1672 eot, /* writes complete */
1673 0, /* urb destination offset */
1674 BRW_URB_SWIZZLE_INTERLEAVE);
1675
1676 if (c->first_overflow_output > 0) {
1677 /* Not all of the vertex outputs/results fit into the MRF.
1678 * Move the overflowed attributes from the GRF to the MRF and
1679 * issue another brw_urb_WRITE().
1680 */
1681 GLuint i, mrf = 1;
1682 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1683 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1684 /* move from GRF to MRF */
1685 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1686 mrf++;
1687 }
1688 }
1689
1690 brw_urb_WRITE(p,
1691 brw_null_reg(), /* dest */
1692 0, /* starting mrf reg nr */
1693 c->r0, /* src */
1694 0, /* allocate */
1695 1, /* used */
1696 mrf, /* msg len */
1697 0, /* response len */
1698 1, /* eot */
1699 1, /* writes complete */
1700 14 / 2, /* urb destination offset */
1701 BRW_URB_SWIZZLE_INTERLEAVE);
1702 }
1703 }
1704
1705 static GLboolean
1706 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1707 {
1708 struct brw_compile *p = &c->func;
1709 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1710
1711 if (p->nr_insn == 0)
1712 return GL_FALSE;
1713
1714 if (val.address_mode != BRW_ADDRESS_DIRECT)
1715 return GL_FALSE;
1716
1717 switch (prev_insn->header.opcode) {
1718 case BRW_OPCODE_MOV:
1719 case BRW_OPCODE_MAC:
1720 case BRW_OPCODE_MUL:
1721 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1722 prev_insn->header.execution_size == val.width &&
1723 prev_insn->bits1.da1.dest_reg_file == val.file &&
1724 prev_insn->bits1.da1.dest_reg_type == val.type &&
1725 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1726 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1727 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1728 prev_insn->bits1.da16.dest_writemask == 0xf)
1729 return GL_TRUE;
1730 else
1731 return GL_FALSE;
1732 default:
1733 return GL_FALSE;
1734 }
1735 }
1736
1737 static uint32_t
1738 get_predicate(const struct prog_instruction *inst)
1739 {
1740 if (inst->DstReg.CondMask == COND_TR)
1741 return BRW_PREDICATE_NONE;
1742
1743 /* All of GLSL only produces predicates for COND_NE and one channel per
1744 * vector. Fail badly if someone starts doing something else, as it might
1745 * mean infinite looping or something.
1746 *
1747 * We'd like to support all the condition codes, but our hardware doesn't
1748 * quite match the Mesa IR, which is modeled after the NV extensions. For
1749 * those, the instruction may update the condition codes or not, then any
1750 * later instruction may use one of those condition codes. For gen4, the
1751 * instruction may update the flags register based on one of the condition
1752 * codes output by the instruction, and then further instructions may
1753 * predicate on that. We can probably support this, but it won't
1754 * necessarily be easy.
1755 */
1756 assert(inst->DstReg.CondMask == COND_NE);
1757
1758 switch (inst->DstReg.CondSwizzle) {
1759 case SWIZZLE_XXXX:
1760 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1761 case SWIZZLE_YYYY:
1762 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1763 case SWIZZLE_ZZZZ:
1764 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1765 case SWIZZLE_WWWW:
1766 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1767 default:
1768 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1769 inst->DstReg.CondMask);
1770 return BRW_PREDICATE_NORMAL;
1771 }
1772 }
1773
1774 /* Emit the vertex program instructions here.
1775 */
1776 void brw_vs_emit(struct brw_vs_compile *c )
1777 {
1778 #define MAX_IF_DEPTH 32
1779 #define MAX_LOOP_DEPTH 32
1780 struct brw_compile *p = &c->func;
1781 struct brw_context *brw = p->brw;
1782 struct intel_context *intel = &brw->intel;
1783 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1784 GLuint insn, if_depth = 0, loop_depth = 0;
1785 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1786 int if_depth_in_loop[MAX_LOOP_DEPTH];
1787 const struct brw_indirect stack_index = brw_indirect(0, 0);
1788 GLuint index;
1789 GLuint file;
1790
1791 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1792 printf("vs-mesa:\n");
1793 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1794 GL_TRUE);
1795 printf("\n");
1796 }
1797
1798 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1799 brw_set_access_mode(p, BRW_ALIGN_16);
1800 if_depth_in_loop[loop_depth] = 0;
1801
1802 brw_set_acc_write_control(p, 1);
1803
1804 for (insn = 0; insn < nr_insns; insn++) {
1805 GLuint i;
1806 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1807
1808 /* Message registers can't be read, so copy the output into GRF
1809 * register if they are used in source registers
1810 */
1811 for (i = 0; i < 3; i++) {
1812 struct prog_src_register *src = &inst->SrcReg[i];
1813 GLuint index = src->Index;
1814 GLuint file = src->File;
1815 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1816 c->output_regs[index].used_in_src = GL_TRUE;
1817 }
1818
1819 switch (inst->Opcode) {
1820 case OPCODE_CAL:
1821 case OPCODE_RET:
1822 c->needs_stack = GL_TRUE;
1823 break;
1824 default:
1825 break;
1826 }
1827 }
1828
1829 /* Static register allocation
1830 */
1831 brw_vs_alloc_regs(c);
1832
1833 if (c->needs_stack)
1834 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1835
1836 for (insn = 0; insn < nr_insns; insn++) {
1837
1838 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1839 struct brw_reg args[3], dst;
1840 GLuint i;
1841
1842 #if 0
1843 printf("%d: ", insn);
1844 _mesa_print_instruction(inst);
1845 #endif
1846
1847 /* Get argument regs. SWZ is special and does this itself.
1848 */
1849 if (inst->Opcode != OPCODE_SWZ)
1850 for (i = 0; i < 3; i++) {
1851 const struct prog_src_register *src = &inst->SrcReg[i];
1852 index = src->Index;
1853 file = src->File;
1854 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1855 args[i] = c->output_regs[index].reg;
1856 else
1857 args[i] = get_arg(c, inst, i);
1858 }
1859
1860 /* Get dest regs. Note that it is possible for a reg to be both
1861 * dst and arg, given the static allocation of registers. So
1862 * care needs to be taken emitting multi-operation instructions.
1863 */
1864 index = inst->DstReg.Index;
1865 file = inst->DstReg.File;
1866 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1867 dst = c->output_regs[index].reg;
1868 else
1869 dst = get_dst(c, inst->DstReg);
1870
1871 if (inst->SaturateMode != SATURATE_OFF) {
1872 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1873 inst->SaturateMode);
1874 }
1875
1876 switch (inst->Opcode) {
1877 case OPCODE_ABS:
1878 brw_MOV(p, dst, brw_abs(args[0]));
1879 break;
1880 case OPCODE_ADD:
1881 brw_ADD(p, dst, args[0], args[1]);
1882 break;
1883 case OPCODE_COS:
1884 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1885 break;
1886 case OPCODE_DP2:
1887 brw_DP2(p, dst, args[0], args[1]);
1888 break;
1889 case OPCODE_DP3:
1890 brw_DP3(p, dst, args[0], args[1]);
1891 break;
1892 case OPCODE_DP4:
1893 brw_DP4(p, dst, args[0], args[1]);
1894 break;
1895 case OPCODE_DPH:
1896 brw_DPH(p, dst, args[0], args[1]);
1897 break;
1898 case OPCODE_NRM3:
1899 emit_nrm(c, dst, args[0], 3);
1900 break;
1901 case OPCODE_NRM4:
1902 emit_nrm(c, dst, args[0], 4);
1903 break;
1904 case OPCODE_DST:
1905 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1906 break;
1907 case OPCODE_EXP:
1908 unalias1(c, dst, args[0], emit_exp_noalias);
1909 break;
1910 case OPCODE_EX2:
1911 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1912 break;
1913 case OPCODE_ARL:
1914 brw_RNDD(p, dst, args[0]);
1915 break;
1916 case OPCODE_FLR:
1917 brw_RNDD(p, dst, args[0]);
1918 break;
1919 case OPCODE_FRC:
1920 brw_FRC(p, dst, args[0]);
1921 break;
1922 case OPCODE_LOG:
1923 unalias1(c, dst, args[0], emit_log_noalias);
1924 break;
1925 case OPCODE_LG2:
1926 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1927 break;
1928 case OPCODE_LIT:
1929 unalias1(c, dst, args[0], emit_lit_noalias);
1930 break;
1931 case OPCODE_LRP:
1932 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1933 break;
1934 case OPCODE_MAD:
1935 if (!accumulator_contains(c, args[2]))
1936 brw_MOV(p, brw_acc_reg(), args[2]);
1937 brw_MAC(p, dst, args[0], args[1]);
1938 break;
1939 case OPCODE_CMP:
1940 emit_cmp(p, dst, args[0], args[1], args[2]);
1941 break;
1942 case OPCODE_MAX:
1943 emit_max(p, dst, args[0], args[1]);
1944 break;
1945 case OPCODE_MIN:
1946 emit_min(p, dst, args[0], args[1]);
1947 break;
1948 case OPCODE_MOV:
1949 brw_MOV(p, dst, args[0]);
1950 break;
1951 case OPCODE_MUL:
1952 brw_MUL(p, dst, args[0], args[1]);
1953 break;
1954 case OPCODE_POW:
1955 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1956 break;
1957 case OPCODE_RCP:
1958 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1959 break;
1960 case OPCODE_RSQ:
1961 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1962 break;
1963
1964 case OPCODE_SEQ:
1965 unalias2(c, dst, args[0], args[1], emit_seq);
1966 break;
1967 case OPCODE_SIN:
1968 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1969 break;
1970 case OPCODE_SNE:
1971 unalias2(c, dst, args[0], args[1], emit_sne);
1972 break;
1973 case OPCODE_SGE:
1974 unalias2(c, dst, args[0], args[1], emit_sge);
1975 break;
1976 case OPCODE_SGT:
1977 unalias2(c, dst, args[0], args[1], emit_sgt);
1978 break;
1979 case OPCODE_SLT:
1980 unalias2(c, dst, args[0], args[1], emit_slt);
1981 break;
1982 case OPCODE_SLE:
1983 unalias2(c, dst, args[0], args[1], emit_sle);
1984 break;
1985 case OPCODE_SSG:
1986 unalias1(c, dst, args[0], emit_sign);
1987 break;
1988 case OPCODE_SUB:
1989 brw_ADD(p, dst, args[0], negate(args[1]));
1990 break;
1991 case OPCODE_SWZ:
1992 /* The args[0] value can't be used here as it won't have
1993 * correctly encoded the full swizzle:
1994 */
1995 emit_swz(c, dst, inst);
1996 break;
1997 case OPCODE_TRUNC:
1998 /* round toward zero */
1999 brw_RNDZ(p, dst, args[0]);
2000 break;
2001 case OPCODE_XPD:
2002 emit_xpd(p, dst, args[0], args[1]);
2003 break;
2004 case OPCODE_IF:
2005 assert(if_depth < MAX_IF_DEPTH);
2006 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
2007 /* Note that brw_IF smashes the predicate_control field. */
2008 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
2009 if_depth_in_loop[loop_depth]++;
2010 if_depth++;
2011 break;
2012 case OPCODE_ELSE:
2013 clear_current_const(c);
2014 assert(if_depth > 0);
2015 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2016 break;
2017 case OPCODE_ENDIF:
2018 clear_current_const(c);
2019 assert(if_depth > 0);
2020 brw_ENDIF(p, if_inst[--if_depth]);
2021 if_depth_in_loop[loop_depth]--;
2022 break;
2023 case OPCODE_BGNLOOP:
2024 clear_current_const(c);
2025 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2026 if_depth_in_loop[loop_depth] = 0;
2027 break;
2028 case OPCODE_BRK:
2029 brw_set_predicate_control(p, get_predicate(inst));
2030 brw_BREAK(p, if_depth_in_loop[loop_depth]);
2031 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2032 break;
2033 case OPCODE_CONT:
2034 brw_set_predicate_control(p, get_predicate(inst));
2035 brw_CONT(p, if_depth_in_loop[loop_depth]);
2036 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2037 break;
2038 case OPCODE_ENDLOOP:
2039 {
2040 clear_current_const(c);
2041 struct brw_instruction *inst0, *inst1;
2042 GLuint br = 1;
2043
2044 loop_depth--;
2045
2046 if (intel->gen == 5)
2047 br = 2;
2048
2049 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2050 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2051 while (inst0 > loop_inst[loop_depth]) {
2052 inst0--;
2053 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2054 inst0->bits3.if_else.jump_count == 0) {
2055 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2056 }
2057 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2058 inst0->bits3.if_else.jump_count == 0) {
2059 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2060 }
2061 }
2062 }
2063 break;
2064 case OPCODE_BRA:
2065 brw_set_predicate_control(p, get_predicate(inst));
2066 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2067 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2068 break;
2069 case OPCODE_CAL:
2070 brw_set_access_mode(p, BRW_ALIGN_1);
2071 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2072 brw_set_access_mode(p, BRW_ALIGN_16);
2073 brw_ADD(p, get_addr_reg(stack_index),
2074 get_addr_reg(stack_index), brw_imm_d(4));
2075 brw_save_call(p, inst->Comment, p->nr_insn);
2076 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2077 break;
2078 case OPCODE_RET:
2079 brw_ADD(p, get_addr_reg(stack_index),
2080 get_addr_reg(stack_index), brw_imm_d(-4));
2081 brw_set_access_mode(p, BRW_ALIGN_1);
2082 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2083 brw_set_access_mode(p, BRW_ALIGN_16);
2084 break;
2085 case OPCODE_END:
2086 emit_vertex_write(c);
2087 break;
2088 case OPCODE_PRINT:
2089 /* no-op */
2090 break;
2091 case OPCODE_BGNSUB:
2092 brw_save_label(p, inst->Comment, p->nr_insn);
2093 break;
2094 case OPCODE_ENDSUB:
2095 /* no-op */
2096 break;
2097 default:
2098 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2099 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2100 _mesa_opcode_string(inst->Opcode) :
2101 "unknown");
2102 }
2103
2104 /* Set the predication update on the last instruction of the native
2105 * instruction sequence.
2106 *
2107 * This would be problematic if it was set on a math instruction,
2108 * but that shouldn't be the case with the current GLSL compiler.
2109 */
2110 if (inst->CondUpdate) {
2111 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2112
2113 assert(hw_insn->header.destreg__conditionalmod == 0);
2114 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2115 }
2116
2117 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2118 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2119 && c->output_regs[inst->DstReg.Index].used_in_src) {
2120 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2121 }
2122
2123 /* Result color clamping.
2124 *
2125 * When destination register is an output register and
2126 * it's primary/secondary front/back color, we have to clamp
2127 * the result to [0,1]. This is done by enabling the
2128 * saturation bit for the last instruction.
2129 *
2130 * We don't use brw_set_saturate() as it modifies
2131 * p->current->header.saturate, which affects all the subsequent
2132 * instructions. Instead, we directly modify the header
2133 * of the last (already stored) instruction.
2134 */
2135 if (inst->DstReg.File == PROGRAM_OUTPUT) {
2136 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2137 || (inst->DstReg.Index == VERT_RESULT_COL1)
2138 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2139 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2140 p->store[p->nr_insn-1].header.saturate = 1;
2141 }
2142 }
2143
2144 if (inst->DstReg.RelAddr) {
2145 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2146 inst->DstReg.File == PROGRAM_OUTPUT);
2147 move_to_reladdr_dst(c, inst, dst);
2148 }
2149
2150 release_tmps(c);
2151 }
2152
2153 brw_resolve_cals(p);
2154
2155 brw_optimize(p);
2156
2157 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2158 int i;
2159
2160 printf("vs-native:\n");
2161 for (i = 0; i < p->nr_insn; i++)
2162 brw_disasm(stdout, &p->store[i], intel->gen);
2163 printf("\n");
2164 }
2165 }