i965: Use the new embedded compare in SEL on gen6 for VS MIN and MAX opcodes.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf;
144 int attributes_in_vue;
145 int first_reladdr_output;
146
147 /* Determine whether to use a real constant buffer or use a block
148 * of GRF registers for constants. The later is faster but only
149 * works if everything fits in the GRF.
150 * XXX this heuristic/check may need some fine tuning...
151 */
152 if (c->vp->program.Base.Parameters->NumParameters +
153 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
154 c->vp->use_const_buffer = GL_TRUE;
155 else
156 c->vp->use_const_buffer = GL_FALSE;
157
158 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
159
160 /* r0 -- reserved as usual
161 */
162 c->r0 = brw_vec8_grf(reg, 0);
163 reg++;
164
165 /* User clip planes from curbe:
166 */
167 if (c->key.nr_userclip) {
168 if (intel->gen >= 6) {
169 for (i = 0; i < c->key.nr_userclip; i++) {
170 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
171 (i % 2) * 4), 0, 4, 1);
172 }
173 reg += ALIGN(c->key.nr_userclip, 2) / 2;
174 } else {
175 for (i = 0; i < c->key.nr_userclip; i++) {
176 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
177 (i % 2) * 4), 0, 4, 1);
178 }
179 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
180 }
181
182 }
183
184 /* Vertex program parameters from curbe:
185 */
186 if (c->vp->use_const_buffer) {
187 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
188 int constant = 0;
189
190 /* We've got more constants than we can load with the push
191 * mechanism. This is often correlated with reladdr loads where
192 * we should probably be using a pull mechanism anyway to avoid
193 * excessive reading. However, the pull mechanism is slow in
194 * general. So, we try to allocate as many non-reladdr-loaded
195 * constants through the push buffer as we can before giving up.
196 */
197 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
198 for (i = 0;
199 i < c->vp->program.Base.NumInstructions && constant < max_constant;
200 i++) {
201 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
202 int arg;
203
204 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
205 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
206 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
207 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
208 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
209 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
210 inst->SrcReg[arg].RelAddr)
211 continue;
212
213 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
214 c->constant_map[inst->SrcReg[arg].Index] = constant++;
215 }
216 }
217 }
218
219 for (i = 0; i < constant; i++) {
220 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
221 (i%2) * 4),
222 0, 4, 1);
223 }
224 reg += (constant + 1) / 2;
225 c->prog_data.curb_read_length = reg - 1;
226 /* XXX 0 causes a bug elsewhere... */
227 c->prog_data.nr_params = MAX2(constant * 4, 4);
228 }
229 else {
230 /* use a section of the GRF for constants */
231 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
232 for (i = 0; i < nr_params; i++) {
233 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
234 }
235 reg += (nr_params + 1) / 2;
236 c->prog_data.curb_read_length = reg - 1;
237
238 c->prog_data.nr_params = nr_params * 4;
239 }
240
241 /* Allocate input regs:
242 */
243 c->nr_inputs = 0;
244 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
245 if (c->prog_data.inputs_read & (1 << i)) {
246 c->nr_inputs++;
247 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
248 reg++;
249 }
250 }
251 /* If there are no inputs, we'll still be reading one attribute's worth
252 * because it's required -- see urb_read_length setting.
253 */
254 if (c->nr_inputs == 0)
255 reg++;
256
257 /* Allocate outputs. The non-position outputs go straight into message regs.
258 */
259 c->nr_outputs = 0;
260 c->first_output = reg;
261 c->first_overflow_output = 0;
262
263 if (intel->gen >= 6) {
264 mrf = 3;
265 if (c->key.nr_userclip)
266 mrf += 2;
267 } else if (intel->gen == 5)
268 mrf = 8;
269 else
270 mrf = 4;
271
272 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
273 for (i = 0; i < VERT_RESULT_MAX; i++) {
274 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
275 c->nr_outputs++;
276 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
277 if (i == VERT_RESULT_HPOS) {
278 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
279 reg++;
280 }
281 else if (i == VERT_RESULT_PSIZ) {
282 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
283 reg++;
284 mrf++; /* just a placeholder? XXX fix later stages & remove this */
285 }
286 else {
287 /* Two restrictions on our compute-to-MRF here. The
288 * message length for all SEND messages is restricted to
289 * [1,15], so we can't use mrf 15, as that means a length
290 * of 16.
291 *
292 * Additionally, URB writes are aligned to URB rows, so we
293 * need to put an even number of registers of URB data in
294 * each URB write so that the later write is aligned. A
295 * message length of 15 means 1 message header reg plus 14
296 * regs of URB data.
297 *
298 * For attributes beyond the compute-to-MRF, we compute to
299 * GRFs and they will be written in the second URB_WRITE.
300 */
301 if (first_reladdr_output > i && mrf < 15) {
302 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
303 mrf++;
304 }
305 else {
306 if (mrf >= 15 && !c->first_overflow_output)
307 c->first_overflow_output = i;
308 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
309 reg++;
310 mrf++;
311 }
312 }
313 }
314 }
315
316 /* Allocate program temporaries:
317 */
318 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
319 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
320 reg++;
321 }
322
323 /* Address reg(s). Don't try to use the internal address reg until
324 * deref time.
325 */
326 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
327 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
328 reg,
329 0,
330 BRW_REGISTER_TYPE_D,
331 BRW_VERTICAL_STRIDE_8,
332 BRW_WIDTH_8,
333 BRW_HORIZONTAL_STRIDE_1,
334 BRW_SWIZZLE_XXXX,
335 WRITEMASK_X);
336 reg++;
337 }
338
339 if (c->vp->use_const_buffer) {
340 for (i = 0; i < 3; i++) {
341 c->current_const[i].reg = brw_vec8_grf(reg, 0);
342 reg++;
343 }
344 clear_current_const(c);
345 }
346
347 for (i = 0; i < 128; i++) {
348 if (c->output_regs[i].used_in_src) {
349 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
350 reg++;
351 }
352 }
353
354 if (c->needs_stack) {
355 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
356 reg += 2;
357 }
358
359 /* Some opcodes need an internal temporary:
360 */
361 c->first_tmp = reg;
362 c->last_tmp = reg; /* for allocation purposes */
363
364 /* Each input reg holds data from two vertices. The
365 * urb_read_length is the number of registers read from *each*
366 * vertex urb, so is half the amount:
367 */
368 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
369 /* Setting this field to 0 leads to undefined behavior according to the
370 * the VS_STATE docs. Our VUEs will always have at least one attribute
371 * sitting in them, even if it's padding.
372 */
373 if (c->prog_data.urb_read_length == 0)
374 c->prog_data.urb_read_length = 1;
375
376 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
377 * them to fit the biggest thing they need to.
378 */
379 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
380
381 /* See emit_vertex_write() for where the VUE's overhead on top of the
382 * attributes comes from.
383 */
384 if (intel->gen >= 6) {
385 int header_regs = 2;
386 if (c->key.nr_userclip)
387 header_regs += 2;
388
389 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
390 } else if (intel->gen == 5)
391 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
392 else
393 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
394
395 c->prog_data.total_grf = reg;
396
397 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
398 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
399 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
400 printf("%s reg = %d\n", __FUNCTION__, reg);
401 }
402 }
403
404
405 /**
406 * If an instruction uses a temp reg both as a src and the dest, we
407 * sometimes need to allocate an intermediate temporary.
408 */
409 static void unalias1( struct brw_vs_compile *c,
410 struct brw_reg dst,
411 struct brw_reg arg0,
412 void (*func)( struct brw_vs_compile *,
413 struct brw_reg,
414 struct brw_reg ))
415 {
416 if (dst.file == arg0.file && dst.nr == arg0.nr) {
417 struct brw_compile *p = &c->func;
418 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
419 func(c, tmp, arg0);
420 brw_MOV(p, dst, tmp);
421 release_tmp(c, tmp);
422 }
423 else {
424 func(c, dst, arg0);
425 }
426 }
427
428 /**
429 * \sa unalias2
430 * Checkes if 2-operand instruction needs an intermediate temporary.
431 */
432 static void unalias2( struct brw_vs_compile *c,
433 struct brw_reg dst,
434 struct brw_reg arg0,
435 struct brw_reg arg1,
436 void (*func)( struct brw_vs_compile *,
437 struct brw_reg,
438 struct brw_reg,
439 struct brw_reg ))
440 {
441 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
442 (dst.file == arg1.file && dst.nr == arg1.nr)) {
443 struct brw_compile *p = &c->func;
444 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
445 func(c, tmp, arg0, arg1);
446 brw_MOV(p, dst, tmp);
447 release_tmp(c, tmp);
448 }
449 else {
450 func(c, dst, arg0, arg1);
451 }
452 }
453
454 /**
455 * \sa unalias2
456 * Checkes if 3-operand instruction needs an intermediate temporary.
457 */
458 static void unalias3( struct brw_vs_compile *c,
459 struct brw_reg dst,
460 struct brw_reg arg0,
461 struct brw_reg arg1,
462 struct brw_reg arg2,
463 void (*func)( struct brw_vs_compile *,
464 struct brw_reg,
465 struct brw_reg,
466 struct brw_reg,
467 struct brw_reg ))
468 {
469 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
470 (dst.file == arg1.file && dst.nr == arg1.nr) ||
471 (dst.file == arg2.file && dst.nr == arg2.nr)) {
472 struct brw_compile *p = &c->func;
473 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
474 func(c, tmp, arg0, arg1, arg2);
475 brw_MOV(p, dst, tmp);
476 release_tmp(c, tmp);
477 }
478 else {
479 func(c, dst, arg0, arg1, arg2);
480 }
481 }
482
483 static void emit_sop( struct brw_vs_compile *c,
484 struct brw_reg dst,
485 struct brw_reg arg0,
486 struct brw_reg arg1,
487 GLuint cond)
488 {
489 struct brw_compile *p = &c->func;
490
491 brw_MOV(p, dst, brw_imm_f(0.0f));
492 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
493 brw_MOV(p, dst, brw_imm_f(1.0f));
494 brw_set_predicate_control_flag_value(p, 0xff);
495 }
496
497 static void emit_seq( struct brw_vs_compile *c,
498 struct brw_reg dst,
499 struct brw_reg arg0,
500 struct brw_reg arg1 )
501 {
502 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
503 }
504
505 static void emit_sne( struct brw_vs_compile *c,
506 struct brw_reg dst,
507 struct brw_reg arg0,
508 struct brw_reg arg1 )
509 {
510 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
511 }
512 static void emit_slt( struct brw_vs_compile *c,
513 struct brw_reg dst,
514 struct brw_reg arg0,
515 struct brw_reg arg1 )
516 {
517 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
518 }
519
520 static void emit_sle( struct brw_vs_compile *c,
521 struct brw_reg dst,
522 struct brw_reg arg0,
523 struct brw_reg arg1 )
524 {
525 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
526 }
527
528 static void emit_sgt( struct brw_vs_compile *c,
529 struct brw_reg dst,
530 struct brw_reg arg0,
531 struct brw_reg arg1 )
532 {
533 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
534 }
535
536 static void emit_sge( struct brw_vs_compile *c,
537 struct brw_reg dst,
538 struct brw_reg arg0,
539 struct brw_reg arg1 )
540 {
541 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
542 }
543
544 static void emit_cmp( struct brw_compile *p,
545 struct brw_reg dst,
546 struct brw_reg arg0,
547 struct brw_reg arg1,
548 struct brw_reg arg2 )
549 {
550 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
551 brw_SEL(p, dst, arg1, arg2);
552 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
553 }
554
555 static void emit_sign(struct brw_vs_compile *c,
556 struct brw_reg dst,
557 struct brw_reg arg0)
558 {
559 struct brw_compile *p = &c->func;
560
561 brw_MOV(p, dst, brw_imm_f(0));
562
563 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
564 brw_MOV(p, dst, brw_imm_f(-1.0));
565 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
566
567 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
568 brw_MOV(p, dst, brw_imm_f(1.0));
569 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
570 }
571
572 static void emit_max( struct brw_compile *p,
573 struct brw_reg dst,
574 struct brw_reg arg0,
575 struct brw_reg arg1 )
576 {
577 struct intel_context *intel = &p->brw->intel;
578
579 if (intel->gen >= 6) {
580 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
581 brw_SEL(p, dst, arg0, arg1);
582 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
583 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
584 } else {
585 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
586 brw_SEL(p, dst, arg0, arg1);
587 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
588 }
589 }
590
591 static void emit_min( struct brw_compile *p,
592 struct brw_reg dst,
593 struct brw_reg arg0,
594 struct brw_reg arg1 )
595 {
596 struct intel_context *intel = &p->brw->intel;
597
598 if (intel->gen >= 6) {
599 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
600 brw_SEL(p, dst, arg0, arg1);
601 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
602 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
603 } else {
604 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
605 brw_SEL(p, dst, arg0, arg1);
606 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
607 }
608 }
609
610 static void emit_math1_gen4(struct brw_vs_compile *c,
611 GLuint function,
612 struct brw_reg dst,
613 struct brw_reg arg0,
614 GLuint precision)
615 {
616 /* There are various odd behaviours with SEND on the simulator. In
617 * addition there are documented issues with the fact that the GEN4
618 * processor doesn't do dependency control properly on SEND
619 * results. So, on balance, this kludge to get around failures
620 * with writemasked math results looks like it might be necessary
621 * whether that turns out to be a simulator bug or not:
622 */
623 struct brw_compile *p = &c->func;
624 struct brw_reg tmp = dst;
625 GLboolean need_tmp = GL_FALSE;
626
627 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
628 dst.dw1.bits.writemask != 0xf)
629 need_tmp = GL_TRUE;
630
631 if (need_tmp)
632 tmp = get_tmp(c);
633
634 brw_math(p,
635 tmp,
636 function,
637 BRW_MATH_SATURATE_NONE,
638 2,
639 arg0,
640 BRW_MATH_DATA_SCALAR,
641 precision);
642
643 if (need_tmp) {
644 brw_MOV(p, dst, tmp);
645 release_tmp(c, tmp);
646 }
647 }
648
649 static void
650 emit_math1_gen6(struct brw_vs_compile *c,
651 GLuint function,
652 struct brw_reg dst,
653 struct brw_reg arg0,
654 GLuint precision)
655 {
656 struct brw_compile *p = &c->func;
657 struct brw_reg tmp_src, tmp_dst;
658
659 /* Something is strange on gen6 math in 16-wide mode, though the
660 * docs say it's supposed to work. Punt to using align1 mode,
661 * which doesn't do writemasking and swizzles.
662 */
663 tmp_src = get_tmp(c);
664 tmp_dst = get_tmp(c);
665
666 brw_MOV(p, tmp_src, arg0);
667
668 brw_set_access_mode(p, BRW_ALIGN_1);
669 brw_math(p,
670 tmp_dst,
671 function,
672 BRW_MATH_SATURATE_NONE,
673 2,
674 tmp_src,
675 BRW_MATH_DATA_SCALAR,
676 precision);
677 brw_set_access_mode(p, BRW_ALIGN_16);
678
679 brw_MOV(p, dst, tmp_dst);
680
681 release_tmp(c, tmp_src);
682 release_tmp(c, tmp_dst);
683 }
684
685 static void
686 emit_math1(struct brw_vs_compile *c,
687 GLuint function,
688 struct brw_reg dst,
689 struct brw_reg arg0,
690 GLuint precision)
691 {
692 struct brw_compile *p = &c->func;
693 struct intel_context *intel = &p->brw->intel;
694
695 if (intel->gen >= 6)
696 emit_math1_gen6(c, function, dst, arg0, precision);
697 else
698 emit_math1_gen4(c, function, dst, arg0, precision);
699 }
700
701 static void emit_math2( struct brw_vs_compile *c,
702 GLuint function,
703 struct brw_reg dst,
704 struct brw_reg arg0,
705 struct brw_reg arg1,
706 GLuint precision)
707 {
708 struct brw_compile *p = &c->func;
709 struct intel_context *intel = &p->brw->intel;
710 struct brw_reg tmp = dst;
711 GLboolean need_tmp = GL_FALSE;
712
713 if (dst.file != BRW_GENERAL_REGISTER_FILE)
714 need_tmp = GL_TRUE;
715
716 if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
717 need_tmp = GL_TRUE;
718
719 if (need_tmp)
720 tmp = get_tmp(c);
721
722 brw_MOV(p, brw_message_reg(3), arg1);
723
724 brw_math(p,
725 tmp,
726 function,
727 BRW_MATH_SATURATE_NONE,
728 2,
729 arg0,
730 BRW_MATH_DATA_SCALAR,
731 precision);
732
733 if (need_tmp) {
734 brw_MOV(p, dst, tmp);
735 release_tmp(c, tmp);
736 }
737 }
738
739
740 static void emit_exp_noalias( struct brw_vs_compile *c,
741 struct brw_reg dst,
742 struct brw_reg arg0 )
743 {
744 struct brw_compile *p = &c->func;
745
746
747 if (dst.dw1.bits.writemask & WRITEMASK_X) {
748 struct brw_reg tmp = get_tmp(c);
749 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
750
751 /* tmp_d = floor(arg0.x) */
752 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
753
754 /* result[0] = 2.0 ^ tmp */
755
756 /* Adjust exponent for floating point:
757 * exp += 127
758 */
759 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
760
761 /* Install exponent and sign.
762 * Excess drops off the edge:
763 */
764 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
765 tmp_d, brw_imm_d(23));
766
767 release_tmp(c, tmp);
768 }
769
770 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
771 /* result[1] = arg0.x - floor(arg0.x) */
772 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
773 }
774
775 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
776 /* As with the LOG instruction, we might be better off just
777 * doing a taylor expansion here, seeing as we have to do all
778 * the prep work.
779 *
780 * If mathbox partial precision is too low, consider also:
781 * result[3] = result[0] * EXP(result[1])
782 */
783 emit_math1(c,
784 BRW_MATH_FUNCTION_EXP,
785 brw_writemask(dst, WRITEMASK_Z),
786 brw_swizzle1(arg0, 0),
787 BRW_MATH_PRECISION_FULL);
788 }
789
790 if (dst.dw1.bits.writemask & WRITEMASK_W) {
791 /* result[3] = 1.0; */
792 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
793 }
794 }
795
796
797 static void emit_log_noalias( struct brw_vs_compile *c,
798 struct brw_reg dst,
799 struct brw_reg arg0 )
800 {
801 struct brw_compile *p = &c->func;
802 struct brw_reg tmp = dst;
803 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
804 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
805 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
806 dst.file != BRW_GENERAL_REGISTER_FILE);
807
808 if (need_tmp) {
809 tmp = get_tmp(c);
810 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
811 }
812
813 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
814 * according to spec:
815 *
816 * These almost look likey they could be joined up, but not really
817 * practical:
818 *
819 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
820 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
821 */
822 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
823 brw_AND(p,
824 brw_writemask(tmp_ud, WRITEMASK_X),
825 brw_swizzle1(arg0_ud, 0),
826 brw_imm_ud((1U<<31)-1));
827
828 brw_SHR(p,
829 brw_writemask(tmp_ud, WRITEMASK_X),
830 tmp_ud,
831 brw_imm_ud(23));
832
833 brw_ADD(p,
834 brw_writemask(tmp, WRITEMASK_X),
835 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
836 brw_imm_d(-127));
837 }
838
839 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
840 brw_AND(p,
841 brw_writemask(tmp_ud, WRITEMASK_Y),
842 brw_swizzle1(arg0_ud, 0),
843 brw_imm_ud((1<<23)-1));
844
845 brw_OR(p,
846 brw_writemask(tmp_ud, WRITEMASK_Y),
847 tmp_ud,
848 brw_imm_ud(127<<23));
849 }
850
851 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
852 /* result[2] = result[0] + LOG2(result[1]); */
853
854 /* Why bother? The above is just a hint how to do this with a
855 * taylor series. Maybe we *should* use a taylor series as by
856 * the time all the above has been done it's almost certainly
857 * quicker than calling the mathbox, even with low precision.
858 *
859 * Options are:
860 * - result[0] + mathbox.LOG2(result[1])
861 * - mathbox.LOG2(arg0.x)
862 * - result[0] + inline_taylor_approx(result[1])
863 */
864 emit_math1(c,
865 BRW_MATH_FUNCTION_LOG,
866 brw_writemask(tmp, WRITEMASK_Z),
867 brw_swizzle1(tmp, 1),
868 BRW_MATH_PRECISION_FULL);
869
870 brw_ADD(p,
871 brw_writemask(tmp, WRITEMASK_Z),
872 brw_swizzle1(tmp, 2),
873 brw_swizzle1(tmp, 0));
874 }
875
876 if (dst.dw1.bits.writemask & WRITEMASK_W) {
877 /* result[3] = 1.0; */
878 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
879 }
880
881 if (need_tmp) {
882 brw_MOV(p, dst, tmp);
883 release_tmp(c, tmp);
884 }
885 }
886
887
888 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
889 */
890 static void emit_dst_noalias( struct brw_vs_compile *c,
891 struct brw_reg dst,
892 struct brw_reg arg0,
893 struct brw_reg arg1)
894 {
895 struct brw_compile *p = &c->func;
896
897 /* There must be a better way to do this:
898 */
899 if (dst.dw1.bits.writemask & WRITEMASK_X)
900 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
901 if (dst.dw1.bits.writemask & WRITEMASK_Y)
902 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
903 if (dst.dw1.bits.writemask & WRITEMASK_Z)
904 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
905 if (dst.dw1.bits.writemask & WRITEMASK_W)
906 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
907 }
908
909
910 static void emit_xpd( struct brw_compile *p,
911 struct brw_reg dst,
912 struct brw_reg t,
913 struct brw_reg u)
914 {
915 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
916 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
917 }
918
919
920 static void emit_lit_noalias( struct brw_vs_compile *c,
921 struct brw_reg dst,
922 struct brw_reg arg0 )
923 {
924 struct brw_compile *p = &c->func;
925 struct brw_instruction *if_insn;
926 struct brw_reg tmp = dst;
927 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
928
929 if (need_tmp)
930 tmp = get_tmp(c);
931
932 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
933 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
934
935 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
936 * to get all channels active inside the IF. In the clipping code
937 * we run with NoMask, so it's not an option and we can use
938 * BRW_EXECUTE_1 for all comparisions.
939 */
940 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
941 if_insn = brw_IF(p, BRW_EXECUTE_8);
942 {
943 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
944
945 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
946 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
947 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
948
949 emit_math2(c,
950 BRW_MATH_FUNCTION_POW,
951 brw_writemask(dst, WRITEMASK_Z),
952 brw_swizzle1(tmp, 2),
953 brw_swizzle1(arg0, 3),
954 BRW_MATH_PRECISION_PARTIAL);
955 }
956
957 brw_ENDIF(p, if_insn);
958
959 release_tmp(c, tmp);
960 }
961
962 static void emit_lrp_noalias(struct brw_vs_compile *c,
963 struct brw_reg dst,
964 struct brw_reg arg0,
965 struct brw_reg arg1,
966 struct brw_reg arg2)
967 {
968 struct brw_compile *p = &c->func;
969
970 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
971 brw_MUL(p, brw_null_reg(), dst, arg2);
972 brw_MAC(p, dst, arg0, arg1);
973 }
974
975 /** 3 or 4-component vector normalization */
976 static void emit_nrm( struct brw_vs_compile *c,
977 struct brw_reg dst,
978 struct brw_reg arg0,
979 int num_comps)
980 {
981 struct brw_compile *p = &c->func;
982 struct brw_reg tmp = get_tmp(c);
983
984 /* tmp = dot(arg0, arg0) */
985 if (num_comps == 3)
986 brw_DP3(p, tmp, arg0, arg0);
987 else
988 brw_DP4(p, tmp, arg0, arg0);
989
990 /* tmp = 1 / sqrt(tmp) */
991 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
992
993 /* dst = arg0 * tmp */
994 brw_MUL(p, dst, arg0, tmp);
995
996 release_tmp(c, tmp);
997 }
998
999
1000 static struct brw_reg
1001 get_constant(struct brw_vs_compile *c,
1002 const struct prog_instruction *inst,
1003 GLuint argIndex)
1004 {
1005 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1006 struct brw_compile *p = &c->func;
1007 struct brw_reg const_reg = c->current_const[argIndex].reg;
1008
1009 assert(argIndex < 3);
1010
1011 assert(c->func.brw->intel.gen < 6); /* FINISHME */
1012
1013 if (c->current_const[argIndex].index != src->Index) {
1014 /* Keep track of the last constant loaded in this slot, for reuse. */
1015 c->current_const[argIndex].index = src->Index;
1016
1017 #if 0
1018 printf(" fetch const[%d] for arg %d into reg %d\n",
1019 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1020 #endif
1021 /* need to fetch the constant now */
1022 brw_dp_READ_4_vs(p,
1023 const_reg, /* writeback dest */
1024 16 * src->Index, /* byte offset */
1025 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1026 );
1027 }
1028
1029 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1030 const_reg = stride(const_reg, 0, 4, 0);
1031 const_reg.subnr = 0;
1032
1033 return const_reg;
1034 }
1035
1036 static struct brw_reg
1037 get_reladdr_constant(struct brw_vs_compile *c,
1038 const struct prog_instruction *inst,
1039 GLuint argIndex)
1040 {
1041 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1042 struct brw_compile *p = &c->func;
1043 struct brw_reg const_reg = c->current_const[argIndex].reg;
1044 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
1045 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1046
1047 assert(argIndex < 3);
1048
1049 assert(c->func.brw->intel.gen < 6); /* FINISHME */
1050
1051 /* Can't reuse a reladdr constant load. */
1052 c->current_const[argIndex].index = -1;
1053
1054 #if 0
1055 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1056 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1057 #endif
1058
1059 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
1060
1061 /* fetch the first vec4 */
1062 brw_dp_READ_4_vs_relative(p,
1063 const_reg, /* writeback dest */
1064 byte_addr_reg, /* address register */
1065 16 * src->Index, /* byte offset */
1066 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1067 );
1068
1069 return const_reg;
1070 }
1071
1072
1073
1074 /* TODO: relative addressing!
1075 */
1076 static struct brw_reg get_reg( struct brw_vs_compile *c,
1077 gl_register_file file,
1078 GLuint index )
1079 {
1080 switch (file) {
1081 case PROGRAM_TEMPORARY:
1082 case PROGRAM_INPUT:
1083 case PROGRAM_OUTPUT:
1084 assert(c->regs[file][index].nr != 0);
1085 return c->regs[file][index];
1086 case PROGRAM_STATE_VAR:
1087 case PROGRAM_CONSTANT:
1088 case PROGRAM_UNIFORM:
1089 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1090 return c->regs[PROGRAM_STATE_VAR][index];
1091 case PROGRAM_ADDRESS:
1092 assert(index == 0);
1093 return c->regs[file][index];
1094
1095 case PROGRAM_UNDEFINED: /* undef values */
1096 return brw_null_reg();
1097
1098 case PROGRAM_LOCAL_PARAM:
1099 case PROGRAM_ENV_PARAM:
1100 case PROGRAM_WRITE_ONLY:
1101 default:
1102 assert(0);
1103 return brw_null_reg();
1104 }
1105 }
1106
1107
1108 /**
1109 * Indirect addressing: get reg[[arg] + offset].
1110 */
1111 static struct brw_reg deref( struct brw_vs_compile *c,
1112 struct brw_reg arg,
1113 GLint offset,
1114 GLuint reg_size )
1115 {
1116 struct brw_compile *p = &c->func;
1117 struct brw_reg tmp = get_tmp(c);
1118 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1119 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1120 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1121 struct brw_reg indirect = brw_vec4_indirect(0,0);
1122 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1123
1124 /* Set the vertical stride on the register access so that the first
1125 * 4 components come from a0.0 and the second 4 from a0.1.
1126 */
1127 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1128
1129 {
1130 brw_push_insn_state(p);
1131 brw_set_access_mode(p, BRW_ALIGN_1);
1132
1133 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1134 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1135
1136 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1137 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1138
1139 brw_MOV(p, tmp, indirect);
1140
1141 brw_pop_insn_state(p);
1142 }
1143
1144 /* NOTE: tmp not released */
1145 return tmp;
1146 }
1147
1148 static void
1149 move_to_reladdr_dst(struct brw_vs_compile *c,
1150 const struct prog_instruction *inst,
1151 struct brw_reg val)
1152 {
1153 struct brw_compile *p = &c->func;
1154 int reg_size = 32;
1155 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1156 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1157 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1158 GLuint byte_offset = base.nr * 32 + base.subnr;
1159 struct brw_reg indirect = brw_vec4_indirect(0,0);
1160 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1161
1162 /* Because destination register indirect addressing can only use
1163 * one index, we'll write each vertex's vec4 value separately.
1164 */
1165 val.width = BRW_WIDTH_4;
1166 val.vstride = BRW_VERTICAL_STRIDE_4;
1167
1168 brw_push_insn_state(p);
1169 brw_set_access_mode(p, BRW_ALIGN_1);
1170
1171 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1172 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1173 brw_MOV(p, indirect, val);
1174
1175 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1176 brw_ADD(p, brw_address_reg(0), acc,
1177 brw_imm_uw(byte_offset + reg_size / 2));
1178 brw_MOV(p, indirect, suboffset(val, 4));
1179
1180 brw_pop_insn_state(p);
1181 }
1182
1183 /**
1184 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1185 * TODO: relative addressing!
1186 */
1187 static struct brw_reg
1188 get_src_reg( struct brw_vs_compile *c,
1189 const struct prog_instruction *inst,
1190 GLuint argIndex )
1191 {
1192 const GLuint file = inst->SrcReg[argIndex].File;
1193 const GLint index = inst->SrcReg[argIndex].Index;
1194 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1195
1196 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1197 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1198
1199 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1200 SWIZZLE_ZERO,
1201 SWIZZLE_ZERO,
1202 SWIZZLE_ZERO)) {
1203 return brw_imm_f(0.0f);
1204 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1205 SWIZZLE_ONE,
1206 SWIZZLE_ONE,
1207 SWIZZLE_ONE)) {
1208 if (src->Negate)
1209 return brw_imm_f(-1.0F);
1210 else
1211 return brw_imm_f(1.0F);
1212 } else if (src->File == PROGRAM_CONSTANT) {
1213 const struct gl_program_parameter_list *params;
1214 float f;
1215 int component = -1;
1216
1217 switch (src->Swizzle) {
1218 case SWIZZLE_XXXX:
1219 component = 0;
1220 break;
1221 case SWIZZLE_YYYY:
1222 component = 1;
1223 break;
1224 case SWIZZLE_ZZZZ:
1225 component = 2;
1226 break;
1227 case SWIZZLE_WWWW:
1228 component = 3;
1229 break;
1230 }
1231
1232 if (component >= 0) {
1233 params = c->vp->program.Base.Parameters;
1234 f = params->ParameterValues[src->Index][component];
1235
1236 if (src->Abs)
1237 f = fabs(f);
1238 if (src->Negate)
1239 f = -f;
1240 return brw_imm_f(f);
1241 }
1242 }
1243 }
1244
1245 switch (file) {
1246 case PROGRAM_TEMPORARY:
1247 case PROGRAM_INPUT:
1248 case PROGRAM_OUTPUT:
1249 if (relAddr) {
1250 return deref(c, c->regs[file][0], index, 32);
1251 }
1252 else {
1253 assert(c->regs[file][index].nr != 0);
1254 return c->regs[file][index];
1255 }
1256
1257 case PROGRAM_STATE_VAR:
1258 case PROGRAM_CONSTANT:
1259 case PROGRAM_UNIFORM:
1260 case PROGRAM_ENV_PARAM:
1261 case PROGRAM_LOCAL_PARAM:
1262 if (c->vp->use_const_buffer) {
1263 if (!relAddr && c->constant_map[index] != -1) {
1264 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1265 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1266 } else if (relAddr)
1267 return get_reladdr_constant(c, inst, argIndex);
1268 else
1269 return get_constant(c, inst, argIndex);
1270 }
1271 else if (relAddr) {
1272 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1273 }
1274 else {
1275 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1276 return c->regs[PROGRAM_STATE_VAR][index];
1277 }
1278 case PROGRAM_ADDRESS:
1279 assert(index == 0);
1280 return c->regs[file][index];
1281
1282 case PROGRAM_UNDEFINED:
1283 /* this is a normal case since we loop over all three src args */
1284 return brw_null_reg();
1285
1286 case PROGRAM_WRITE_ONLY:
1287 default:
1288 assert(0);
1289 return brw_null_reg();
1290 }
1291 }
1292
1293 /**
1294 * Return the brw reg for the given instruction's src argument.
1295 * Will return mangled results for SWZ op. The emit_swz() function
1296 * ignores this result and recalculates taking extended swizzles into
1297 * account.
1298 */
1299 static struct brw_reg get_arg( struct brw_vs_compile *c,
1300 const struct prog_instruction *inst,
1301 GLuint argIndex )
1302 {
1303 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1304 struct brw_reg reg;
1305
1306 if (src->File == PROGRAM_UNDEFINED)
1307 return brw_null_reg();
1308
1309 reg = get_src_reg(c, inst, argIndex);
1310
1311 /* Convert 3-bit swizzle to 2-bit.
1312 */
1313 if (reg.file != BRW_IMMEDIATE_VALUE) {
1314 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1315 GET_SWZ(src->Swizzle, 1),
1316 GET_SWZ(src->Swizzle, 2),
1317 GET_SWZ(src->Swizzle, 3));
1318 }
1319
1320 /* Note this is ok for non-swizzle instructions:
1321 */
1322 reg.negate = src->Negate ? 1 : 0;
1323
1324 return reg;
1325 }
1326
1327
1328 /**
1329 * Get brw register for the given program dest register.
1330 */
1331 static struct brw_reg get_dst( struct brw_vs_compile *c,
1332 struct prog_dst_register dst )
1333 {
1334 struct brw_reg reg;
1335
1336 switch (dst.File) {
1337 case PROGRAM_TEMPORARY:
1338 case PROGRAM_OUTPUT:
1339 /* register-indirect addressing is only 1x1, not VxH, for
1340 * destination regs. So, for RelAddr we'll return a temporary
1341 * for the dest and do a move of the result to the RelAddr
1342 * register after the instruction emit.
1343 */
1344 if (dst.RelAddr) {
1345 reg = get_tmp(c);
1346 } else {
1347 assert(c->regs[dst.File][dst.Index].nr != 0);
1348 reg = c->regs[dst.File][dst.Index];
1349 }
1350 break;
1351 case PROGRAM_ADDRESS:
1352 assert(dst.Index == 0);
1353 reg = c->regs[dst.File][dst.Index];
1354 break;
1355 case PROGRAM_UNDEFINED:
1356 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1357 reg = brw_null_reg();
1358 break;
1359 default:
1360 assert(0);
1361 reg = brw_null_reg();
1362 }
1363
1364 assert(reg.type != BRW_IMMEDIATE_VALUE);
1365 reg.dw1.bits.writemask = dst.WriteMask;
1366
1367 return reg;
1368 }
1369
1370
1371 static void emit_swz( struct brw_vs_compile *c,
1372 struct brw_reg dst,
1373 const struct prog_instruction *inst)
1374 {
1375 const GLuint argIndex = 0;
1376 const struct prog_src_register src = inst->SrcReg[argIndex];
1377 struct brw_compile *p = &c->func;
1378 GLuint zeros_mask = 0;
1379 GLuint ones_mask = 0;
1380 GLuint src_mask = 0;
1381 GLubyte src_swz[4];
1382 GLboolean need_tmp = (src.Negate &&
1383 dst.file != BRW_GENERAL_REGISTER_FILE);
1384 struct brw_reg tmp = dst;
1385 GLuint i;
1386
1387 if (need_tmp)
1388 tmp = get_tmp(c);
1389
1390 for (i = 0; i < 4; i++) {
1391 if (dst.dw1.bits.writemask & (1<<i)) {
1392 GLubyte s = GET_SWZ(src.Swizzle, i);
1393 switch (s) {
1394 case SWIZZLE_X:
1395 case SWIZZLE_Y:
1396 case SWIZZLE_Z:
1397 case SWIZZLE_W:
1398 src_mask |= 1<<i;
1399 src_swz[i] = s;
1400 break;
1401 case SWIZZLE_ZERO:
1402 zeros_mask |= 1<<i;
1403 break;
1404 case SWIZZLE_ONE:
1405 ones_mask |= 1<<i;
1406 break;
1407 }
1408 }
1409 }
1410
1411 /* Do src first, in case dst aliases src:
1412 */
1413 if (src_mask) {
1414 struct brw_reg arg0;
1415
1416 arg0 = get_src_reg(c, inst, argIndex);
1417
1418 arg0 = brw_swizzle(arg0,
1419 src_swz[0], src_swz[1],
1420 src_swz[2], src_swz[3]);
1421
1422 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1423 }
1424
1425 if (zeros_mask)
1426 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1427
1428 if (ones_mask)
1429 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1430
1431 if (src.Negate)
1432 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1433
1434 if (need_tmp) {
1435 brw_MOV(p, dst, tmp);
1436 release_tmp(c, tmp);
1437 }
1438 }
1439
1440
1441 /**
1442 * Post-vertex-program processing. Send the results to the URB.
1443 */
1444 static void emit_vertex_write( struct brw_vs_compile *c)
1445 {
1446 struct brw_compile *p = &c->func;
1447 struct brw_context *brw = p->brw;
1448 struct intel_context *intel = &brw->intel;
1449 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1450 struct brw_reg ndc;
1451 int eot;
1452 GLuint len_vertex_header = 2;
1453 int next_mrf, i;
1454
1455 if (c->key.copy_edgeflag) {
1456 brw_MOV(p,
1457 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1458 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1459 }
1460
1461 if (intel->gen < 6) {
1462 /* Build ndc coords */
1463 ndc = get_tmp(c);
1464 /* ndc = 1.0 / pos.w */
1465 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1466 /* ndc.xyz = pos * ndc */
1467 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1468 }
1469
1470 /* Update the header for point size, user clipping flags, and -ve rhw
1471 * workaround.
1472 */
1473 if (intel->gen >= 6) {
1474 struct brw_reg m1 = brw_message_reg(1);
1475
1476 /* On gen6, m1 has each value in a separate dword, so we never
1477 * need to mess with a temporary for computing the m1 value.
1478 */
1479 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1480 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1481 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1482 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1483 }
1484
1485 /* Set the user clip distances in dword 8-15. (m3-4)*/
1486 if (c->key.nr_userclip) {
1487 for (i = 0; i < c->key.nr_userclip; i++) {
1488 struct brw_reg m;
1489 if (i < 4)
1490 m = brw_message_reg(3);
1491 else
1492 m = brw_message_reg(4);
1493
1494 brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1495 }
1496 }
1497 } else if ((c->prog_data.outputs_written &
1498 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1499 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1500 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1501 GLuint i;
1502
1503 brw_MOV(p, header1, brw_imm_ud(0));
1504
1505 brw_set_access_mode(p, BRW_ALIGN_16);
1506
1507 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1508 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1509 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1510 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1511 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1512 header1, brw_imm_ud(0x7ff<<8));
1513 }
1514
1515 for (i = 0; i < c->key.nr_userclip; i++) {
1516 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1517 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1518 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1519 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1520 }
1521
1522 /* i965 clipping workaround:
1523 * 1) Test for -ve rhw
1524 * 2) If set,
1525 * set ndc = (0,0,0,0)
1526 * set ucp[6] = 1
1527 *
1528 * Later, clipping will detect ucp[6] and ensure the primitive is
1529 * clipped against all fixed planes.
1530 */
1531 if (brw->has_negative_rhw_bug) {
1532 brw_CMP(p,
1533 vec8(brw_null_reg()),
1534 BRW_CONDITIONAL_L,
1535 brw_swizzle1(ndc, 3),
1536 brw_imm_f(0));
1537
1538 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1539 brw_MOV(p, ndc, brw_imm_f(0));
1540 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1541 }
1542
1543 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1544 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1545 brw_set_access_mode(p, BRW_ALIGN_16);
1546
1547 release_tmp(c, header1);
1548 }
1549 else {
1550 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1551 }
1552
1553 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1554 * of zeros followed by two sets of NDC coordinates:
1555 */
1556 brw_set_access_mode(p, BRW_ALIGN_1);
1557 brw_set_acc_write_control(p, 0);
1558
1559 /* The VUE layout is documented in Volume 2a. */
1560 if (intel->gen >= 6) {
1561 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1562 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1563 * dword 4-7 (m2) is the 4D space position
1564 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1565 * enabled.
1566 * m3 or 5 is the first vertex element data we fill, which is
1567 * the vertex position.
1568 */
1569 brw_MOV(p, brw_message_reg(2), pos);
1570 len_vertex_header = 1;
1571 if (c->key.nr_userclip > 0)
1572 len_vertex_header += 2;
1573 } else if (intel->gen == 5) {
1574 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1575 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1576 * dword 4-7 (m2) is the ndc position (set above)
1577 * dword 8-11 (m3) of the vertex header is the 4D space position
1578 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1579 * m6 is a pad so that the vertex element data is aligned
1580 * m7 is the first vertex data we fill, which is the vertex position.
1581 */
1582 brw_MOV(p, brw_message_reg(2), ndc);
1583 brw_MOV(p, brw_message_reg(3), pos);
1584 brw_MOV(p, brw_message_reg(7), pos);
1585 len_vertex_header = 6;
1586 } else {
1587 /* There are 8 dwords in VUE header pre-Ironlake:
1588 * dword 0-3 (m1) is indices, point width, clip flags.
1589 * dword 4-7 (m2) is ndc position (set above)
1590 *
1591 * dword 8-11 (m3) is the first vertex data, which we always have be the
1592 * vertex position.
1593 */
1594 brw_MOV(p, brw_message_reg(2), ndc);
1595 brw_MOV(p, brw_message_reg(3), pos);
1596 len_vertex_header = 2;
1597 }
1598
1599 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1600 next_mrf = 2 + len_vertex_header;
1601 for (i = 0; i < VERT_RESULT_MAX; i++) {
1602 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1603 break;
1604 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1605 continue;
1606
1607 if (i >= VERT_RESULT_TEX0 &&
1608 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1609 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1610 next_mrf++;
1611 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1612 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1613 }
1614 }
1615
1616 eot = (c->first_overflow_output == 0);
1617
1618 brw_urb_WRITE(p,
1619 brw_null_reg(), /* dest */
1620 0, /* starting mrf reg nr */
1621 c->r0, /* src */
1622 0, /* allocate */
1623 1, /* used */
1624 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1625 0, /* response len */
1626 eot, /* eot */
1627 eot, /* writes complete */
1628 0, /* urb destination offset */
1629 BRW_URB_SWIZZLE_INTERLEAVE);
1630
1631 if (c->first_overflow_output > 0) {
1632 /* Not all of the vertex outputs/results fit into the MRF.
1633 * Move the overflowed attributes from the GRF to the MRF and
1634 * issue another brw_urb_WRITE().
1635 */
1636 GLuint i, mrf = 1;
1637 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1638 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1639 /* move from GRF to MRF */
1640 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1641 mrf++;
1642 }
1643 }
1644
1645 brw_urb_WRITE(p,
1646 brw_null_reg(), /* dest */
1647 0, /* starting mrf reg nr */
1648 c->r0, /* src */
1649 0, /* allocate */
1650 1, /* used */
1651 mrf, /* msg len */
1652 0, /* response len */
1653 1, /* eot */
1654 1, /* writes complete */
1655 14 / 2, /* urb destination offset */
1656 BRW_URB_SWIZZLE_INTERLEAVE);
1657 }
1658 }
1659
1660 static GLboolean
1661 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1662 {
1663 struct brw_compile *p = &c->func;
1664 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1665
1666 if (p->nr_insn == 0)
1667 return GL_FALSE;
1668
1669 if (val.address_mode != BRW_ADDRESS_DIRECT)
1670 return GL_FALSE;
1671
1672 switch (prev_insn->header.opcode) {
1673 case BRW_OPCODE_MOV:
1674 case BRW_OPCODE_MAC:
1675 case BRW_OPCODE_MUL:
1676 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1677 prev_insn->header.execution_size == val.width &&
1678 prev_insn->bits1.da1.dest_reg_file == val.file &&
1679 prev_insn->bits1.da1.dest_reg_type == val.type &&
1680 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1681 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1682 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1683 prev_insn->bits1.da16.dest_writemask == 0xf)
1684 return GL_TRUE;
1685 else
1686 return GL_FALSE;
1687 default:
1688 return GL_FALSE;
1689 }
1690 }
1691
1692 static uint32_t
1693 get_predicate(const struct prog_instruction *inst)
1694 {
1695 if (inst->DstReg.CondMask == COND_TR)
1696 return BRW_PREDICATE_NONE;
1697
1698 /* All of GLSL only produces predicates for COND_NE and one channel per
1699 * vector. Fail badly if someone starts doing something else, as it might
1700 * mean infinite looping or something.
1701 *
1702 * We'd like to support all the condition codes, but our hardware doesn't
1703 * quite match the Mesa IR, which is modeled after the NV extensions. For
1704 * those, the instruction may update the condition codes or not, then any
1705 * later instruction may use one of those condition codes. For gen4, the
1706 * instruction may update the flags register based on one of the condition
1707 * codes output by the instruction, and then further instructions may
1708 * predicate on that. We can probably support this, but it won't
1709 * necessarily be easy.
1710 */
1711 assert(inst->DstReg.CondMask == COND_NE);
1712
1713 switch (inst->DstReg.CondSwizzle) {
1714 case SWIZZLE_XXXX:
1715 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1716 case SWIZZLE_YYYY:
1717 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1718 case SWIZZLE_ZZZZ:
1719 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1720 case SWIZZLE_WWWW:
1721 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1722 default:
1723 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1724 inst->DstReg.CondMask);
1725 return BRW_PREDICATE_NORMAL;
1726 }
1727 }
1728
1729 /* Emit the vertex program instructions here.
1730 */
1731 void brw_vs_emit(struct brw_vs_compile *c )
1732 {
1733 #define MAX_IF_DEPTH 32
1734 #define MAX_LOOP_DEPTH 32
1735 struct brw_compile *p = &c->func;
1736 struct brw_context *brw = p->brw;
1737 struct intel_context *intel = &brw->intel;
1738 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1739 GLuint insn, if_depth = 0, loop_depth = 0;
1740 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1741 int if_depth_in_loop[MAX_LOOP_DEPTH];
1742 const struct brw_indirect stack_index = brw_indirect(0, 0);
1743 GLuint index;
1744 GLuint file;
1745
1746 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1747 printf("vs-mesa:\n");
1748 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1749 GL_TRUE);
1750 printf("\n");
1751 }
1752
1753 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1754 brw_set_access_mode(p, BRW_ALIGN_16);
1755 if_depth_in_loop[loop_depth] = 0;
1756
1757 brw_set_acc_write_control(p, 1);
1758
1759 for (insn = 0; insn < nr_insns; insn++) {
1760 GLuint i;
1761 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1762
1763 /* Message registers can't be read, so copy the output into GRF
1764 * register if they are used in source registers
1765 */
1766 for (i = 0; i < 3; i++) {
1767 struct prog_src_register *src = &inst->SrcReg[i];
1768 GLuint index = src->Index;
1769 GLuint file = src->File;
1770 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1771 c->output_regs[index].used_in_src = GL_TRUE;
1772 }
1773
1774 switch (inst->Opcode) {
1775 case OPCODE_CAL:
1776 case OPCODE_RET:
1777 c->needs_stack = GL_TRUE;
1778 break;
1779 default:
1780 break;
1781 }
1782 }
1783
1784 /* Static register allocation
1785 */
1786 brw_vs_alloc_regs(c);
1787
1788 if (c->needs_stack)
1789 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1790
1791 for (insn = 0; insn < nr_insns; insn++) {
1792
1793 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1794 struct brw_reg args[3], dst;
1795 GLuint i;
1796
1797 #if 0
1798 printf("%d: ", insn);
1799 _mesa_print_instruction(inst);
1800 #endif
1801
1802 /* Get argument regs. SWZ is special and does this itself.
1803 */
1804 if (inst->Opcode != OPCODE_SWZ)
1805 for (i = 0; i < 3; i++) {
1806 const struct prog_src_register *src = &inst->SrcReg[i];
1807 index = src->Index;
1808 file = src->File;
1809 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1810 args[i] = c->output_regs[index].reg;
1811 else
1812 args[i] = get_arg(c, inst, i);
1813 }
1814
1815 /* Get dest regs. Note that it is possible for a reg to be both
1816 * dst and arg, given the static allocation of registers. So
1817 * care needs to be taken emitting multi-operation instructions.
1818 */
1819 index = inst->DstReg.Index;
1820 file = inst->DstReg.File;
1821 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1822 dst = c->output_regs[index].reg;
1823 else
1824 dst = get_dst(c, inst->DstReg);
1825
1826 if (inst->SaturateMode != SATURATE_OFF) {
1827 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1828 inst->SaturateMode);
1829 }
1830
1831 switch (inst->Opcode) {
1832 case OPCODE_ABS:
1833 brw_MOV(p, dst, brw_abs(args[0]));
1834 break;
1835 case OPCODE_ADD:
1836 brw_ADD(p, dst, args[0], args[1]);
1837 break;
1838 case OPCODE_COS:
1839 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1840 break;
1841 case OPCODE_DP2:
1842 brw_DP2(p, dst, args[0], args[1]);
1843 break;
1844 case OPCODE_DP3:
1845 brw_DP3(p, dst, args[0], args[1]);
1846 break;
1847 case OPCODE_DP4:
1848 brw_DP4(p, dst, args[0], args[1]);
1849 break;
1850 case OPCODE_DPH:
1851 brw_DPH(p, dst, args[0], args[1]);
1852 break;
1853 case OPCODE_NRM3:
1854 emit_nrm(c, dst, args[0], 3);
1855 break;
1856 case OPCODE_NRM4:
1857 emit_nrm(c, dst, args[0], 4);
1858 break;
1859 case OPCODE_DST:
1860 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1861 break;
1862 case OPCODE_EXP:
1863 unalias1(c, dst, args[0], emit_exp_noalias);
1864 break;
1865 case OPCODE_EX2:
1866 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1867 break;
1868 case OPCODE_ARL:
1869 brw_RNDD(p, dst, args[0]);
1870 break;
1871 case OPCODE_FLR:
1872 brw_RNDD(p, dst, args[0]);
1873 break;
1874 case OPCODE_FRC:
1875 brw_FRC(p, dst, args[0]);
1876 break;
1877 case OPCODE_LOG:
1878 unalias1(c, dst, args[0], emit_log_noalias);
1879 break;
1880 case OPCODE_LG2:
1881 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1882 break;
1883 case OPCODE_LIT:
1884 unalias1(c, dst, args[0], emit_lit_noalias);
1885 break;
1886 case OPCODE_LRP:
1887 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1888 break;
1889 case OPCODE_MAD:
1890 if (!accumulator_contains(c, args[2]))
1891 brw_MOV(p, brw_acc_reg(), args[2]);
1892 brw_MAC(p, dst, args[0], args[1]);
1893 break;
1894 case OPCODE_CMP:
1895 emit_cmp(p, dst, args[0], args[1], args[2]);
1896 break;
1897 case OPCODE_MAX:
1898 emit_max(p, dst, args[0], args[1]);
1899 break;
1900 case OPCODE_MIN:
1901 emit_min(p, dst, args[0], args[1]);
1902 break;
1903 case OPCODE_MOV:
1904 brw_MOV(p, dst, args[0]);
1905 break;
1906 case OPCODE_MUL:
1907 brw_MUL(p, dst, args[0], args[1]);
1908 break;
1909 case OPCODE_POW:
1910 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1911 break;
1912 case OPCODE_RCP:
1913 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1914 break;
1915 case OPCODE_RSQ:
1916 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1917 break;
1918
1919 case OPCODE_SEQ:
1920 unalias2(c, dst, args[0], args[1], emit_seq);
1921 break;
1922 case OPCODE_SIN:
1923 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1924 break;
1925 case OPCODE_SNE:
1926 unalias2(c, dst, args[0], args[1], emit_sne);
1927 break;
1928 case OPCODE_SGE:
1929 unalias2(c, dst, args[0], args[1], emit_sge);
1930 break;
1931 case OPCODE_SGT:
1932 unalias2(c, dst, args[0], args[1], emit_sgt);
1933 break;
1934 case OPCODE_SLT:
1935 unalias2(c, dst, args[0], args[1], emit_slt);
1936 break;
1937 case OPCODE_SLE:
1938 unalias2(c, dst, args[0], args[1], emit_sle);
1939 break;
1940 case OPCODE_SSG:
1941 unalias1(c, dst, args[0], emit_sign);
1942 break;
1943 case OPCODE_SUB:
1944 brw_ADD(p, dst, args[0], negate(args[1]));
1945 break;
1946 case OPCODE_SWZ:
1947 /* The args[0] value can't be used here as it won't have
1948 * correctly encoded the full swizzle:
1949 */
1950 emit_swz(c, dst, inst);
1951 break;
1952 case OPCODE_TRUNC:
1953 /* round toward zero */
1954 brw_RNDZ(p, dst, args[0]);
1955 break;
1956 case OPCODE_XPD:
1957 emit_xpd(p, dst, args[0], args[1]);
1958 break;
1959 case OPCODE_IF:
1960 assert(if_depth < MAX_IF_DEPTH);
1961 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1962 /* Note that brw_IF smashes the predicate_control field. */
1963 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1964 if_depth_in_loop[loop_depth]++;
1965 if_depth++;
1966 break;
1967 case OPCODE_ELSE:
1968 clear_current_const(c);
1969 assert(if_depth > 0);
1970 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1971 break;
1972 case OPCODE_ENDIF:
1973 clear_current_const(c);
1974 assert(if_depth > 0);
1975 brw_ENDIF(p, if_inst[--if_depth]);
1976 if_depth_in_loop[loop_depth]--;
1977 break;
1978 case OPCODE_BGNLOOP:
1979 clear_current_const(c);
1980 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1981 if_depth_in_loop[loop_depth] = 0;
1982 break;
1983 case OPCODE_BRK:
1984 brw_set_predicate_control(p, get_predicate(inst));
1985 brw_BREAK(p, if_depth_in_loop[loop_depth]);
1986 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1987 break;
1988 case OPCODE_CONT:
1989 brw_set_predicate_control(p, get_predicate(inst));
1990 brw_CONT(p, if_depth_in_loop[loop_depth]);
1991 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1992 break;
1993 case OPCODE_ENDLOOP:
1994 {
1995 clear_current_const(c);
1996 struct brw_instruction *inst0, *inst1;
1997 GLuint br = 1;
1998
1999 loop_depth--;
2000
2001 if (intel->gen == 5)
2002 br = 2;
2003
2004 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2005 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2006 while (inst0 > loop_inst[loop_depth]) {
2007 inst0--;
2008 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2009 inst0->bits3.if_else.jump_count == 0) {
2010 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2011 }
2012 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2013 inst0->bits3.if_else.jump_count == 0) {
2014 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2015 }
2016 }
2017 }
2018 break;
2019 case OPCODE_BRA:
2020 brw_set_predicate_control(p, get_predicate(inst));
2021 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2022 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2023 break;
2024 case OPCODE_CAL:
2025 brw_set_access_mode(p, BRW_ALIGN_1);
2026 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2027 brw_set_access_mode(p, BRW_ALIGN_16);
2028 brw_ADD(p, get_addr_reg(stack_index),
2029 get_addr_reg(stack_index), brw_imm_d(4));
2030 brw_save_call(p, inst->Comment, p->nr_insn);
2031 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2032 break;
2033 case OPCODE_RET:
2034 brw_ADD(p, get_addr_reg(stack_index),
2035 get_addr_reg(stack_index), brw_imm_d(-4));
2036 brw_set_access_mode(p, BRW_ALIGN_1);
2037 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2038 brw_set_access_mode(p, BRW_ALIGN_16);
2039 break;
2040 case OPCODE_END:
2041 emit_vertex_write(c);
2042 break;
2043 case OPCODE_PRINT:
2044 /* no-op */
2045 break;
2046 case OPCODE_BGNSUB:
2047 brw_save_label(p, inst->Comment, p->nr_insn);
2048 break;
2049 case OPCODE_ENDSUB:
2050 /* no-op */
2051 break;
2052 default:
2053 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2054 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2055 _mesa_opcode_string(inst->Opcode) :
2056 "unknown");
2057 }
2058
2059 /* Set the predication update on the last instruction of the native
2060 * instruction sequence.
2061 *
2062 * This would be problematic if it was set on a math instruction,
2063 * but that shouldn't be the case with the current GLSL compiler.
2064 */
2065 if (inst->CondUpdate) {
2066 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2067
2068 assert(hw_insn->header.destreg__conditionalmod == 0);
2069 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2070 }
2071
2072 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2073 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2074 && c->output_regs[inst->DstReg.Index].used_in_src) {
2075 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2076 }
2077
2078 /* Result color clamping.
2079 *
2080 * When destination register is an output register and
2081 * it's primary/secondary front/back color, we have to clamp
2082 * the result to [0,1]. This is done by enabling the
2083 * saturation bit for the last instruction.
2084 *
2085 * We don't use brw_set_saturate() as it modifies
2086 * p->current->header.saturate, which affects all the subsequent
2087 * instructions. Instead, we directly modify the header
2088 * of the last (already stored) instruction.
2089 */
2090 if (inst->DstReg.File == PROGRAM_OUTPUT) {
2091 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2092 || (inst->DstReg.Index == VERT_RESULT_COL1)
2093 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2094 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2095 p->store[p->nr_insn-1].header.saturate = 1;
2096 }
2097 }
2098
2099 if (inst->DstReg.RelAddr) {
2100 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2101 inst->DstReg.File == PROGRAM_OUTPUT);
2102 move_to_reladdr_dst(c, inst, dst);
2103 }
2104
2105 release_tmps(c);
2106 }
2107
2108 brw_resolve_cals(p);
2109
2110 brw_optimize(p);
2111
2112 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2113 int i;
2114
2115 printf("vs-native:\n");
2116 for (i = 0; i < p->nr_insn; i++)
2117 brw_disasm(stdout, &p->store[i], intel->gen);
2118 printf("\n");
2119 }
2120 }