i965/vs: Use the embedded-comparison SEL on gen6+, like the FS does.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 src_reg::src_reg(dst_reg reg)
34 {
35 init();
36
37 this->file = reg.file;
38 this->reg = reg.reg;
39 this->reg_offset = reg.reg_offset;
40 this->type = reg.type;
41 this->reladdr = reg.reladdr;
42 this->fixed_hw_reg = reg.fixed_hw_reg;
43
44 int swizzles[4];
45 int next_chan = 0;
46 int last = 0;
47
48 for (int i = 0; i < 4; i++) {
49 if (!(reg.writemask & (1 << i)))
50 continue;
51
52 swizzles[next_chan++] = last = i;
53 }
54
55 for (; next_chan < 4; next_chan++) {
56 swizzles[next_chan] = last;
57 }
58
59 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
60 swizzles[2], swizzles[3]);
61 }
62
63 dst_reg::dst_reg(src_reg reg)
64 {
65 init();
66
67 this->file = reg.file;
68 this->reg = reg.reg;
69 this->reg_offset = reg.reg_offset;
70 this->type = reg.type;
71 this->writemask = WRITEMASK_XYZW;
72 this->reladdr = reg.reladdr;
73 this->fixed_hw_reg = reg.fixed_hw_reg;
74 }
75
76 vec4_instruction::vec4_instruction(vec4_visitor *v,
77 enum opcode opcode, dst_reg dst,
78 src_reg src0, src_reg src1, src_reg src2)
79 {
80 this->opcode = opcode;
81 this->dst = dst;
82 this->src[0] = src0;
83 this->src[1] = src1;
84 this->src[2] = src2;
85 this->ir = v->base_ir;
86 this->annotation = v->current_annotation;
87 }
88
89 vec4_instruction *
90 vec4_visitor::emit(vec4_instruction *inst)
91 {
92 this->instructions.push_tail(inst);
93
94 return inst;
95 }
96
97 vec4_instruction *
98 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
99 {
100 new_inst->ir = inst->ir;
101 new_inst->annotation = inst->annotation;
102
103 inst->insert_before(new_inst);
104
105 return inst;
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
110 src_reg src0, src_reg src1, src_reg src2)
111 {
112 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
113 src0, src1, src2));
114 }
115
116
117 vec4_instruction *
118 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
119 {
120 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
121 }
122
123 vec4_instruction *
124 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
125 {
126 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
127 }
128
129 vec4_instruction *
130 vec4_visitor::emit(enum opcode opcode)
131 {
132 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
133 }
134
135 #define ALU1(op) \
136 vec4_instruction * \
137 vec4_visitor::op(dst_reg dst, src_reg src0) \
138 { \
139 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
140 src0); \
141 }
142
143 #define ALU2(op) \
144 vec4_instruction * \
145 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
146 { \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU2(ADD)
158 ALU2(MUL)
159 ALU2(MACH)
160 ALU2(AND)
161 ALU2(OR)
162 ALU2(XOR)
163 ALU2(DP3)
164 ALU2(DP4)
165
166 /** Gen4 predicated IF. */
167 vec4_instruction *
168 vec4_visitor::IF(uint32_t predicate)
169 {
170 vec4_instruction *inst;
171
172 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
173 inst->predicate = predicate;
174
175 return inst;
176 }
177
178 /** Gen6+ IF with embedded comparison. */
179 vec4_instruction *
180 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
181 {
182 assert(intel->gen >= 6);
183
184 vec4_instruction *inst;
185
186 resolve_ud_negate(&src0);
187 resolve_ud_negate(&src1);
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
190 src0, src1);
191 inst->conditional_mod = condition;
192
193 return inst;
194 }
195
196 /**
197 * CMP: Sets the low bit of the destination channels with the result
198 * of the comparison, while the upper bits are undefined, and updates
199 * the flag register with the packed 16 bits of the result.
200 */
201 vec4_instruction *
202 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
203 {
204 vec4_instruction *inst;
205
206 /* original gen4 does type conversion to the destination type
207 * before before comparison, producing garbage results for floating
208 * point comparisons.
209 */
210 if (intel->gen == 4) {
211 dst.type = src0.type;
212 if (dst.file == HW_REG)
213 dst.fixed_hw_reg.type = dst.type;
214 }
215
216 resolve_ud_negate(&src0);
217 resolve_ud_negate(&src1);
218
219 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
220 inst->conditional_mod = condition;
221
222 return inst;
223 }
224
225 vec4_instruction *
226 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
227 {
228 vec4_instruction *inst;
229
230 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
231 dst, index);
232 inst->base_mrf = 14;
233 inst->mlen = 1;
234
235 return inst;
236 }
237
238 vec4_instruction *
239 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
240 {
241 vec4_instruction *inst;
242
243 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
244 dst, src, index);
245 inst->base_mrf = 13;
246 inst->mlen = 2;
247
248 return inst;
249 }
250
251 void
252 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
253 {
254 static enum opcode dot_opcodes[] = {
255 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
256 };
257
258 emit(dot_opcodes[elements - 2], dst, src0, src1);
259 }
260
261 void
262 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
263 {
264 /* The gen6 math instruction ignores the source modifiers --
265 * swizzle, abs, negate, and at least some parts of the register
266 * region description.
267 *
268 * While it would seem that this MOV could be avoided at this point
269 * in the case that the swizzle is matched up with the destination
270 * writemask, note that uniform packing and register allocation
271 * could rearrange our swizzle, so let's leave this matter up to
272 * copy propagation later.
273 */
274 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
275 emit(MOV(dst_reg(temp_src), src));
276
277 if (dst.writemask != WRITEMASK_XYZW) {
278 /* The gen6 math instruction must be align1, so we can't do
279 * writemasks.
280 */
281 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
282
283 emit(opcode, temp_dst, temp_src);
284
285 emit(MOV(dst, src_reg(temp_dst)));
286 } else {
287 emit(opcode, dst, temp_src);
288 }
289 }
290
291 void
292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
293 {
294 vec4_instruction *inst = emit(opcode, dst, src);
295 inst->base_mrf = 1;
296 inst->mlen = 1;
297 }
298
299 void
300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
301 {
302 switch (opcode) {
303 case SHADER_OPCODE_RCP:
304 case SHADER_OPCODE_RSQ:
305 case SHADER_OPCODE_SQRT:
306 case SHADER_OPCODE_EXP2:
307 case SHADER_OPCODE_LOG2:
308 case SHADER_OPCODE_SIN:
309 case SHADER_OPCODE_COS:
310 break;
311 default:
312 assert(!"not reached: bad math opcode");
313 return;
314 }
315
316 if (intel->gen >= 6) {
317 return emit_math1_gen6(opcode, dst, src);
318 } else {
319 return emit_math1_gen4(opcode, dst, src);
320 }
321 }
322
323 void
324 vec4_visitor::emit_math2_gen6(enum opcode opcode,
325 dst_reg dst, src_reg src0, src_reg src1)
326 {
327 src_reg expanded;
328
329 /* The gen6 math instruction ignores the source modifiers --
330 * swizzle, abs, negate, and at least some parts of the register
331 * region description. Move the sources to temporaries to make it
332 * generally work.
333 */
334
335 expanded = src_reg(this, glsl_type::vec4_type);
336 expanded.type = src0.type;
337 emit(MOV(dst_reg(expanded), src0));
338 src0 = expanded;
339
340 expanded = src_reg(this, glsl_type::vec4_type);
341 expanded.type = src1.type;
342 emit(MOV(dst_reg(expanded), src1));
343 src1 = expanded;
344
345 if (dst.writemask != WRITEMASK_XYZW) {
346 /* The gen6 math instruction must be align1, so we can't do
347 * writemasks.
348 */
349 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
350 temp_dst.type = dst.type;
351
352 emit(opcode, temp_dst, src0, src1);
353
354 emit(MOV(dst, src_reg(temp_dst)));
355 } else {
356 emit(opcode, dst, src0, src1);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen4(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 vec4_instruction *inst = emit(opcode, dst, src0, src1);
365 inst->base_mrf = 1;
366 inst->mlen = 2;
367 }
368
369 void
370 vec4_visitor::emit_math(enum opcode opcode,
371 dst_reg dst, src_reg src0, src_reg src1)
372 {
373 switch (opcode) {
374 case SHADER_OPCODE_POW:
375 case SHADER_OPCODE_INT_QUOTIENT:
376 case SHADER_OPCODE_INT_REMAINDER:
377 break;
378 default:
379 assert(!"not reached: unsupported binary math opcode");
380 return;
381 }
382
383 if (intel->gen >= 6) {
384 return emit_math2_gen6(opcode, dst, src0, src1);
385 } else {
386 return emit_math2_gen4(opcode, dst, src0, src1);
387 }
388 }
389
390 void
391 vec4_visitor::visit_instructions(const exec_list *list)
392 {
393 foreach_list(node, list) {
394 ir_instruction *ir = (ir_instruction *)node;
395
396 base_ir = ir;
397 ir->accept(this);
398 }
399 }
400
401
402 static int
403 type_size(const struct glsl_type *type)
404 {
405 unsigned int i;
406 int size;
407
408 switch (type->base_type) {
409 case GLSL_TYPE_UINT:
410 case GLSL_TYPE_INT:
411 case GLSL_TYPE_FLOAT:
412 case GLSL_TYPE_BOOL:
413 if (type->is_matrix()) {
414 return type->matrix_columns;
415 } else {
416 /* Regardless of size of vector, it gets a vec4. This is bad
417 * packing for things like floats, but otherwise arrays become a
418 * mess. Hopefully a later pass over the code can pack scalars
419 * down if appropriate.
420 */
421 return 1;
422 }
423 case GLSL_TYPE_ARRAY:
424 assert(type->length > 0);
425 return type_size(type->fields.array) * type->length;
426 case GLSL_TYPE_STRUCT:
427 size = 0;
428 for (i = 0; i < type->length; i++) {
429 size += type_size(type->fields.structure[i].type);
430 }
431 return size;
432 case GLSL_TYPE_SAMPLER:
433 /* Samplers take up one slot in UNIFORMS[], but they're baked in
434 * at link time.
435 */
436 return 1;
437 default:
438 assert(0);
439 return 0;
440 }
441 }
442
443 int
444 vec4_visitor::virtual_grf_alloc(int size)
445 {
446 if (virtual_grf_array_size <= virtual_grf_count) {
447 if (virtual_grf_array_size == 0)
448 virtual_grf_array_size = 16;
449 else
450 virtual_grf_array_size *= 2;
451 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
452 virtual_grf_array_size);
453 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
454 virtual_grf_array_size);
455 }
456 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
457 virtual_grf_reg_count += size;
458 virtual_grf_sizes[virtual_grf_count] = size;
459 return virtual_grf_count++;
460 }
461
462 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
463 {
464 init();
465
466 this->file = GRF;
467 this->reg = v->virtual_grf_alloc(type_size(type));
468
469 if (type->is_array() || type->is_record()) {
470 this->swizzle = BRW_SWIZZLE_NOOP;
471 } else {
472 this->swizzle = swizzle_for_size(type->vector_elements);
473 }
474
475 this->type = brw_type_for_base_type(type);
476 }
477
478 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
479 {
480 init();
481
482 this->file = GRF;
483 this->reg = v->virtual_grf_alloc(type_size(type));
484
485 if (type->is_array() || type->is_record()) {
486 this->writemask = WRITEMASK_XYZW;
487 } else {
488 this->writemask = (1 << type->vector_elements) - 1;
489 }
490
491 this->type = brw_type_for_base_type(type);
492 }
493
494 /* Our support for uniforms is piggy-backed on the struct
495 * gl_fragment_program, because that's where the values actually
496 * get stored, rather than in some global gl_shader_program uniform
497 * store.
498 */
499 int
500 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
501 {
502 unsigned int offset = 0;
503 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
504
505 if (type->is_matrix()) {
506 const glsl_type *column = type->column_type();
507
508 for (unsigned int i = 0; i < type->matrix_columns; i++) {
509 offset += setup_uniform_values(loc + offset, column);
510 }
511
512 return offset;
513 }
514
515 switch (type->base_type) {
516 case GLSL_TYPE_FLOAT:
517 case GLSL_TYPE_UINT:
518 case GLSL_TYPE_INT:
519 case GLSL_TYPE_BOOL:
520 for (unsigned int i = 0; i < type->vector_elements; i++) {
521 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
522 }
523
524 /* Set up pad elements to get things aligned to a vec4 boundary. */
525 for (unsigned int i = type->vector_elements; i < 4; i++) {
526 static float zero = 0;
527
528 c->prog_data.param[this->uniforms * 4 + i] = &zero;
529 }
530
531 /* Track the size of this uniform vector, for future packing of
532 * uniforms.
533 */
534 this->uniform_vector_size[this->uniforms] = type->vector_elements;
535 this->uniforms++;
536
537 return 1;
538
539 case GLSL_TYPE_STRUCT:
540 for (unsigned int i = 0; i < type->length; i++) {
541 offset += setup_uniform_values(loc + offset,
542 type->fields.structure[i].type);
543 }
544 return offset;
545
546 case GLSL_TYPE_ARRAY:
547 for (unsigned int i = 0; i < type->length; i++) {
548 offset += setup_uniform_values(loc + offset, type->fields.array);
549 }
550 return offset;
551
552 case GLSL_TYPE_SAMPLER:
553 /* The sampler takes up a slot, but we don't use any values from it. */
554 return 1;
555
556 default:
557 assert(!"not reached");
558 return 0;
559 }
560 }
561
562 void
563 vec4_visitor::setup_uniform_clipplane_values()
564 {
565 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
566
567 /* Pre-Gen6, we compact clip planes. For example, if the user
568 * enables just clip planes 0, 1, and 3, we will enable clip planes
569 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
570 * plane 2. This simplifies the implementation of the Gen6 clip
571 * thread.
572 *
573 * In Gen6 and later, we don't compact clip planes, because this
574 * simplifies the implementation of gl_ClipDistance.
575 */
576 int compacted_clipplane_index = 0;
577 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
578 if (intel->gen < 6 &&
579 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
580 continue;
581 }
582 this->uniform_vector_size[this->uniforms] = 4;
583 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
584 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
585 for (int j = 0; j < 4; ++j) {
586 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
587 }
588 ++compacted_clipplane_index;
589 ++this->uniforms;
590 }
591 }
592
593 /* Our support for builtin uniforms is even scarier than non-builtin.
594 * It sits on top of the PROG_STATE_VAR parameters that are
595 * automatically updated from GL context state.
596 */
597 void
598 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
599 {
600 const ir_state_slot *const slots = ir->state_slots;
601 assert(ir->state_slots != NULL);
602
603 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
604 /* This state reference has already been setup by ir_to_mesa,
605 * but we'll get the same index back here. We can reference
606 * ParameterValues directly, since unlike brw_fs.cpp, we never
607 * add new state references during compile.
608 */
609 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
610 (gl_state_index *)slots[i].tokens);
611 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
612
613 this->uniform_vector_size[this->uniforms] = 0;
614 /* Add each of the unique swizzled channels of the element.
615 * This will end up matching the size of the glsl_type of this field.
616 */
617 int last_swiz = -1;
618 for (unsigned int j = 0; j < 4; j++) {
619 int swiz = GET_SWZ(slots[i].swizzle, j);
620 last_swiz = swiz;
621
622 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
623 if (swiz <= last_swiz)
624 this->uniform_vector_size[this->uniforms]++;
625 }
626 this->uniforms++;
627 }
628 }
629
630 dst_reg *
631 vec4_visitor::variable_storage(ir_variable *var)
632 {
633 return (dst_reg *)hash_table_find(this->variable_ht, var);
634 }
635
636 void
637 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
638 {
639 ir_expression *expr = ir->as_expression();
640
641 *predicate = BRW_PREDICATE_NORMAL;
642
643 if (expr) {
644 src_reg op[2];
645 vec4_instruction *inst;
646
647 assert(expr->get_num_operands() <= 2);
648 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
649 expr->operands[i]->accept(this);
650 op[i] = this->result;
651
652 resolve_ud_negate(&op[i]);
653 }
654
655 switch (expr->operation) {
656 case ir_unop_logic_not:
657 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
658 inst->conditional_mod = BRW_CONDITIONAL_Z;
659 break;
660
661 case ir_binop_logic_xor:
662 inst = emit(XOR(dst_null_d(), op[0], op[1]));
663 inst->conditional_mod = BRW_CONDITIONAL_NZ;
664 break;
665
666 case ir_binop_logic_or:
667 inst = emit(OR(dst_null_d(), op[0], op[1]));
668 inst->conditional_mod = BRW_CONDITIONAL_NZ;
669 break;
670
671 case ir_binop_logic_and:
672 inst = emit(AND(dst_null_d(), op[0], op[1]));
673 inst->conditional_mod = BRW_CONDITIONAL_NZ;
674 break;
675
676 case ir_unop_f2b:
677 if (intel->gen >= 6) {
678 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
679 } else {
680 inst = emit(MOV(dst_null_f(), op[0]));
681 inst->conditional_mod = BRW_CONDITIONAL_NZ;
682 }
683 break;
684
685 case ir_unop_i2b:
686 if (intel->gen >= 6) {
687 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
688 } else {
689 inst = emit(MOV(dst_null_d(), op[0]));
690 inst->conditional_mod = BRW_CONDITIONAL_NZ;
691 }
692 break;
693
694 case ir_binop_all_equal:
695 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
696 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
697 break;
698
699 case ir_binop_any_nequal:
700 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
701 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
702 break;
703
704 case ir_unop_any:
705 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
706 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
707 break;
708
709 case ir_binop_greater:
710 case ir_binop_gequal:
711 case ir_binop_less:
712 case ir_binop_lequal:
713 case ir_binop_equal:
714 case ir_binop_nequal:
715 emit(CMP(dst_null_d(), op[0], op[1],
716 brw_conditional_for_comparison(expr->operation)));
717 break;
718
719 default:
720 assert(!"not reached");
721 break;
722 }
723 return;
724 }
725
726 ir->accept(this);
727
728 resolve_ud_negate(&this->result);
729
730 if (intel->gen >= 6) {
731 vec4_instruction *inst = emit(AND(dst_null_d(),
732 this->result, src_reg(1)));
733 inst->conditional_mod = BRW_CONDITIONAL_NZ;
734 } else {
735 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
736 inst->conditional_mod = BRW_CONDITIONAL_NZ;
737 }
738 }
739
740 /**
741 * Emit a gen6 IF statement with the comparison folded into the IF
742 * instruction.
743 */
744 void
745 vec4_visitor::emit_if_gen6(ir_if *ir)
746 {
747 ir_expression *expr = ir->condition->as_expression();
748
749 if (expr) {
750 src_reg op[2];
751 dst_reg temp;
752
753 assert(expr->get_num_operands() <= 2);
754 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
755 expr->operands[i]->accept(this);
756 op[i] = this->result;
757 }
758
759 switch (expr->operation) {
760 case ir_unop_logic_not:
761 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
762 return;
763
764 case ir_binop_logic_xor:
765 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
766 return;
767
768 case ir_binop_logic_or:
769 temp = dst_reg(this, glsl_type::bool_type);
770 emit(OR(temp, op[0], op[1]));
771 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
772 return;
773
774 case ir_binop_logic_and:
775 temp = dst_reg(this, glsl_type::bool_type);
776 emit(AND(temp, op[0], op[1]));
777 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
778 return;
779
780 case ir_unop_f2b:
781 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
782 return;
783
784 case ir_unop_i2b:
785 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
786 return;
787
788 case ir_binop_greater:
789 case ir_binop_gequal:
790 case ir_binop_less:
791 case ir_binop_lequal:
792 case ir_binop_equal:
793 case ir_binop_nequal:
794 emit(IF(op[0], op[1],
795 brw_conditional_for_comparison(expr->operation)));
796 return;
797
798 case ir_binop_all_equal:
799 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
800 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
801 return;
802
803 case ir_binop_any_nequal:
804 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
805 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
806 return;
807
808 case ir_unop_any:
809 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
810 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
811 return;
812
813 default:
814 assert(!"not reached");
815 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
816 return;
817 }
818 return;
819 }
820
821 ir->condition->accept(this);
822
823 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
824 }
825
826 void
827 vec4_visitor::visit(ir_variable *ir)
828 {
829 dst_reg *reg = NULL;
830
831 if (variable_storage(ir))
832 return;
833
834 switch (ir->mode) {
835 case ir_var_in:
836 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
837
838 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
839 * come in as floating point conversions of the integer values.
840 */
841 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
842 if (!c->key.gl_fixed_input_size[i])
843 continue;
844
845 dst_reg dst = *reg;
846 dst.type = brw_type_for_base_type(ir->type);
847 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
848 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
849 }
850 break;
851
852 case ir_var_out:
853 reg = new(mem_ctx) dst_reg(this, ir->type);
854
855 for (int i = 0; i < type_size(ir->type); i++) {
856 output_reg[ir->location + i] = *reg;
857 output_reg[ir->location + i].reg_offset = i;
858 output_reg[ir->location + i].type =
859 brw_type_for_base_type(ir->type->get_scalar_type());
860 output_reg_annotation[ir->location + i] = ir->name;
861 }
862 break;
863
864 case ir_var_auto:
865 case ir_var_temporary:
866 reg = new(mem_ctx) dst_reg(this, ir->type);
867 break;
868
869 case ir_var_uniform:
870 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
871
872 /* Track how big the whole uniform variable is, in case we need to put a
873 * copy of its data into pull constants for array access.
874 */
875 this->uniform_size[this->uniforms] = type_size(ir->type);
876
877 if (!strncmp(ir->name, "gl_", 3)) {
878 setup_builtin_uniform_values(ir);
879 } else {
880 setup_uniform_values(ir->location, ir->type);
881 }
882 break;
883
884 case ir_var_system_value:
885 /* VertexID is stored by the VF as the last vertex element, but
886 * we don't represent it with a flag in inputs_read, so we call
887 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
888 */
889 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
890 prog_data->uses_vertexid = true;
891
892 switch (ir->location) {
893 case SYSTEM_VALUE_VERTEX_ID:
894 reg->writemask = WRITEMASK_X;
895 break;
896 case SYSTEM_VALUE_INSTANCE_ID:
897 reg->writemask = WRITEMASK_Y;
898 break;
899 default:
900 assert(!"not reached");
901 break;
902 }
903 break;
904
905 default:
906 assert(!"not reached");
907 }
908
909 reg->type = brw_type_for_base_type(ir->type);
910 hash_table_insert(this->variable_ht, reg, ir);
911 }
912
913 void
914 vec4_visitor::visit(ir_loop *ir)
915 {
916 dst_reg counter;
917
918 /* We don't want debugging output to print the whole body of the
919 * loop as the annotation.
920 */
921 this->base_ir = NULL;
922
923 if (ir->counter != NULL) {
924 this->base_ir = ir->counter;
925 ir->counter->accept(this);
926 counter = *(variable_storage(ir->counter));
927
928 if (ir->from != NULL) {
929 this->base_ir = ir->from;
930 ir->from->accept(this);
931
932 emit(MOV(counter, this->result));
933 }
934 }
935
936 emit(BRW_OPCODE_DO);
937
938 if (ir->to) {
939 this->base_ir = ir->to;
940 ir->to->accept(this);
941
942 emit(CMP(dst_null_d(), src_reg(counter), this->result,
943 brw_conditional_for_comparison(ir->cmp)));
944
945 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
946 inst->predicate = BRW_PREDICATE_NORMAL;
947 }
948
949 visit_instructions(&ir->body_instructions);
950
951
952 if (ir->increment) {
953 this->base_ir = ir->increment;
954 ir->increment->accept(this);
955 emit(ADD(counter, src_reg(counter), this->result));
956 }
957
958 emit(BRW_OPCODE_WHILE);
959 }
960
961 void
962 vec4_visitor::visit(ir_loop_jump *ir)
963 {
964 switch (ir->mode) {
965 case ir_loop_jump::jump_break:
966 emit(BRW_OPCODE_BREAK);
967 break;
968 case ir_loop_jump::jump_continue:
969 emit(BRW_OPCODE_CONTINUE);
970 break;
971 }
972 }
973
974
975 void
976 vec4_visitor::visit(ir_function_signature *ir)
977 {
978 assert(0);
979 (void)ir;
980 }
981
982 void
983 vec4_visitor::visit(ir_function *ir)
984 {
985 /* Ignore function bodies other than main() -- we shouldn't see calls to
986 * them since they should all be inlined.
987 */
988 if (strcmp(ir->name, "main") == 0) {
989 const ir_function_signature *sig;
990 exec_list empty;
991
992 sig = ir->matching_signature(&empty);
993
994 assert(sig);
995
996 visit_instructions(&sig->body);
997 }
998 }
999
1000 bool
1001 vec4_visitor::try_emit_sat(ir_expression *ir)
1002 {
1003 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1004 if (!sat_src)
1005 return false;
1006
1007 sat_src->accept(this);
1008 src_reg src = this->result;
1009
1010 this->result = src_reg(this, ir->type);
1011 vec4_instruction *inst;
1012 inst = emit(MOV(dst_reg(this->result), src));
1013 inst->saturate = true;
1014
1015 return true;
1016 }
1017
1018 void
1019 vec4_visitor::emit_bool_comparison(unsigned int op,
1020 dst_reg dst, src_reg src0, src_reg src1)
1021 {
1022 /* original gen4 does destination conversion before comparison. */
1023 if (intel->gen < 5)
1024 dst.type = src0.type;
1025
1026 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1027
1028 dst.type = BRW_REGISTER_TYPE_D;
1029 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1030 }
1031
1032 void
1033 vec4_visitor::visit(ir_expression *ir)
1034 {
1035 unsigned int operand;
1036 src_reg op[Elements(ir->operands)];
1037 src_reg result_src;
1038 dst_reg result_dst;
1039 vec4_instruction *inst;
1040
1041 if (try_emit_sat(ir))
1042 return;
1043
1044 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1045 this->result.file = BAD_FILE;
1046 ir->operands[operand]->accept(this);
1047 if (this->result.file == BAD_FILE) {
1048 printf("Failed to get tree for expression operand:\n");
1049 ir->operands[operand]->print();
1050 exit(1);
1051 }
1052 op[operand] = this->result;
1053
1054 /* Matrix expression operands should have been broken down to vector
1055 * operations already.
1056 */
1057 assert(!ir->operands[operand]->type->is_matrix());
1058 }
1059
1060 int vector_elements = ir->operands[0]->type->vector_elements;
1061 if (ir->operands[1]) {
1062 vector_elements = MAX2(vector_elements,
1063 ir->operands[1]->type->vector_elements);
1064 }
1065
1066 this->result.file = BAD_FILE;
1067
1068 /* Storage for our result. Ideally for an assignment we'd be using
1069 * the actual storage for the result here, instead.
1070 */
1071 result_src = src_reg(this, ir->type);
1072 /* convenience for the emit functions below. */
1073 result_dst = dst_reg(result_src);
1074 /* If nothing special happens, this is the result. */
1075 this->result = result_src;
1076 /* Limit writes to the channels that will be used by result_src later.
1077 * This does limit this temp's use as a temporary for multi-instruction
1078 * sequences.
1079 */
1080 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1081
1082 switch (ir->operation) {
1083 case ir_unop_logic_not:
1084 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1085 * ones complement of the whole register, not just bit 0.
1086 */
1087 emit(XOR(result_dst, op[0], src_reg(1)));
1088 break;
1089 case ir_unop_neg:
1090 op[0].negate = !op[0].negate;
1091 this->result = op[0];
1092 break;
1093 case ir_unop_abs:
1094 op[0].abs = true;
1095 op[0].negate = false;
1096 this->result = op[0];
1097 break;
1098
1099 case ir_unop_sign:
1100 emit(MOV(result_dst, src_reg(0.0f)));
1101
1102 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1103 inst = emit(MOV(result_dst, src_reg(1.0f)));
1104 inst->predicate = BRW_PREDICATE_NORMAL;
1105
1106 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1107 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1108 inst->predicate = BRW_PREDICATE_NORMAL;
1109
1110 break;
1111
1112 case ir_unop_rcp:
1113 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1114 break;
1115
1116 case ir_unop_exp2:
1117 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1118 break;
1119 case ir_unop_log2:
1120 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1121 break;
1122 case ir_unop_exp:
1123 case ir_unop_log:
1124 assert(!"not reached: should be handled by ir_explog_to_explog2");
1125 break;
1126 case ir_unop_sin:
1127 case ir_unop_sin_reduced:
1128 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1129 break;
1130 case ir_unop_cos:
1131 case ir_unop_cos_reduced:
1132 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1133 break;
1134
1135 case ir_unop_dFdx:
1136 case ir_unop_dFdy:
1137 assert(!"derivatives not valid in vertex shader");
1138 break;
1139
1140 case ir_unop_noise:
1141 assert(!"not reached: should be handled by lower_noise");
1142 break;
1143
1144 case ir_binop_add:
1145 emit(ADD(result_dst, op[0], op[1]));
1146 break;
1147 case ir_binop_sub:
1148 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1149 break;
1150
1151 case ir_binop_mul:
1152 if (ir->type->is_integer()) {
1153 /* For integer multiplication, the MUL uses the low 16 bits
1154 * of one of the operands (src0 on gen6, src1 on gen7). The
1155 * MACH accumulates in the contribution of the upper 16 bits
1156 * of that operand.
1157 *
1158 * FINISHME: Emit just the MUL if we know an operand is small
1159 * enough.
1160 */
1161 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1162
1163 emit(MUL(acc, op[0], op[1]));
1164 emit(MACH(dst_null_d(), op[0], op[1]));
1165 emit(MOV(result_dst, src_reg(acc)));
1166 } else {
1167 emit(MUL(result_dst, op[0], op[1]));
1168 }
1169 break;
1170 case ir_binop_div:
1171 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1172 assert(ir->type->is_integer());
1173 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1174 break;
1175 case ir_binop_mod:
1176 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1177 assert(ir->type->is_integer());
1178 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1179 break;
1180
1181 case ir_binop_less:
1182 case ir_binop_greater:
1183 case ir_binop_lequal:
1184 case ir_binop_gequal:
1185 case ir_binop_equal:
1186 case ir_binop_nequal: {
1187 emit(CMP(result_dst, op[0], op[1],
1188 brw_conditional_for_comparison(ir->operation)));
1189 emit(AND(result_dst, result_src, src_reg(0x1)));
1190 break;
1191 }
1192
1193 case ir_binop_all_equal:
1194 /* "==" operator producing a scalar boolean. */
1195 if (ir->operands[0]->type->is_vector() ||
1196 ir->operands[1]->type->is_vector()) {
1197 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1198 emit(MOV(result_dst, src_reg(0)));
1199 inst = emit(MOV(result_dst, src_reg(1)));
1200 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1201 } else {
1202 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1203 emit(AND(result_dst, result_src, src_reg(0x1)));
1204 }
1205 break;
1206 case ir_binop_any_nequal:
1207 /* "!=" operator producing a scalar boolean. */
1208 if (ir->operands[0]->type->is_vector() ||
1209 ir->operands[1]->type->is_vector()) {
1210 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1211
1212 emit(MOV(result_dst, src_reg(0)));
1213 inst = emit(MOV(result_dst, src_reg(1)));
1214 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1215 } else {
1216 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1217 emit(AND(result_dst, result_src, src_reg(0x1)));
1218 }
1219 break;
1220
1221 case ir_unop_any:
1222 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1223 emit(MOV(result_dst, src_reg(0)));
1224
1225 inst = emit(MOV(result_dst, src_reg(1)));
1226 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1227 break;
1228
1229 case ir_binop_logic_xor:
1230 emit(XOR(result_dst, op[0], op[1]));
1231 break;
1232
1233 case ir_binop_logic_or:
1234 emit(OR(result_dst, op[0], op[1]));
1235 break;
1236
1237 case ir_binop_logic_and:
1238 emit(AND(result_dst, op[0], op[1]));
1239 break;
1240
1241 case ir_binop_dot:
1242 assert(ir->operands[0]->type->is_vector());
1243 assert(ir->operands[0]->type == ir->operands[1]->type);
1244 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1245 break;
1246
1247 case ir_unop_sqrt:
1248 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1249 break;
1250 case ir_unop_rsq:
1251 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1252 break;
1253 case ir_unop_i2f:
1254 case ir_unop_i2u:
1255 case ir_unop_u2i:
1256 case ir_unop_u2f:
1257 case ir_unop_b2f:
1258 case ir_unop_b2i:
1259 case ir_unop_f2i:
1260 emit(MOV(result_dst, op[0]));
1261 break;
1262 case ir_unop_f2b:
1263 case ir_unop_i2b: {
1264 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1265 emit(AND(result_dst, result_src, src_reg(1)));
1266 break;
1267 }
1268
1269 case ir_unop_trunc:
1270 emit(RNDZ(result_dst, op[0]));
1271 break;
1272 case ir_unop_ceil:
1273 op[0].negate = !op[0].negate;
1274 inst = emit(RNDD(result_dst, op[0]));
1275 this->result.negate = true;
1276 break;
1277 case ir_unop_floor:
1278 inst = emit(RNDD(result_dst, op[0]));
1279 break;
1280 case ir_unop_fract:
1281 inst = emit(FRC(result_dst, op[0]));
1282 break;
1283 case ir_unop_round_even:
1284 emit(RNDE(result_dst, op[0]));
1285 break;
1286
1287 case ir_binop_min:
1288 if (intel->gen >= 6) {
1289 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1290 inst->conditional_mod = BRW_CONDITIONAL_L;
1291 } else {
1292 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1293
1294 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1295 inst->predicate = BRW_PREDICATE_NORMAL;
1296 }
1297 break;
1298 case ir_binop_max:
1299 if (intel->gen >= 6) {
1300 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1301 inst->conditional_mod = BRW_CONDITIONAL_G;
1302 } else {
1303 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1304
1305 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1306 inst->predicate = BRW_PREDICATE_NORMAL;
1307 }
1308 break;
1309
1310 case ir_binop_pow:
1311 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1312 break;
1313
1314 case ir_unop_bit_not:
1315 inst = emit(NOT(result_dst, op[0]));
1316 break;
1317 case ir_binop_bit_and:
1318 inst = emit(AND(result_dst, op[0], op[1]));
1319 break;
1320 case ir_binop_bit_xor:
1321 inst = emit(XOR(result_dst, op[0], op[1]));
1322 break;
1323 case ir_binop_bit_or:
1324 inst = emit(OR(result_dst, op[0], op[1]));
1325 break;
1326
1327 case ir_binop_lshift:
1328 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1329 break;
1330
1331 case ir_binop_rshift:
1332 if (ir->type->base_type == GLSL_TYPE_INT)
1333 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1334 else
1335 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1336 break;
1337
1338 case ir_quadop_vector:
1339 assert(!"not reached: should be handled by lower_quadop_vector");
1340 break;
1341 }
1342 }
1343
1344
1345 void
1346 vec4_visitor::visit(ir_swizzle *ir)
1347 {
1348 src_reg src;
1349 int i = 0;
1350 int swizzle[4];
1351
1352 /* Note that this is only swizzles in expressions, not those on the left
1353 * hand side of an assignment, which do write masking. See ir_assignment
1354 * for that.
1355 */
1356
1357 ir->val->accept(this);
1358 src = this->result;
1359 assert(src.file != BAD_FILE);
1360
1361 for (i = 0; i < ir->type->vector_elements; i++) {
1362 switch (i) {
1363 case 0:
1364 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1365 break;
1366 case 1:
1367 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1368 break;
1369 case 2:
1370 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1371 break;
1372 case 3:
1373 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1374 break;
1375 }
1376 }
1377 for (; i < 4; i++) {
1378 /* Replicate the last channel out. */
1379 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1380 }
1381
1382 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1383
1384 this->result = src;
1385 }
1386
1387 void
1388 vec4_visitor::visit(ir_dereference_variable *ir)
1389 {
1390 const struct glsl_type *type = ir->type;
1391 dst_reg *reg = variable_storage(ir->var);
1392
1393 if (!reg) {
1394 fail("Failed to find variable storage for %s\n", ir->var->name);
1395 this->result = src_reg(brw_null_reg());
1396 return;
1397 }
1398
1399 this->result = src_reg(*reg);
1400
1401 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1402 this->result.swizzle = swizzle_for_size(type->vector_elements);
1403 }
1404
1405 void
1406 vec4_visitor::visit(ir_dereference_array *ir)
1407 {
1408 ir_constant *constant_index;
1409 src_reg src;
1410 int element_size = type_size(ir->type);
1411
1412 constant_index = ir->array_index->constant_expression_value();
1413
1414 ir->array->accept(this);
1415 src = this->result;
1416
1417 if (constant_index) {
1418 src.reg_offset += constant_index->value.i[0] * element_size;
1419 } else {
1420 /* Variable index array dereference. It eats the "vec4" of the
1421 * base of the array and an index that offsets the Mesa register
1422 * index.
1423 */
1424 ir->array_index->accept(this);
1425
1426 src_reg index_reg;
1427
1428 if (element_size == 1) {
1429 index_reg = this->result;
1430 } else {
1431 index_reg = src_reg(this, glsl_type::int_type);
1432
1433 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1434 }
1435
1436 if (src.reladdr) {
1437 src_reg temp = src_reg(this, glsl_type::int_type);
1438
1439 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1440
1441 index_reg = temp;
1442 }
1443
1444 src.reladdr = ralloc(mem_ctx, src_reg);
1445 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1446 }
1447
1448 /* If the type is smaller than a vec4, replicate the last channel out. */
1449 if (ir->type->is_scalar() || ir->type->is_vector())
1450 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1451 else
1452 src.swizzle = BRW_SWIZZLE_NOOP;
1453 src.type = brw_type_for_base_type(ir->type);
1454
1455 this->result = src;
1456 }
1457
1458 void
1459 vec4_visitor::visit(ir_dereference_record *ir)
1460 {
1461 unsigned int i;
1462 const glsl_type *struct_type = ir->record->type;
1463 int offset = 0;
1464
1465 ir->record->accept(this);
1466
1467 for (i = 0; i < struct_type->length; i++) {
1468 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1469 break;
1470 offset += type_size(struct_type->fields.structure[i].type);
1471 }
1472
1473 /* If the type is smaller than a vec4, replicate the last channel out. */
1474 if (ir->type->is_scalar() || ir->type->is_vector())
1475 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1476 else
1477 this->result.swizzle = BRW_SWIZZLE_NOOP;
1478 this->result.type = brw_type_for_base_type(ir->type);
1479
1480 this->result.reg_offset += offset;
1481 }
1482
1483 /**
1484 * We want to be careful in assignment setup to hit the actual storage
1485 * instead of potentially using a temporary like we might with the
1486 * ir_dereference handler.
1487 */
1488 static dst_reg
1489 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1490 {
1491 /* The LHS must be a dereference. If the LHS is a variable indexed array
1492 * access of a vector, it must be separated into a series conditional moves
1493 * before reaching this point (see ir_vec_index_to_cond_assign).
1494 */
1495 assert(ir->as_dereference());
1496 ir_dereference_array *deref_array = ir->as_dereference_array();
1497 if (deref_array) {
1498 assert(!deref_array->array->type->is_vector());
1499 }
1500
1501 /* Use the rvalue deref handler for the most part. We'll ignore
1502 * swizzles in it and write swizzles using writemask, though.
1503 */
1504 ir->accept(v);
1505 return dst_reg(v->result);
1506 }
1507
1508 void
1509 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1510 const struct glsl_type *type, uint32_t predicate)
1511 {
1512 if (type->base_type == GLSL_TYPE_STRUCT) {
1513 for (unsigned int i = 0; i < type->length; i++) {
1514 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1515 }
1516 return;
1517 }
1518
1519 if (type->is_array()) {
1520 for (unsigned int i = 0; i < type->length; i++) {
1521 emit_block_move(dst, src, type->fields.array, predicate);
1522 }
1523 return;
1524 }
1525
1526 if (type->is_matrix()) {
1527 const struct glsl_type *vec_type;
1528
1529 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1530 type->vector_elements, 1);
1531
1532 for (int i = 0; i < type->matrix_columns; i++) {
1533 emit_block_move(dst, src, vec_type, predicate);
1534 }
1535 return;
1536 }
1537
1538 assert(type->is_scalar() || type->is_vector());
1539
1540 dst->type = brw_type_for_base_type(type);
1541 src->type = dst->type;
1542
1543 dst->writemask = (1 << type->vector_elements) - 1;
1544
1545 /* Do we need to worry about swizzling a swizzle? */
1546 assert(src->swizzle == BRW_SWIZZLE_NOOP
1547 || src->swizzle == swizzle_for_size(type->vector_elements));
1548 src->swizzle = swizzle_for_size(type->vector_elements);
1549
1550 vec4_instruction *inst = emit(MOV(*dst, *src));
1551 inst->predicate = predicate;
1552
1553 dst->reg_offset++;
1554 src->reg_offset++;
1555 }
1556
1557
1558 /* If the RHS processing resulted in an instruction generating a
1559 * temporary value, and it would be easy to rewrite the instruction to
1560 * generate its result right into the LHS instead, do so. This ends
1561 * up reliably removing instructions where it can be tricky to do so
1562 * later without real UD chain information.
1563 */
1564 bool
1565 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1566 dst_reg dst,
1567 src_reg src,
1568 vec4_instruction *pre_rhs_inst,
1569 vec4_instruction *last_rhs_inst)
1570 {
1571 /* This could be supported, but it would take more smarts. */
1572 if (ir->condition)
1573 return false;
1574
1575 if (pre_rhs_inst == last_rhs_inst)
1576 return false; /* No instructions generated to work with. */
1577
1578 /* Make sure the last instruction generated our source reg. */
1579 if (src.file != GRF ||
1580 src.file != last_rhs_inst->dst.file ||
1581 src.reg != last_rhs_inst->dst.reg ||
1582 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1583 src.reladdr ||
1584 src.abs ||
1585 src.negate ||
1586 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1587 return false;
1588
1589 /* Check that that last instruction fully initialized the channels
1590 * we want to use, in the order we want to use them. We could
1591 * potentially reswizzle the operands of many instructions so that
1592 * we could handle out of order channels, but don't yet.
1593 */
1594
1595 for (unsigned i = 0; i < 4; i++) {
1596 if (dst.writemask & (1 << i)) {
1597 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1598 return false;
1599
1600 if (BRW_GET_SWZ(src.swizzle, i) != i)
1601 return false;
1602 }
1603 }
1604
1605 /* Success! Rewrite the instruction. */
1606 last_rhs_inst->dst.file = dst.file;
1607 last_rhs_inst->dst.reg = dst.reg;
1608 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1609 last_rhs_inst->dst.reladdr = dst.reladdr;
1610 last_rhs_inst->dst.writemask &= dst.writemask;
1611
1612 return true;
1613 }
1614
1615 void
1616 vec4_visitor::visit(ir_assignment *ir)
1617 {
1618 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1619 uint32_t predicate = BRW_PREDICATE_NONE;
1620
1621 if (!ir->lhs->type->is_scalar() &&
1622 !ir->lhs->type->is_vector()) {
1623 ir->rhs->accept(this);
1624 src_reg src = this->result;
1625
1626 if (ir->condition) {
1627 emit_bool_to_cond_code(ir->condition, &predicate);
1628 }
1629
1630 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1631 return;
1632 }
1633
1634 /* Now we're down to just a scalar/vector with writemasks. */
1635 int i;
1636
1637 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1638 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1639
1640 ir->rhs->accept(this);
1641
1642 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1643
1644 src_reg src = this->result;
1645
1646 int swizzles[4];
1647 int first_enabled_chan = 0;
1648 int src_chan = 0;
1649
1650 assert(ir->lhs->type->is_vector() ||
1651 ir->lhs->type->is_scalar());
1652 dst.writemask = ir->write_mask;
1653
1654 for (int i = 0; i < 4; i++) {
1655 if (dst.writemask & (1 << i)) {
1656 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1657 break;
1658 }
1659 }
1660
1661 /* Swizzle a small RHS vector into the channels being written.
1662 *
1663 * glsl ir treats write_mask as dictating how many channels are
1664 * present on the RHS while in our instructions we need to make
1665 * those channels appear in the slots of the vec4 they're written to.
1666 */
1667 for (int i = 0; i < 4; i++) {
1668 if (dst.writemask & (1 << i))
1669 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1670 else
1671 swizzles[i] = first_enabled_chan;
1672 }
1673 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1674 swizzles[2], swizzles[3]);
1675
1676 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1677 return;
1678 }
1679
1680 if (ir->condition) {
1681 emit_bool_to_cond_code(ir->condition, &predicate);
1682 }
1683
1684 for (i = 0; i < type_size(ir->lhs->type); i++) {
1685 vec4_instruction *inst = emit(MOV(dst, src));
1686 inst->predicate = predicate;
1687
1688 dst.reg_offset++;
1689 src.reg_offset++;
1690 }
1691 }
1692
1693 void
1694 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1695 {
1696 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1697 foreach_list(node, &ir->components) {
1698 ir_constant *field_value = (ir_constant *)node;
1699
1700 emit_constant_values(dst, field_value);
1701 }
1702 return;
1703 }
1704
1705 if (ir->type->is_array()) {
1706 for (unsigned int i = 0; i < ir->type->length; i++) {
1707 emit_constant_values(dst, ir->array_elements[i]);
1708 }
1709 return;
1710 }
1711
1712 if (ir->type->is_matrix()) {
1713 for (int i = 0; i < ir->type->matrix_columns; i++) {
1714 float *vec = &ir->value.f[i * ir->type->vector_elements];
1715
1716 for (int j = 0; j < ir->type->vector_elements; j++) {
1717 dst->writemask = 1 << j;
1718 dst->type = BRW_REGISTER_TYPE_F;
1719
1720 emit(MOV(*dst, src_reg(vec[j])));
1721 }
1722 dst->reg_offset++;
1723 }
1724 return;
1725 }
1726
1727 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1728
1729 for (int i = 0; i < ir->type->vector_elements; i++) {
1730 if (!(remaining_writemask & (1 << i)))
1731 continue;
1732
1733 dst->writemask = 1 << i;
1734 dst->type = brw_type_for_base_type(ir->type);
1735
1736 /* Find other components that match the one we're about to
1737 * write. Emits fewer instructions for things like vec4(0.5,
1738 * 1.5, 1.5, 1.5).
1739 */
1740 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1741 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1742 if (ir->value.b[i] == ir->value.b[j])
1743 dst->writemask |= (1 << j);
1744 } else {
1745 /* u, i, and f storage all line up, so no need for a
1746 * switch case for comparing each type.
1747 */
1748 if (ir->value.u[i] == ir->value.u[j])
1749 dst->writemask |= (1 << j);
1750 }
1751 }
1752
1753 switch (ir->type->base_type) {
1754 case GLSL_TYPE_FLOAT:
1755 emit(MOV(*dst, src_reg(ir->value.f[i])));
1756 break;
1757 case GLSL_TYPE_INT:
1758 emit(MOV(*dst, src_reg(ir->value.i[i])));
1759 break;
1760 case GLSL_TYPE_UINT:
1761 emit(MOV(*dst, src_reg(ir->value.u[i])));
1762 break;
1763 case GLSL_TYPE_BOOL:
1764 emit(MOV(*dst, src_reg(ir->value.b[i])));
1765 break;
1766 default:
1767 assert(!"Non-float/uint/int/bool constant");
1768 break;
1769 }
1770
1771 remaining_writemask &= ~dst->writemask;
1772 }
1773 dst->reg_offset++;
1774 }
1775
1776 void
1777 vec4_visitor::visit(ir_constant *ir)
1778 {
1779 dst_reg dst = dst_reg(this, ir->type);
1780 this->result = src_reg(dst);
1781
1782 emit_constant_values(&dst, ir);
1783 }
1784
1785 void
1786 vec4_visitor::visit(ir_call *ir)
1787 {
1788 assert(!"not reached");
1789 }
1790
1791 void
1792 vec4_visitor::visit(ir_texture *ir)
1793 {
1794 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1795 sampler = vp->Base.SamplerUnits[sampler];
1796
1797 /* Should be lowered by do_lower_texture_projection */
1798 assert(!ir->projector);
1799
1800 vec4_instruction *inst = NULL;
1801 switch (ir->op) {
1802 case ir_tex:
1803 case ir_txl:
1804 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1805 break;
1806 case ir_txd:
1807 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1808 break;
1809 case ir_txf:
1810 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1811 break;
1812 case ir_txs:
1813 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1814 break;
1815 case ir_txb:
1816 assert(!"TXB is not valid for vertex shaders.");
1817 }
1818
1819 /* Texel offsets go in the message header; Gen4 also requires headers. */
1820 inst->header_present = ir->offset || intel->gen < 5;
1821 inst->base_mrf = 2;
1822 inst->mlen = inst->header_present + 1; /* always at least one */
1823 inst->sampler = sampler;
1824 inst->dst = dst_reg(this, ir->type);
1825 inst->shadow_compare = ir->shadow_comparitor != NULL;
1826
1827 if (ir->offset != NULL)
1828 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1829
1830 /* MRF for the first parameter */
1831 int param_base = inst->base_mrf + inst->header_present;
1832
1833 if (ir->op == ir_txs) {
1834 ir->lod_info.lod->accept(this);
1835 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1836 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1837 this->result));
1838 } else {
1839 int i, coord_mask = 0, zero_mask = 0;
1840 /* Load the coordinate */
1841 /* FINISHME: gl_clamp_mask and saturate */
1842 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1843 coord_mask |= (1 << i);
1844 for (; i < 4; i++)
1845 zero_mask |= (1 << i);
1846
1847 ir->coordinate->accept(this);
1848 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1849 this->result));
1850 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1851 src_reg(0)));
1852 /* Load the shadow comparitor */
1853 if (ir->shadow_comparitor) {
1854 ir->shadow_comparitor->accept(this);
1855 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1856 WRITEMASK_X),
1857 this->result));
1858 inst->mlen++;
1859 }
1860
1861 /* Load the LOD info */
1862 if (ir->op == ir_txl) {
1863 int mrf, writemask;
1864 if (intel->gen >= 5) {
1865 mrf = param_base + 1;
1866 if (ir->shadow_comparitor) {
1867 writemask = WRITEMASK_Y;
1868 /* mlen already incremented */
1869 } else {
1870 writemask = WRITEMASK_X;
1871 inst->mlen++;
1872 }
1873 } else /* intel->gen == 4 */ {
1874 mrf = param_base;
1875 writemask = WRITEMASK_Z;
1876 }
1877 ir->lod_info.lod->accept(this);
1878 emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
1879 this->result));
1880 } else if (ir->op == ir_txf) {
1881 ir->lod_info.lod->accept(this);
1882 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1883 this->result));
1884 } else if (ir->op == ir_txd) {
1885 const glsl_type *type = ir->lod_info.grad.dPdx->type;
1886
1887 ir->lod_info.grad.dPdx->accept(this);
1888 src_reg dPdx = this->result;
1889 ir->lod_info.grad.dPdy->accept(this);
1890 src_reg dPdy = this->result;
1891
1892 if (intel->gen >= 5) {
1893 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1894 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1895 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1896 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1897 inst->mlen++;
1898
1899 if (ir->type->vector_elements == 3) {
1900 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1901 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1902 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1903 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1904 inst->mlen++;
1905 }
1906 } else /* intel->gen == 4 */ {
1907 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1908 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1909 inst->mlen += 2;
1910 }
1911 }
1912 }
1913
1914 emit(inst);
1915
1916 swizzle_result(ir, src_reg(inst->dst), sampler);
1917 }
1918
1919 void
1920 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
1921 {
1922 this->result = orig_val;
1923
1924 int s = c->key.tex.swizzles[sampler];
1925
1926 if (ir->op == ir_txs || ir->type == glsl_type::float_type
1927 || s == SWIZZLE_NOOP)
1928 return;
1929
1930 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1931 int swizzle[4];
1932
1933 for (int i = 0; i < 4; i++) {
1934 switch (GET_SWZ(s, i)) {
1935 case SWIZZLE_ZERO:
1936 zero_mask |= (1 << i);
1937 break;
1938 case SWIZZLE_ONE:
1939 one_mask |= (1 << i);
1940 break;
1941 default:
1942 copy_mask |= (1 << i);
1943 swizzle[i] = GET_SWZ(s, i);
1944 break;
1945 }
1946 }
1947
1948 this->result = src_reg(this, ir->type);
1949 dst_reg swizzled_result(this->result);
1950
1951 if (copy_mask) {
1952 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1953 swizzled_result.writemask = copy_mask;
1954 emit(MOV(swizzled_result, orig_val));
1955 }
1956
1957 if (zero_mask) {
1958 swizzled_result.writemask = zero_mask;
1959 emit(MOV(swizzled_result, src_reg(0.0f)));
1960 }
1961
1962 if (one_mask) {
1963 swizzled_result.writemask = one_mask;
1964 emit(MOV(swizzled_result, src_reg(1.0f)));
1965 }
1966 }
1967
1968 void
1969 vec4_visitor::visit(ir_return *ir)
1970 {
1971 assert(!"not reached");
1972 }
1973
1974 void
1975 vec4_visitor::visit(ir_discard *ir)
1976 {
1977 assert(!"not reached");
1978 }
1979
1980 void
1981 vec4_visitor::visit(ir_if *ir)
1982 {
1983 /* Don't point the annotation at the if statement, because then it plus
1984 * the then and else blocks get printed.
1985 */
1986 this->base_ir = ir->condition;
1987
1988 if (intel->gen == 6) {
1989 emit_if_gen6(ir);
1990 } else {
1991 uint32_t predicate;
1992 emit_bool_to_cond_code(ir->condition, &predicate);
1993 emit(IF(predicate));
1994 }
1995
1996 visit_instructions(&ir->then_instructions);
1997
1998 if (!ir->else_instructions.is_empty()) {
1999 this->base_ir = ir->condition;
2000 emit(BRW_OPCODE_ELSE);
2001
2002 visit_instructions(&ir->else_instructions);
2003 }
2004
2005 this->base_ir = ir->condition;
2006 emit(BRW_OPCODE_ENDIF);
2007 }
2008
2009 void
2010 vec4_visitor::emit_ndc_computation()
2011 {
2012 /* Get the position */
2013 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2014
2015 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2016 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2017 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2018
2019 current_annotation = "NDC";
2020 dst_reg ndc_w = ndc;
2021 ndc_w.writemask = WRITEMASK_W;
2022 src_reg pos_w = pos;
2023 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2024 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2025
2026 dst_reg ndc_xyz = ndc;
2027 ndc_xyz.writemask = WRITEMASK_XYZ;
2028
2029 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2030 }
2031
2032 void
2033 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2034 {
2035 if (intel->gen < 6 &&
2036 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2037 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2038 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2039 dst_reg header1_w = header1;
2040 header1_w.writemask = WRITEMASK_W;
2041 GLuint i;
2042
2043 emit(MOV(header1, 0u));
2044
2045 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2046 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2047
2048 current_annotation = "Point size";
2049 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2050 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2051 }
2052
2053 current_annotation = "Clipping flags";
2054 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2055 vec4_instruction *inst;
2056
2057 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2058 src_reg(this->userplane[i])));
2059 inst->conditional_mod = BRW_CONDITIONAL_L;
2060
2061 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2062 inst->predicate = BRW_PREDICATE_NORMAL;
2063 }
2064
2065 /* i965 clipping workaround:
2066 * 1) Test for -ve rhw
2067 * 2) If set,
2068 * set ndc = (0,0,0,0)
2069 * set ucp[6] = 1
2070 *
2071 * Later, clipping will detect ucp[6] and ensure the primitive is
2072 * clipped against all fixed planes.
2073 */
2074 if (brw->has_negative_rhw_bug) {
2075 #if 0
2076 /* FINISHME */
2077 brw_CMP(p,
2078 vec8(brw_null_reg()),
2079 BRW_CONDITIONAL_L,
2080 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2081 brw_imm_f(0));
2082
2083 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2084 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2085 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2086 #endif
2087 }
2088
2089 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2090 } else if (intel->gen < 6) {
2091 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2092 } else {
2093 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2094 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2095 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2096 src_reg(output_reg[VERT_RESULT_PSIZ])));
2097 }
2098 }
2099 }
2100
2101 void
2102 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2103 {
2104 if (intel->gen < 6) {
2105 /* Clip distance slots are set aside in gen5, but they are not used. It
2106 * is not clear whether we actually need to set aside space for them,
2107 * but the performance cost is negligible.
2108 */
2109 return;
2110 }
2111
2112 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2113 *
2114 * "If a linked set of shaders forming the vertex stage contains no
2115 * static write to gl_ClipVertex or gl_ClipDistance, but the
2116 * application has requested clipping against user clip planes through
2117 * the API, then the coordinate written to gl_Position is used for
2118 * comparison against the user clip planes."
2119 *
2120 * This function is only called if the shader didn't write to
2121 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2122 * if the user wrote to it; otherwise we use gl_Position.
2123 */
2124 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2125 if (!(c->prog_data.outputs_written
2126 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2127 clip_vertex = VERT_RESULT_HPOS;
2128 }
2129
2130 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2131 ++i) {
2132 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2133 src_reg(output_reg[clip_vertex]),
2134 src_reg(this->userplane[i + offset])));
2135 }
2136 }
2137
2138 void
2139 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2140 {
2141 assert (vert_result < VERT_RESULT_MAX);
2142 reg.type = output_reg[vert_result].type;
2143 current_annotation = output_reg_annotation[vert_result];
2144 /* Copy the register, saturating if necessary */
2145 vec4_instruction *inst = emit(MOV(reg,
2146 src_reg(output_reg[vert_result])));
2147 if ((vert_result == VERT_RESULT_COL0 ||
2148 vert_result == VERT_RESULT_COL1 ||
2149 vert_result == VERT_RESULT_BFC0 ||
2150 vert_result == VERT_RESULT_BFC1) &&
2151 c->key.clamp_vertex_color) {
2152 inst->saturate = true;
2153 }
2154 }
2155
2156 void
2157 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2158 {
2159 struct brw_reg hw_reg = brw_message_reg(mrf);
2160 dst_reg reg = dst_reg(MRF, mrf);
2161 reg.type = BRW_REGISTER_TYPE_F;
2162
2163 switch (vert_result) {
2164 case VERT_RESULT_PSIZ:
2165 /* PSIZ is always in slot 0, and is coupled with other flags. */
2166 current_annotation = "indices, point width, clip flags";
2167 emit_psiz_and_flags(hw_reg);
2168 break;
2169 case BRW_VERT_RESULT_NDC:
2170 current_annotation = "NDC";
2171 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2172 break;
2173 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2174 case VERT_RESULT_HPOS:
2175 current_annotation = "gl_Position";
2176 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2177 break;
2178 case VERT_RESULT_CLIP_DIST0:
2179 case VERT_RESULT_CLIP_DIST1:
2180 if (this->c->key.uses_clip_distance) {
2181 emit_generic_urb_slot(reg, vert_result);
2182 } else {
2183 current_annotation = "user clip distances";
2184 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2185 }
2186 break;
2187 case BRW_VERT_RESULT_PAD:
2188 /* No need to write to this slot */
2189 break;
2190 default:
2191 emit_generic_urb_slot(reg, vert_result);
2192 break;
2193 }
2194 }
2195
2196 static int
2197 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2198 {
2199 struct intel_context *intel = &brw->intel;
2200
2201 if (intel->gen >= 6) {
2202 /* URB data written (does not include the message header reg) must
2203 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2204 * section 5.4.3.2.2: URB_INTERLEAVED.
2205 *
2206 * URB entries are allocated on a multiple of 1024 bits, so an
2207 * extra 128 bits written here to make the end align to 256 is
2208 * no problem.
2209 */
2210 if ((mlen % 2) != 1)
2211 mlen++;
2212 }
2213
2214 return mlen;
2215 }
2216
2217 /**
2218 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2219 * complete the VS thread.
2220 *
2221 * The VUE layout is documented in Volume 2a.
2222 */
2223 void
2224 vec4_visitor::emit_urb_writes()
2225 {
2226 /* MRF 0 is reserved for the debugger, so start with message header
2227 * in MRF 1.
2228 */
2229 int base_mrf = 1;
2230 int mrf = base_mrf;
2231 /* In the process of generating our URB write message contents, we
2232 * may need to unspill a register or load from an array. Those
2233 * reads would use MRFs 14-15.
2234 */
2235 int max_usable_mrf = 13;
2236
2237 /* The following assertion verifies that max_usable_mrf causes an
2238 * even-numbered amount of URB write data, which will meet gen6's
2239 * requirements for length alignment.
2240 */
2241 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2242
2243 /* FINISHME: edgeflag */
2244
2245 brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2246 c->prog_data.outputs_written);
2247
2248 /* First mrf is the g0-based message header containing URB handles and such,
2249 * which is implied in VS_OPCODE_URB_WRITE.
2250 */
2251 mrf++;
2252
2253 if (intel->gen < 6) {
2254 emit_ndc_computation();
2255 }
2256
2257 /* Set up the VUE data for the first URB write */
2258 int slot;
2259 for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2260 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2261
2262 /* If this was max_usable_mrf, we can't fit anything more into this URB
2263 * WRITE.
2264 */
2265 if (mrf > max_usable_mrf) {
2266 slot++;
2267 break;
2268 }
2269 }
2270
2271 current_annotation = "URB write";
2272 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2273 inst->base_mrf = base_mrf;
2274 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2275 inst->eot = (slot >= c->vue_map.num_slots);
2276
2277 /* Optional second URB write */
2278 if (!inst->eot) {
2279 mrf = base_mrf + 1;
2280
2281 for (; slot < c->vue_map.num_slots; ++slot) {
2282 assert(mrf < max_usable_mrf);
2283
2284 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2285 }
2286
2287 current_annotation = "URB write";
2288 inst = emit(VS_OPCODE_URB_WRITE);
2289 inst->base_mrf = base_mrf;
2290 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2291 inst->eot = true;
2292 /* URB destination offset. In the previous write, we got MRFs
2293 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2294 * URB row increments, and each of our MRFs is half of one of
2295 * those, since we're doing interleaved writes.
2296 */
2297 inst->offset = (max_usable_mrf - base_mrf) / 2;
2298 }
2299 }
2300
2301 src_reg
2302 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2303 src_reg *reladdr, int reg_offset)
2304 {
2305 /* Because we store the values to scratch interleaved like our
2306 * vertex data, we need to scale the vec4 index by 2.
2307 */
2308 int message_header_scale = 2;
2309
2310 /* Pre-gen6, the message header uses byte offsets instead of vec4
2311 * (16-byte) offset units.
2312 */
2313 if (intel->gen < 6)
2314 message_header_scale *= 16;
2315
2316 if (reladdr) {
2317 src_reg index = src_reg(this, glsl_type::int_type);
2318
2319 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2320 emit_before(inst, MUL(dst_reg(index),
2321 index, src_reg(message_header_scale)));
2322
2323 return index;
2324 } else {
2325 return src_reg(reg_offset * message_header_scale);
2326 }
2327 }
2328
2329 src_reg
2330 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2331 src_reg *reladdr, int reg_offset)
2332 {
2333 if (reladdr) {
2334 src_reg index = src_reg(this, glsl_type::int_type);
2335
2336 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2337
2338 /* Pre-gen6, the message header uses byte offsets instead of vec4
2339 * (16-byte) offset units.
2340 */
2341 if (intel->gen < 6) {
2342 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2343 }
2344
2345 return index;
2346 } else {
2347 int message_header_scale = intel->gen < 6 ? 16 : 1;
2348 return src_reg(reg_offset * message_header_scale);
2349 }
2350 }
2351
2352 /**
2353 * Emits an instruction before @inst to load the value named by @orig_src
2354 * from scratch space at @base_offset to @temp.
2355 */
2356 void
2357 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2358 dst_reg temp, src_reg orig_src,
2359 int base_offset)
2360 {
2361 int reg_offset = base_offset + orig_src.reg_offset;
2362 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2363
2364 emit_before(inst, SCRATCH_READ(temp, index));
2365 }
2366
2367 /**
2368 * Emits an instruction after @inst to store the value to be written
2369 * to @orig_dst to scratch space at @base_offset, from @temp.
2370 */
2371 void
2372 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2373 src_reg temp, dst_reg orig_dst,
2374 int base_offset)
2375 {
2376 int reg_offset = base_offset + orig_dst.reg_offset;
2377 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2378
2379 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2380 orig_dst.writemask));
2381 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2382 write->predicate = inst->predicate;
2383 write->ir = inst->ir;
2384 write->annotation = inst->annotation;
2385 inst->insert_after(write);
2386 }
2387
2388 /**
2389 * We can't generally support array access in GRF space, because a
2390 * single instruction's destination can only span 2 contiguous
2391 * registers. So, we send all GRF arrays that get variable index
2392 * access to scratch space.
2393 */
2394 void
2395 vec4_visitor::move_grf_array_access_to_scratch()
2396 {
2397 int scratch_loc[this->virtual_grf_count];
2398
2399 for (int i = 0; i < this->virtual_grf_count; i++) {
2400 scratch_loc[i] = -1;
2401 }
2402
2403 /* First, calculate the set of virtual GRFs that need to be punted
2404 * to scratch due to having any array access on them, and where in
2405 * scratch.
2406 */
2407 foreach_list(node, &this->instructions) {
2408 vec4_instruction *inst = (vec4_instruction *)node;
2409
2410 if (inst->dst.file == GRF && inst->dst.reladdr &&
2411 scratch_loc[inst->dst.reg] == -1) {
2412 scratch_loc[inst->dst.reg] = c->last_scratch;
2413 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2414 }
2415
2416 for (int i = 0 ; i < 3; i++) {
2417 src_reg *src = &inst->src[i];
2418
2419 if (src->file == GRF && src->reladdr &&
2420 scratch_loc[src->reg] == -1) {
2421 scratch_loc[src->reg] = c->last_scratch;
2422 c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2423 }
2424 }
2425 }
2426
2427 /* Now, for anything that will be accessed through scratch, rewrite
2428 * it to load/store. Note that this is a _safe list walk, because
2429 * we may generate a new scratch_write instruction after the one
2430 * we're processing.
2431 */
2432 foreach_list_safe(node, &this->instructions) {
2433 vec4_instruction *inst = (vec4_instruction *)node;
2434
2435 /* Set up the annotation tracking for new generated instructions. */
2436 base_ir = inst->ir;
2437 current_annotation = inst->annotation;
2438
2439 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2440 src_reg temp = src_reg(this, glsl_type::vec4_type);
2441
2442 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2443
2444 inst->dst.file = temp.file;
2445 inst->dst.reg = temp.reg;
2446 inst->dst.reg_offset = temp.reg_offset;
2447 inst->dst.reladdr = NULL;
2448 }
2449
2450 for (int i = 0 ; i < 3; i++) {
2451 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2452 continue;
2453
2454 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2455
2456 emit_scratch_read(inst, temp, inst->src[i],
2457 scratch_loc[inst->src[i].reg]);
2458
2459 inst->src[i].file = temp.file;
2460 inst->src[i].reg = temp.reg;
2461 inst->src[i].reg_offset = temp.reg_offset;
2462 inst->src[i].reladdr = NULL;
2463 }
2464 }
2465 }
2466
2467 /**
2468 * Emits an instruction before @inst to load the value named by @orig_src
2469 * from the pull constant buffer (surface) at @base_offset to @temp.
2470 */
2471 void
2472 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2473 dst_reg temp, src_reg orig_src,
2474 int base_offset)
2475 {
2476 int reg_offset = base_offset + orig_src.reg_offset;
2477 src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2478 vec4_instruction *load;
2479
2480 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2481 temp, index);
2482 load->base_mrf = 14;
2483 load->mlen = 1;
2484 emit_before(inst, load);
2485 }
2486
2487 /**
2488 * Implements array access of uniforms by inserting a
2489 * PULL_CONSTANT_LOAD instruction.
2490 *
2491 * Unlike temporary GRF array access (where we don't support it due to
2492 * the difficulty of doing relative addressing on instruction
2493 * destinations), we could potentially do array access of uniforms
2494 * that were loaded in GRF space as push constants. In real-world
2495 * usage we've seen, though, the arrays being used are always larger
2496 * than we could load as push constants, so just always move all
2497 * uniform array access out to a pull constant buffer.
2498 */
2499 void
2500 vec4_visitor::move_uniform_array_access_to_pull_constants()
2501 {
2502 int pull_constant_loc[this->uniforms];
2503
2504 for (int i = 0; i < this->uniforms; i++) {
2505 pull_constant_loc[i] = -1;
2506 }
2507
2508 /* Walk through and find array access of uniforms. Put a copy of that
2509 * uniform in the pull constant buffer.
2510 *
2511 * Note that we don't move constant-indexed accesses to arrays. No
2512 * testing has been done of the performance impact of this choice.
2513 */
2514 foreach_list_safe(node, &this->instructions) {
2515 vec4_instruction *inst = (vec4_instruction *)node;
2516
2517 for (int i = 0 ; i < 3; i++) {
2518 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2519 continue;
2520
2521 int uniform = inst->src[i].reg;
2522
2523 /* If this array isn't already present in the pull constant buffer,
2524 * add it.
2525 */
2526 if (pull_constant_loc[uniform] == -1) {
2527 const float **values = &prog_data->param[uniform * 4];
2528
2529 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2530
2531 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2532 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2533 }
2534 }
2535
2536 /* Set up the annotation tracking for new generated instructions. */
2537 base_ir = inst->ir;
2538 current_annotation = inst->annotation;
2539
2540 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2541
2542 emit_pull_constant_load(inst, temp, inst->src[i],
2543 pull_constant_loc[uniform]);
2544
2545 inst->src[i].file = temp.file;
2546 inst->src[i].reg = temp.reg;
2547 inst->src[i].reg_offset = temp.reg_offset;
2548 inst->src[i].reladdr = NULL;
2549 }
2550 }
2551
2552 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2553 * no need to track them as larger-than-vec4 objects. This will be
2554 * relied on in cutting out unused uniform vectors from push
2555 * constants.
2556 */
2557 split_uniform_registers();
2558 }
2559
2560 void
2561 vec4_visitor::resolve_ud_negate(src_reg *reg)
2562 {
2563 if (reg->type != BRW_REGISTER_TYPE_UD ||
2564 !reg->negate)
2565 return;
2566
2567 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2568 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2569 *reg = temp;
2570 }
2571
2572 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2573 struct gl_shader_program *prog,
2574 struct brw_shader *shader)
2575 {
2576 this->c = c;
2577 this->p = &c->func;
2578 this->brw = p->brw;
2579 this->intel = &brw->intel;
2580 this->ctx = &intel->ctx;
2581 this->prog = prog;
2582 this->shader = shader;
2583
2584 this->mem_ctx = ralloc_context(NULL);
2585 this->failed = false;
2586
2587 this->base_ir = NULL;
2588 this->current_annotation = NULL;
2589
2590 this->c = c;
2591 this->vp = (struct gl_vertex_program *)
2592 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2593 this->prog_data = &c->prog_data;
2594
2595 this->variable_ht = hash_table_ctor(0,
2596 hash_table_pointer_hash,
2597 hash_table_pointer_compare);
2598
2599 this->virtual_grf_def = NULL;
2600 this->virtual_grf_use = NULL;
2601 this->virtual_grf_sizes = NULL;
2602 this->virtual_grf_count = 0;
2603 this->virtual_grf_reg_map = NULL;
2604 this->virtual_grf_reg_count = 0;
2605 this->virtual_grf_array_size = 0;
2606 this->live_intervals_valid = false;
2607
2608 this->uniforms = 0;
2609 }
2610
2611 vec4_visitor::~vec4_visitor()
2612 {
2613 ralloc_free(this->mem_ctx);
2614 hash_table_dtor(this->variable_ht);
2615 }
2616
2617
2618 void
2619 vec4_visitor::fail(const char *format, ...)
2620 {
2621 va_list va;
2622 char *msg;
2623
2624 if (failed)
2625 return;
2626
2627 failed = true;
2628
2629 va_start(va, format);
2630 msg = ralloc_vasprintf(mem_ctx, format, va);
2631 va_end(va);
2632 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2633
2634 this->fail_msg = msg;
2635
2636 if (INTEL_DEBUG & DEBUG_VS) {
2637 fprintf(stderr, "%s", msg);
2638 }
2639 }
2640
2641 } /* namespace brw */