i965/vs: Fix leak of an empty hash_table structure per compile.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 src_reg::src_reg(dst_reg reg)
34 {
35 init();
36
37 this->file = reg.file;
38 this->reg = reg.reg;
39 this->reg_offset = reg.reg_offset;
40 this->type = reg.type;
41 this->reladdr = reg.reladdr;
42 this->fixed_hw_reg = reg.fixed_hw_reg;
43
44 int swizzles[4];
45 int next_chan = 0;
46 int last = 0;
47
48 for (int i = 0; i < 4; i++) {
49 if (!(reg.writemask & (1 << i)))
50 continue;
51
52 swizzles[next_chan++] = last = i;
53 }
54
55 for (; next_chan < 4; next_chan++) {
56 swizzles[next_chan] = last;
57 }
58
59 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
60 swizzles[2], swizzles[3]);
61 }
62
63 dst_reg::dst_reg(src_reg reg)
64 {
65 init();
66
67 this->file = reg.file;
68 this->reg = reg.reg;
69 this->reg_offset = reg.reg_offset;
70 this->type = reg.type;
71 this->writemask = WRITEMASK_XYZW;
72 this->reladdr = reg.reladdr;
73 this->fixed_hw_reg = reg.fixed_hw_reg;
74 }
75
76 vec4_instruction::vec4_instruction(vec4_visitor *v,
77 enum opcode opcode, dst_reg dst,
78 src_reg src0, src_reg src1, src_reg src2)
79 {
80 this->opcode = opcode;
81 this->dst = dst;
82 this->src[0] = src0;
83 this->src[1] = src1;
84 this->src[2] = src2;
85 this->ir = v->base_ir;
86 this->annotation = v->current_annotation;
87 }
88
89 vec4_instruction *
90 vec4_visitor::emit(vec4_instruction *inst)
91 {
92 this->instructions.push_tail(inst);
93
94 return inst;
95 }
96
97 vec4_instruction *
98 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
99 {
100 new_inst->ir = inst->ir;
101 new_inst->annotation = inst->annotation;
102
103 inst->insert_before(new_inst);
104
105 return inst;
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
110 src_reg src0, src_reg src1, src_reg src2)
111 {
112 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
113 src0, src1, src2));
114 }
115
116
117 vec4_instruction *
118 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
119 {
120 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
121 }
122
123 vec4_instruction *
124 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
125 {
126 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
127 }
128
129 vec4_instruction *
130 vec4_visitor::emit(enum opcode opcode)
131 {
132 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
133 }
134
135 #define ALU1(op) \
136 vec4_instruction * \
137 vec4_visitor::op(dst_reg dst, src_reg src0) \
138 { \
139 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
140 src0); \
141 }
142
143 #define ALU2(op) \
144 vec4_instruction * \
145 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
146 { \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU2(ADD)
158 ALU2(MUL)
159 ALU2(MACH)
160 ALU2(AND)
161 ALU2(OR)
162 ALU2(XOR)
163 ALU2(DP3)
164 ALU2(DP4)
165
166 /** Gen4 predicated IF. */
167 vec4_instruction *
168 vec4_visitor::IF(uint32_t predicate)
169 {
170 vec4_instruction *inst;
171
172 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
173 inst->predicate = predicate;
174
175 return inst;
176 }
177
178 /** Gen6+ IF with embedded comparison. */
179 vec4_instruction *
180 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
181 {
182 assert(intel->gen >= 6);
183
184 vec4_instruction *inst;
185
186 resolve_ud_negate(&src0);
187 resolve_ud_negate(&src1);
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
190 src0, src1);
191 inst->conditional_mod = condition;
192
193 return inst;
194 }
195
196 /**
197 * CMP: Sets the low bit of the destination channels with the result
198 * of the comparison, while the upper bits are undefined, and updates
199 * the flag register with the packed 16 bits of the result.
200 */
201 vec4_instruction *
202 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
203 {
204 vec4_instruction *inst;
205
206 /* original gen4 does type conversion to the destination type
207 * before before comparison, producing garbage results for floating
208 * point comparisons.
209 */
210 if (intel->gen == 4) {
211 dst.type = src0.type;
212 if (dst.file == HW_REG)
213 dst.fixed_hw_reg.type = dst.type;
214 }
215
216 resolve_ud_negate(&src0);
217 resolve_ud_negate(&src1);
218
219 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
220 inst->conditional_mod = condition;
221
222 return inst;
223 }
224
225 vec4_instruction *
226 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
227 {
228 vec4_instruction *inst;
229
230 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
231 dst, index);
232 inst->base_mrf = 14;
233 inst->mlen = 1;
234
235 return inst;
236 }
237
238 vec4_instruction *
239 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
240 {
241 vec4_instruction *inst;
242
243 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
244 dst, src, index);
245 inst->base_mrf = 13;
246 inst->mlen = 2;
247
248 return inst;
249 }
250
251 void
252 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
253 {
254 static enum opcode dot_opcodes[] = {
255 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
256 };
257
258 emit(dot_opcodes[elements - 2], dst, src0, src1);
259 }
260
261 void
262 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
263 {
264 /* The gen6 math instruction ignores the source modifiers --
265 * swizzle, abs, negate, and at least some parts of the register
266 * region description.
267 *
268 * While it would seem that this MOV could be avoided at this point
269 * in the case that the swizzle is matched up with the destination
270 * writemask, note that uniform packing and register allocation
271 * could rearrange our swizzle, so let's leave this matter up to
272 * copy propagation later.
273 */
274 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
275 emit(MOV(dst_reg(temp_src), src));
276
277 if (dst.writemask != WRITEMASK_XYZW) {
278 /* The gen6 math instruction must be align1, so we can't do
279 * writemasks.
280 */
281 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
282
283 emit(opcode, temp_dst, temp_src);
284
285 emit(MOV(dst, src_reg(temp_dst)));
286 } else {
287 emit(opcode, dst, temp_src);
288 }
289 }
290
291 void
292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
293 {
294 vec4_instruction *inst = emit(opcode, dst, src);
295 inst->base_mrf = 1;
296 inst->mlen = 1;
297 }
298
299 void
300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
301 {
302 switch (opcode) {
303 case SHADER_OPCODE_RCP:
304 case SHADER_OPCODE_RSQ:
305 case SHADER_OPCODE_SQRT:
306 case SHADER_OPCODE_EXP2:
307 case SHADER_OPCODE_LOG2:
308 case SHADER_OPCODE_SIN:
309 case SHADER_OPCODE_COS:
310 break;
311 default:
312 assert(!"not reached: bad math opcode");
313 return;
314 }
315
316 if (intel->gen >= 6) {
317 return emit_math1_gen6(opcode, dst, src);
318 } else {
319 return emit_math1_gen4(opcode, dst, src);
320 }
321 }
322
323 void
324 vec4_visitor::emit_math2_gen6(enum opcode opcode,
325 dst_reg dst, src_reg src0, src_reg src1)
326 {
327 src_reg expanded;
328
329 /* The gen6 math instruction ignores the source modifiers --
330 * swizzle, abs, negate, and at least some parts of the register
331 * region description. Move the sources to temporaries to make it
332 * generally work.
333 */
334
335 expanded = src_reg(this, glsl_type::vec4_type);
336 expanded.type = src0.type;
337 emit(MOV(dst_reg(expanded), src0));
338 src0 = expanded;
339
340 expanded = src_reg(this, glsl_type::vec4_type);
341 expanded.type = src1.type;
342 emit(MOV(dst_reg(expanded), src1));
343 src1 = expanded;
344
345 if (dst.writemask != WRITEMASK_XYZW) {
346 /* The gen6 math instruction must be align1, so we can't do
347 * writemasks.
348 */
349 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
350 temp_dst.type = dst.type;
351
352 emit(opcode, temp_dst, src0, src1);
353
354 emit(MOV(dst, src_reg(temp_dst)));
355 } else {
356 emit(opcode, dst, src0, src1);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen4(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 vec4_instruction *inst = emit(opcode, dst, src0, src1);
365 inst->base_mrf = 1;
366 inst->mlen = 2;
367 }
368
369 void
370 vec4_visitor::emit_math(enum opcode opcode,
371 dst_reg dst, src_reg src0, src_reg src1)
372 {
373 switch (opcode) {
374 case SHADER_OPCODE_POW:
375 case SHADER_OPCODE_INT_QUOTIENT:
376 case SHADER_OPCODE_INT_REMAINDER:
377 break;
378 default:
379 assert(!"not reached: unsupported binary math opcode");
380 return;
381 }
382
383 if (intel->gen >= 6) {
384 return emit_math2_gen6(opcode, dst, src0, src1);
385 } else {
386 return emit_math2_gen4(opcode, dst, src0, src1);
387 }
388 }
389
390 void
391 vec4_visitor::visit_instructions(const exec_list *list)
392 {
393 foreach_list(node, list) {
394 ir_instruction *ir = (ir_instruction *)node;
395
396 base_ir = ir;
397 ir->accept(this);
398 }
399 }
400
401
402 static int
403 type_size(const struct glsl_type *type)
404 {
405 unsigned int i;
406 int size;
407
408 switch (type->base_type) {
409 case GLSL_TYPE_UINT:
410 case GLSL_TYPE_INT:
411 case GLSL_TYPE_FLOAT:
412 case GLSL_TYPE_BOOL:
413 if (type->is_matrix()) {
414 return type->matrix_columns;
415 } else {
416 /* Regardless of size of vector, it gets a vec4. This is bad
417 * packing for things like floats, but otherwise arrays become a
418 * mess. Hopefully a later pass over the code can pack scalars
419 * down if appropriate.
420 */
421 return 1;
422 }
423 case GLSL_TYPE_ARRAY:
424 assert(type->length > 0);
425 return type_size(type->fields.array) * type->length;
426 case GLSL_TYPE_STRUCT:
427 size = 0;
428 for (i = 0; i < type->length; i++) {
429 size += type_size(type->fields.structure[i].type);
430 }
431 return size;
432 case GLSL_TYPE_SAMPLER:
433 /* Samplers take up one slot in UNIFORMS[], but they're baked in
434 * at link time.
435 */
436 return 1;
437 default:
438 assert(0);
439 return 0;
440 }
441 }
442
443 int
444 vec4_visitor::virtual_grf_alloc(int size)
445 {
446 if (virtual_grf_array_size <= virtual_grf_count) {
447 if (virtual_grf_array_size == 0)
448 virtual_grf_array_size = 16;
449 else
450 virtual_grf_array_size *= 2;
451 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
452 virtual_grf_array_size);
453 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
454 virtual_grf_array_size);
455 }
456 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
457 virtual_grf_reg_count += size;
458 virtual_grf_sizes[virtual_grf_count] = size;
459 return virtual_grf_count++;
460 }
461
462 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
463 {
464 init();
465
466 this->file = GRF;
467 this->reg = v->virtual_grf_alloc(type_size(type));
468
469 if (type->is_array() || type->is_record()) {
470 this->swizzle = BRW_SWIZZLE_NOOP;
471 } else {
472 this->swizzle = swizzle_for_size(type->vector_elements);
473 }
474
475 this->type = brw_type_for_base_type(type);
476 }
477
478 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
479 {
480 init();
481
482 this->file = GRF;
483 this->reg = v->virtual_grf_alloc(type_size(type));
484
485 if (type->is_array() || type->is_record()) {
486 this->writemask = WRITEMASK_XYZW;
487 } else {
488 this->writemask = (1 << type->vector_elements) - 1;
489 }
490
491 this->type = brw_type_for_base_type(type);
492 }
493
494 /* Our support for uniforms is piggy-backed on the struct
495 * gl_fragment_program, because that's where the values actually
496 * get stored, rather than in some global gl_shader_program uniform
497 * store.
498 */
499 int
500 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
501 {
502 unsigned int offset = 0;
503 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
504
505 if (type->is_matrix()) {
506 const glsl_type *column = type->column_type();
507
508 for (unsigned int i = 0; i < type->matrix_columns; i++) {
509 offset += setup_uniform_values(loc + offset, column);
510 }
511
512 return offset;
513 }
514
515 switch (type->base_type) {
516 case GLSL_TYPE_FLOAT:
517 case GLSL_TYPE_UINT:
518 case GLSL_TYPE_INT:
519 case GLSL_TYPE_BOOL:
520 for (unsigned int i = 0; i < type->vector_elements; i++) {
521 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
522 }
523
524 /* Set up pad elements to get things aligned to a vec4 boundary. */
525 for (unsigned int i = type->vector_elements; i < 4; i++) {
526 static float zero = 0;
527
528 c->prog_data.param[this->uniforms * 4 + i] = &zero;
529 }
530
531 /* Track the size of this uniform vector, for future packing of
532 * uniforms.
533 */
534 this->uniform_vector_size[this->uniforms] = type->vector_elements;
535 this->uniforms++;
536
537 return 1;
538
539 case GLSL_TYPE_STRUCT:
540 for (unsigned int i = 0; i < type->length; i++) {
541 offset += setup_uniform_values(loc + offset,
542 type->fields.structure[i].type);
543 }
544 return offset;
545
546 case GLSL_TYPE_ARRAY:
547 for (unsigned int i = 0; i < type->length; i++) {
548 offset += setup_uniform_values(loc + offset, type->fields.array);
549 }
550 return offset;
551
552 case GLSL_TYPE_SAMPLER:
553 /* The sampler takes up a slot, but we don't use any values from it. */
554 return 1;
555
556 default:
557 assert(!"not reached");
558 return 0;
559 }
560 }
561
562 void
563 vec4_visitor::setup_uniform_clipplane_values()
564 {
565 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
566
567 /* Pre-Gen6, we compact clip planes. For example, if the user
568 * enables just clip planes 0, 1, and 3, we will enable clip planes
569 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
570 * plane 2. This simplifies the implementation of the Gen6 clip
571 * thread.
572 *
573 * In Gen6 and later, we don't compact clip planes, because this
574 * simplifies the implementation of gl_ClipDistance.
575 */
576 int compacted_clipplane_index = 0;
577 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
578 if (intel->gen < 6 &&
579 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
580 continue;
581 }
582 this->uniform_vector_size[this->uniforms] = 4;
583 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
584 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
585 for (int j = 0; j < 4; ++j) {
586 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
587 }
588 ++compacted_clipplane_index;
589 ++this->uniforms;
590 }
591 }
592
593 /* Our support for builtin uniforms is even scarier than non-builtin.
594 * It sits on top of the PROG_STATE_VAR parameters that are
595 * automatically updated from GL context state.
596 */
597 void
598 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
599 {
600 const ir_state_slot *const slots = ir->state_slots;
601 assert(ir->state_slots != NULL);
602
603 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
604 /* This state reference has already been setup by ir_to_mesa,
605 * but we'll get the same index back here. We can reference
606 * ParameterValues directly, since unlike brw_fs.cpp, we never
607 * add new state references during compile.
608 */
609 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
610 (gl_state_index *)slots[i].tokens);
611 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
612
613 this->uniform_vector_size[this->uniforms] = 0;
614 /* Add each of the unique swizzled channels of the element.
615 * This will end up matching the size of the glsl_type of this field.
616 */
617 int last_swiz = -1;
618 for (unsigned int j = 0; j < 4; j++) {
619 int swiz = GET_SWZ(slots[i].swizzle, j);
620 last_swiz = swiz;
621
622 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
623 if (swiz <= last_swiz)
624 this->uniform_vector_size[this->uniforms]++;
625 }
626 this->uniforms++;
627 }
628 }
629
630 dst_reg *
631 vec4_visitor::variable_storage(ir_variable *var)
632 {
633 return (dst_reg *)hash_table_find(this->variable_ht, var);
634 }
635
636 void
637 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
638 {
639 ir_expression *expr = ir->as_expression();
640
641 *predicate = BRW_PREDICATE_NORMAL;
642
643 if (expr) {
644 src_reg op[2];
645 vec4_instruction *inst;
646
647 assert(expr->get_num_operands() <= 2);
648 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
649 expr->operands[i]->accept(this);
650 op[i] = this->result;
651
652 resolve_ud_negate(&op[i]);
653 }
654
655 switch (expr->operation) {
656 case ir_unop_logic_not:
657 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
658 inst->conditional_mod = BRW_CONDITIONAL_Z;
659 break;
660
661 case ir_binop_logic_xor:
662 inst = emit(XOR(dst_null_d(), op[0], op[1]));
663 inst->conditional_mod = BRW_CONDITIONAL_NZ;
664 break;
665
666 case ir_binop_logic_or:
667 inst = emit(OR(dst_null_d(), op[0], op[1]));
668 inst->conditional_mod = BRW_CONDITIONAL_NZ;
669 break;
670
671 case ir_binop_logic_and:
672 inst = emit(AND(dst_null_d(), op[0], op[1]));
673 inst->conditional_mod = BRW_CONDITIONAL_NZ;
674 break;
675
676 case ir_unop_f2b:
677 if (intel->gen >= 6) {
678 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
679 } else {
680 inst = emit(MOV(dst_null_f(), op[0]));
681 inst->conditional_mod = BRW_CONDITIONAL_NZ;
682 }
683 break;
684
685 case ir_unop_i2b:
686 if (intel->gen >= 6) {
687 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
688 } else {
689 inst = emit(MOV(dst_null_d(), op[0]));
690 inst->conditional_mod = BRW_CONDITIONAL_NZ;
691 }
692 break;
693
694 case ir_binop_all_equal:
695 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
696 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
697 break;
698
699 case ir_binop_any_nequal:
700 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
701 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
702 break;
703
704 case ir_unop_any:
705 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
706 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
707 break;
708
709 case ir_binop_greater:
710 case ir_binop_gequal:
711 case ir_binop_less:
712 case ir_binop_lequal:
713 case ir_binop_equal:
714 case ir_binop_nequal:
715 emit(CMP(dst_null_d(), op[0], op[1],
716 brw_conditional_for_comparison(expr->operation)));
717 break;
718
719 default:
720 assert(!"not reached");
721 break;
722 }
723 return;
724 }
725
726 ir->accept(this);
727
728 resolve_ud_negate(&this->result);
729
730 if (intel->gen >= 6) {
731 vec4_instruction *inst = emit(AND(dst_null_d(),
732 this->result, src_reg(1)));
733 inst->conditional_mod = BRW_CONDITIONAL_NZ;
734 } else {
735 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
736 inst->conditional_mod = BRW_CONDITIONAL_NZ;
737 }
738 }
739
740 /**
741 * Emit a gen6 IF statement with the comparison folded into the IF
742 * instruction.
743 */
744 void
745 vec4_visitor::emit_if_gen6(ir_if *ir)
746 {
747 ir_expression *expr = ir->condition->as_expression();
748
749 if (expr) {
750 src_reg op[2];
751 dst_reg temp;
752
753 assert(expr->get_num_operands() <= 2);
754 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
755 expr->operands[i]->accept(this);
756 op[i] = this->result;
757 }
758
759 switch (expr->operation) {
760 case ir_unop_logic_not:
761 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
762 return;
763
764 case ir_binop_logic_xor:
765 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
766 return;
767
768 case ir_binop_logic_or:
769 temp = dst_reg(this, glsl_type::bool_type);
770 emit(OR(temp, op[0], op[1]));
771 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
772 return;
773
774 case ir_binop_logic_and:
775 temp = dst_reg(this, glsl_type::bool_type);
776 emit(AND(temp, op[0], op[1]));
777 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
778 return;
779
780 case ir_unop_f2b:
781 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
782 return;
783
784 case ir_unop_i2b:
785 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
786 return;
787
788 case ir_binop_greater:
789 case ir_binop_gequal:
790 case ir_binop_less:
791 case ir_binop_lequal:
792 case ir_binop_equal:
793 case ir_binop_nequal:
794 emit(IF(op[0], op[1],
795 brw_conditional_for_comparison(expr->operation)));
796 return;
797
798 case ir_binop_all_equal:
799 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
800 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
801 return;
802
803 case ir_binop_any_nequal:
804 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
805 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
806 return;
807
808 case ir_unop_any:
809 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
810 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
811 return;
812
813 default:
814 assert(!"not reached");
815 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
816 return;
817 }
818 return;
819 }
820
821 ir->condition->accept(this);
822
823 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
824 }
825
826 void
827 vec4_visitor::visit(ir_variable *ir)
828 {
829 dst_reg *reg = NULL;
830
831 if (variable_storage(ir))
832 return;
833
834 switch (ir->mode) {
835 case ir_var_in:
836 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
837
838 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
839 * come in as floating point conversions of the integer values.
840 */
841 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
842 if (!c->key.gl_fixed_input_size[i])
843 continue;
844
845 dst_reg dst = *reg;
846 dst.type = brw_type_for_base_type(ir->type);
847 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
848 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
849 }
850 break;
851
852 case ir_var_out:
853 reg = new(mem_ctx) dst_reg(this, ir->type);
854
855 for (int i = 0; i < type_size(ir->type); i++) {
856 output_reg[ir->location + i] = *reg;
857 output_reg[ir->location + i].reg_offset = i;
858 output_reg[ir->location + i].type =
859 brw_type_for_base_type(ir->type->get_scalar_type());
860 output_reg_annotation[ir->location + i] = ir->name;
861 }
862 break;
863
864 case ir_var_auto:
865 case ir_var_temporary:
866 reg = new(mem_ctx) dst_reg(this, ir->type);
867 break;
868
869 case ir_var_uniform:
870 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
871
872 /* Track how big the whole uniform variable is, in case we need to put a
873 * copy of its data into pull constants for array access.
874 */
875 this->uniform_size[this->uniforms] = type_size(ir->type);
876
877 if (!strncmp(ir->name, "gl_", 3)) {
878 setup_builtin_uniform_values(ir);
879 } else {
880 setup_uniform_values(ir->location, ir->type);
881 }
882 break;
883
884 case ir_var_system_value:
885 /* VertexID is stored by the VF as the last vertex element, but
886 * we don't represent it with a flag in inputs_read, so we call
887 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
888 */
889 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
890 prog_data->uses_vertexid = true;
891
892 switch (ir->location) {
893 case SYSTEM_VALUE_VERTEX_ID:
894 reg->writemask = WRITEMASK_X;
895 break;
896 case SYSTEM_VALUE_INSTANCE_ID:
897 reg->writemask = WRITEMASK_Y;
898 break;
899 default:
900 assert(!"not reached");
901 break;
902 }
903 break;
904
905 default:
906 assert(!"not reached");
907 }
908
909 reg->type = brw_type_for_base_type(ir->type);
910 hash_table_insert(this->variable_ht, reg, ir);
911 }
912
913 void
914 vec4_visitor::visit(ir_loop *ir)
915 {
916 dst_reg counter;
917
918 /* We don't want debugging output to print the whole body of the
919 * loop as the annotation.
920 */
921 this->base_ir = NULL;
922
923 if (ir->counter != NULL) {
924 this->base_ir = ir->counter;
925 ir->counter->accept(this);
926 counter = *(variable_storage(ir->counter));
927
928 if (ir->from != NULL) {
929 this->base_ir = ir->from;
930 ir->from->accept(this);
931
932 emit(MOV(counter, this->result));
933 }
934 }
935
936 emit(BRW_OPCODE_DO);
937
938 if (ir->to) {
939 this->base_ir = ir->to;
940 ir->to->accept(this);
941
942 emit(CMP(dst_null_d(), src_reg(counter), this->result,
943 brw_conditional_for_comparison(ir->cmp)));
944
945 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
946 inst->predicate = BRW_PREDICATE_NORMAL;
947 }
948
949 visit_instructions(&ir->body_instructions);
950
951
952 if (ir->increment) {
953 this->base_ir = ir->increment;
954 ir->increment->accept(this);
955 emit(ADD(counter, src_reg(counter), this->result));
956 }
957
958 emit(BRW_OPCODE_WHILE);
959 }
960
961 void
962 vec4_visitor::visit(ir_loop_jump *ir)
963 {
964 switch (ir->mode) {
965 case ir_loop_jump::jump_break:
966 emit(BRW_OPCODE_BREAK);
967 break;
968 case ir_loop_jump::jump_continue:
969 emit(BRW_OPCODE_CONTINUE);
970 break;
971 }
972 }
973
974
975 void
976 vec4_visitor::visit(ir_function_signature *ir)
977 {
978 assert(0);
979 (void)ir;
980 }
981
982 void
983 vec4_visitor::visit(ir_function *ir)
984 {
985 /* Ignore function bodies other than main() -- we shouldn't see calls to
986 * them since they should all be inlined.
987 */
988 if (strcmp(ir->name, "main") == 0) {
989 const ir_function_signature *sig;
990 exec_list empty;
991
992 sig = ir->matching_signature(&empty);
993
994 assert(sig);
995
996 visit_instructions(&sig->body);
997 }
998 }
999
1000 bool
1001 vec4_visitor::try_emit_sat(ir_expression *ir)
1002 {
1003 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1004 if (!sat_src)
1005 return false;
1006
1007 sat_src->accept(this);
1008 src_reg src = this->result;
1009
1010 this->result = src_reg(this, ir->type);
1011 vec4_instruction *inst;
1012 inst = emit(MOV(dst_reg(this->result), src));
1013 inst->saturate = true;
1014
1015 return true;
1016 }
1017
1018 void
1019 vec4_visitor::emit_bool_comparison(unsigned int op,
1020 dst_reg dst, src_reg src0, src_reg src1)
1021 {
1022 /* original gen4 does destination conversion before comparison. */
1023 if (intel->gen < 5)
1024 dst.type = src0.type;
1025
1026 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1027
1028 dst.type = BRW_REGISTER_TYPE_D;
1029 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1030 }
1031
1032 void
1033 vec4_visitor::visit(ir_expression *ir)
1034 {
1035 unsigned int operand;
1036 src_reg op[Elements(ir->operands)];
1037 src_reg result_src;
1038 dst_reg result_dst;
1039 vec4_instruction *inst;
1040
1041 if (try_emit_sat(ir))
1042 return;
1043
1044 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1045 this->result.file = BAD_FILE;
1046 ir->operands[operand]->accept(this);
1047 if (this->result.file == BAD_FILE) {
1048 printf("Failed to get tree for expression operand:\n");
1049 ir->operands[operand]->print();
1050 exit(1);
1051 }
1052 op[operand] = this->result;
1053
1054 /* Matrix expression operands should have been broken down to vector
1055 * operations already.
1056 */
1057 assert(!ir->operands[operand]->type->is_matrix());
1058 }
1059
1060 int vector_elements = ir->operands[0]->type->vector_elements;
1061 if (ir->operands[1]) {
1062 vector_elements = MAX2(vector_elements,
1063 ir->operands[1]->type->vector_elements);
1064 }
1065
1066 this->result.file = BAD_FILE;
1067
1068 /* Storage for our result. Ideally for an assignment we'd be using
1069 * the actual storage for the result here, instead.
1070 */
1071 result_src = src_reg(this, ir->type);
1072 /* convenience for the emit functions below. */
1073 result_dst = dst_reg(result_src);
1074 /* If nothing special happens, this is the result. */
1075 this->result = result_src;
1076 /* Limit writes to the channels that will be used by result_src later.
1077 * This does limit this temp's use as a temporary for multi-instruction
1078 * sequences.
1079 */
1080 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1081
1082 switch (ir->operation) {
1083 case ir_unop_logic_not:
1084 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1085 * ones complement of the whole register, not just bit 0.
1086 */
1087 emit(XOR(result_dst, op[0], src_reg(1)));
1088 break;
1089 case ir_unop_neg:
1090 op[0].negate = !op[0].negate;
1091 this->result = op[0];
1092 break;
1093 case ir_unop_abs:
1094 op[0].abs = true;
1095 op[0].negate = false;
1096 this->result = op[0];
1097 break;
1098
1099 case ir_unop_sign:
1100 emit(MOV(result_dst, src_reg(0.0f)));
1101
1102 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1103 inst = emit(MOV(result_dst, src_reg(1.0f)));
1104 inst->predicate = BRW_PREDICATE_NORMAL;
1105
1106 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1107 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1108 inst->predicate = BRW_PREDICATE_NORMAL;
1109
1110 break;
1111
1112 case ir_unop_rcp:
1113 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1114 break;
1115
1116 case ir_unop_exp2:
1117 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1118 break;
1119 case ir_unop_log2:
1120 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1121 break;
1122 case ir_unop_exp:
1123 case ir_unop_log:
1124 assert(!"not reached: should be handled by ir_explog_to_explog2");
1125 break;
1126 case ir_unop_sin:
1127 case ir_unop_sin_reduced:
1128 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1129 break;
1130 case ir_unop_cos:
1131 case ir_unop_cos_reduced:
1132 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1133 break;
1134
1135 case ir_unop_dFdx:
1136 case ir_unop_dFdy:
1137 assert(!"derivatives not valid in vertex shader");
1138 break;
1139
1140 case ir_unop_noise:
1141 assert(!"not reached: should be handled by lower_noise");
1142 break;
1143
1144 case ir_binop_add:
1145 emit(ADD(result_dst, op[0], op[1]));
1146 break;
1147 case ir_binop_sub:
1148 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1149 break;
1150
1151 case ir_binop_mul:
1152 if (ir->type->is_integer()) {
1153 /* For integer multiplication, the MUL uses the low 16 bits
1154 * of one of the operands (src0 on gen6, src1 on gen7). The
1155 * MACH accumulates in the contribution of the upper 16 bits
1156 * of that operand.
1157 *
1158 * FINISHME: Emit just the MUL if we know an operand is small
1159 * enough.
1160 */
1161 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1162
1163 emit(MUL(acc, op[0], op[1]));
1164 emit(MACH(dst_null_d(), op[0], op[1]));
1165 emit(MOV(result_dst, src_reg(acc)));
1166 } else {
1167 emit(MUL(result_dst, op[0], op[1]));
1168 }
1169 break;
1170 case ir_binop_div:
1171 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1172 assert(ir->type->is_integer());
1173 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1174 break;
1175 case ir_binop_mod:
1176 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1177 assert(ir->type->is_integer());
1178 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1179 break;
1180
1181 case ir_binop_less:
1182 case ir_binop_greater:
1183 case ir_binop_lequal:
1184 case ir_binop_gequal:
1185 case ir_binop_equal:
1186 case ir_binop_nequal: {
1187 emit(CMP(result_dst, op[0], op[1],
1188 brw_conditional_for_comparison(ir->operation)));
1189 emit(AND(result_dst, result_src, src_reg(0x1)));
1190 break;
1191 }
1192
1193 case ir_binop_all_equal:
1194 /* "==" operator producing a scalar boolean. */
1195 if (ir->operands[0]->type->is_vector() ||
1196 ir->operands[1]->type->is_vector()) {
1197 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1198 emit(MOV(result_dst, src_reg(0)));
1199 inst = emit(MOV(result_dst, src_reg(1)));
1200 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1201 } else {
1202 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1203 emit(AND(result_dst, result_src, src_reg(0x1)));
1204 }
1205 break;
1206 case ir_binop_any_nequal:
1207 /* "!=" operator producing a scalar boolean. */
1208 if (ir->operands[0]->type->is_vector() ||
1209 ir->operands[1]->type->is_vector()) {
1210 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1211
1212 emit(MOV(result_dst, src_reg(0)));
1213 inst = emit(MOV(result_dst, src_reg(1)));
1214 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1215 } else {
1216 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1217 emit(AND(result_dst, result_src, src_reg(0x1)));
1218 }
1219 break;
1220
1221 case ir_unop_any:
1222 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1223 emit(MOV(result_dst, src_reg(0)));
1224
1225 inst = emit(MOV(result_dst, src_reg(1)));
1226 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1227 break;
1228
1229 case ir_binop_logic_xor:
1230 emit(XOR(result_dst, op[0], op[1]));
1231 break;
1232
1233 case ir_binop_logic_or:
1234 emit(OR(result_dst, op[0], op[1]));
1235 break;
1236
1237 case ir_binop_logic_and:
1238 emit(AND(result_dst, op[0], op[1]));
1239 break;
1240
1241 case ir_binop_dot:
1242 assert(ir->operands[0]->type->is_vector());
1243 assert(ir->operands[0]->type == ir->operands[1]->type);
1244 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1245 break;
1246
1247 case ir_unop_sqrt:
1248 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1249 break;
1250 case ir_unop_rsq:
1251 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1252 break;
1253 case ir_unop_i2f:
1254 case ir_unop_i2u:
1255 case ir_unop_u2i:
1256 case ir_unop_u2f:
1257 case ir_unop_b2f:
1258 case ir_unop_b2i:
1259 case ir_unop_f2i:
1260 emit(MOV(result_dst, op[0]));
1261 break;
1262 case ir_unop_f2b:
1263 case ir_unop_i2b: {
1264 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1265 emit(AND(result_dst, result_src, src_reg(1)));
1266 break;
1267 }
1268
1269 case ir_unop_trunc:
1270 emit(RNDZ(result_dst, op[0]));
1271 break;
1272 case ir_unop_ceil:
1273 op[0].negate = !op[0].negate;
1274 inst = emit(RNDD(result_dst, op[0]));
1275 this->result.negate = true;
1276 break;
1277 case ir_unop_floor:
1278 inst = emit(RNDD(result_dst, op[0]));
1279 break;
1280 case ir_unop_fract:
1281 inst = emit(FRC(result_dst, op[0]));
1282 break;
1283 case ir_unop_round_even:
1284 emit(RNDE(result_dst, op[0]));
1285 break;
1286
1287 case ir_binop_min:
1288 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1289
1290 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1291 inst->predicate = BRW_PREDICATE_NORMAL;
1292 break;
1293 case ir_binop_max:
1294 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1295
1296 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1297 inst->predicate = BRW_PREDICATE_NORMAL;
1298 break;
1299
1300 case ir_binop_pow:
1301 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1302 break;
1303
1304 case ir_unop_bit_not:
1305 inst = emit(NOT(result_dst, op[0]));
1306 break;
1307 case ir_binop_bit_and:
1308 inst = emit(AND(result_dst, op[0], op[1]));
1309 break;
1310 case ir_binop_bit_xor:
1311 inst = emit(XOR(result_dst, op[0], op[1]));
1312 break;
1313 case ir_binop_bit_or:
1314 inst = emit(OR(result_dst, op[0], op[1]));
1315 break;
1316
1317 case ir_binop_lshift:
1318 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1319 break;
1320
1321 case ir_binop_rshift:
1322 if (ir->type->base_type == GLSL_TYPE_INT)
1323 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1324 else
1325 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1326 break;
1327
1328 case ir_quadop_vector:
1329 assert(!"not reached: should be handled by lower_quadop_vector");
1330 break;
1331 }
1332 }
1333
1334
1335 void
1336 vec4_visitor::visit(ir_swizzle *ir)
1337 {
1338 src_reg src;
1339 int i = 0;
1340 int swizzle[4];
1341
1342 /* Note that this is only swizzles in expressions, not those on the left
1343 * hand side of an assignment, which do write masking. See ir_assignment
1344 * for that.
1345 */
1346
1347 ir->val->accept(this);
1348 src = this->result;
1349 assert(src.file != BAD_FILE);
1350
1351 for (i = 0; i < ir->type->vector_elements; i++) {
1352 switch (i) {
1353 case 0:
1354 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1355 break;
1356 case 1:
1357 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1358 break;
1359 case 2:
1360 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1361 break;
1362 case 3:
1363 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1364 break;
1365 }
1366 }
1367 for (; i < 4; i++) {
1368 /* Replicate the last channel out. */
1369 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1370 }
1371
1372 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1373
1374 this->result = src;
1375 }
1376
1377 void
1378 vec4_visitor::visit(ir_dereference_variable *ir)
1379 {
1380 const struct glsl_type *type = ir->type;
1381 dst_reg *reg = variable_storage(ir->var);
1382
1383 if (!reg) {
1384 fail("Failed to find variable storage for %s\n", ir->var->name);
1385 this->result = src_reg(brw_null_reg());
1386 return;
1387 }
1388
1389 this->result = src_reg(*reg);
1390
1391 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1392 this->result.swizzle = swizzle_for_size(type->vector_elements);
1393 }
1394
1395 void
1396 vec4_visitor::visit(ir_dereference_array *ir)
1397 {
1398 ir_constant *constant_index;
1399 src_reg src;
1400 int element_size = type_size(ir->type);
1401
1402 constant_index = ir->array_index->constant_expression_value();
1403
1404 ir->array->accept(this);
1405 src = this->result;
1406
1407 if (constant_index) {
1408 src.reg_offset += constant_index->value.i[0] * element_size;
1409 } else {
1410 /* Variable index array dereference. It eats the "vec4" of the
1411 * base of the array and an index that offsets the Mesa register
1412 * index.
1413 */
1414 ir->array_index->accept(this);
1415
1416 src_reg index_reg;
1417
1418 if (element_size == 1) {
1419 index_reg = this->result;
1420 } else {
1421 index_reg = src_reg(this, glsl_type::int_type);
1422
1423 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1424 }
1425
1426 if (src.reladdr) {
1427 src_reg temp = src_reg(this, glsl_type::int_type);
1428
1429 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1430
1431 index_reg = temp;
1432 }
1433
1434 src.reladdr = ralloc(mem_ctx, src_reg);
1435 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1436 }
1437
1438 /* If the type is smaller than a vec4, replicate the last channel out. */
1439 if (ir->type->is_scalar() || ir->type->is_vector())
1440 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1441 else
1442 src.swizzle = BRW_SWIZZLE_NOOP;
1443 src.type = brw_type_for_base_type(ir->type);
1444
1445 this->result = src;
1446 }
1447
1448 void
1449 vec4_visitor::visit(ir_dereference_record *ir)
1450 {
1451 unsigned int i;
1452 const glsl_type *struct_type = ir->record->type;
1453 int offset = 0;
1454
1455 ir->record->accept(this);
1456
1457 for (i = 0; i < struct_type->length; i++) {
1458 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1459 break;
1460 offset += type_size(struct_type->fields.structure[i].type);
1461 }
1462
1463 /* If the type is smaller than a vec4, replicate the last channel out. */
1464 if (ir->type->is_scalar() || ir->type->is_vector())
1465 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1466 else
1467 this->result.swizzle = BRW_SWIZZLE_NOOP;
1468 this->result.type = brw_type_for_base_type(ir->type);
1469
1470 this->result.reg_offset += offset;
1471 }
1472
1473 /**
1474 * We want to be careful in assignment setup to hit the actual storage
1475 * instead of potentially using a temporary like we might with the
1476 * ir_dereference handler.
1477 */
1478 static dst_reg
1479 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1480 {
1481 /* The LHS must be a dereference. If the LHS is a variable indexed array
1482 * access of a vector, it must be separated into a series conditional moves
1483 * before reaching this point (see ir_vec_index_to_cond_assign).
1484 */
1485 assert(ir->as_dereference());
1486 ir_dereference_array *deref_array = ir->as_dereference_array();
1487 if (deref_array) {
1488 assert(!deref_array->array->type->is_vector());
1489 }
1490
1491 /* Use the rvalue deref handler for the most part. We'll ignore
1492 * swizzles in it and write swizzles using writemask, though.
1493 */
1494 ir->accept(v);
1495 return dst_reg(v->result);
1496 }
1497
1498 void
1499 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1500 const struct glsl_type *type, uint32_t predicate)
1501 {
1502 if (type->base_type == GLSL_TYPE_STRUCT) {
1503 for (unsigned int i = 0; i < type->length; i++) {
1504 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1505 }
1506 return;
1507 }
1508
1509 if (type->is_array()) {
1510 for (unsigned int i = 0; i < type->length; i++) {
1511 emit_block_move(dst, src, type->fields.array, predicate);
1512 }
1513 return;
1514 }
1515
1516 if (type->is_matrix()) {
1517 const struct glsl_type *vec_type;
1518
1519 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1520 type->vector_elements, 1);
1521
1522 for (int i = 0; i < type->matrix_columns; i++) {
1523 emit_block_move(dst, src, vec_type, predicate);
1524 }
1525 return;
1526 }
1527
1528 assert(type->is_scalar() || type->is_vector());
1529
1530 dst->type = brw_type_for_base_type(type);
1531 src->type = dst->type;
1532
1533 dst->writemask = (1 << type->vector_elements) - 1;
1534
1535 /* Do we need to worry about swizzling a swizzle? */
1536 assert(src->swizzle == BRW_SWIZZLE_NOOP
1537 || src->swizzle == swizzle_for_size(type->vector_elements));
1538 src->swizzle = swizzle_for_size(type->vector_elements);
1539
1540 vec4_instruction *inst = emit(MOV(*dst, *src));
1541 inst->predicate = predicate;
1542
1543 dst->reg_offset++;
1544 src->reg_offset++;
1545 }
1546
1547
1548 /* If the RHS processing resulted in an instruction generating a
1549 * temporary value, and it would be easy to rewrite the instruction to
1550 * generate its result right into the LHS instead, do so. This ends
1551 * up reliably removing instructions where it can be tricky to do so
1552 * later without real UD chain information.
1553 */
1554 bool
1555 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1556 dst_reg dst,
1557 src_reg src,
1558 vec4_instruction *pre_rhs_inst,
1559 vec4_instruction *last_rhs_inst)
1560 {
1561 /* This could be supported, but it would take more smarts. */
1562 if (ir->condition)
1563 return false;
1564
1565 if (pre_rhs_inst == last_rhs_inst)
1566 return false; /* No instructions generated to work with. */
1567
1568 /* Make sure the last instruction generated our source reg. */
1569 if (src.file != GRF ||
1570 src.file != last_rhs_inst->dst.file ||
1571 src.reg != last_rhs_inst->dst.reg ||
1572 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1573 src.reladdr ||
1574 src.abs ||
1575 src.negate ||
1576 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1577 return false;
1578
1579 /* Check that that last instruction fully initialized the channels
1580 * we want to use, in the order we want to use them. We could
1581 * potentially reswizzle the operands of many instructions so that
1582 * we could handle out of order channels, but don't yet.
1583 */
1584
1585 for (unsigned i = 0; i < 4; i++) {
1586 if (dst.writemask & (1 << i)) {
1587 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1588 return false;
1589
1590 if (BRW_GET_SWZ(src.swizzle, i) != i)
1591 return false;
1592 }
1593 }
1594
1595 /* Success! Rewrite the instruction. */
1596 last_rhs_inst->dst.file = dst.file;
1597 last_rhs_inst->dst.reg = dst.reg;
1598 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1599 last_rhs_inst->dst.reladdr = dst.reladdr;
1600 last_rhs_inst->dst.writemask &= dst.writemask;
1601
1602 return true;
1603 }
1604
1605 void
1606 vec4_visitor::visit(ir_assignment *ir)
1607 {
1608 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1609 uint32_t predicate = BRW_PREDICATE_NONE;
1610
1611 if (!ir->lhs->type->is_scalar() &&
1612 !ir->lhs->type->is_vector()) {
1613 ir->rhs->accept(this);
1614 src_reg src = this->result;
1615
1616 if (ir->condition) {
1617 emit_bool_to_cond_code(ir->condition, &predicate);
1618 }
1619
1620 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1621 return;
1622 }
1623
1624 /* Now we're down to just a scalar/vector with writemasks. */
1625 int i;
1626
1627 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1628 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1629
1630 ir->rhs->accept(this);
1631
1632 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1633
1634 src_reg src = this->result;
1635
1636 int swizzles[4];
1637 int first_enabled_chan = 0;
1638 int src_chan = 0;
1639
1640 assert(ir->lhs->type->is_vector() ||
1641 ir->lhs->type->is_scalar());
1642 dst.writemask = ir->write_mask;
1643
1644 for (int i = 0; i < 4; i++) {
1645 if (dst.writemask & (1 << i)) {
1646 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1647 break;
1648 }
1649 }
1650
1651 /* Swizzle a small RHS vector into the channels being written.
1652 *
1653 * glsl ir treats write_mask as dictating how many channels are
1654 * present on the RHS while in our instructions we need to make
1655 * those channels appear in the slots of the vec4 they're written to.
1656 */
1657 for (int i = 0; i < 4; i++) {
1658 if (dst.writemask & (1 << i))
1659 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1660 else
1661 swizzles[i] = first_enabled_chan;
1662 }
1663 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1664 swizzles[2], swizzles[3]);
1665
1666 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1667 return;
1668 }
1669
1670 if (ir->condition) {
1671 emit_bool_to_cond_code(ir->condition, &predicate);
1672 }
1673
1674 for (i = 0; i < type_size(ir->lhs->type); i++) {
1675 vec4_instruction *inst = emit(MOV(dst, src));
1676 inst->predicate = predicate;
1677
1678 dst.reg_offset++;
1679 src.reg_offset++;
1680 }
1681 }
1682
1683 void
1684 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1685 {
1686 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1687 foreach_list(node, &ir->components) {
1688 ir_constant *field_value = (ir_constant *)node;
1689
1690 emit_constant_values(dst, field_value);
1691 }
1692 return;
1693 }
1694
1695 if (ir->type->is_array()) {
1696 for (unsigned int i = 0; i < ir->type->length; i++) {
1697 emit_constant_values(dst, ir->array_elements[i]);
1698 }
1699 return;
1700 }
1701
1702 if (ir->type->is_matrix()) {
1703 for (int i = 0; i < ir->type->matrix_columns; i++) {
1704 float *vec = &ir->value.f[i * ir->type->vector_elements];
1705
1706 for (int j = 0; j < ir->type->vector_elements; j++) {
1707 dst->writemask = 1 << j;
1708 dst->type = BRW_REGISTER_TYPE_F;
1709
1710 emit(MOV(*dst, src_reg(vec[j])));
1711 }
1712 dst->reg_offset++;
1713 }
1714 return;
1715 }
1716
1717 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1718
1719 for (int i = 0; i < ir->type->vector_elements; i++) {
1720 if (!(remaining_writemask & (1 << i)))
1721 continue;
1722
1723 dst->writemask = 1 << i;
1724 dst->type = brw_type_for_base_type(ir->type);
1725
1726 /* Find other components that match the one we're about to
1727 * write. Emits fewer instructions for things like vec4(0.5,
1728 * 1.5, 1.5, 1.5).
1729 */
1730 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1731 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1732 if (ir->value.b[i] == ir->value.b[j])
1733 dst->writemask |= (1 << j);
1734 } else {
1735 /* u, i, and f storage all line up, so no need for a
1736 * switch case for comparing each type.
1737 */
1738 if (ir->value.u[i] == ir->value.u[j])
1739 dst->writemask |= (1 << j);
1740 }
1741 }
1742
1743 switch (ir->type->base_type) {
1744 case GLSL_TYPE_FLOAT:
1745 emit(MOV(*dst, src_reg(ir->value.f[i])));
1746 break;
1747 case GLSL_TYPE_INT:
1748 emit(MOV(*dst, src_reg(ir->value.i[i])));
1749 break;
1750 case GLSL_TYPE_UINT:
1751 emit(MOV(*dst, src_reg(ir->value.u[i])));
1752 break;
1753 case GLSL_TYPE_BOOL:
1754 emit(MOV(*dst, src_reg(ir->value.b[i])));
1755 break;
1756 default:
1757 assert(!"Non-float/uint/int/bool constant");
1758 break;
1759 }
1760
1761 remaining_writemask &= ~dst->writemask;
1762 }
1763 dst->reg_offset++;
1764 }
1765
1766 void
1767 vec4_visitor::visit(ir_constant *ir)
1768 {
1769 dst_reg dst = dst_reg(this, ir->type);
1770 this->result = src_reg(dst);
1771
1772 emit_constant_values(&dst, ir);
1773 }
1774
1775 void
1776 vec4_visitor::visit(ir_call *ir)
1777 {
1778 assert(!"not reached");
1779 }
1780
1781 void
1782 vec4_visitor::visit(ir_texture *ir)
1783 {
1784 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1785 sampler = vp->Base.SamplerUnits[sampler];
1786
1787 /* Should be lowered by do_lower_texture_projection */
1788 assert(!ir->projector);
1789
1790 vec4_instruction *inst = NULL;
1791 switch (ir->op) {
1792 case ir_tex:
1793 case ir_txl:
1794 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1795 break;
1796 case ir_txd:
1797 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1798 break;
1799 case ir_txf:
1800 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1801 break;
1802 case ir_txs:
1803 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1804 break;
1805 case ir_txb:
1806 assert(!"TXB is not valid for vertex shaders.");
1807 }
1808
1809 /* Texel offsets go in the message header; Gen4 also requires headers. */
1810 inst->header_present = ir->offset || intel->gen < 5;
1811 inst->base_mrf = 2;
1812 inst->mlen = inst->header_present + 1; /* always at least one */
1813 inst->sampler = sampler;
1814 inst->dst = dst_reg(this, ir->type);
1815 inst->shadow_compare = ir->shadow_comparitor != NULL;
1816
1817 if (ir->offset != NULL)
1818 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1819
1820 /* MRF for the first parameter */
1821 int param_base = inst->base_mrf + inst->header_present;
1822
1823 if (ir->op == ir_txs) {
1824 ir->lod_info.lod->accept(this);
1825 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1826 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1827 this->result));
1828 } else {
1829 int i, coord_mask = 0, zero_mask = 0;
1830 /* Load the coordinate */
1831 /* FINISHME: gl_clamp_mask and saturate */
1832 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1833 coord_mask |= (1 << i);
1834 for (; i < 4; i++)
1835 zero_mask |= (1 << i);
1836
1837 ir->coordinate->accept(this);
1838 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1839 this->result));
1840 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1841 src_reg(0)));
1842 /* Load the shadow comparitor */
1843 if (ir->shadow_comparitor) {
1844 ir->shadow_comparitor->accept(this);
1845 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1846 WRITEMASK_X),
1847 this->result));
1848 inst->mlen++;
1849 }
1850
1851 /* Load the LOD info */
1852 if (ir->op == ir_txl) {
1853 int mrf, writemask;
1854 if (intel->gen >= 5) {
1855 mrf = param_base + 1;
1856 if (ir->shadow_comparitor) {
1857 writemask = WRITEMASK_Y;
1858 /* mlen already incremented */
1859 } else {
1860 writemask = WRITEMASK_X;
1861 inst->mlen++;
1862 }
1863 } else /* intel->gen == 4 */ {
1864 mrf = param_base;
1865 writemask = WRITEMASK_Z;
1866 }
1867 ir->lod_info.lod->accept(this);
1868 emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
1869 this->result));
1870 } else if (ir->op == ir_txf) {
1871 ir->lod_info.lod->accept(this);
1872 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1873 this->result));
1874 } else if (ir->op == ir_txd) {
1875 const glsl_type *type = ir->lod_info.grad.dPdx->type;
1876
1877 ir->lod_info.grad.dPdx->accept(this);
1878 src_reg dPdx = this->result;
1879 ir->lod_info.grad.dPdy->accept(this);
1880 src_reg dPdy = this->result;
1881
1882 if (intel->gen >= 5) {
1883 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1884 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1885 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1886 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1887 inst->mlen++;
1888
1889 if (ir->type->vector_elements == 3) {
1890 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1891 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1892 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1893 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1894 inst->mlen++;
1895 }
1896 } else /* intel->gen == 4 */ {
1897 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1898 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1899 inst->mlen += 2;
1900 }
1901 }
1902 }
1903
1904 emit(inst);
1905
1906 swizzle_result(ir, src_reg(inst->dst), sampler);
1907 }
1908
1909 void
1910 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
1911 {
1912 this->result = orig_val;
1913
1914 int s = c->key.tex.swizzles[sampler];
1915
1916 if (ir->op == ir_txs || ir->type == glsl_type::float_type
1917 || s == SWIZZLE_NOOP)
1918 return;
1919
1920 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1921 int swizzle[4];
1922
1923 for (int i = 0; i < 4; i++) {
1924 switch (GET_SWZ(s, i)) {
1925 case SWIZZLE_ZERO:
1926 zero_mask |= (1 << i);
1927 break;
1928 case SWIZZLE_ONE:
1929 one_mask |= (1 << i);
1930 break;
1931 default:
1932 copy_mask |= (1 << i);
1933 swizzle[i] = GET_SWZ(s, i);
1934 break;
1935 }
1936 }
1937
1938 this->result = src_reg(this, ir->type);
1939 dst_reg swizzled_result(this->result);
1940
1941 if (copy_mask) {
1942 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1943 swizzled_result.writemask = copy_mask;
1944 emit(MOV(swizzled_result, orig_val));
1945 }
1946
1947 if (zero_mask) {
1948 swizzled_result.writemask = zero_mask;
1949 emit(MOV(swizzled_result, src_reg(0.0f)));
1950 }
1951
1952 if (one_mask) {
1953 swizzled_result.writemask = one_mask;
1954 emit(MOV(swizzled_result, src_reg(1.0f)));
1955 }
1956 }
1957
1958 void
1959 vec4_visitor::visit(ir_return *ir)
1960 {
1961 assert(!"not reached");
1962 }
1963
1964 void
1965 vec4_visitor::visit(ir_discard *ir)
1966 {
1967 assert(!"not reached");
1968 }
1969
1970 void
1971 vec4_visitor::visit(ir_if *ir)
1972 {
1973 /* Don't point the annotation at the if statement, because then it plus
1974 * the then and else blocks get printed.
1975 */
1976 this->base_ir = ir->condition;
1977
1978 if (intel->gen == 6) {
1979 emit_if_gen6(ir);
1980 } else {
1981 uint32_t predicate;
1982 emit_bool_to_cond_code(ir->condition, &predicate);
1983 emit(IF(predicate));
1984 }
1985
1986 visit_instructions(&ir->then_instructions);
1987
1988 if (!ir->else_instructions.is_empty()) {
1989 this->base_ir = ir->condition;
1990 emit(BRW_OPCODE_ELSE);
1991
1992 visit_instructions(&ir->else_instructions);
1993 }
1994
1995 this->base_ir = ir->condition;
1996 emit(BRW_OPCODE_ENDIF);
1997 }
1998
1999 void
2000 vec4_visitor::emit_ndc_computation()
2001 {
2002 /* Get the position */
2003 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2004
2005 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2006 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2007 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2008
2009 current_annotation = "NDC";
2010 dst_reg ndc_w = ndc;
2011 ndc_w.writemask = WRITEMASK_W;
2012 src_reg pos_w = pos;
2013 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2014 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2015
2016 dst_reg ndc_xyz = ndc;
2017 ndc_xyz.writemask = WRITEMASK_XYZ;
2018
2019 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2020 }
2021
2022 void
2023 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2024 {
2025 if (intel->gen < 6 &&
2026 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2027 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2028 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2029 dst_reg header1_w = header1;
2030 header1_w.writemask = WRITEMASK_W;
2031 GLuint i;
2032
2033 emit(MOV(header1, 0u));
2034
2035 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2036 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2037
2038 current_annotation = "Point size";
2039 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2040 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2041 }
2042
2043 current_annotation = "Clipping flags";
2044 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2045 vec4_instruction *inst;
2046
2047 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2048 src_reg(this->userplane[i])));
2049 inst->conditional_mod = BRW_CONDITIONAL_L;
2050
2051 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2052 inst->predicate = BRW_PREDICATE_NORMAL;
2053 }
2054
2055 /* i965 clipping workaround:
2056 * 1) Test for -ve rhw
2057 * 2) If set,
2058 * set ndc = (0,0,0,0)
2059 * set ucp[6] = 1
2060 *
2061 * Later, clipping will detect ucp[6] and ensure the primitive is
2062 * clipped against all fixed planes.
2063 */
2064 if (brw->has_negative_rhw_bug) {
2065 #if 0
2066 /* FINISHME */
2067 brw_CMP(p,
2068 vec8(brw_null_reg()),
2069 BRW_CONDITIONAL_L,
2070 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2071 brw_imm_f(0));
2072
2073 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2074 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2075 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2076 #endif
2077 }
2078
2079 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2080 } else if (intel->gen < 6) {
2081 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2082 } else {
2083 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2084 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2085 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2086 src_reg(output_reg[VERT_RESULT_PSIZ])));
2087 }
2088 }
2089 }
2090
2091 void
2092 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2093 {
2094 if (intel->gen < 6) {
2095 /* Clip distance slots are set aside in gen5, but they are not used. It
2096 * is not clear whether we actually need to set aside space for them,
2097 * but the performance cost is negligible.
2098 */
2099 return;
2100 }
2101
2102 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2103 *
2104 * "If a linked set of shaders forming the vertex stage contains no
2105 * static write to gl_ClipVertex or gl_ClipDistance, but the
2106 * application has requested clipping against user clip planes through
2107 * the API, then the coordinate written to gl_Position is used for
2108 * comparison against the user clip planes."
2109 *
2110 * This function is only called if the shader didn't write to
2111 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2112 * if the user wrote to it; otherwise we use gl_Position.
2113 */
2114 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2115 if (!(c->prog_data.outputs_written
2116 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2117 clip_vertex = VERT_RESULT_HPOS;
2118 }
2119
2120 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2121 ++i) {
2122 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2123 src_reg(output_reg[clip_vertex]),
2124 src_reg(this->userplane[i + offset])));
2125 }
2126 }
2127
2128 void
2129 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2130 {
2131 assert (vert_result < VERT_RESULT_MAX);
2132 reg.type = output_reg[vert_result].type;
2133 current_annotation = output_reg_annotation[vert_result];
2134 /* Copy the register, saturating if necessary */
2135 vec4_instruction *inst = emit(MOV(reg,
2136 src_reg(output_reg[vert_result])));
2137 if ((vert_result == VERT_RESULT_COL0 ||
2138 vert_result == VERT_RESULT_COL1 ||
2139 vert_result == VERT_RESULT_BFC0 ||
2140 vert_result == VERT_RESULT_BFC1) &&
2141 c->key.clamp_vertex_color) {
2142 inst->saturate = true;
2143 }
2144 }
2145
2146 void
2147 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2148 {
2149 struct brw_reg hw_reg = brw_message_reg(mrf);
2150 dst_reg reg = dst_reg(MRF, mrf);
2151 reg.type = BRW_REGISTER_TYPE_F;
2152
2153 switch (vert_result) {
2154 case VERT_RESULT_PSIZ:
2155 /* PSIZ is always in slot 0, and is coupled with other flags. */
2156 current_annotation = "indices, point width, clip flags";
2157 emit_psiz_and_flags(hw_reg);
2158 break;
2159 case BRW_VERT_RESULT_NDC:
2160 current_annotation = "NDC";
2161 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2162 break;
2163 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2164 case VERT_RESULT_HPOS:
2165 current_annotation = "gl_Position";
2166 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2167 break;
2168 case VERT_RESULT_CLIP_DIST0:
2169 case VERT_RESULT_CLIP_DIST1:
2170 if (this->c->key.uses_clip_distance) {
2171 emit_generic_urb_slot(reg, vert_result);
2172 } else {
2173 current_annotation = "user clip distances";
2174 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2175 }
2176 break;
2177 case BRW_VERT_RESULT_PAD:
2178 /* No need to write to this slot */
2179 break;
2180 default:
2181 emit_generic_urb_slot(reg, vert_result);
2182 break;
2183 }
2184 }
2185
2186 static int
2187 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2188 {
2189 struct intel_context *intel = &brw->intel;
2190
2191 if (intel->gen >= 6) {
2192 /* URB data written (does not include the message header reg) must
2193 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2194 * section 5.4.3.2.2: URB_INTERLEAVED.
2195 *
2196 * URB entries are allocated on a multiple of 1024 bits, so an
2197 * extra 128 bits written here to make the end align to 256 is
2198 * no problem.
2199 */
2200 if ((mlen % 2) != 1)
2201 mlen++;
2202 }
2203
2204 return mlen;
2205 }
2206
2207 /**
2208 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2209 * complete the VS thread.
2210 *
2211 * The VUE layout is documented in Volume 2a.
2212 */
2213 void
2214 vec4_visitor::emit_urb_writes()
2215 {
2216 /* MRF 0 is reserved for the debugger, so start with message header
2217 * in MRF 1.
2218 */
2219 int base_mrf = 1;
2220 int mrf = base_mrf;
2221 /* In the process of generating our URB write message contents, we
2222 * may need to unspill a register or load from an array. Those
2223 * reads would use MRFs 14-15.
2224 */
2225 int max_usable_mrf = 13;
2226
2227 /* The following assertion verifies that max_usable_mrf causes an
2228 * even-numbered amount of URB write data, which will meet gen6's
2229 * requirements for length alignment.
2230 */
2231 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2232
2233 /* FINISHME: edgeflag */
2234
2235 brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2236 c->prog_data.outputs_written);
2237
2238 /* First mrf is the g0-based message header containing URB handles and such,
2239 * which is implied in VS_OPCODE_URB_WRITE.
2240 */
2241 mrf++;
2242
2243 if (intel->gen < 6) {
2244 emit_ndc_computation();
2245 }
2246
2247 /* Set up the VUE data for the first URB write */
2248 int slot;
2249 for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2250 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2251
2252 /* If this was max_usable_mrf, we can't fit anything more into this URB
2253 * WRITE.
2254 */
2255 if (mrf > max_usable_mrf) {
2256 slot++;
2257 break;
2258 }
2259 }
2260
2261 current_annotation = "URB write";
2262 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2263 inst->base_mrf = base_mrf;
2264 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2265 inst->eot = (slot >= c->vue_map.num_slots);
2266
2267 /* Optional second URB write */
2268 if (!inst->eot) {
2269 mrf = base_mrf + 1;
2270
2271 for (; slot < c->vue_map.num_slots; ++slot) {
2272 assert(mrf < max_usable_mrf);
2273
2274 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2275 }
2276
2277 current_annotation = "URB write";
2278 inst = emit(VS_OPCODE_URB_WRITE);
2279 inst->base_mrf = base_mrf;
2280 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2281 inst->eot = true;
2282 /* URB destination offset. In the previous write, we got MRFs
2283 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2284 * URB row increments, and each of our MRFs is half of one of
2285 * those, since we're doing interleaved writes.
2286 */
2287 inst->offset = (max_usable_mrf - base_mrf) / 2;
2288 }
2289
2290 if (intel->gen == 6)
2291 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2292 else
2293 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2294 }
2295
2296 src_reg
2297 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2298 src_reg *reladdr, int reg_offset)
2299 {
2300 /* Because we store the values to scratch interleaved like our
2301 * vertex data, we need to scale the vec4 index by 2.
2302 */
2303 int message_header_scale = 2;
2304
2305 /* Pre-gen6, the message header uses byte offsets instead of vec4
2306 * (16-byte) offset units.
2307 */
2308 if (intel->gen < 6)
2309 message_header_scale *= 16;
2310
2311 if (reladdr) {
2312 src_reg index = src_reg(this, glsl_type::int_type);
2313
2314 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2315 emit_before(inst, MUL(dst_reg(index),
2316 index, src_reg(message_header_scale)));
2317
2318 return index;
2319 } else {
2320 return src_reg(reg_offset * message_header_scale);
2321 }
2322 }
2323
2324 src_reg
2325 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2326 src_reg *reladdr, int reg_offset)
2327 {
2328 if (reladdr) {
2329 src_reg index = src_reg(this, glsl_type::int_type);
2330
2331 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2332
2333 /* Pre-gen6, the message header uses byte offsets instead of vec4
2334 * (16-byte) offset units.
2335 */
2336 if (intel->gen < 6) {
2337 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2338 }
2339
2340 return index;
2341 } else {
2342 int message_header_scale = intel->gen < 6 ? 16 : 1;
2343 return src_reg(reg_offset * message_header_scale);
2344 }
2345 }
2346
2347 /**
2348 * Emits an instruction before @inst to load the value named by @orig_src
2349 * from scratch space at @base_offset to @temp.
2350 */
2351 void
2352 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2353 dst_reg temp, src_reg orig_src,
2354 int base_offset)
2355 {
2356 int reg_offset = base_offset + orig_src.reg_offset;
2357 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2358
2359 emit_before(inst, SCRATCH_READ(temp, index));
2360 }
2361
2362 /**
2363 * Emits an instruction after @inst to store the value to be written
2364 * to @orig_dst to scratch space at @base_offset, from @temp.
2365 */
2366 void
2367 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2368 src_reg temp, dst_reg orig_dst,
2369 int base_offset)
2370 {
2371 int reg_offset = base_offset + orig_dst.reg_offset;
2372 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2373
2374 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2375 orig_dst.writemask));
2376 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2377 write->predicate = inst->predicate;
2378 write->ir = inst->ir;
2379 write->annotation = inst->annotation;
2380 inst->insert_after(write);
2381 }
2382
2383 /**
2384 * We can't generally support array access in GRF space, because a
2385 * single instruction's destination can only span 2 contiguous
2386 * registers. So, we send all GRF arrays that get variable index
2387 * access to scratch space.
2388 */
2389 void
2390 vec4_visitor::move_grf_array_access_to_scratch()
2391 {
2392 int scratch_loc[this->virtual_grf_count];
2393
2394 for (int i = 0; i < this->virtual_grf_count; i++) {
2395 scratch_loc[i] = -1;
2396 }
2397
2398 /* First, calculate the set of virtual GRFs that need to be punted
2399 * to scratch due to having any array access on them, and where in
2400 * scratch.
2401 */
2402 foreach_list(node, &this->instructions) {
2403 vec4_instruction *inst = (vec4_instruction *)node;
2404
2405 if (inst->dst.file == GRF && inst->dst.reladdr &&
2406 scratch_loc[inst->dst.reg] == -1) {
2407 scratch_loc[inst->dst.reg] = c->last_scratch;
2408 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2409 }
2410
2411 for (int i = 0 ; i < 3; i++) {
2412 src_reg *src = &inst->src[i];
2413
2414 if (src->file == GRF && src->reladdr &&
2415 scratch_loc[src->reg] == -1) {
2416 scratch_loc[src->reg] = c->last_scratch;
2417 c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2418 }
2419 }
2420 }
2421
2422 /* Now, for anything that will be accessed through scratch, rewrite
2423 * it to load/store. Note that this is a _safe list walk, because
2424 * we may generate a new scratch_write instruction after the one
2425 * we're processing.
2426 */
2427 foreach_list_safe(node, &this->instructions) {
2428 vec4_instruction *inst = (vec4_instruction *)node;
2429
2430 /* Set up the annotation tracking for new generated instructions. */
2431 base_ir = inst->ir;
2432 current_annotation = inst->annotation;
2433
2434 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2435 src_reg temp = src_reg(this, glsl_type::vec4_type);
2436
2437 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2438
2439 inst->dst.file = temp.file;
2440 inst->dst.reg = temp.reg;
2441 inst->dst.reg_offset = temp.reg_offset;
2442 inst->dst.reladdr = NULL;
2443 }
2444
2445 for (int i = 0 ; i < 3; i++) {
2446 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2447 continue;
2448
2449 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2450
2451 emit_scratch_read(inst, temp, inst->src[i],
2452 scratch_loc[inst->src[i].reg]);
2453
2454 inst->src[i].file = temp.file;
2455 inst->src[i].reg = temp.reg;
2456 inst->src[i].reg_offset = temp.reg_offset;
2457 inst->src[i].reladdr = NULL;
2458 }
2459 }
2460 }
2461
2462 /**
2463 * Emits an instruction before @inst to load the value named by @orig_src
2464 * from the pull constant buffer (surface) at @base_offset to @temp.
2465 */
2466 void
2467 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2468 dst_reg temp, src_reg orig_src,
2469 int base_offset)
2470 {
2471 int reg_offset = base_offset + orig_src.reg_offset;
2472 src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2473 vec4_instruction *load;
2474
2475 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2476 temp, index);
2477 load->base_mrf = 14;
2478 load->mlen = 1;
2479 emit_before(inst, load);
2480 }
2481
2482 /**
2483 * Implements array access of uniforms by inserting a
2484 * PULL_CONSTANT_LOAD instruction.
2485 *
2486 * Unlike temporary GRF array access (where we don't support it due to
2487 * the difficulty of doing relative addressing on instruction
2488 * destinations), we could potentially do array access of uniforms
2489 * that were loaded in GRF space as push constants. In real-world
2490 * usage we've seen, though, the arrays being used are always larger
2491 * than we could load as push constants, so just always move all
2492 * uniform array access out to a pull constant buffer.
2493 */
2494 void
2495 vec4_visitor::move_uniform_array_access_to_pull_constants()
2496 {
2497 int pull_constant_loc[this->uniforms];
2498
2499 for (int i = 0; i < this->uniforms; i++) {
2500 pull_constant_loc[i] = -1;
2501 }
2502
2503 /* Walk through and find array access of uniforms. Put a copy of that
2504 * uniform in the pull constant buffer.
2505 *
2506 * Note that we don't move constant-indexed accesses to arrays. No
2507 * testing has been done of the performance impact of this choice.
2508 */
2509 foreach_list_safe(node, &this->instructions) {
2510 vec4_instruction *inst = (vec4_instruction *)node;
2511
2512 for (int i = 0 ; i < 3; i++) {
2513 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2514 continue;
2515
2516 int uniform = inst->src[i].reg;
2517
2518 /* If this array isn't already present in the pull constant buffer,
2519 * add it.
2520 */
2521 if (pull_constant_loc[uniform] == -1) {
2522 const float **values = &prog_data->param[uniform * 4];
2523
2524 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2525
2526 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2527 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2528 }
2529 }
2530
2531 /* Set up the annotation tracking for new generated instructions. */
2532 base_ir = inst->ir;
2533 current_annotation = inst->annotation;
2534
2535 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2536
2537 emit_pull_constant_load(inst, temp, inst->src[i],
2538 pull_constant_loc[uniform]);
2539
2540 inst->src[i].file = temp.file;
2541 inst->src[i].reg = temp.reg;
2542 inst->src[i].reg_offset = temp.reg_offset;
2543 inst->src[i].reladdr = NULL;
2544 }
2545 }
2546
2547 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2548 * no need to track them as larger-than-vec4 objects. This will be
2549 * relied on in cutting out unused uniform vectors from push
2550 * constants.
2551 */
2552 split_uniform_registers();
2553 }
2554
2555 void
2556 vec4_visitor::resolve_ud_negate(src_reg *reg)
2557 {
2558 if (reg->type != BRW_REGISTER_TYPE_UD ||
2559 !reg->negate)
2560 return;
2561
2562 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2563 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2564 *reg = temp;
2565 }
2566
2567 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2568 struct gl_shader_program *prog,
2569 struct brw_shader *shader)
2570 {
2571 this->c = c;
2572 this->p = &c->func;
2573 this->brw = p->brw;
2574 this->intel = &brw->intel;
2575 this->ctx = &intel->ctx;
2576 this->prog = prog;
2577 this->shader = shader;
2578
2579 this->mem_ctx = ralloc_context(NULL);
2580 this->failed = false;
2581
2582 this->base_ir = NULL;
2583 this->current_annotation = NULL;
2584
2585 this->c = c;
2586 this->vp = (struct gl_vertex_program *)
2587 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2588 this->prog_data = &c->prog_data;
2589
2590 this->variable_ht = hash_table_ctor(0,
2591 hash_table_pointer_hash,
2592 hash_table_pointer_compare);
2593
2594 this->virtual_grf_def = NULL;
2595 this->virtual_grf_use = NULL;
2596 this->virtual_grf_sizes = NULL;
2597 this->virtual_grf_count = 0;
2598 this->virtual_grf_reg_map = NULL;
2599 this->virtual_grf_reg_count = 0;
2600 this->virtual_grf_array_size = 0;
2601 this->live_intervals_valid = false;
2602
2603 this->uniforms = 0;
2604 }
2605
2606 vec4_visitor::~vec4_visitor()
2607 {
2608 ralloc_free(this->mem_ctx);
2609 hash_table_dtor(this->variable_ht);
2610 }
2611
2612
2613 void
2614 vec4_visitor::fail(const char *format, ...)
2615 {
2616 va_list va;
2617 char *msg;
2618
2619 if (failed)
2620 return;
2621
2622 failed = true;
2623
2624 va_start(va, format);
2625 msg = ralloc_vasprintf(mem_ctx, format, va);
2626 va_end(va);
2627 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2628
2629 this->fail_msg = msg;
2630
2631 if (INTEL_DEBUG & DEBUG_VS) {
2632 fprintf(stderr, "%s", msg);
2633 }
2634 }
2635
2636 } /* namespace brw */