i965: Add HiZ operation state to brw_context
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 }
29
30 namespace brw {
31
32 src_reg::src_reg(dst_reg reg)
33 {
34 init();
35
36 this->file = reg.file;
37 this->reg = reg.reg;
38 this->reg_offset = reg.reg_offset;
39 this->type = reg.type;
40 this->reladdr = reg.reladdr;
41 this->fixed_hw_reg = reg.fixed_hw_reg;
42
43 int swizzles[4];
44 int next_chan = 0;
45 int last = 0;
46
47 for (int i = 0; i < 4; i++) {
48 if (!(reg.writemask & (1 << i)))
49 continue;
50
51 swizzles[next_chan++] = last = i;
52 }
53
54 for (; next_chan < 4; next_chan++) {
55 swizzles[next_chan] = last;
56 }
57
58 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
59 swizzles[2], swizzles[3]);
60 }
61
62 dst_reg::dst_reg(src_reg reg)
63 {
64 init();
65
66 this->file = reg.file;
67 this->reg = reg.reg;
68 this->reg_offset = reg.reg_offset;
69 this->type = reg.type;
70 this->writemask = WRITEMASK_XYZW;
71 this->reladdr = reg.reladdr;
72 this->fixed_hw_reg = reg.fixed_hw_reg;
73 }
74
75 vec4_instruction::vec4_instruction(vec4_visitor *v,
76 enum opcode opcode, dst_reg dst,
77 src_reg src0, src_reg src1, src_reg src2)
78 {
79 this->opcode = opcode;
80 this->dst = dst;
81 this->src[0] = src0;
82 this->src[1] = src1;
83 this->src[2] = src2;
84 this->ir = v->base_ir;
85 this->annotation = v->current_annotation;
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(vec4_instruction *inst)
90 {
91 this->instructions.push_tail(inst);
92
93 return inst;
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
98 {
99 new_inst->ir = inst->ir;
100 new_inst->annotation = inst->annotation;
101
102 inst->insert_before(new_inst);
103
104 return inst;
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
109 src_reg src0, src_reg src1, src_reg src2)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
112 src0, src1, src2));
113 }
114
115
116 vec4_instruction *
117 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
118 {
119 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
120 }
121
122 vec4_instruction *
123 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
124 {
125 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
126 }
127
128 vec4_instruction *
129 vec4_visitor::emit(enum opcode opcode)
130 {
131 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
132 }
133
134 #define ALU1(op) \
135 vec4_instruction * \
136 vec4_visitor::op(dst_reg dst, src_reg src0) \
137 { \
138 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
139 src0); \
140 }
141
142 #define ALU2(op) \
143 vec4_instruction * \
144 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
145 { \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU2(ADD)
157 ALU2(MUL)
158 ALU2(MACH)
159 ALU2(AND)
160 ALU2(OR)
161 ALU2(XOR)
162 ALU2(DP3)
163 ALU2(DP4)
164
165 /** Gen4 predicated IF. */
166 vec4_instruction *
167 vec4_visitor::IF(uint32_t predicate)
168 {
169 vec4_instruction *inst;
170
171 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
172 inst->predicate = predicate;
173
174 return inst;
175 }
176
177 /** Gen6+ IF with embedded comparison. */
178 vec4_instruction *
179 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
180 {
181 assert(intel->gen >= 6);
182
183 vec4_instruction *inst;
184
185 resolve_ud_negate(&src0);
186 resolve_ud_negate(&src1);
187
188 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
189 src0, src1);
190 inst->conditional_mod = condition;
191
192 return inst;
193 }
194
195 /**
196 * CMP: Sets the low bit of the destination channels with the result
197 * of the comparison, while the upper bits are undefined, and updates
198 * the flag register with the packed 16 bits of the result.
199 */
200 vec4_instruction *
201 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
202 {
203 vec4_instruction *inst;
204
205 /* original gen4 does type conversion to the destination type
206 * before before comparison, producing garbage results for floating
207 * point comparisons.
208 */
209 if (intel->gen == 4) {
210 dst.type = src0.type;
211 if (dst.file == HW_REG)
212 dst.fixed_hw_reg.type = dst.type;
213 }
214
215 resolve_ud_negate(&src0);
216 resolve_ud_negate(&src1);
217
218 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
219 inst->conditional_mod = condition;
220
221 return inst;
222 }
223
224 vec4_instruction *
225 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
226 {
227 vec4_instruction *inst;
228
229 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
230 dst, index);
231 inst->base_mrf = 14;
232 inst->mlen = 1;
233
234 return inst;
235 }
236
237 vec4_instruction *
238 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
239 {
240 vec4_instruction *inst;
241
242 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
243 dst, src, index);
244 inst->base_mrf = 13;
245 inst->mlen = 2;
246
247 return inst;
248 }
249
250 void
251 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
252 {
253 static enum opcode dot_opcodes[] = {
254 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
255 };
256
257 emit(dot_opcodes[elements - 2], dst, src0, src1);
258 }
259
260 void
261 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
262 {
263 /* The gen6 math instruction ignores the source modifiers --
264 * swizzle, abs, negate, and at least some parts of the register
265 * region description.
266 *
267 * While it would seem that this MOV could be avoided at this point
268 * in the case that the swizzle is matched up with the destination
269 * writemask, note that uniform packing and register allocation
270 * could rearrange our swizzle, so let's leave this matter up to
271 * copy propagation later.
272 */
273 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
274 emit(MOV(dst_reg(temp_src), src));
275
276 if (dst.writemask != WRITEMASK_XYZW) {
277 /* The gen6 math instruction must be align1, so we can't do
278 * writemasks.
279 */
280 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
281
282 emit(opcode, temp_dst, temp_src);
283
284 emit(MOV(dst, src_reg(temp_dst)));
285 } else {
286 emit(opcode, dst, temp_src);
287 }
288 }
289
290 void
291 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
292 {
293 vec4_instruction *inst = emit(opcode, dst, src);
294 inst->base_mrf = 1;
295 inst->mlen = 1;
296 }
297
298 void
299 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
300 {
301 switch (opcode) {
302 case SHADER_OPCODE_RCP:
303 case SHADER_OPCODE_RSQ:
304 case SHADER_OPCODE_SQRT:
305 case SHADER_OPCODE_EXP2:
306 case SHADER_OPCODE_LOG2:
307 case SHADER_OPCODE_SIN:
308 case SHADER_OPCODE_COS:
309 break;
310 default:
311 assert(!"not reached: bad math opcode");
312 return;
313 }
314
315 if (intel->gen >= 6) {
316 return emit_math1_gen6(opcode, dst, src);
317 } else {
318 return emit_math1_gen4(opcode, dst, src);
319 }
320 }
321
322 void
323 vec4_visitor::emit_math2_gen6(enum opcode opcode,
324 dst_reg dst, src_reg src0, src_reg src1)
325 {
326 src_reg expanded;
327
328 /* The gen6 math instruction ignores the source modifiers --
329 * swizzle, abs, negate, and at least some parts of the register
330 * region description. Move the sources to temporaries to make it
331 * generally work.
332 */
333
334 expanded = src_reg(this, glsl_type::vec4_type);
335 expanded.type = src0.type;
336 emit(MOV(dst_reg(expanded), src0));
337 src0 = expanded;
338
339 expanded = src_reg(this, glsl_type::vec4_type);
340 expanded.type = src1.type;
341 emit(MOV(dst_reg(expanded), src1));
342 src1 = expanded;
343
344 if (dst.writemask != WRITEMASK_XYZW) {
345 /* The gen6 math instruction must be align1, so we can't do
346 * writemasks.
347 */
348 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
349 temp_dst.type = dst.type;
350
351 emit(opcode, temp_dst, src0, src1);
352
353 emit(MOV(dst, src_reg(temp_dst)));
354 } else {
355 emit(opcode, dst, src0, src1);
356 }
357 }
358
359 void
360 vec4_visitor::emit_math2_gen4(enum opcode opcode,
361 dst_reg dst, src_reg src0, src_reg src1)
362 {
363 vec4_instruction *inst = emit(opcode, dst, src0, src1);
364 inst->base_mrf = 1;
365 inst->mlen = 2;
366 }
367
368 void
369 vec4_visitor::emit_math(enum opcode opcode,
370 dst_reg dst, src_reg src0, src_reg src1)
371 {
372 switch (opcode) {
373 case SHADER_OPCODE_POW:
374 case SHADER_OPCODE_INT_QUOTIENT:
375 case SHADER_OPCODE_INT_REMAINDER:
376 break;
377 default:
378 assert(!"not reached: unsupported binary math opcode");
379 return;
380 }
381
382 if (intel->gen >= 6) {
383 return emit_math2_gen6(opcode, dst, src0, src1);
384 } else {
385 return emit_math2_gen4(opcode, dst, src0, src1);
386 }
387 }
388
389 void
390 vec4_visitor::visit_instructions(const exec_list *list)
391 {
392 foreach_list(node, list) {
393 ir_instruction *ir = (ir_instruction *)node;
394
395 base_ir = ir;
396 ir->accept(this);
397 }
398 }
399
400
401 static int
402 type_size(const struct glsl_type *type)
403 {
404 unsigned int i;
405 int size;
406
407 switch (type->base_type) {
408 case GLSL_TYPE_UINT:
409 case GLSL_TYPE_INT:
410 case GLSL_TYPE_FLOAT:
411 case GLSL_TYPE_BOOL:
412 if (type->is_matrix()) {
413 return type->matrix_columns;
414 } else {
415 /* Regardless of size of vector, it gets a vec4. This is bad
416 * packing for things like floats, but otherwise arrays become a
417 * mess. Hopefully a later pass over the code can pack scalars
418 * down if appropriate.
419 */
420 return 1;
421 }
422 case GLSL_TYPE_ARRAY:
423 assert(type->length > 0);
424 return type_size(type->fields.array) * type->length;
425 case GLSL_TYPE_STRUCT:
426 size = 0;
427 for (i = 0; i < type->length; i++) {
428 size += type_size(type->fields.structure[i].type);
429 }
430 return size;
431 case GLSL_TYPE_SAMPLER:
432 /* Samplers take up one slot in UNIFORMS[], but they're baked in
433 * at link time.
434 */
435 return 1;
436 default:
437 assert(0);
438 return 0;
439 }
440 }
441
442 int
443 vec4_visitor::virtual_grf_alloc(int size)
444 {
445 if (virtual_grf_array_size <= virtual_grf_count) {
446 if (virtual_grf_array_size == 0)
447 virtual_grf_array_size = 16;
448 else
449 virtual_grf_array_size *= 2;
450 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
451 virtual_grf_array_size);
452 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
453 virtual_grf_array_size);
454 }
455 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
456 virtual_grf_reg_count += size;
457 virtual_grf_sizes[virtual_grf_count] = size;
458 return virtual_grf_count++;
459 }
460
461 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
462 {
463 init();
464
465 this->file = GRF;
466 this->reg = v->virtual_grf_alloc(type_size(type));
467
468 if (type->is_array() || type->is_record()) {
469 this->swizzle = BRW_SWIZZLE_NOOP;
470 } else {
471 this->swizzle = swizzle_for_size(type->vector_elements);
472 }
473
474 this->type = brw_type_for_base_type(type);
475 }
476
477 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
478 {
479 init();
480
481 this->file = GRF;
482 this->reg = v->virtual_grf_alloc(type_size(type));
483
484 if (type->is_array() || type->is_record()) {
485 this->writemask = WRITEMASK_XYZW;
486 } else {
487 this->writemask = (1 << type->vector_elements) - 1;
488 }
489
490 this->type = brw_type_for_base_type(type);
491 }
492
493 /* Our support for uniforms is piggy-backed on the struct
494 * gl_fragment_program, because that's where the values actually
495 * get stored, rather than in some global gl_shader_program uniform
496 * store.
497 */
498 int
499 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
500 {
501 unsigned int offset = 0;
502 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
503
504 if (type->is_matrix()) {
505 const glsl_type *column = type->column_type();
506
507 for (unsigned int i = 0; i < type->matrix_columns; i++) {
508 offset += setup_uniform_values(loc + offset, column);
509 }
510
511 return offset;
512 }
513
514 switch (type->base_type) {
515 case GLSL_TYPE_FLOAT:
516 case GLSL_TYPE_UINT:
517 case GLSL_TYPE_INT:
518 case GLSL_TYPE_BOOL:
519 for (unsigned int i = 0; i < type->vector_elements; i++) {
520 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
521 }
522
523 /* Set up pad elements to get things aligned to a vec4 boundary. */
524 for (unsigned int i = type->vector_elements; i < 4; i++) {
525 static float zero = 0;
526
527 c->prog_data.param[this->uniforms * 4 + i] = &zero;
528 }
529
530 /* Track the size of this uniform vector, for future packing of
531 * uniforms.
532 */
533 this->uniform_vector_size[this->uniforms] = type->vector_elements;
534 this->uniforms++;
535
536 return 1;
537
538 case GLSL_TYPE_STRUCT:
539 for (unsigned int i = 0; i < type->length; i++) {
540 offset += setup_uniform_values(loc + offset,
541 type->fields.structure[i].type);
542 }
543 return offset;
544
545 case GLSL_TYPE_ARRAY:
546 for (unsigned int i = 0; i < type->length; i++) {
547 offset += setup_uniform_values(loc + offset, type->fields.array);
548 }
549 return offset;
550
551 case GLSL_TYPE_SAMPLER:
552 /* The sampler takes up a slot, but we don't use any values from it. */
553 return 1;
554
555 default:
556 assert(!"not reached");
557 return 0;
558 }
559 }
560
561 void
562 vec4_visitor::setup_uniform_clipplane_values()
563 {
564 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
565
566 /* Pre-Gen6, we compact clip planes. For example, if the user
567 * enables just clip planes 0, 1, and 3, we will enable clip planes
568 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
569 * plane 2. This simplifies the implementation of the Gen6 clip
570 * thread.
571 *
572 * In Gen6 and later, we don't compact clip planes, because this
573 * simplifies the implementation of gl_ClipDistance.
574 */
575 int compacted_clipplane_index = 0;
576 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
577 if (intel->gen < 6 &&
578 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
579 continue;
580 }
581 this->uniform_vector_size[this->uniforms] = 4;
582 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
583 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
584 for (int j = 0; j < 4; ++j) {
585 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
586 }
587 ++compacted_clipplane_index;
588 ++this->uniforms;
589 }
590 }
591
592 /* Our support for builtin uniforms is even scarier than non-builtin.
593 * It sits on top of the PROG_STATE_VAR parameters that are
594 * automatically updated from GL context state.
595 */
596 void
597 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
598 {
599 const ir_state_slot *const slots = ir->state_slots;
600 assert(ir->state_slots != NULL);
601
602 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
603 /* This state reference has already been setup by ir_to_mesa,
604 * but we'll get the same index back here. We can reference
605 * ParameterValues directly, since unlike brw_fs.cpp, we never
606 * add new state references during compile.
607 */
608 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
609 (gl_state_index *)slots[i].tokens);
610 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
611
612 this->uniform_vector_size[this->uniforms] = 0;
613 /* Add each of the unique swizzled channels of the element.
614 * This will end up matching the size of the glsl_type of this field.
615 */
616 int last_swiz = -1;
617 for (unsigned int j = 0; j < 4; j++) {
618 int swiz = GET_SWZ(slots[i].swizzle, j);
619 last_swiz = swiz;
620
621 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
622 if (swiz <= last_swiz)
623 this->uniform_vector_size[this->uniforms]++;
624 }
625 this->uniforms++;
626 }
627 }
628
629 dst_reg *
630 vec4_visitor::variable_storage(ir_variable *var)
631 {
632 return (dst_reg *)hash_table_find(this->variable_ht, var);
633 }
634
635 void
636 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
637 {
638 ir_expression *expr = ir->as_expression();
639
640 *predicate = BRW_PREDICATE_NORMAL;
641
642 if (expr) {
643 src_reg op[2];
644 vec4_instruction *inst;
645
646 assert(expr->get_num_operands() <= 2);
647 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
648 expr->operands[i]->accept(this);
649 op[i] = this->result;
650
651 resolve_ud_negate(&op[i]);
652 }
653
654 switch (expr->operation) {
655 case ir_unop_logic_not:
656 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
657 inst->conditional_mod = BRW_CONDITIONAL_Z;
658 break;
659
660 case ir_binop_logic_xor:
661 inst = emit(XOR(dst_null_d(), op[0], op[1]));
662 inst->conditional_mod = BRW_CONDITIONAL_NZ;
663 break;
664
665 case ir_binop_logic_or:
666 inst = emit(OR(dst_null_d(), op[0], op[1]));
667 inst->conditional_mod = BRW_CONDITIONAL_NZ;
668 break;
669
670 case ir_binop_logic_and:
671 inst = emit(AND(dst_null_d(), op[0], op[1]));
672 inst->conditional_mod = BRW_CONDITIONAL_NZ;
673 break;
674
675 case ir_unop_f2b:
676 if (intel->gen >= 6) {
677 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
678 } else {
679 inst = emit(MOV(dst_null_f(), op[0]));
680 inst->conditional_mod = BRW_CONDITIONAL_NZ;
681 }
682 break;
683
684 case ir_unop_i2b:
685 if (intel->gen >= 6) {
686 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
687 } else {
688 inst = emit(MOV(dst_null_d(), op[0]));
689 inst->conditional_mod = BRW_CONDITIONAL_NZ;
690 }
691 break;
692
693 case ir_binop_all_equal:
694 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
695 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
696 break;
697
698 case ir_binop_any_nequal:
699 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
700 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
701 break;
702
703 case ir_unop_any:
704 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
705 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
706 break;
707
708 case ir_binop_greater:
709 case ir_binop_gequal:
710 case ir_binop_less:
711 case ir_binop_lequal:
712 case ir_binop_equal:
713 case ir_binop_nequal:
714 emit(CMP(dst_null_d(), op[0], op[1],
715 brw_conditional_for_comparison(expr->operation)));
716 break;
717
718 default:
719 assert(!"not reached");
720 break;
721 }
722 return;
723 }
724
725 ir->accept(this);
726
727 resolve_ud_negate(&this->result);
728
729 if (intel->gen >= 6) {
730 vec4_instruction *inst = emit(AND(dst_null_d(),
731 this->result, src_reg(1)));
732 inst->conditional_mod = BRW_CONDITIONAL_NZ;
733 } else {
734 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
735 inst->conditional_mod = BRW_CONDITIONAL_NZ;
736 }
737 }
738
739 /**
740 * Emit a gen6 IF statement with the comparison folded into the IF
741 * instruction.
742 */
743 void
744 vec4_visitor::emit_if_gen6(ir_if *ir)
745 {
746 ir_expression *expr = ir->condition->as_expression();
747
748 if (expr) {
749 src_reg op[2];
750 dst_reg temp;
751
752 assert(expr->get_num_operands() <= 2);
753 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
754 expr->operands[i]->accept(this);
755 op[i] = this->result;
756 }
757
758 switch (expr->operation) {
759 case ir_unop_logic_not:
760 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
761 return;
762
763 case ir_binop_logic_xor:
764 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
765 return;
766
767 case ir_binop_logic_or:
768 temp = dst_reg(this, glsl_type::bool_type);
769 emit(OR(temp, op[0], op[1]));
770 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
771 return;
772
773 case ir_binop_logic_and:
774 temp = dst_reg(this, glsl_type::bool_type);
775 emit(AND(temp, op[0], op[1]));
776 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
777 return;
778
779 case ir_unop_f2b:
780 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
781 return;
782
783 case ir_unop_i2b:
784 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
785 return;
786
787 case ir_binop_greater:
788 case ir_binop_gequal:
789 case ir_binop_less:
790 case ir_binop_lequal:
791 case ir_binop_equal:
792 case ir_binop_nequal:
793 emit(IF(op[0], op[1],
794 brw_conditional_for_comparison(expr->operation)));
795 return;
796
797 case ir_binop_all_equal:
798 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
799 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
800 return;
801
802 case ir_binop_any_nequal:
803 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
804 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
805 return;
806
807 case ir_unop_any:
808 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
809 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
810 return;
811
812 default:
813 assert(!"not reached");
814 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
815 return;
816 }
817 return;
818 }
819
820 ir->condition->accept(this);
821
822 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
823 }
824
825 void
826 vec4_visitor::visit(ir_variable *ir)
827 {
828 dst_reg *reg = NULL;
829
830 if (variable_storage(ir))
831 return;
832
833 switch (ir->mode) {
834 case ir_var_in:
835 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
836
837 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
838 * come in as floating point conversions of the integer values.
839 */
840 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
841 if (!c->key.gl_fixed_input_size[i])
842 continue;
843
844 dst_reg dst = *reg;
845 dst.type = brw_type_for_base_type(ir->type);
846 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
847 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
848 }
849 break;
850
851 case ir_var_out:
852 reg = new(mem_ctx) dst_reg(this, ir->type);
853
854 for (int i = 0; i < type_size(ir->type); i++) {
855 output_reg[ir->location + i] = *reg;
856 output_reg[ir->location + i].reg_offset = i;
857 output_reg[ir->location + i].type =
858 brw_type_for_base_type(ir->type->get_scalar_type());
859 output_reg_annotation[ir->location + i] = ir->name;
860 }
861 break;
862
863 case ir_var_auto:
864 case ir_var_temporary:
865 reg = new(mem_ctx) dst_reg(this, ir->type);
866 break;
867
868 case ir_var_uniform:
869 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
870
871 /* Track how big the whole uniform variable is, in case we need to put a
872 * copy of its data into pull constants for array access.
873 */
874 this->uniform_size[this->uniforms] = type_size(ir->type);
875
876 if (!strncmp(ir->name, "gl_", 3)) {
877 setup_builtin_uniform_values(ir);
878 } else {
879 setup_uniform_values(ir->location, ir->type);
880 }
881 break;
882
883 case ir_var_system_value:
884 /* VertexID is stored by the VF as the last vertex element, but
885 * we don't represent it with a flag in inputs_read, so we call
886 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
887 */
888 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
889 prog_data->uses_vertexid = true;
890
891 switch (ir->location) {
892 case SYSTEM_VALUE_VERTEX_ID:
893 reg->writemask = WRITEMASK_X;
894 break;
895 case SYSTEM_VALUE_INSTANCE_ID:
896 reg->writemask = WRITEMASK_Y;
897 break;
898 default:
899 assert(!"not reached");
900 break;
901 }
902 break;
903
904 default:
905 assert(!"not reached");
906 }
907
908 reg->type = brw_type_for_base_type(ir->type);
909 hash_table_insert(this->variable_ht, reg, ir);
910 }
911
912 void
913 vec4_visitor::visit(ir_loop *ir)
914 {
915 dst_reg counter;
916
917 /* We don't want debugging output to print the whole body of the
918 * loop as the annotation.
919 */
920 this->base_ir = NULL;
921
922 if (ir->counter != NULL) {
923 this->base_ir = ir->counter;
924 ir->counter->accept(this);
925 counter = *(variable_storage(ir->counter));
926
927 if (ir->from != NULL) {
928 this->base_ir = ir->from;
929 ir->from->accept(this);
930
931 emit(MOV(counter, this->result));
932 }
933 }
934
935 emit(BRW_OPCODE_DO);
936
937 if (ir->to) {
938 this->base_ir = ir->to;
939 ir->to->accept(this);
940
941 emit(CMP(dst_null_d(), src_reg(counter), this->result,
942 brw_conditional_for_comparison(ir->cmp)));
943
944 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
945 inst->predicate = BRW_PREDICATE_NORMAL;
946 }
947
948 visit_instructions(&ir->body_instructions);
949
950
951 if (ir->increment) {
952 this->base_ir = ir->increment;
953 ir->increment->accept(this);
954 emit(ADD(counter, src_reg(counter), this->result));
955 }
956
957 emit(BRW_OPCODE_WHILE);
958 }
959
960 void
961 vec4_visitor::visit(ir_loop_jump *ir)
962 {
963 switch (ir->mode) {
964 case ir_loop_jump::jump_break:
965 emit(BRW_OPCODE_BREAK);
966 break;
967 case ir_loop_jump::jump_continue:
968 emit(BRW_OPCODE_CONTINUE);
969 break;
970 }
971 }
972
973
974 void
975 vec4_visitor::visit(ir_function_signature *ir)
976 {
977 assert(0);
978 (void)ir;
979 }
980
981 void
982 vec4_visitor::visit(ir_function *ir)
983 {
984 /* Ignore function bodies other than main() -- we shouldn't see calls to
985 * them since they should all be inlined.
986 */
987 if (strcmp(ir->name, "main") == 0) {
988 const ir_function_signature *sig;
989 exec_list empty;
990
991 sig = ir->matching_signature(&empty);
992
993 assert(sig);
994
995 visit_instructions(&sig->body);
996 }
997 }
998
999 bool
1000 vec4_visitor::try_emit_sat(ir_expression *ir)
1001 {
1002 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1003 if (!sat_src)
1004 return false;
1005
1006 sat_src->accept(this);
1007 src_reg src = this->result;
1008
1009 this->result = src_reg(this, ir->type);
1010 vec4_instruction *inst;
1011 inst = emit(MOV(dst_reg(this->result), src));
1012 inst->saturate = true;
1013
1014 return true;
1015 }
1016
1017 void
1018 vec4_visitor::emit_bool_comparison(unsigned int op,
1019 dst_reg dst, src_reg src0, src_reg src1)
1020 {
1021 /* original gen4 does destination conversion before comparison. */
1022 if (intel->gen < 5)
1023 dst.type = src0.type;
1024
1025 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1026
1027 dst.type = BRW_REGISTER_TYPE_D;
1028 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1029 }
1030
1031 void
1032 vec4_visitor::visit(ir_expression *ir)
1033 {
1034 unsigned int operand;
1035 src_reg op[Elements(ir->operands)];
1036 src_reg result_src;
1037 dst_reg result_dst;
1038 vec4_instruction *inst;
1039
1040 if (try_emit_sat(ir))
1041 return;
1042
1043 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1044 this->result.file = BAD_FILE;
1045 ir->operands[operand]->accept(this);
1046 if (this->result.file == BAD_FILE) {
1047 printf("Failed to get tree for expression operand:\n");
1048 ir->operands[operand]->print();
1049 exit(1);
1050 }
1051 op[operand] = this->result;
1052
1053 /* Matrix expression operands should have been broken down to vector
1054 * operations already.
1055 */
1056 assert(!ir->operands[operand]->type->is_matrix());
1057 }
1058
1059 int vector_elements = ir->operands[0]->type->vector_elements;
1060 if (ir->operands[1]) {
1061 vector_elements = MAX2(vector_elements,
1062 ir->operands[1]->type->vector_elements);
1063 }
1064
1065 this->result.file = BAD_FILE;
1066
1067 /* Storage for our result. Ideally for an assignment we'd be using
1068 * the actual storage for the result here, instead.
1069 */
1070 result_src = src_reg(this, ir->type);
1071 /* convenience for the emit functions below. */
1072 result_dst = dst_reg(result_src);
1073 /* If nothing special happens, this is the result. */
1074 this->result = result_src;
1075 /* Limit writes to the channels that will be used by result_src later.
1076 * This does limit this temp's use as a temporary for multi-instruction
1077 * sequences.
1078 */
1079 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1080
1081 switch (ir->operation) {
1082 case ir_unop_logic_not:
1083 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1084 * ones complement of the whole register, not just bit 0.
1085 */
1086 emit(XOR(result_dst, op[0], src_reg(1)));
1087 break;
1088 case ir_unop_neg:
1089 op[0].negate = !op[0].negate;
1090 this->result = op[0];
1091 break;
1092 case ir_unop_abs:
1093 op[0].abs = true;
1094 op[0].negate = false;
1095 this->result = op[0];
1096 break;
1097
1098 case ir_unop_sign:
1099 emit(MOV(result_dst, src_reg(0.0f)));
1100
1101 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1102 inst = emit(MOV(result_dst, src_reg(1.0f)));
1103 inst->predicate = BRW_PREDICATE_NORMAL;
1104
1105 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1106 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1107 inst->predicate = BRW_PREDICATE_NORMAL;
1108
1109 break;
1110
1111 case ir_unop_rcp:
1112 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1113 break;
1114
1115 case ir_unop_exp2:
1116 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1117 break;
1118 case ir_unop_log2:
1119 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1120 break;
1121 case ir_unop_exp:
1122 case ir_unop_log:
1123 assert(!"not reached: should be handled by ir_explog_to_explog2");
1124 break;
1125 case ir_unop_sin:
1126 case ir_unop_sin_reduced:
1127 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1128 break;
1129 case ir_unop_cos:
1130 case ir_unop_cos_reduced:
1131 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1132 break;
1133
1134 case ir_unop_dFdx:
1135 case ir_unop_dFdy:
1136 assert(!"derivatives not valid in vertex shader");
1137 break;
1138
1139 case ir_unop_noise:
1140 assert(!"not reached: should be handled by lower_noise");
1141 break;
1142
1143 case ir_binop_add:
1144 emit(ADD(result_dst, op[0], op[1]));
1145 break;
1146 case ir_binop_sub:
1147 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1148 break;
1149
1150 case ir_binop_mul:
1151 if (ir->type->is_integer()) {
1152 /* For integer multiplication, the MUL uses the low 16 bits
1153 * of one of the operands (src0 on gen6, src1 on gen7). The
1154 * MACH accumulates in the contribution of the upper 16 bits
1155 * of that operand.
1156 *
1157 * FINISHME: Emit just the MUL if we know an operand is small
1158 * enough.
1159 */
1160 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1161
1162 emit(MUL(acc, op[0], op[1]));
1163 emit(MACH(dst_null_d(), op[0], op[1]));
1164 emit(MOV(result_dst, src_reg(acc)));
1165 } else {
1166 emit(MUL(result_dst, op[0], op[1]));
1167 }
1168 break;
1169 case ir_binop_div:
1170 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1171 assert(ir->type->is_integer());
1172 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1173 break;
1174 case ir_binop_mod:
1175 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1176 assert(ir->type->is_integer());
1177 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1178 break;
1179
1180 case ir_binop_less:
1181 case ir_binop_greater:
1182 case ir_binop_lequal:
1183 case ir_binop_gequal:
1184 case ir_binop_equal:
1185 case ir_binop_nequal: {
1186 emit(CMP(result_dst, op[0], op[1],
1187 brw_conditional_for_comparison(ir->operation)));
1188 emit(AND(result_dst, result_src, src_reg(0x1)));
1189 break;
1190 }
1191
1192 case ir_binop_all_equal:
1193 /* "==" operator producing a scalar boolean. */
1194 if (ir->operands[0]->type->is_vector() ||
1195 ir->operands[1]->type->is_vector()) {
1196 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1197 emit(MOV(result_dst, src_reg(0)));
1198 inst = emit(MOV(result_dst, src_reg(1)));
1199 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1200 } else {
1201 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1202 emit(AND(result_dst, result_src, src_reg(0x1)));
1203 }
1204 break;
1205 case ir_binop_any_nequal:
1206 /* "!=" operator producing a scalar boolean. */
1207 if (ir->operands[0]->type->is_vector() ||
1208 ir->operands[1]->type->is_vector()) {
1209 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1210
1211 emit(MOV(result_dst, src_reg(0)));
1212 inst = emit(MOV(result_dst, src_reg(1)));
1213 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1214 } else {
1215 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1216 emit(AND(result_dst, result_src, src_reg(0x1)));
1217 }
1218 break;
1219
1220 case ir_unop_any:
1221 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1222 emit(MOV(result_dst, src_reg(0)));
1223
1224 inst = emit(MOV(result_dst, src_reg(1)));
1225 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1226 break;
1227
1228 case ir_binop_logic_xor:
1229 emit(XOR(result_dst, op[0], op[1]));
1230 break;
1231
1232 case ir_binop_logic_or:
1233 emit(OR(result_dst, op[0], op[1]));
1234 break;
1235
1236 case ir_binop_logic_and:
1237 emit(AND(result_dst, op[0], op[1]));
1238 break;
1239
1240 case ir_binop_dot:
1241 assert(ir->operands[0]->type->is_vector());
1242 assert(ir->operands[0]->type == ir->operands[1]->type);
1243 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1244 break;
1245
1246 case ir_unop_sqrt:
1247 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1248 break;
1249 case ir_unop_rsq:
1250 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1251 break;
1252 case ir_unop_i2f:
1253 case ir_unop_i2u:
1254 case ir_unop_u2i:
1255 case ir_unop_u2f:
1256 case ir_unop_b2f:
1257 case ir_unop_b2i:
1258 case ir_unop_f2i:
1259 emit(MOV(result_dst, op[0]));
1260 break;
1261 case ir_unop_f2b:
1262 case ir_unop_i2b: {
1263 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1264 emit(AND(result_dst, result_src, src_reg(1)));
1265 break;
1266 }
1267
1268 case ir_unop_trunc:
1269 emit(RNDZ(result_dst, op[0]));
1270 break;
1271 case ir_unop_ceil:
1272 op[0].negate = !op[0].negate;
1273 inst = emit(RNDD(result_dst, op[0]));
1274 this->result.negate = true;
1275 break;
1276 case ir_unop_floor:
1277 inst = emit(RNDD(result_dst, op[0]));
1278 break;
1279 case ir_unop_fract:
1280 inst = emit(FRC(result_dst, op[0]));
1281 break;
1282 case ir_unop_round_even:
1283 emit(RNDE(result_dst, op[0]));
1284 break;
1285
1286 case ir_binop_min:
1287 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1288
1289 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1290 inst->predicate = BRW_PREDICATE_NORMAL;
1291 break;
1292 case ir_binop_max:
1293 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1294
1295 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1296 inst->predicate = BRW_PREDICATE_NORMAL;
1297 break;
1298
1299 case ir_binop_pow:
1300 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1301 break;
1302
1303 case ir_unop_bit_not:
1304 inst = emit(NOT(result_dst, op[0]));
1305 break;
1306 case ir_binop_bit_and:
1307 inst = emit(AND(result_dst, op[0], op[1]));
1308 break;
1309 case ir_binop_bit_xor:
1310 inst = emit(XOR(result_dst, op[0], op[1]));
1311 break;
1312 case ir_binop_bit_or:
1313 inst = emit(OR(result_dst, op[0], op[1]));
1314 break;
1315
1316 case ir_binop_lshift:
1317 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1318 break;
1319
1320 case ir_binop_rshift:
1321 if (ir->type->base_type == GLSL_TYPE_INT)
1322 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1323 else
1324 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1325 break;
1326
1327 case ir_quadop_vector:
1328 assert(!"not reached: should be handled by lower_quadop_vector");
1329 break;
1330 }
1331 }
1332
1333
1334 void
1335 vec4_visitor::visit(ir_swizzle *ir)
1336 {
1337 src_reg src;
1338 int i = 0;
1339 int swizzle[4];
1340
1341 /* Note that this is only swizzles in expressions, not those on the left
1342 * hand side of an assignment, which do write masking. See ir_assignment
1343 * for that.
1344 */
1345
1346 ir->val->accept(this);
1347 src = this->result;
1348 assert(src.file != BAD_FILE);
1349
1350 for (i = 0; i < ir->type->vector_elements; i++) {
1351 switch (i) {
1352 case 0:
1353 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1354 break;
1355 case 1:
1356 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1357 break;
1358 case 2:
1359 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1360 break;
1361 case 3:
1362 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1363 break;
1364 }
1365 }
1366 for (; i < 4; i++) {
1367 /* Replicate the last channel out. */
1368 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1369 }
1370
1371 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1372
1373 this->result = src;
1374 }
1375
1376 void
1377 vec4_visitor::visit(ir_dereference_variable *ir)
1378 {
1379 const struct glsl_type *type = ir->type;
1380 dst_reg *reg = variable_storage(ir->var);
1381
1382 if (!reg) {
1383 fail("Failed to find variable storage for %s\n", ir->var->name);
1384 this->result = src_reg(brw_null_reg());
1385 return;
1386 }
1387
1388 this->result = src_reg(*reg);
1389
1390 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1391 this->result.swizzle = swizzle_for_size(type->vector_elements);
1392 }
1393
1394 void
1395 vec4_visitor::visit(ir_dereference_array *ir)
1396 {
1397 ir_constant *constant_index;
1398 src_reg src;
1399 int element_size = type_size(ir->type);
1400
1401 constant_index = ir->array_index->constant_expression_value();
1402
1403 ir->array->accept(this);
1404 src = this->result;
1405
1406 if (constant_index) {
1407 src.reg_offset += constant_index->value.i[0] * element_size;
1408 } else {
1409 /* Variable index array dereference. It eats the "vec4" of the
1410 * base of the array and an index that offsets the Mesa register
1411 * index.
1412 */
1413 ir->array_index->accept(this);
1414
1415 src_reg index_reg;
1416
1417 if (element_size == 1) {
1418 index_reg = this->result;
1419 } else {
1420 index_reg = src_reg(this, glsl_type::int_type);
1421
1422 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1423 }
1424
1425 if (src.reladdr) {
1426 src_reg temp = src_reg(this, glsl_type::int_type);
1427
1428 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1429
1430 index_reg = temp;
1431 }
1432
1433 src.reladdr = ralloc(mem_ctx, src_reg);
1434 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1435 }
1436
1437 /* If the type is smaller than a vec4, replicate the last channel out. */
1438 if (ir->type->is_scalar() || ir->type->is_vector())
1439 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1440 else
1441 src.swizzle = BRW_SWIZZLE_NOOP;
1442 src.type = brw_type_for_base_type(ir->type);
1443
1444 this->result = src;
1445 }
1446
1447 void
1448 vec4_visitor::visit(ir_dereference_record *ir)
1449 {
1450 unsigned int i;
1451 const glsl_type *struct_type = ir->record->type;
1452 int offset = 0;
1453
1454 ir->record->accept(this);
1455
1456 for (i = 0; i < struct_type->length; i++) {
1457 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1458 break;
1459 offset += type_size(struct_type->fields.structure[i].type);
1460 }
1461
1462 /* If the type is smaller than a vec4, replicate the last channel out. */
1463 if (ir->type->is_scalar() || ir->type->is_vector())
1464 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1465 else
1466 this->result.swizzle = BRW_SWIZZLE_NOOP;
1467 this->result.type = brw_type_for_base_type(ir->type);
1468
1469 this->result.reg_offset += offset;
1470 }
1471
1472 /**
1473 * We want to be careful in assignment setup to hit the actual storage
1474 * instead of potentially using a temporary like we might with the
1475 * ir_dereference handler.
1476 */
1477 static dst_reg
1478 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1479 {
1480 /* The LHS must be a dereference. If the LHS is a variable indexed array
1481 * access of a vector, it must be separated into a series conditional moves
1482 * before reaching this point (see ir_vec_index_to_cond_assign).
1483 */
1484 assert(ir->as_dereference());
1485 ir_dereference_array *deref_array = ir->as_dereference_array();
1486 if (deref_array) {
1487 assert(!deref_array->array->type->is_vector());
1488 }
1489
1490 /* Use the rvalue deref handler for the most part. We'll ignore
1491 * swizzles in it and write swizzles using writemask, though.
1492 */
1493 ir->accept(v);
1494 return dst_reg(v->result);
1495 }
1496
1497 void
1498 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1499 const struct glsl_type *type, uint32_t predicate)
1500 {
1501 if (type->base_type == GLSL_TYPE_STRUCT) {
1502 for (unsigned int i = 0; i < type->length; i++) {
1503 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1504 }
1505 return;
1506 }
1507
1508 if (type->is_array()) {
1509 for (unsigned int i = 0; i < type->length; i++) {
1510 emit_block_move(dst, src, type->fields.array, predicate);
1511 }
1512 return;
1513 }
1514
1515 if (type->is_matrix()) {
1516 const struct glsl_type *vec_type;
1517
1518 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1519 type->vector_elements, 1);
1520
1521 for (int i = 0; i < type->matrix_columns; i++) {
1522 emit_block_move(dst, src, vec_type, predicate);
1523 }
1524 return;
1525 }
1526
1527 assert(type->is_scalar() || type->is_vector());
1528
1529 dst->type = brw_type_for_base_type(type);
1530 src->type = dst->type;
1531
1532 dst->writemask = (1 << type->vector_elements) - 1;
1533
1534 /* Do we need to worry about swizzling a swizzle? */
1535 assert(src->swizzle == BRW_SWIZZLE_NOOP
1536 || src->swizzle == swizzle_for_size(type->vector_elements));
1537 src->swizzle = swizzle_for_size(type->vector_elements);
1538
1539 vec4_instruction *inst = emit(MOV(*dst, *src));
1540 inst->predicate = predicate;
1541
1542 dst->reg_offset++;
1543 src->reg_offset++;
1544 }
1545
1546
1547 /* If the RHS processing resulted in an instruction generating a
1548 * temporary value, and it would be easy to rewrite the instruction to
1549 * generate its result right into the LHS instead, do so. This ends
1550 * up reliably removing instructions where it can be tricky to do so
1551 * later without real UD chain information.
1552 */
1553 bool
1554 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1555 dst_reg dst,
1556 src_reg src,
1557 vec4_instruction *pre_rhs_inst,
1558 vec4_instruction *last_rhs_inst)
1559 {
1560 /* This could be supported, but it would take more smarts. */
1561 if (ir->condition)
1562 return false;
1563
1564 if (pre_rhs_inst == last_rhs_inst)
1565 return false; /* No instructions generated to work with. */
1566
1567 /* Make sure the last instruction generated our source reg. */
1568 if (src.file != GRF ||
1569 src.file != last_rhs_inst->dst.file ||
1570 src.reg != last_rhs_inst->dst.reg ||
1571 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1572 src.reladdr ||
1573 src.abs ||
1574 src.negate ||
1575 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1576 return false;
1577
1578 /* Check that that last instruction fully initialized the channels
1579 * we want to use, in the order we want to use them. We could
1580 * potentially reswizzle the operands of many instructions so that
1581 * we could handle out of order channels, but don't yet.
1582 */
1583
1584 for (unsigned i = 0; i < 4; i++) {
1585 if (dst.writemask & (1 << i)) {
1586 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1587 return false;
1588
1589 if (BRW_GET_SWZ(src.swizzle, i) != i)
1590 return false;
1591 }
1592 }
1593
1594 /* Success! Rewrite the instruction. */
1595 last_rhs_inst->dst.file = dst.file;
1596 last_rhs_inst->dst.reg = dst.reg;
1597 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1598 last_rhs_inst->dst.reladdr = dst.reladdr;
1599 last_rhs_inst->dst.writemask &= dst.writemask;
1600
1601 return true;
1602 }
1603
1604 void
1605 vec4_visitor::visit(ir_assignment *ir)
1606 {
1607 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1608 uint32_t predicate = BRW_PREDICATE_NONE;
1609
1610 if (!ir->lhs->type->is_scalar() &&
1611 !ir->lhs->type->is_vector()) {
1612 ir->rhs->accept(this);
1613 src_reg src = this->result;
1614
1615 if (ir->condition) {
1616 emit_bool_to_cond_code(ir->condition, &predicate);
1617 }
1618
1619 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1620 return;
1621 }
1622
1623 /* Now we're down to just a scalar/vector with writemasks. */
1624 int i;
1625
1626 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1627 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1628
1629 ir->rhs->accept(this);
1630
1631 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1632
1633 src_reg src = this->result;
1634
1635 int swizzles[4];
1636 int first_enabled_chan = 0;
1637 int src_chan = 0;
1638
1639 assert(ir->lhs->type->is_vector() ||
1640 ir->lhs->type->is_scalar());
1641 dst.writemask = ir->write_mask;
1642
1643 for (int i = 0; i < 4; i++) {
1644 if (dst.writemask & (1 << i)) {
1645 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1646 break;
1647 }
1648 }
1649
1650 /* Swizzle a small RHS vector into the channels being written.
1651 *
1652 * glsl ir treats write_mask as dictating how many channels are
1653 * present on the RHS while in our instructions we need to make
1654 * those channels appear in the slots of the vec4 they're written to.
1655 */
1656 for (int i = 0; i < 4; i++) {
1657 if (dst.writemask & (1 << i))
1658 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1659 else
1660 swizzles[i] = first_enabled_chan;
1661 }
1662 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1663 swizzles[2], swizzles[3]);
1664
1665 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1666 return;
1667 }
1668
1669 if (ir->condition) {
1670 emit_bool_to_cond_code(ir->condition, &predicate);
1671 }
1672
1673 for (i = 0; i < type_size(ir->lhs->type); i++) {
1674 vec4_instruction *inst = emit(MOV(dst, src));
1675 inst->predicate = predicate;
1676
1677 dst.reg_offset++;
1678 src.reg_offset++;
1679 }
1680 }
1681
1682 void
1683 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1684 {
1685 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1686 foreach_list(node, &ir->components) {
1687 ir_constant *field_value = (ir_constant *)node;
1688
1689 emit_constant_values(dst, field_value);
1690 }
1691 return;
1692 }
1693
1694 if (ir->type->is_array()) {
1695 for (unsigned int i = 0; i < ir->type->length; i++) {
1696 emit_constant_values(dst, ir->array_elements[i]);
1697 }
1698 return;
1699 }
1700
1701 if (ir->type->is_matrix()) {
1702 for (int i = 0; i < ir->type->matrix_columns; i++) {
1703 for (int j = 0; j < ir->type->vector_elements; j++) {
1704 dst->writemask = 1 << j;
1705 dst->type = BRW_REGISTER_TYPE_F;
1706
1707 emit(MOV(*dst,
1708 src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1709 }
1710 dst->reg_offset++;
1711 }
1712 return;
1713 }
1714
1715 for (int i = 0; i < ir->type->vector_elements; i++) {
1716 dst->writemask = 1 << i;
1717 dst->type = brw_type_for_base_type(ir->type);
1718
1719 switch (ir->type->base_type) {
1720 case GLSL_TYPE_FLOAT:
1721 emit(MOV(*dst, src_reg(ir->value.f[i])));
1722 break;
1723 case GLSL_TYPE_INT:
1724 emit(MOV(*dst, src_reg(ir->value.i[i])));
1725 break;
1726 case GLSL_TYPE_UINT:
1727 emit(MOV(*dst, src_reg(ir->value.u[i])));
1728 break;
1729 case GLSL_TYPE_BOOL:
1730 emit(MOV(*dst, src_reg(ir->value.b[i])));
1731 break;
1732 default:
1733 assert(!"Non-float/uint/int/bool constant");
1734 break;
1735 }
1736 }
1737 dst->reg_offset++;
1738 }
1739
1740 void
1741 vec4_visitor::visit(ir_constant *ir)
1742 {
1743 dst_reg dst = dst_reg(this, ir->type);
1744 this->result = src_reg(dst);
1745
1746 emit_constant_values(&dst, ir);
1747 }
1748
1749 void
1750 vec4_visitor::visit(ir_call *ir)
1751 {
1752 assert(!"not reached");
1753 }
1754
1755 void
1756 vec4_visitor::visit(ir_texture *ir)
1757 {
1758 /* FINISHME: Implement vertex texturing.
1759 *
1760 * With 0 vertex samplers available, the linker will reject
1761 * programs that do vertex texturing, but after our visitor has
1762 * run.
1763 */
1764 this->result = src_reg(this, glsl_type::vec4_type);
1765 }
1766
1767 void
1768 vec4_visitor::visit(ir_return *ir)
1769 {
1770 assert(!"not reached");
1771 }
1772
1773 void
1774 vec4_visitor::visit(ir_discard *ir)
1775 {
1776 assert(!"not reached");
1777 }
1778
1779 void
1780 vec4_visitor::visit(ir_if *ir)
1781 {
1782 /* Don't point the annotation at the if statement, because then it plus
1783 * the then and else blocks get printed.
1784 */
1785 this->base_ir = ir->condition;
1786
1787 if (intel->gen == 6) {
1788 emit_if_gen6(ir);
1789 } else {
1790 uint32_t predicate;
1791 emit_bool_to_cond_code(ir->condition, &predicate);
1792 emit(IF(predicate));
1793 }
1794
1795 visit_instructions(&ir->then_instructions);
1796
1797 if (!ir->else_instructions.is_empty()) {
1798 this->base_ir = ir->condition;
1799 emit(BRW_OPCODE_ELSE);
1800
1801 visit_instructions(&ir->else_instructions);
1802 }
1803
1804 this->base_ir = ir->condition;
1805 emit(BRW_OPCODE_ENDIF);
1806 }
1807
1808 void
1809 vec4_visitor::emit_ndc_computation()
1810 {
1811 /* Get the position */
1812 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1813
1814 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1815 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1816 output_reg[BRW_VERT_RESULT_NDC] = ndc;
1817
1818 current_annotation = "NDC";
1819 dst_reg ndc_w = ndc;
1820 ndc_w.writemask = WRITEMASK_W;
1821 src_reg pos_w = pos;
1822 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1823 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1824
1825 dst_reg ndc_xyz = ndc;
1826 ndc_xyz.writemask = WRITEMASK_XYZ;
1827
1828 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1829 }
1830
1831 void
1832 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1833 {
1834 if (intel->gen < 6 &&
1835 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1836 c->key.userclip_active || brw->has_negative_rhw_bug)) {
1837 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1838 dst_reg header1_w = header1;
1839 header1_w.writemask = WRITEMASK_W;
1840 GLuint i;
1841
1842 emit(MOV(header1, 0u));
1843
1844 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1845 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
1846
1847 current_annotation = "Point size";
1848 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1849 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1850 }
1851
1852 current_annotation = "Clipping flags";
1853 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1854 vec4_instruction *inst;
1855
1856 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
1857 src_reg(this->userplane[i])));
1858 inst->conditional_mod = BRW_CONDITIONAL_L;
1859
1860 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
1861 inst->predicate = BRW_PREDICATE_NORMAL;
1862 }
1863
1864 /* i965 clipping workaround:
1865 * 1) Test for -ve rhw
1866 * 2) If set,
1867 * set ndc = (0,0,0,0)
1868 * set ucp[6] = 1
1869 *
1870 * Later, clipping will detect ucp[6] and ensure the primitive is
1871 * clipped against all fixed planes.
1872 */
1873 if (brw->has_negative_rhw_bug) {
1874 #if 0
1875 /* FINISHME */
1876 brw_CMP(p,
1877 vec8(brw_null_reg()),
1878 BRW_CONDITIONAL_L,
1879 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
1880 brw_imm_f(0));
1881
1882 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1883 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
1884 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1885 #endif
1886 }
1887
1888 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1889 } else if (intel->gen < 6) {
1890 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1891 } else {
1892 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1893 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1894 emit(MOV(brw_writemask(reg, WRITEMASK_W),
1895 src_reg(output_reg[VERT_RESULT_PSIZ])));
1896 }
1897 }
1898 }
1899
1900 void
1901 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
1902 {
1903 if (intel->gen < 6) {
1904 /* Clip distance slots are set aside in gen5, but they are not used. It
1905 * is not clear whether we actually need to set aside space for them,
1906 * but the performance cost is negligible.
1907 */
1908 return;
1909 }
1910
1911 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
1912 *
1913 * "If a linked set of shaders forming the vertex stage contains no
1914 * static write to gl_ClipVertex or gl_ClipDistance, but the
1915 * application has requested clipping against user clip planes through
1916 * the API, then the coordinate written to gl_Position is used for
1917 * comparison against the user clip planes."
1918 *
1919 * This function is only called if the shader didn't write to
1920 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
1921 * if the user wrote to it; otherwise we use gl_Position.
1922 */
1923 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
1924 if (!(c->prog_data.outputs_written
1925 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
1926 clip_vertex = VERT_RESULT_HPOS;
1927 }
1928
1929 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
1930 ++i) {
1931 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
1932 src_reg(output_reg[clip_vertex]),
1933 src_reg(this->userplane[i + offset])));
1934 }
1935 }
1936
1937 void
1938 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
1939 {
1940 assert (vert_result < VERT_RESULT_MAX);
1941 reg.type = output_reg[vert_result].type;
1942 current_annotation = output_reg_annotation[vert_result];
1943 /* Copy the register, saturating if necessary */
1944 vec4_instruction *inst = emit(MOV(reg,
1945 src_reg(output_reg[vert_result])));
1946 if ((vert_result == VERT_RESULT_COL0 ||
1947 vert_result == VERT_RESULT_COL1 ||
1948 vert_result == VERT_RESULT_BFC0 ||
1949 vert_result == VERT_RESULT_BFC1) &&
1950 c->key.clamp_vertex_color) {
1951 inst->saturate = true;
1952 }
1953 }
1954
1955 void
1956 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
1957 {
1958 struct brw_reg hw_reg = brw_message_reg(mrf);
1959 dst_reg reg = dst_reg(MRF, mrf);
1960 reg.type = BRW_REGISTER_TYPE_F;
1961
1962 switch (vert_result) {
1963 case VERT_RESULT_PSIZ:
1964 /* PSIZ is always in slot 0, and is coupled with other flags. */
1965 current_annotation = "indices, point width, clip flags";
1966 emit_psiz_and_flags(hw_reg);
1967 break;
1968 case BRW_VERT_RESULT_NDC:
1969 current_annotation = "NDC";
1970 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
1971 break;
1972 case BRW_VERT_RESULT_HPOS_DUPLICATE:
1973 case VERT_RESULT_HPOS:
1974 current_annotation = "gl_Position";
1975 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
1976 break;
1977 case VERT_RESULT_CLIP_DIST0:
1978 case VERT_RESULT_CLIP_DIST1:
1979 if (this->c->key.uses_clip_distance) {
1980 emit_generic_urb_slot(reg, vert_result);
1981 } else {
1982 current_annotation = "user clip distances";
1983 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
1984 }
1985 break;
1986 case BRW_VERT_RESULT_PAD:
1987 /* No need to write to this slot */
1988 break;
1989 default:
1990 emit_generic_urb_slot(reg, vert_result);
1991 break;
1992 }
1993 }
1994
1995 static int
1996 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1997 {
1998 struct intel_context *intel = &brw->intel;
1999
2000 if (intel->gen >= 6) {
2001 /* URB data written (does not include the message header reg) must
2002 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2003 * section 5.4.3.2.2: URB_INTERLEAVED.
2004 *
2005 * URB entries are allocated on a multiple of 1024 bits, so an
2006 * extra 128 bits written here to make the end align to 256 is
2007 * no problem.
2008 */
2009 if ((mlen % 2) != 1)
2010 mlen++;
2011 }
2012
2013 return mlen;
2014 }
2015
2016 /**
2017 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2018 * complete the VS thread.
2019 *
2020 * The VUE layout is documented in Volume 2a.
2021 */
2022 void
2023 vec4_visitor::emit_urb_writes()
2024 {
2025 /* MRF 0 is reserved for the debugger, so start with message header
2026 * in MRF 1.
2027 */
2028 int base_mrf = 1;
2029 int mrf = base_mrf;
2030 /* In the process of generating our URB write message contents, we
2031 * may need to unspill a register or load from an array. Those
2032 * reads would use MRFs 14-15.
2033 */
2034 int max_usable_mrf = 13;
2035
2036 /* The following assertion verifies that max_usable_mrf causes an
2037 * even-numbered amount of URB write data, which will meet gen6's
2038 * requirements for length alignment.
2039 */
2040 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2041
2042 /* FINISHME: edgeflag */
2043
2044 brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2045 c->prog_data.outputs_written);
2046
2047 /* First mrf is the g0-based message header containing URB handles and such,
2048 * which is implied in VS_OPCODE_URB_WRITE.
2049 */
2050 mrf++;
2051
2052 if (intel->gen < 6) {
2053 emit_ndc_computation();
2054 }
2055
2056 /* Set up the VUE data for the first URB write */
2057 int slot;
2058 for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2059 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2060
2061 /* If this was max_usable_mrf, we can't fit anything more into this URB
2062 * WRITE.
2063 */
2064 if (mrf > max_usable_mrf) {
2065 slot++;
2066 break;
2067 }
2068 }
2069
2070 current_annotation = "URB write";
2071 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2072 inst->base_mrf = base_mrf;
2073 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2074 inst->eot = (slot >= c->vue_map.num_slots);
2075
2076 /* Optional second URB write */
2077 if (!inst->eot) {
2078 mrf = base_mrf + 1;
2079
2080 for (; slot < c->vue_map.num_slots; ++slot) {
2081 assert(mrf < max_usable_mrf);
2082
2083 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2084 }
2085
2086 current_annotation = "URB write";
2087 inst = emit(VS_OPCODE_URB_WRITE);
2088 inst->base_mrf = base_mrf;
2089 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2090 inst->eot = true;
2091 /* URB destination offset. In the previous write, we got MRFs
2092 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2093 * URB row increments, and each of our MRFs is half of one of
2094 * those, since we're doing interleaved writes.
2095 */
2096 inst->offset = (max_usable_mrf - base_mrf) / 2;
2097 }
2098
2099 if (intel->gen == 6)
2100 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2101 else
2102 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2103 }
2104
2105 src_reg
2106 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2107 src_reg *reladdr, int reg_offset)
2108 {
2109 /* Because we store the values to scratch interleaved like our
2110 * vertex data, we need to scale the vec4 index by 2.
2111 */
2112 int message_header_scale = 2;
2113
2114 /* Pre-gen6, the message header uses byte offsets instead of vec4
2115 * (16-byte) offset units.
2116 */
2117 if (intel->gen < 6)
2118 message_header_scale *= 16;
2119
2120 if (reladdr) {
2121 src_reg index = src_reg(this, glsl_type::int_type);
2122
2123 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2124 emit_before(inst, MUL(dst_reg(index),
2125 index, src_reg(message_header_scale)));
2126
2127 return index;
2128 } else {
2129 return src_reg(reg_offset * message_header_scale);
2130 }
2131 }
2132
2133 src_reg
2134 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2135 src_reg *reladdr, int reg_offset)
2136 {
2137 if (reladdr) {
2138 src_reg index = src_reg(this, glsl_type::int_type);
2139
2140 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2141
2142 /* Pre-gen6, the message header uses byte offsets instead of vec4
2143 * (16-byte) offset units.
2144 */
2145 if (intel->gen < 6) {
2146 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2147 }
2148
2149 return index;
2150 } else {
2151 int message_header_scale = intel->gen < 6 ? 16 : 1;
2152 return src_reg(reg_offset * message_header_scale);
2153 }
2154 }
2155
2156 /**
2157 * Emits an instruction before @inst to load the value named by @orig_src
2158 * from scratch space at @base_offset to @temp.
2159 */
2160 void
2161 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2162 dst_reg temp, src_reg orig_src,
2163 int base_offset)
2164 {
2165 int reg_offset = base_offset + orig_src.reg_offset;
2166 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2167
2168 emit_before(inst, SCRATCH_READ(temp, index));
2169 }
2170
2171 /**
2172 * Emits an instruction after @inst to store the value to be written
2173 * to @orig_dst to scratch space at @base_offset, from @temp.
2174 */
2175 void
2176 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2177 src_reg temp, dst_reg orig_dst,
2178 int base_offset)
2179 {
2180 int reg_offset = base_offset + orig_dst.reg_offset;
2181 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2182
2183 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2184 orig_dst.writemask));
2185 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2186 write->predicate = inst->predicate;
2187 write->ir = inst->ir;
2188 write->annotation = inst->annotation;
2189 inst->insert_after(write);
2190 }
2191
2192 /**
2193 * We can't generally support array access in GRF space, because a
2194 * single instruction's destination can only span 2 contiguous
2195 * registers. So, we send all GRF arrays that get variable index
2196 * access to scratch space.
2197 */
2198 void
2199 vec4_visitor::move_grf_array_access_to_scratch()
2200 {
2201 int scratch_loc[this->virtual_grf_count];
2202
2203 for (int i = 0; i < this->virtual_grf_count; i++) {
2204 scratch_loc[i] = -1;
2205 }
2206
2207 /* First, calculate the set of virtual GRFs that need to be punted
2208 * to scratch due to having any array access on them, and where in
2209 * scratch.
2210 */
2211 foreach_list(node, &this->instructions) {
2212 vec4_instruction *inst = (vec4_instruction *)node;
2213
2214 if (inst->dst.file == GRF && inst->dst.reladdr &&
2215 scratch_loc[inst->dst.reg] == -1) {
2216 scratch_loc[inst->dst.reg] = c->last_scratch;
2217 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2218 }
2219
2220 for (int i = 0 ; i < 3; i++) {
2221 src_reg *src = &inst->src[i];
2222
2223 if (src->file == GRF && src->reladdr &&
2224 scratch_loc[src->reg] == -1) {
2225 scratch_loc[src->reg] = c->last_scratch;
2226 c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2227 }
2228 }
2229 }
2230
2231 /* Now, for anything that will be accessed through scratch, rewrite
2232 * it to load/store. Note that this is a _safe list walk, because
2233 * we may generate a new scratch_write instruction after the one
2234 * we're processing.
2235 */
2236 foreach_list_safe(node, &this->instructions) {
2237 vec4_instruction *inst = (vec4_instruction *)node;
2238
2239 /* Set up the annotation tracking for new generated instructions. */
2240 base_ir = inst->ir;
2241 current_annotation = inst->annotation;
2242
2243 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2244 src_reg temp = src_reg(this, glsl_type::vec4_type);
2245
2246 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2247
2248 inst->dst.file = temp.file;
2249 inst->dst.reg = temp.reg;
2250 inst->dst.reg_offset = temp.reg_offset;
2251 inst->dst.reladdr = NULL;
2252 }
2253
2254 for (int i = 0 ; i < 3; i++) {
2255 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2256 continue;
2257
2258 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2259
2260 emit_scratch_read(inst, temp, inst->src[i],
2261 scratch_loc[inst->src[i].reg]);
2262
2263 inst->src[i].file = temp.file;
2264 inst->src[i].reg = temp.reg;
2265 inst->src[i].reg_offset = temp.reg_offset;
2266 inst->src[i].reladdr = NULL;
2267 }
2268 }
2269 }
2270
2271 /**
2272 * Emits an instruction before @inst to load the value named by @orig_src
2273 * from the pull constant buffer (surface) at @base_offset to @temp.
2274 */
2275 void
2276 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2277 dst_reg temp, src_reg orig_src,
2278 int base_offset)
2279 {
2280 int reg_offset = base_offset + orig_src.reg_offset;
2281 src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2282 vec4_instruction *load;
2283
2284 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2285 temp, index);
2286 load->base_mrf = 14;
2287 load->mlen = 1;
2288 emit_before(inst, load);
2289 }
2290
2291 /**
2292 * Implements array access of uniforms by inserting a
2293 * PULL_CONSTANT_LOAD instruction.
2294 *
2295 * Unlike temporary GRF array access (where we don't support it due to
2296 * the difficulty of doing relative addressing on instruction
2297 * destinations), we could potentially do array access of uniforms
2298 * that were loaded in GRF space as push constants. In real-world
2299 * usage we've seen, though, the arrays being used are always larger
2300 * than we could load as push constants, so just always move all
2301 * uniform array access out to a pull constant buffer.
2302 */
2303 void
2304 vec4_visitor::move_uniform_array_access_to_pull_constants()
2305 {
2306 int pull_constant_loc[this->uniforms];
2307
2308 for (int i = 0; i < this->uniforms; i++) {
2309 pull_constant_loc[i] = -1;
2310 }
2311
2312 /* Walk through and find array access of uniforms. Put a copy of that
2313 * uniform in the pull constant buffer.
2314 *
2315 * Note that we don't move constant-indexed accesses to arrays. No
2316 * testing has been done of the performance impact of this choice.
2317 */
2318 foreach_list_safe(node, &this->instructions) {
2319 vec4_instruction *inst = (vec4_instruction *)node;
2320
2321 for (int i = 0 ; i < 3; i++) {
2322 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2323 continue;
2324
2325 int uniform = inst->src[i].reg;
2326
2327 /* If this array isn't already present in the pull constant buffer,
2328 * add it.
2329 */
2330 if (pull_constant_loc[uniform] == -1) {
2331 const float **values = &prog_data->param[uniform * 4];
2332
2333 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2334
2335 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2336 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2337 }
2338 }
2339
2340 /* Set up the annotation tracking for new generated instructions. */
2341 base_ir = inst->ir;
2342 current_annotation = inst->annotation;
2343
2344 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2345
2346 emit_pull_constant_load(inst, temp, inst->src[i],
2347 pull_constant_loc[uniform]);
2348
2349 inst->src[i].file = temp.file;
2350 inst->src[i].reg = temp.reg;
2351 inst->src[i].reg_offset = temp.reg_offset;
2352 inst->src[i].reladdr = NULL;
2353 }
2354 }
2355
2356 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2357 * no need to track them as larger-than-vec4 objects. This will be
2358 * relied on in cutting out unused uniform vectors from push
2359 * constants.
2360 */
2361 split_uniform_registers();
2362 }
2363
2364 void
2365 vec4_visitor::resolve_ud_negate(src_reg *reg)
2366 {
2367 if (reg->type != BRW_REGISTER_TYPE_UD ||
2368 !reg->negate)
2369 return;
2370
2371 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2372 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2373 *reg = temp;
2374 }
2375
2376 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2377 struct gl_shader_program *prog,
2378 struct brw_shader *shader)
2379 {
2380 this->c = c;
2381 this->p = &c->func;
2382 this->brw = p->brw;
2383 this->intel = &brw->intel;
2384 this->ctx = &intel->ctx;
2385 this->prog = prog;
2386 this->shader = shader;
2387
2388 this->mem_ctx = ralloc_context(NULL);
2389 this->failed = false;
2390
2391 this->base_ir = NULL;
2392 this->current_annotation = NULL;
2393
2394 this->c = c;
2395 this->vp = (struct gl_vertex_program *)
2396 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2397 this->prog_data = &c->prog_data;
2398
2399 this->variable_ht = hash_table_ctor(0,
2400 hash_table_pointer_hash,
2401 hash_table_pointer_compare);
2402
2403 this->virtual_grf_def = NULL;
2404 this->virtual_grf_use = NULL;
2405 this->virtual_grf_sizes = NULL;
2406 this->virtual_grf_count = 0;
2407 this->virtual_grf_reg_map = NULL;
2408 this->virtual_grf_reg_count = 0;
2409 this->virtual_grf_array_size = 0;
2410 this->live_intervals_valid = false;
2411
2412 this->uniforms = 0;
2413
2414 this->variable_ht = hash_table_ctor(0,
2415 hash_table_pointer_hash,
2416 hash_table_pointer_compare);
2417 }
2418
2419 vec4_visitor::~vec4_visitor()
2420 {
2421 ralloc_free(this->mem_ctx);
2422 hash_table_dtor(this->variable_ht);
2423 }
2424
2425
2426 void
2427 vec4_visitor::fail(const char *format, ...)
2428 {
2429 va_list va;
2430 char *msg;
2431
2432 if (failed)
2433 return;
2434
2435 failed = true;
2436
2437 va_start(va, format);
2438 msg = ralloc_vasprintf(mem_ctx, format, va);
2439 va_end(va);
2440 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2441
2442 this->fail_msg = msg;
2443
2444 if (INTEL_DEBUG & DEBUG_VS) {
2445 fprintf(stderr, "%s", msg);
2446 }
2447 }
2448
2449 } /* namespace brw */