i965/gen4: Fold WM surface state prepare()/emit() together.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 }
29
30 namespace brw {
31
32 src_reg::src_reg(dst_reg reg)
33 {
34 init();
35
36 this->file = reg.file;
37 this->reg = reg.reg;
38 this->reg_offset = reg.reg_offset;
39 this->type = reg.type;
40 this->reladdr = reg.reladdr;
41 this->fixed_hw_reg = reg.fixed_hw_reg;
42
43 int swizzles[4];
44 int next_chan = 0;
45 int last = 0;
46
47 for (int i = 0; i < 4; i++) {
48 if (!(reg.writemask & (1 << i)))
49 continue;
50
51 swizzles[next_chan++] = last = i;
52 }
53
54 for (; next_chan < 4; next_chan++) {
55 swizzles[next_chan] = last;
56 }
57
58 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
59 swizzles[2], swizzles[3]);
60 }
61
62 dst_reg::dst_reg(src_reg reg)
63 {
64 init();
65
66 this->file = reg.file;
67 this->reg = reg.reg;
68 this->reg_offset = reg.reg_offset;
69 this->type = reg.type;
70 this->writemask = WRITEMASK_XYZW;
71 this->reladdr = reg.reladdr;
72 this->fixed_hw_reg = reg.fixed_hw_reg;
73 }
74
75 vec4_instruction::vec4_instruction(vec4_visitor *v,
76 enum opcode opcode, dst_reg dst,
77 src_reg src0, src_reg src1, src_reg src2)
78 {
79 this->opcode = opcode;
80 this->dst = dst;
81 this->src[0] = src0;
82 this->src[1] = src1;
83 this->src[2] = src2;
84 this->ir = v->base_ir;
85 this->annotation = v->current_annotation;
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(vec4_instruction *inst)
90 {
91 this->instructions.push_tail(inst);
92
93 return inst;
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
98 {
99 new_inst->ir = inst->ir;
100 new_inst->annotation = inst->annotation;
101
102 inst->insert_before(new_inst);
103
104 return inst;
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
109 src_reg src0, src_reg src1, src_reg src2)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
112 src0, src1, src2));
113 }
114
115
116 vec4_instruction *
117 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
118 {
119 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
120 }
121
122 vec4_instruction *
123 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
124 {
125 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
126 }
127
128 vec4_instruction *
129 vec4_visitor::emit(enum opcode opcode)
130 {
131 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
132 }
133
134 #define ALU1(op) \
135 vec4_instruction * \
136 vec4_visitor::op(dst_reg dst, src_reg src0) \
137 { \
138 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
139 src0); \
140 }
141
142 #define ALU2(op) \
143 vec4_instruction * \
144 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
145 { \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU2(ADD)
157 ALU2(MUL)
158 ALU2(MACH)
159 ALU2(AND)
160 ALU2(OR)
161 ALU2(XOR)
162 ALU2(DP3)
163 ALU2(DP4)
164
165 /** Gen4 predicated IF. */
166 vec4_instruction *
167 vec4_visitor::IF(uint32_t predicate)
168 {
169 vec4_instruction *inst;
170
171 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
172 inst->predicate = predicate;
173
174 return inst;
175 }
176
177 /** Gen6+ IF with embedded comparison. */
178 vec4_instruction *
179 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
180 {
181 assert(intel->gen >= 6);
182
183 vec4_instruction *inst;
184
185 resolve_ud_negate(&src0);
186 resolve_ud_negate(&src1);
187
188 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
189 src0, src1);
190 inst->conditional_mod = condition;
191
192 return inst;
193 }
194
195 /**
196 * CMP: Sets the low bit of the destination channels with the result
197 * of the comparison, while the upper bits are undefined, and updates
198 * the flag register with the packed 16 bits of the result.
199 */
200 vec4_instruction *
201 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
202 {
203 vec4_instruction *inst;
204
205 /* original gen4 does type conversion to the destination type
206 * before before comparison, producing garbage results for floating
207 * point comparisons.
208 */
209 if (intel->gen == 4) {
210 dst.type = src0.type;
211 if (dst.file == HW_REG)
212 dst.fixed_hw_reg.type = dst.type;
213 }
214
215 resolve_ud_negate(&src0);
216 resolve_ud_negate(&src1);
217
218 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
219 inst->conditional_mod = condition;
220
221 return inst;
222 }
223
224 vec4_instruction *
225 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
226 {
227 vec4_instruction *inst;
228
229 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
230 dst, index);
231 inst->base_mrf = 14;
232 inst->mlen = 1;
233
234 return inst;
235 }
236
237 vec4_instruction *
238 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
239 {
240 vec4_instruction *inst;
241
242 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
243 dst, src, index);
244 inst->base_mrf = 13;
245 inst->mlen = 2;
246
247 return inst;
248 }
249
250 void
251 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
252 {
253 static enum opcode dot_opcodes[] = {
254 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
255 };
256
257 emit(dot_opcodes[elements - 2], dst, src0, src1);
258 }
259
260 void
261 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
262 {
263 /* The gen6 math instruction ignores the source modifiers --
264 * swizzle, abs, negate, and at least some parts of the register
265 * region description.
266 *
267 * While it would seem that this MOV could be avoided at this point
268 * in the case that the swizzle is matched up with the destination
269 * writemask, note that uniform packing and register allocation
270 * could rearrange our swizzle, so let's leave this matter up to
271 * copy propagation later.
272 */
273 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
274 emit(MOV(dst_reg(temp_src), src));
275
276 if (dst.writemask != WRITEMASK_XYZW) {
277 /* The gen6 math instruction must be align1, so we can't do
278 * writemasks.
279 */
280 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
281
282 emit(opcode, temp_dst, temp_src);
283
284 emit(MOV(dst, src_reg(temp_dst)));
285 } else {
286 emit(opcode, dst, temp_src);
287 }
288 }
289
290 void
291 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
292 {
293 vec4_instruction *inst = emit(opcode, dst, src);
294 inst->base_mrf = 1;
295 inst->mlen = 1;
296 }
297
298 void
299 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
300 {
301 switch (opcode) {
302 case SHADER_OPCODE_RCP:
303 case SHADER_OPCODE_RSQ:
304 case SHADER_OPCODE_SQRT:
305 case SHADER_OPCODE_EXP2:
306 case SHADER_OPCODE_LOG2:
307 case SHADER_OPCODE_SIN:
308 case SHADER_OPCODE_COS:
309 break;
310 default:
311 assert(!"not reached: bad math opcode");
312 return;
313 }
314
315 if (intel->gen >= 6) {
316 return emit_math1_gen6(opcode, dst, src);
317 } else {
318 return emit_math1_gen4(opcode, dst, src);
319 }
320 }
321
322 void
323 vec4_visitor::emit_math2_gen6(enum opcode opcode,
324 dst_reg dst, src_reg src0, src_reg src1)
325 {
326 src_reg expanded;
327
328 /* The gen6 math instruction ignores the source modifiers --
329 * swizzle, abs, negate, and at least some parts of the register
330 * region description. Move the sources to temporaries to make it
331 * generally work.
332 */
333
334 expanded = src_reg(this, glsl_type::vec4_type);
335 expanded.type = src0.type;
336 emit(MOV(dst_reg(expanded), src0));
337 src0 = expanded;
338
339 expanded = src_reg(this, glsl_type::vec4_type);
340 expanded.type = src1.type;
341 emit(MOV(dst_reg(expanded), src1));
342 src1 = expanded;
343
344 if (dst.writemask != WRITEMASK_XYZW) {
345 /* The gen6 math instruction must be align1, so we can't do
346 * writemasks.
347 */
348 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
349 temp_dst.type = dst.type;
350
351 emit(opcode, temp_dst, src0, src1);
352
353 emit(MOV(dst, src_reg(temp_dst)));
354 } else {
355 emit(opcode, dst, src0, src1);
356 }
357 }
358
359 void
360 vec4_visitor::emit_math2_gen4(enum opcode opcode,
361 dst_reg dst, src_reg src0, src_reg src1)
362 {
363 vec4_instruction *inst = emit(opcode, dst, src0, src1);
364 inst->base_mrf = 1;
365 inst->mlen = 2;
366 }
367
368 void
369 vec4_visitor::emit_math(enum opcode opcode,
370 dst_reg dst, src_reg src0, src_reg src1)
371 {
372 switch (opcode) {
373 case SHADER_OPCODE_POW:
374 case SHADER_OPCODE_INT_QUOTIENT:
375 case SHADER_OPCODE_INT_REMAINDER:
376 break;
377 default:
378 assert(!"not reached: unsupported binary math opcode");
379 return;
380 }
381
382 if (intel->gen >= 6) {
383 return emit_math2_gen6(opcode, dst, src0, src1);
384 } else {
385 return emit_math2_gen4(opcode, dst, src0, src1);
386 }
387 }
388
389 void
390 vec4_visitor::visit_instructions(const exec_list *list)
391 {
392 foreach_list(node, list) {
393 ir_instruction *ir = (ir_instruction *)node;
394
395 base_ir = ir;
396 ir->accept(this);
397 }
398 }
399
400
401 static int
402 type_size(const struct glsl_type *type)
403 {
404 unsigned int i;
405 int size;
406
407 switch (type->base_type) {
408 case GLSL_TYPE_UINT:
409 case GLSL_TYPE_INT:
410 case GLSL_TYPE_FLOAT:
411 case GLSL_TYPE_BOOL:
412 if (type->is_matrix()) {
413 return type->matrix_columns;
414 } else {
415 /* Regardless of size of vector, it gets a vec4. This is bad
416 * packing for things like floats, but otherwise arrays become a
417 * mess. Hopefully a later pass over the code can pack scalars
418 * down if appropriate.
419 */
420 return 1;
421 }
422 case GLSL_TYPE_ARRAY:
423 assert(type->length > 0);
424 return type_size(type->fields.array) * type->length;
425 case GLSL_TYPE_STRUCT:
426 size = 0;
427 for (i = 0; i < type->length; i++) {
428 size += type_size(type->fields.structure[i].type);
429 }
430 return size;
431 case GLSL_TYPE_SAMPLER:
432 /* Samplers take up one slot in UNIFORMS[], but they're baked in
433 * at link time.
434 */
435 return 1;
436 default:
437 assert(0);
438 return 0;
439 }
440 }
441
442 int
443 vec4_visitor::virtual_grf_alloc(int size)
444 {
445 if (virtual_grf_array_size <= virtual_grf_count) {
446 if (virtual_grf_array_size == 0)
447 virtual_grf_array_size = 16;
448 else
449 virtual_grf_array_size *= 2;
450 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
451 virtual_grf_array_size);
452 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
453 virtual_grf_array_size);
454 }
455 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
456 virtual_grf_reg_count += size;
457 virtual_grf_sizes[virtual_grf_count] = size;
458 return virtual_grf_count++;
459 }
460
461 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
462 {
463 init();
464
465 this->file = GRF;
466 this->reg = v->virtual_grf_alloc(type_size(type));
467
468 if (type->is_array() || type->is_record()) {
469 this->swizzle = BRW_SWIZZLE_NOOP;
470 } else {
471 this->swizzle = swizzle_for_size(type->vector_elements);
472 }
473
474 this->type = brw_type_for_base_type(type);
475 }
476
477 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
478 {
479 init();
480
481 this->file = GRF;
482 this->reg = v->virtual_grf_alloc(type_size(type));
483
484 if (type->is_array() || type->is_record()) {
485 this->writemask = WRITEMASK_XYZW;
486 } else {
487 this->writemask = (1 << type->vector_elements) - 1;
488 }
489
490 this->type = brw_type_for_base_type(type);
491 }
492
493 /* Our support for uniforms is piggy-backed on the struct
494 * gl_fragment_program, because that's where the values actually
495 * get stored, rather than in some global gl_shader_program uniform
496 * store.
497 */
498 int
499 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
500 {
501 unsigned int offset = 0;
502 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
503
504 if (type->is_matrix()) {
505 const glsl_type *column = type->column_type();
506
507 for (unsigned int i = 0; i < type->matrix_columns; i++) {
508 offset += setup_uniform_values(loc + offset, column);
509 }
510
511 return offset;
512 }
513
514 switch (type->base_type) {
515 case GLSL_TYPE_FLOAT:
516 case GLSL_TYPE_UINT:
517 case GLSL_TYPE_INT:
518 case GLSL_TYPE_BOOL:
519 for (unsigned int i = 0; i < type->vector_elements; i++) {
520 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
521 }
522
523 /* Set up pad elements to get things aligned to a vec4 boundary. */
524 for (unsigned int i = type->vector_elements; i < 4; i++) {
525 static float zero = 0;
526
527 c->prog_data.param[this->uniforms * 4 + i] = &zero;
528 }
529
530 /* Track the size of this uniform vector, for future packing of
531 * uniforms.
532 */
533 this->uniform_vector_size[this->uniforms] = type->vector_elements;
534 this->uniforms++;
535
536 return 1;
537
538 case GLSL_TYPE_STRUCT:
539 for (unsigned int i = 0; i < type->length; i++) {
540 offset += setup_uniform_values(loc + offset,
541 type->fields.structure[i].type);
542 }
543 return offset;
544
545 case GLSL_TYPE_ARRAY:
546 for (unsigned int i = 0; i < type->length; i++) {
547 offset += setup_uniform_values(loc + offset, type->fields.array);
548 }
549 return offset;
550
551 case GLSL_TYPE_SAMPLER:
552 /* The sampler takes up a slot, but we don't use any values from it. */
553 return 1;
554
555 default:
556 assert(!"not reached");
557 return 0;
558 }
559 }
560
561 void
562 vec4_visitor::setup_uniform_clipplane_values()
563 {
564 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
565
566 /* Pre-Gen6, we compact clip planes. For example, if the user
567 * enables just clip planes 0, 1, and 3, we will enable clip planes
568 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
569 * plane 2. This simplifies the implementation of the Gen6 clip
570 * thread.
571 *
572 * In Gen6 and later, we don't compact clip planes, because this
573 * simplifies the implementation of gl_ClipDistance.
574 */
575 int compacted_clipplane_index = 0;
576 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
577 if (intel->gen < 6 &&
578 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
579 continue;
580 }
581 this->uniform_vector_size[this->uniforms] = 4;
582 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
583 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
584 for (int j = 0; j < 4; ++j) {
585 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
586 }
587 ++compacted_clipplane_index;
588 ++this->uniforms;
589 }
590 }
591
592 /* Our support for builtin uniforms is even scarier than non-builtin.
593 * It sits on top of the PROG_STATE_VAR parameters that are
594 * automatically updated from GL context state.
595 */
596 void
597 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
598 {
599 const ir_state_slot *const slots = ir->state_slots;
600 assert(ir->state_slots != NULL);
601
602 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
603 /* This state reference has already been setup by ir_to_mesa,
604 * but we'll get the same index back here. We can reference
605 * ParameterValues directly, since unlike brw_fs.cpp, we never
606 * add new state references during compile.
607 */
608 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
609 (gl_state_index *)slots[i].tokens);
610 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
611
612 this->uniform_vector_size[this->uniforms] = 0;
613 /* Add each of the unique swizzled channels of the element.
614 * This will end up matching the size of the glsl_type of this field.
615 */
616 int last_swiz = -1;
617 for (unsigned int j = 0; j < 4; j++) {
618 int swiz = GET_SWZ(slots[i].swizzle, j);
619 last_swiz = swiz;
620
621 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
622 if (swiz <= last_swiz)
623 this->uniform_vector_size[this->uniforms]++;
624 }
625 this->uniforms++;
626 }
627 }
628
629 dst_reg *
630 vec4_visitor::variable_storage(ir_variable *var)
631 {
632 return (dst_reg *)hash_table_find(this->variable_ht, var);
633 }
634
635 void
636 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
637 {
638 ir_expression *expr = ir->as_expression();
639
640 *predicate = BRW_PREDICATE_NORMAL;
641
642 if (expr) {
643 src_reg op[2];
644 vec4_instruction *inst;
645
646 assert(expr->get_num_operands() <= 2);
647 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
648 expr->operands[i]->accept(this);
649 op[i] = this->result;
650
651 resolve_ud_negate(&op[i]);
652 }
653
654 switch (expr->operation) {
655 case ir_unop_logic_not:
656 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
657 inst->conditional_mod = BRW_CONDITIONAL_Z;
658 break;
659
660 case ir_binop_logic_xor:
661 inst = emit(XOR(dst_null_d(), op[0], op[1]));
662 inst->conditional_mod = BRW_CONDITIONAL_NZ;
663 break;
664
665 case ir_binop_logic_or:
666 inst = emit(OR(dst_null_d(), op[0], op[1]));
667 inst->conditional_mod = BRW_CONDITIONAL_NZ;
668 break;
669
670 case ir_binop_logic_and:
671 inst = emit(AND(dst_null_d(), op[0], op[1]));
672 inst->conditional_mod = BRW_CONDITIONAL_NZ;
673 break;
674
675 case ir_unop_f2b:
676 if (intel->gen >= 6) {
677 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
678 } else {
679 inst = emit(MOV(dst_null_f(), op[0]));
680 inst->conditional_mod = BRW_CONDITIONAL_NZ;
681 }
682 break;
683
684 case ir_unop_i2b:
685 if (intel->gen >= 6) {
686 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
687 } else {
688 inst = emit(MOV(dst_null_d(), op[0]));
689 inst->conditional_mod = BRW_CONDITIONAL_NZ;
690 }
691 break;
692
693 case ir_binop_all_equal:
694 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
695 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
696 break;
697
698 case ir_binop_any_nequal:
699 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
700 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
701 break;
702
703 case ir_unop_any:
704 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
705 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
706 break;
707
708 case ir_binop_greater:
709 case ir_binop_gequal:
710 case ir_binop_less:
711 case ir_binop_lequal:
712 case ir_binop_equal:
713 case ir_binop_nequal:
714 emit(CMP(dst_null_d(), op[0], op[1],
715 brw_conditional_for_comparison(expr->operation)));
716 break;
717
718 default:
719 assert(!"not reached");
720 break;
721 }
722 return;
723 }
724
725 ir->accept(this);
726
727 resolve_ud_negate(&this->result);
728
729 if (intel->gen >= 6) {
730 vec4_instruction *inst = emit(AND(dst_null_d(),
731 this->result, src_reg(1)));
732 inst->conditional_mod = BRW_CONDITIONAL_NZ;
733 } else {
734 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
735 inst->conditional_mod = BRW_CONDITIONAL_NZ;
736 }
737 }
738
739 /**
740 * Emit a gen6 IF statement with the comparison folded into the IF
741 * instruction.
742 */
743 void
744 vec4_visitor::emit_if_gen6(ir_if *ir)
745 {
746 ir_expression *expr = ir->condition->as_expression();
747
748 if (expr) {
749 src_reg op[2];
750 dst_reg temp;
751
752 assert(expr->get_num_operands() <= 2);
753 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
754 expr->operands[i]->accept(this);
755 op[i] = this->result;
756 }
757
758 switch (expr->operation) {
759 case ir_unop_logic_not:
760 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
761 return;
762
763 case ir_binop_logic_xor:
764 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
765 return;
766
767 case ir_binop_logic_or:
768 temp = dst_reg(this, glsl_type::bool_type);
769 emit(OR(temp, op[0], op[1]));
770 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
771 return;
772
773 case ir_binop_logic_and:
774 temp = dst_reg(this, glsl_type::bool_type);
775 emit(AND(temp, op[0], op[1]));
776 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
777 return;
778
779 case ir_unop_f2b:
780 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
781 return;
782
783 case ir_unop_i2b:
784 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
785 return;
786
787 case ir_binop_greater:
788 case ir_binop_gequal:
789 case ir_binop_less:
790 case ir_binop_lequal:
791 case ir_binop_equal:
792 case ir_binop_nequal:
793 emit(IF(op[0], op[1],
794 brw_conditional_for_comparison(expr->operation)));
795 return;
796
797 case ir_binop_all_equal:
798 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
799 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
800 return;
801
802 case ir_binop_any_nequal:
803 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
804 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
805 return;
806
807 case ir_unop_any:
808 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
809 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
810 return;
811
812 default:
813 assert(!"not reached");
814 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
815 return;
816 }
817 return;
818 }
819
820 ir->condition->accept(this);
821
822 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
823 }
824
825 void
826 vec4_visitor::visit(ir_variable *ir)
827 {
828 dst_reg *reg = NULL;
829
830 if (variable_storage(ir))
831 return;
832
833 switch (ir->mode) {
834 case ir_var_in:
835 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
836
837 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
838 * come in as floating point conversions of the integer values.
839 */
840 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
841 if (!c->key.gl_fixed_input_size[i])
842 continue;
843
844 dst_reg dst = *reg;
845 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
846 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
847 }
848 break;
849
850 case ir_var_out:
851 reg = new(mem_ctx) dst_reg(this, ir->type);
852
853 for (int i = 0; i < type_size(ir->type); i++) {
854 output_reg[ir->location + i] = *reg;
855 output_reg[ir->location + i].reg_offset = i;
856 output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
857 output_reg_annotation[ir->location + i] = ir->name;
858 }
859 break;
860
861 case ir_var_auto:
862 case ir_var_temporary:
863 reg = new(mem_ctx) dst_reg(this, ir->type);
864 break;
865
866 case ir_var_uniform:
867 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
868
869 /* Track how big the whole uniform variable is, in case we need to put a
870 * copy of its data into pull constants for array access.
871 */
872 this->uniform_size[this->uniforms] = type_size(ir->type);
873
874 if (!strncmp(ir->name, "gl_", 3)) {
875 setup_builtin_uniform_values(ir);
876 } else {
877 setup_uniform_values(ir->location, ir->type);
878 }
879 break;
880
881 default:
882 assert(!"not reached");
883 }
884
885 reg->type = brw_type_for_base_type(ir->type);
886 hash_table_insert(this->variable_ht, reg, ir);
887 }
888
889 void
890 vec4_visitor::visit(ir_loop *ir)
891 {
892 dst_reg counter;
893
894 /* We don't want debugging output to print the whole body of the
895 * loop as the annotation.
896 */
897 this->base_ir = NULL;
898
899 if (ir->counter != NULL) {
900 this->base_ir = ir->counter;
901 ir->counter->accept(this);
902 counter = *(variable_storage(ir->counter));
903
904 if (ir->from != NULL) {
905 this->base_ir = ir->from;
906 ir->from->accept(this);
907
908 emit(MOV(counter, this->result));
909 }
910 }
911
912 emit(BRW_OPCODE_DO);
913
914 if (ir->to) {
915 this->base_ir = ir->to;
916 ir->to->accept(this);
917
918 emit(CMP(dst_null_d(), src_reg(counter), this->result,
919 brw_conditional_for_comparison(ir->cmp)));
920
921 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
922 inst->predicate = BRW_PREDICATE_NORMAL;
923 }
924
925 visit_instructions(&ir->body_instructions);
926
927
928 if (ir->increment) {
929 this->base_ir = ir->increment;
930 ir->increment->accept(this);
931 emit(ADD(counter, src_reg(counter), this->result));
932 }
933
934 emit(BRW_OPCODE_WHILE);
935 }
936
937 void
938 vec4_visitor::visit(ir_loop_jump *ir)
939 {
940 switch (ir->mode) {
941 case ir_loop_jump::jump_break:
942 emit(BRW_OPCODE_BREAK);
943 break;
944 case ir_loop_jump::jump_continue:
945 emit(BRW_OPCODE_CONTINUE);
946 break;
947 }
948 }
949
950
951 void
952 vec4_visitor::visit(ir_function_signature *ir)
953 {
954 assert(0);
955 (void)ir;
956 }
957
958 void
959 vec4_visitor::visit(ir_function *ir)
960 {
961 /* Ignore function bodies other than main() -- we shouldn't see calls to
962 * them since they should all be inlined.
963 */
964 if (strcmp(ir->name, "main") == 0) {
965 const ir_function_signature *sig;
966 exec_list empty;
967
968 sig = ir->matching_signature(&empty);
969
970 assert(sig);
971
972 visit_instructions(&sig->body);
973 }
974 }
975
976 bool
977 vec4_visitor::try_emit_sat(ir_expression *ir)
978 {
979 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
980 if (!sat_src)
981 return false;
982
983 sat_src->accept(this);
984 src_reg src = this->result;
985
986 this->result = src_reg(this, ir->type);
987 vec4_instruction *inst;
988 inst = emit(MOV(dst_reg(this->result), src));
989 inst->saturate = true;
990
991 return true;
992 }
993
994 void
995 vec4_visitor::emit_bool_comparison(unsigned int op,
996 dst_reg dst, src_reg src0, src_reg src1)
997 {
998 /* original gen4 does destination conversion before comparison. */
999 if (intel->gen < 5)
1000 dst.type = src0.type;
1001
1002 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1003
1004 dst.type = BRW_REGISTER_TYPE_D;
1005 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_expression *ir)
1010 {
1011 unsigned int operand;
1012 src_reg op[Elements(ir->operands)];
1013 src_reg result_src;
1014 dst_reg result_dst;
1015 vec4_instruction *inst;
1016
1017 if (try_emit_sat(ir))
1018 return;
1019
1020 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1021 this->result.file = BAD_FILE;
1022 ir->operands[operand]->accept(this);
1023 if (this->result.file == BAD_FILE) {
1024 printf("Failed to get tree for expression operand:\n");
1025 ir->operands[operand]->print();
1026 exit(1);
1027 }
1028 op[operand] = this->result;
1029
1030 /* Matrix expression operands should have been broken down to vector
1031 * operations already.
1032 */
1033 assert(!ir->operands[operand]->type->is_matrix());
1034 }
1035
1036 int vector_elements = ir->operands[0]->type->vector_elements;
1037 if (ir->operands[1]) {
1038 vector_elements = MAX2(vector_elements,
1039 ir->operands[1]->type->vector_elements);
1040 }
1041
1042 this->result.file = BAD_FILE;
1043
1044 /* Storage for our result. Ideally for an assignment we'd be using
1045 * the actual storage for the result here, instead.
1046 */
1047 result_src = src_reg(this, ir->type);
1048 /* convenience for the emit functions below. */
1049 result_dst = dst_reg(result_src);
1050 /* If nothing special happens, this is the result. */
1051 this->result = result_src;
1052 /* Limit writes to the channels that will be used by result_src later.
1053 * This does limit this temp's use as a temporary for multi-instruction
1054 * sequences.
1055 */
1056 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1057
1058 switch (ir->operation) {
1059 case ir_unop_logic_not:
1060 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1061 * ones complement of the whole register, not just bit 0.
1062 */
1063 emit(XOR(result_dst, op[0], src_reg(1)));
1064 break;
1065 case ir_unop_neg:
1066 op[0].negate = !op[0].negate;
1067 this->result = op[0];
1068 break;
1069 case ir_unop_abs:
1070 op[0].abs = true;
1071 op[0].negate = false;
1072 this->result = op[0];
1073 break;
1074
1075 case ir_unop_sign:
1076 emit(MOV(result_dst, src_reg(0.0f)));
1077
1078 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1079 inst = emit(MOV(result_dst, src_reg(1.0f)));
1080 inst->predicate = BRW_PREDICATE_NORMAL;
1081
1082 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1083 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1084 inst->predicate = BRW_PREDICATE_NORMAL;
1085
1086 break;
1087
1088 case ir_unop_rcp:
1089 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1090 break;
1091
1092 case ir_unop_exp2:
1093 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1094 break;
1095 case ir_unop_log2:
1096 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1097 break;
1098 case ir_unop_exp:
1099 case ir_unop_log:
1100 assert(!"not reached: should be handled by ir_explog_to_explog2");
1101 break;
1102 case ir_unop_sin:
1103 case ir_unop_sin_reduced:
1104 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1105 break;
1106 case ir_unop_cos:
1107 case ir_unop_cos_reduced:
1108 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1109 break;
1110
1111 case ir_unop_dFdx:
1112 case ir_unop_dFdy:
1113 assert(!"derivatives not valid in vertex shader");
1114 break;
1115
1116 case ir_unop_noise:
1117 assert(!"not reached: should be handled by lower_noise");
1118 break;
1119
1120 case ir_binop_add:
1121 emit(ADD(result_dst, op[0], op[1]));
1122 break;
1123 case ir_binop_sub:
1124 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1125 break;
1126
1127 case ir_binop_mul:
1128 if (ir->type->is_integer()) {
1129 /* For integer multiplication, the MUL uses the low 16 bits
1130 * of one of the operands (src0 on gen6, src1 on gen7). The
1131 * MACH accumulates in the contribution of the upper 16 bits
1132 * of that operand.
1133 *
1134 * FINISHME: Emit just the MUL if we know an operand is small
1135 * enough.
1136 */
1137 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1138
1139 emit(MUL(acc, op[0], op[1]));
1140 emit(MACH(dst_null_d(), op[0], op[1]));
1141 emit(MOV(result_dst, src_reg(acc)));
1142 } else {
1143 emit(MUL(result_dst, op[0], op[1]));
1144 }
1145 break;
1146 case ir_binop_div:
1147 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1148 assert(ir->type->is_integer());
1149 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1150 break;
1151 case ir_binop_mod:
1152 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1153 assert(ir->type->is_integer());
1154 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1155 break;
1156
1157 case ir_binop_less:
1158 case ir_binop_greater:
1159 case ir_binop_lequal:
1160 case ir_binop_gequal:
1161 case ir_binop_equal:
1162 case ir_binop_nequal: {
1163 emit(CMP(result_dst, op[0], op[1],
1164 brw_conditional_for_comparison(ir->operation)));
1165 emit(AND(result_dst, result_src, src_reg(0x1)));
1166 break;
1167 }
1168
1169 case ir_binop_all_equal:
1170 /* "==" operator producing a scalar boolean. */
1171 if (ir->operands[0]->type->is_vector() ||
1172 ir->operands[1]->type->is_vector()) {
1173 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1174 emit(MOV(result_dst, src_reg(0)));
1175 inst = emit(MOV(result_dst, src_reg(1)));
1176 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1177 } else {
1178 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1179 emit(AND(result_dst, result_src, src_reg(0x1)));
1180 }
1181 break;
1182 case ir_binop_any_nequal:
1183 /* "!=" operator producing a scalar boolean. */
1184 if (ir->operands[0]->type->is_vector() ||
1185 ir->operands[1]->type->is_vector()) {
1186 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1187
1188 emit(MOV(result_dst, src_reg(0)));
1189 inst = emit(MOV(result_dst, src_reg(1)));
1190 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1191 } else {
1192 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1193 emit(AND(result_dst, result_src, src_reg(0x1)));
1194 }
1195 break;
1196
1197 case ir_unop_any:
1198 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1199 emit(MOV(result_dst, src_reg(0)));
1200
1201 inst = emit(MOV(result_dst, src_reg(1)));
1202 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1203 break;
1204
1205 case ir_binop_logic_xor:
1206 emit(XOR(result_dst, op[0], op[1]));
1207 break;
1208
1209 case ir_binop_logic_or:
1210 emit(OR(result_dst, op[0], op[1]));
1211 break;
1212
1213 case ir_binop_logic_and:
1214 emit(AND(result_dst, op[0], op[1]));
1215 break;
1216
1217 case ir_binop_dot:
1218 assert(ir->operands[0]->type->is_vector());
1219 assert(ir->operands[0]->type == ir->operands[1]->type);
1220 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1221 break;
1222
1223 case ir_unop_sqrt:
1224 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1225 break;
1226 case ir_unop_rsq:
1227 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1228 break;
1229 case ir_unop_i2f:
1230 case ir_unop_i2u:
1231 case ir_unop_u2i:
1232 case ir_unop_u2f:
1233 case ir_unop_b2f:
1234 case ir_unop_b2i:
1235 case ir_unop_f2i:
1236 emit(MOV(result_dst, op[0]));
1237 break;
1238 case ir_unop_f2b:
1239 case ir_unop_i2b: {
1240 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1241 emit(AND(result_dst, result_src, src_reg(1)));
1242 break;
1243 }
1244
1245 case ir_unop_trunc:
1246 emit(RNDZ(result_dst, op[0]));
1247 break;
1248 case ir_unop_ceil:
1249 op[0].negate = !op[0].negate;
1250 inst = emit(RNDD(result_dst, op[0]));
1251 this->result.negate = true;
1252 break;
1253 case ir_unop_floor:
1254 inst = emit(RNDD(result_dst, op[0]));
1255 break;
1256 case ir_unop_fract:
1257 inst = emit(FRC(result_dst, op[0]));
1258 break;
1259 case ir_unop_round_even:
1260 emit(RNDE(result_dst, op[0]));
1261 break;
1262
1263 case ir_binop_min:
1264 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1265
1266 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1267 inst->predicate = BRW_PREDICATE_NORMAL;
1268 break;
1269 case ir_binop_max:
1270 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1271
1272 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1273 inst->predicate = BRW_PREDICATE_NORMAL;
1274 break;
1275
1276 case ir_binop_pow:
1277 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1278 break;
1279
1280 case ir_unop_bit_not:
1281 inst = emit(NOT(result_dst, op[0]));
1282 break;
1283 case ir_binop_bit_and:
1284 inst = emit(AND(result_dst, op[0], op[1]));
1285 break;
1286 case ir_binop_bit_xor:
1287 inst = emit(XOR(result_dst, op[0], op[1]));
1288 break;
1289 case ir_binop_bit_or:
1290 inst = emit(OR(result_dst, op[0], op[1]));
1291 break;
1292
1293 case ir_binop_lshift:
1294 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1295 break;
1296
1297 case ir_binop_rshift:
1298 if (ir->type->base_type == GLSL_TYPE_INT)
1299 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1300 else
1301 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1302 break;
1303
1304 case ir_quadop_vector:
1305 assert(!"not reached: should be handled by lower_quadop_vector");
1306 break;
1307 }
1308 }
1309
1310
1311 void
1312 vec4_visitor::visit(ir_swizzle *ir)
1313 {
1314 src_reg src;
1315 int i = 0;
1316 int swizzle[4];
1317
1318 /* Note that this is only swizzles in expressions, not those on the left
1319 * hand side of an assignment, which do write masking. See ir_assignment
1320 * for that.
1321 */
1322
1323 ir->val->accept(this);
1324 src = this->result;
1325 assert(src.file != BAD_FILE);
1326
1327 for (i = 0; i < ir->type->vector_elements; i++) {
1328 switch (i) {
1329 case 0:
1330 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1331 break;
1332 case 1:
1333 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1334 break;
1335 case 2:
1336 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1337 break;
1338 case 3:
1339 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1340 break;
1341 }
1342 }
1343 for (; i < 4; i++) {
1344 /* Replicate the last channel out. */
1345 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1346 }
1347
1348 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1349
1350 this->result = src;
1351 }
1352
1353 void
1354 vec4_visitor::visit(ir_dereference_variable *ir)
1355 {
1356 const struct glsl_type *type = ir->type;
1357 dst_reg *reg = variable_storage(ir->var);
1358
1359 if (!reg) {
1360 fail("Failed to find variable storage for %s\n", ir->var->name);
1361 this->result = src_reg(brw_null_reg());
1362 return;
1363 }
1364
1365 this->result = src_reg(*reg);
1366
1367 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1368 this->result.swizzle = swizzle_for_size(type->vector_elements);
1369 }
1370
1371 void
1372 vec4_visitor::visit(ir_dereference_array *ir)
1373 {
1374 ir_constant *constant_index;
1375 src_reg src;
1376 int element_size = type_size(ir->type);
1377
1378 constant_index = ir->array_index->constant_expression_value();
1379
1380 ir->array->accept(this);
1381 src = this->result;
1382
1383 if (constant_index) {
1384 src.reg_offset += constant_index->value.i[0] * element_size;
1385 } else {
1386 /* Variable index array dereference. It eats the "vec4" of the
1387 * base of the array and an index that offsets the Mesa register
1388 * index.
1389 */
1390 ir->array_index->accept(this);
1391
1392 src_reg index_reg;
1393
1394 if (element_size == 1) {
1395 index_reg = this->result;
1396 } else {
1397 index_reg = src_reg(this, glsl_type::int_type);
1398
1399 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1400 }
1401
1402 if (src.reladdr) {
1403 src_reg temp = src_reg(this, glsl_type::int_type);
1404
1405 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1406
1407 index_reg = temp;
1408 }
1409
1410 src.reladdr = ralloc(mem_ctx, src_reg);
1411 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1412 }
1413
1414 /* If the type is smaller than a vec4, replicate the last channel out. */
1415 if (ir->type->is_scalar() || ir->type->is_vector())
1416 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1417 else
1418 src.swizzle = BRW_SWIZZLE_NOOP;
1419 src.type = brw_type_for_base_type(ir->type);
1420
1421 this->result = src;
1422 }
1423
1424 void
1425 vec4_visitor::visit(ir_dereference_record *ir)
1426 {
1427 unsigned int i;
1428 const glsl_type *struct_type = ir->record->type;
1429 int offset = 0;
1430
1431 ir->record->accept(this);
1432
1433 for (i = 0; i < struct_type->length; i++) {
1434 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1435 break;
1436 offset += type_size(struct_type->fields.structure[i].type);
1437 }
1438
1439 /* If the type is smaller than a vec4, replicate the last channel out. */
1440 if (ir->type->is_scalar() || ir->type->is_vector())
1441 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1442 else
1443 this->result.swizzle = BRW_SWIZZLE_NOOP;
1444 this->result.type = brw_type_for_base_type(ir->type);
1445
1446 this->result.reg_offset += offset;
1447 }
1448
1449 /**
1450 * We want to be careful in assignment setup to hit the actual storage
1451 * instead of potentially using a temporary like we might with the
1452 * ir_dereference handler.
1453 */
1454 static dst_reg
1455 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1456 {
1457 /* The LHS must be a dereference. If the LHS is a variable indexed array
1458 * access of a vector, it must be separated into a series conditional moves
1459 * before reaching this point (see ir_vec_index_to_cond_assign).
1460 */
1461 assert(ir->as_dereference());
1462 ir_dereference_array *deref_array = ir->as_dereference_array();
1463 if (deref_array) {
1464 assert(!deref_array->array->type->is_vector());
1465 }
1466
1467 /* Use the rvalue deref handler for the most part. We'll ignore
1468 * swizzles in it and write swizzles using writemask, though.
1469 */
1470 ir->accept(v);
1471 return dst_reg(v->result);
1472 }
1473
1474 void
1475 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1476 const struct glsl_type *type, uint32_t predicate)
1477 {
1478 if (type->base_type == GLSL_TYPE_STRUCT) {
1479 for (unsigned int i = 0; i < type->length; i++) {
1480 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1481 }
1482 return;
1483 }
1484
1485 if (type->is_array()) {
1486 for (unsigned int i = 0; i < type->length; i++) {
1487 emit_block_move(dst, src, type->fields.array, predicate);
1488 }
1489 return;
1490 }
1491
1492 if (type->is_matrix()) {
1493 const struct glsl_type *vec_type;
1494
1495 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1496 type->vector_elements, 1);
1497
1498 for (int i = 0; i < type->matrix_columns; i++) {
1499 emit_block_move(dst, src, vec_type, predicate);
1500 }
1501 return;
1502 }
1503
1504 assert(type->is_scalar() || type->is_vector());
1505
1506 dst->type = brw_type_for_base_type(type);
1507 src->type = dst->type;
1508
1509 dst->writemask = (1 << type->vector_elements) - 1;
1510
1511 /* Do we need to worry about swizzling a swizzle? */
1512 assert(src->swizzle == BRW_SWIZZLE_NOOP
1513 || src->swizzle == swizzle_for_size(type->vector_elements));
1514 src->swizzle = swizzle_for_size(type->vector_elements);
1515
1516 vec4_instruction *inst = emit(MOV(*dst, *src));
1517 inst->predicate = predicate;
1518
1519 dst->reg_offset++;
1520 src->reg_offset++;
1521 }
1522
1523
1524 /* If the RHS processing resulted in an instruction generating a
1525 * temporary value, and it would be easy to rewrite the instruction to
1526 * generate its result right into the LHS instead, do so. This ends
1527 * up reliably removing instructions where it can be tricky to do so
1528 * later without real UD chain information.
1529 */
1530 bool
1531 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1532 dst_reg dst,
1533 src_reg src,
1534 vec4_instruction *pre_rhs_inst,
1535 vec4_instruction *last_rhs_inst)
1536 {
1537 /* This could be supported, but it would take more smarts. */
1538 if (ir->condition)
1539 return false;
1540
1541 if (pre_rhs_inst == last_rhs_inst)
1542 return false; /* No instructions generated to work with. */
1543
1544 /* Make sure the last instruction generated our source reg. */
1545 if (src.file != GRF ||
1546 src.file != last_rhs_inst->dst.file ||
1547 src.reg != last_rhs_inst->dst.reg ||
1548 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1549 src.reladdr ||
1550 src.abs ||
1551 src.negate ||
1552 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1553 return false;
1554
1555 /* Check that that last instruction fully initialized the channels
1556 * we want to use, in the order we want to use them. We could
1557 * potentially reswizzle the operands of many instructions so that
1558 * we could handle out of order channels, but don't yet.
1559 */
1560
1561 for (unsigned i = 0; i < 4; i++) {
1562 if (dst.writemask & (1 << i)) {
1563 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1564 return false;
1565
1566 if (BRW_GET_SWZ(src.swizzle, i) != i)
1567 return false;
1568 }
1569 }
1570
1571 /* Success! Rewrite the instruction. */
1572 last_rhs_inst->dst.file = dst.file;
1573 last_rhs_inst->dst.reg = dst.reg;
1574 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1575 last_rhs_inst->dst.reladdr = dst.reladdr;
1576 last_rhs_inst->dst.writemask &= dst.writemask;
1577
1578 return true;
1579 }
1580
1581 void
1582 vec4_visitor::visit(ir_assignment *ir)
1583 {
1584 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1585 uint32_t predicate = BRW_PREDICATE_NONE;
1586
1587 if (!ir->lhs->type->is_scalar() &&
1588 !ir->lhs->type->is_vector()) {
1589 ir->rhs->accept(this);
1590 src_reg src = this->result;
1591
1592 if (ir->condition) {
1593 emit_bool_to_cond_code(ir->condition, &predicate);
1594 }
1595
1596 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1597 return;
1598 }
1599
1600 /* Now we're down to just a scalar/vector with writemasks. */
1601 int i;
1602
1603 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1604 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1605
1606 ir->rhs->accept(this);
1607
1608 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1609
1610 src_reg src = this->result;
1611
1612 int swizzles[4];
1613 int first_enabled_chan = 0;
1614 int src_chan = 0;
1615
1616 assert(ir->lhs->type->is_vector() ||
1617 ir->lhs->type->is_scalar());
1618 dst.writemask = ir->write_mask;
1619
1620 for (int i = 0; i < 4; i++) {
1621 if (dst.writemask & (1 << i)) {
1622 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1623 break;
1624 }
1625 }
1626
1627 /* Swizzle a small RHS vector into the channels being written.
1628 *
1629 * glsl ir treats write_mask as dictating how many channels are
1630 * present on the RHS while in our instructions we need to make
1631 * those channels appear in the slots of the vec4 they're written to.
1632 */
1633 for (int i = 0; i < 4; i++) {
1634 if (dst.writemask & (1 << i))
1635 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1636 else
1637 swizzles[i] = first_enabled_chan;
1638 }
1639 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1640 swizzles[2], swizzles[3]);
1641
1642 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1643 return;
1644 }
1645
1646 if (ir->condition) {
1647 emit_bool_to_cond_code(ir->condition, &predicate);
1648 }
1649
1650 for (i = 0; i < type_size(ir->lhs->type); i++) {
1651 vec4_instruction *inst = emit(MOV(dst, src));
1652 inst->predicate = predicate;
1653
1654 dst.reg_offset++;
1655 src.reg_offset++;
1656 }
1657 }
1658
1659 void
1660 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1661 {
1662 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1663 foreach_list(node, &ir->components) {
1664 ir_constant *field_value = (ir_constant *)node;
1665
1666 emit_constant_values(dst, field_value);
1667 }
1668 return;
1669 }
1670
1671 if (ir->type->is_array()) {
1672 for (unsigned int i = 0; i < ir->type->length; i++) {
1673 emit_constant_values(dst, ir->array_elements[i]);
1674 }
1675 return;
1676 }
1677
1678 if (ir->type->is_matrix()) {
1679 for (int i = 0; i < ir->type->matrix_columns; i++) {
1680 for (int j = 0; j < ir->type->vector_elements; j++) {
1681 dst->writemask = 1 << j;
1682 dst->type = BRW_REGISTER_TYPE_F;
1683
1684 emit(MOV(*dst,
1685 src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1686 }
1687 dst->reg_offset++;
1688 }
1689 return;
1690 }
1691
1692 for (int i = 0; i < ir->type->vector_elements; i++) {
1693 dst->writemask = 1 << i;
1694 dst->type = brw_type_for_base_type(ir->type);
1695
1696 switch (ir->type->base_type) {
1697 case GLSL_TYPE_FLOAT:
1698 emit(MOV(*dst, src_reg(ir->value.f[i])));
1699 break;
1700 case GLSL_TYPE_INT:
1701 emit(MOV(*dst, src_reg(ir->value.i[i])));
1702 break;
1703 case GLSL_TYPE_UINT:
1704 emit(MOV(*dst, src_reg(ir->value.u[i])));
1705 break;
1706 case GLSL_TYPE_BOOL:
1707 emit(MOV(*dst, src_reg(ir->value.b[i])));
1708 break;
1709 default:
1710 assert(!"Non-float/uint/int/bool constant");
1711 break;
1712 }
1713 }
1714 dst->reg_offset++;
1715 }
1716
1717 void
1718 vec4_visitor::visit(ir_constant *ir)
1719 {
1720 dst_reg dst = dst_reg(this, ir->type);
1721 this->result = src_reg(dst);
1722
1723 emit_constant_values(&dst, ir);
1724 }
1725
1726 void
1727 vec4_visitor::visit(ir_call *ir)
1728 {
1729 assert(!"not reached");
1730 }
1731
1732 void
1733 vec4_visitor::visit(ir_texture *ir)
1734 {
1735 /* FINISHME: Implement vertex texturing.
1736 *
1737 * With 0 vertex samplers available, the linker will reject
1738 * programs that do vertex texturing, but after our visitor has
1739 * run.
1740 */
1741 this->result = src_reg(this, glsl_type::vec4_type);
1742 }
1743
1744 void
1745 vec4_visitor::visit(ir_return *ir)
1746 {
1747 assert(!"not reached");
1748 }
1749
1750 void
1751 vec4_visitor::visit(ir_discard *ir)
1752 {
1753 assert(!"not reached");
1754 }
1755
1756 void
1757 vec4_visitor::visit(ir_if *ir)
1758 {
1759 /* Don't point the annotation at the if statement, because then it plus
1760 * the then and else blocks get printed.
1761 */
1762 this->base_ir = ir->condition;
1763
1764 if (intel->gen == 6) {
1765 emit_if_gen6(ir);
1766 } else {
1767 uint32_t predicate;
1768 emit_bool_to_cond_code(ir->condition, &predicate);
1769 emit(IF(predicate));
1770 }
1771
1772 visit_instructions(&ir->then_instructions);
1773
1774 if (!ir->else_instructions.is_empty()) {
1775 this->base_ir = ir->condition;
1776 emit(BRW_OPCODE_ELSE);
1777
1778 visit_instructions(&ir->else_instructions);
1779 }
1780
1781 this->base_ir = ir->condition;
1782 emit(BRW_OPCODE_ENDIF);
1783 }
1784
1785 void
1786 vec4_visitor::emit_ndc_computation()
1787 {
1788 /* Get the position */
1789 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1790
1791 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1792 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1793 output_reg[BRW_VERT_RESULT_NDC] = ndc;
1794
1795 current_annotation = "NDC";
1796 dst_reg ndc_w = ndc;
1797 ndc_w.writemask = WRITEMASK_W;
1798 src_reg pos_w = pos;
1799 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1800 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1801
1802 dst_reg ndc_xyz = ndc;
1803 ndc_xyz.writemask = WRITEMASK_XYZ;
1804
1805 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1806 }
1807
1808 void
1809 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1810 {
1811 if (intel->gen < 6 &&
1812 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1813 c->key.userclip_active || brw->has_negative_rhw_bug)) {
1814 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1815 dst_reg header1_w = header1;
1816 header1_w.writemask = WRITEMASK_W;
1817 GLuint i;
1818
1819 emit(MOV(header1, 0u));
1820
1821 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1822 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
1823
1824 current_annotation = "Point size";
1825 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1826 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1827 }
1828
1829 current_annotation = "Clipping flags";
1830 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1831 vec4_instruction *inst;
1832
1833 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
1834 src_reg(this->userplane[i])));
1835 inst->conditional_mod = BRW_CONDITIONAL_L;
1836
1837 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
1838 inst->predicate = BRW_PREDICATE_NORMAL;
1839 }
1840
1841 /* i965 clipping workaround:
1842 * 1) Test for -ve rhw
1843 * 2) If set,
1844 * set ndc = (0,0,0,0)
1845 * set ucp[6] = 1
1846 *
1847 * Later, clipping will detect ucp[6] and ensure the primitive is
1848 * clipped against all fixed planes.
1849 */
1850 if (brw->has_negative_rhw_bug) {
1851 #if 0
1852 /* FINISHME */
1853 brw_CMP(p,
1854 vec8(brw_null_reg()),
1855 BRW_CONDITIONAL_L,
1856 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
1857 brw_imm_f(0));
1858
1859 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1860 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
1861 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1862 #endif
1863 }
1864
1865 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1866 } else if (intel->gen < 6) {
1867 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1868 } else {
1869 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1870 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1871 emit(MOV(brw_writemask(reg, WRITEMASK_W),
1872 src_reg(output_reg[VERT_RESULT_PSIZ])));
1873 }
1874 }
1875 }
1876
1877 void
1878 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
1879 {
1880 if (intel->gen < 6) {
1881 /* Clip distance slots are set aside in gen5, but they are not used. It
1882 * is not clear whether we actually need to set aside space for them,
1883 * but the performance cost is negligible.
1884 */
1885 return;
1886 }
1887
1888 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
1889 *
1890 * "If a linked set of shaders forming the vertex stage contains no
1891 * static write to gl_ClipVertex or gl_ClipDistance, but the
1892 * application has requested clipping against user clip planes through
1893 * the API, then the coordinate written to gl_Position is used for
1894 * comparison against the user clip planes."
1895 *
1896 * This function is only called if the shader didn't write to
1897 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
1898 * if the user wrote to it; otherwise we use gl_Position.
1899 */
1900 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
1901 if (!(c->prog_data.outputs_written
1902 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
1903 clip_vertex = VERT_RESULT_HPOS;
1904 }
1905
1906 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
1907 ++i) {
1908 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
1909 src_reg(output_reg[clip_vertex]),
1910 src_reg(this->userplane[i + offset])));
1911 }
1912 }
1913
1914 void
1915 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
1916 {
1917 assert (vert_result < VERT_RESULT_MAX);
1918 current_annotation = output_reg_annotation[vert_result];
1919 /* Copy the register, saturating if necessary */
1920 vec4_instruction *inst = emit(MOV(reg,
1921 src_reg(output_reg[vert_result])));
1922 if ((vert_result == VERT_RESULT_COL0 ||
1923 vert_result == VERT_RESULT_COL1 ||
1924 vert_result == VERT_RESULT_BFC0 ||
1925 vert_result == VERT_RESULT_BFC1) &&
1926 c->key.clamp_vertex_color) {
1927 inst->saturate = true;
1928 }
1929 }
1930
1931 void
1932 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
1933 {
1934 struct brw_reg hw_reg = brw_message_reg(mrf);
1935 dst_reg reg = dst_reg(MRF, mrf);
1936 reg.type = BRW_REGISTER_TYPE_F;
1937
1938 switch (vert_result) {
1939 case VERT_RESULT_PSIZ:
1940 /* PSIZ is always in slot 0, and is coupled with other flags. */
1941 current_annotation = "indices, point width, clip flags";
1942 emit_psiz_and_flags(hw_reg);
1943 break;
1944 case BRW_VERT_RESULT_NDC:
1945 current_annotation = "NDC";
1946 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
1947 break;
1948 case BRW_VERT_RESULT_HPOS_DUPLICATE:
1949 case VERT_RESULT_HPOS:
1950 current_annotation = "gl_Position";
1951 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
1952 break;
1953 case VERT_RESULT_CLIP_DIST0:
1954 case VERT_RESULT_CLIP_DIST1:
1955 if (this->c->key.uses_clip_distance) {
1956 emit_generic_urb_slot(reg, vert_result);
1957 } else {
1958 current_annotation = "user clip distances";
1959 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
1960 }
1961 break;
1962 case BRW_VERT_RESULT_PAD:
1963 /* No need to write to this slot */
1964 break;
1965 default:
1966 emit_generic_urb_slot(reg, vert_result);
1967 break;
1968 }
1969 }
1970
1971 static int
1972 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1973 {
1974 struct intel_context *intel = &brw->intel;
1975
1976 if (intel->gen >= 6) {
1977 /* URB data written (does not include the message header reg) must
1978 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1979 * section 5.4.3.2.2: URB_INTERLEAVED.
1980 *
1981 * URB entries are allocated on a multiple of 1024 bits, so an
1982 * extra 128 bits written here to make the end align to 256 is
1983 * no problem.
1984 */
1985 if ((mlen % 2) != 1)
1986 mlen++;
1987 }
1988
1989 return mlen;
1990 }
1991
1992 /**
1993 * Generates the VUE payload plus the 1 or 2 URB write instructions to
1994 * complete the VS thread.
1995 *
1996 * The VUE layout is documented in Volume 2a.
1997 */
1998 void
1999 vec4_visitor::emit_urb_writes()
2000 {
2001 /* MRF 0 is reserved for the debugger, so start with message header
2002 * in MRF 1.
2003 */
2004 int base_mrf = 1;
2005 int mrf = base_mrf;
2006 /* In the process of generating our URB write message contents, we
2007 * may need to unspill a register or load from an array. Those
2008 * reads would use MRFs 14-15.
2009 */
2010 int max_usable_mrf = 13;
2011
2012 /* The following assertion verifies that max_usable_mrf causes an
2013 * even-numbered amount of URB write data, which will meet gen6's
2014 * requirements for length alignment.
2015 */
2016 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2017
2018 /* FINISHME: edgeflag */
2019
2020 brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2021 c->prog_data.outputs_written);
2022
2023 /* First mrf is the g0-based message header containing URB handles and such,
2024 * which is implied in VS_OPCODE_URB_WRITE.
2025 */
2026 mrf++;
2027
2028 if (intel->gen < 6) {
2029 emit_ndc_computation();
2030 }
2031
2032 /* Set up the VUE data for the first URB write */
2033 int slot;
2034 for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2035 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2036
2037 /* If this was max_usable_mrf, we can't fit anything more into this URB
2038 * WRITE.
2039 */
2040 if (mrf > max_usable_mrf) {
2041 slot++;
2042 break;
2043 }
2044 }
2045
2046 current_annotation = "URB write";
2047 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2048 inst->base_mrf = base_mrf;
2049 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2050 inst->eot = (slot >= c->vue_map.num_slots);
2051
2052 /* Optional second URB write */
2053 if (!inst->eot) {
2054 mrf = base_mrf + 1;
2055
2056 for (; slot < c->vue_map.num_slots; ++slot) {
2057 assert(mrf < max_usable_mrf);
2058
2059 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2060 }
2061
2062 current_annotation = "URB write";
2063 inst = emit(VS_OPCODE_URB_WRITE);
2064 inst->base_mrf = base_mrf;
2065 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2066 inst->eot = true;
2067 /* URB destination offset. In the previous write, we got MRFs
2068 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2069 * URB row increments, and each of our MRFs is half of one of
2070 * those, since we're doing interleaved writes.
2071 */
2072 inst->offset = (max_usable_mrf - base_mrf) / 2;
2073 }
2074
2075 if (intel->gen == 6)
2076 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2077 else
2078 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2079 }
2080
2081 src_reg
2082 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2083 src_reg *reladdr, int reg_offset)
2084 {
2085 /* Because we store the values to scratch interleaved like our
2086 * vertex data, we need to scale the vec4 index by 2.
2087 */
2088 int message_header_scale = 2;
2089
2090 /* Pre-gen6, the message header uses byte offsets instead of vec4
2091 * (16-byte) offset units.
2092 */
2093 if (intel->gen < 6)
2094 message_header_scale *= 16;
2095
2096 if (reladdr) {
2097 src_reg index = src_reg(this, glsl_type::int_type);
2098
2099 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2100 emit_before(inst, MUL(dst_reg(index),
2101 index, src_reg(message_header_scale)));
2102
2103 return index;
2104 } else {
2105 return src_reg(reg_offset * message_header_scale);
2106 }
2107 }
2108
2109 src_reg
2110 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2111 src_reg *reladdr, int reg_offset)
2112 {
2113 if (reladdr) {
2114 src_reg index = src_reg(this, glsl_type::int_type);
2115
2116 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2117
2118 /* Pre-gen6, the message header uses byte offsets instead of vec4
2119 * (16-byte) offset units.
2120 */
2121 if (intel->gen < 6) {
2122 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2123 }
2124
2125 return index;
2126 } else {
2127 int message_header_scale = intel->gen < 6 ? 16 : 1;
2128 return src_reg(reg_offset * message_header_scale);
2129 }
2130 }
2131
2132 /**
2133 * Emits an instruction before @inst to load the value named by @orig_src
2134 * from scratch space at @base_offset to @temp.
2135 */
2136 void
2137 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2138 dst_reg temp, src_reg orig_src,
2139 int base_offset)
2140 {
2141 int reg_offset = base_offset + orig_src.reg_offset;
2142 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2143
2144 emit_before(inst, SCRATCH_READ(temp, index));
2145 }
2146
2147 /**
2148 * Emits an instruction after @inst to store the value to be written
2149 * to @orig_dst to scratch space at @base_offset, from @temp.
2150 */
2151 void
2152 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2153 src_reg temp, dst_reg orig_dst,
2154 int base_offset)
2155 {
2156 int reg_offset = base_offset + orig_dst.reg_offset;
2157 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2158
2159 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2160 orig_dst.writemask));
2161 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2162 write->predicate = inst->predicate;
2163 write->ir = inst->ir;
2164 write->annotation = inst->annotation;
2165 inst->insert_after(write);
2166 }
2167
2168 /**
2169 * We can't generally support array access in GRF space, because a
2170 * single instruction's destination can only span 2 contiguous
2171 * registers. So, we send all GRF arrays that get variable index
2172 * access to scratch space.
2173 */
2174 void
2175 vec4_visitor::move_grf_array_access_to_scratch()
2176 {
2177 int scratch_loc[this->virtual_grf_count];
2178
2179 for (int i = 0; i < this->virtual_grf_count; i++) {
2180 scratch_loc[i] = -1;
2181 }
2182
2183 /* First, calculate the set of virtual GRFs that need to be punted
2184 * to scratch due to having any array access on them, and where in
2185 * scratch.
2186 */
2187 foreach_list(node, &this->instructions) {
2188 vec4_instruction *inst = (vec4_instruction *)node;
2189
2190 if (inst->dst.file == GRF && inst->dst.reladdr &&
2191 scratch_loc[inst->dst.reg] == -1) {
2192 scratch_loc[inst->dst.reg] = c->last_scratch;
2193 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2194 }
2195
2196 for (int i = 0 ; i < 3; i++) {
2197 src_reg *src = &inst->src[i];
2198
2199 if (src->file == GRF && src->reladdr &&
2200 scratch_loc[src->reg] == -1) {
2201 scratch_loc[src->reg] = c->last_scratch;
2202 c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2203 }
2204 }
2205 }
2206
2207 /* Now, for anything that will be accessed through scratch, rewrite
2208 * it to load/store. Note that this is a _safe list walk, because
2209 * we may generate a new scratch_write instruction after the one
2210 * we're processing.
2211 */
2212 foreach_list_safe(node, &this->instructions) {
2213 vec4_instruction *inst = (vec4_instruction *)node;
2214
2215 /* Set up the annotation tracking for new generated instructions. */
2216 base_ir = inst->ir;
2217 current_annotation = inst->annotation;
2218
2219 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2220 src_reg temp = src_reg(this, glsl_type::vec4_type);
2221
2222 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2223
2224 inst->dst.file = temp.file;
2225 inst->dst.reg = temp.reg;
2226 inst->dst.reg_offset = temp.reg_offset;
2227 inst->dst.reladdr = NULL;
2228 }
2229
2230 for (int i = 0 ; i < 3; i++) {
2231 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2232 continue;
2233
2234 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2235
2236 emit_scratch_read(inst, temp, inst->src[i],
2237 scratch_loc[inst->src[i].reg]);
2238
2239 inst->src[i].file = temp.file;
2240 inst->src[i].reg = temp.reg;
2241 inst->src[i].reg_offset = temp.reg_offset;
2242 inst->src[i].reladdr = NULL;
2243 }
2244 }
2245 }
2246
2247 /**
2248 * Emits an instruction before @inst to load the value named by @orig_src
2249 * from the pull constant buffer (surface) at @base_offset to @temp.
2250 */
2251 void
2252 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2253 dst_reg temp, src_reg orig_src,
2254 int base_offset)
2255 {
2256 int reg_offset = base_offset + orig_src.reg_offset;
2257 src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2258 vec4_instruction *load;
2259
2260 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2261 temp, index);
2262 load->base_mrf = 14;
2263 load->mlen = 1;
2264 emit_before(inst, load);
2265 }
2266
2267 /**
2268 * Implements array access of uniforms by inserting a
2269 * PULL_CONSTANT_LOAD instruction.
2270 *
2271 * Unlike temporary GRF array access (where we don't support it due to
2272 * the difficulty of doing relative addressing on instruction
2273 * destinations), we could potentially do array access of uniforms
2274 * that were loaded in GRF space as push constants. In real-world
2275 * usage we've seen, though, the arrays being used are always larger
2276 * than we could load as push constants, so just always move all
2277 * uniform array access out to a pull constant buffer.
2278 */
2279 void
2280 vec4_visitor::move_uniform_array_access_to_pull_constants()
2281 {
2282 int pull_constant_loc[this->uniforms];
2283
2284 for (int i = 0; i < this->uniforms; i++) {
2285 pull_constant_loc[i] = -1;
2286 }
2287
2288 /* Walk through and find array access of uniforms. Put a copy of that
2289 * uniform in the pull constant buffer.
2290 *
2291 * Note that we don't move constant-indexed accesses to arrays. No
2292 * testing has been done of the performance impact of this choice.
2293 */
2294 foreach_list_safe(node, &this->instructions) {
2295 vec4_instruction *inst = (vec4_instruction *)node;
2296
2297 for (int i = 0 ; i < 3; i++) {
2298 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2299 continue;
2300
2301 int uniform = inst->src[i].reg;
2302
2303 /* If this array isn't already present in the pull constant buffer,
2304 * add it.
2305 */
2306 if (pull_constant_loc[uniform] == -1) {
2307 const float **values = &prog_data->param[uniform * 4];
2308
2309 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2310
2311 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2312 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2313 }
2314 }
2315
2316 /* Set up the annotation tracking for new generated instructions. */
2317 base_ir = inst->ir;
2318 current_annotation = inst->annotation;
2319
2320 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2321
2322 emit_pull_constant_load(inst, temp, inst->src[i],
2323 pull_constant_loc[uniform]);
2324
2325 inst->src[i].file = temp.file;
2326 inst->src[i].reg = temp.reg;
2327 inst->src[i].reg_offset = temp.reg_offset;
2328 inst->src[i].reladdr = NULL;
2329 }
2330 }
2331
2332 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2333 * no need to track them as larger-than-vec4 objects. This will be
2334 * relied on in cutting out unused uniform vectors from push
2335 * constants.
2336 */
2337 split_uniform_registers();
2338 }
2339
2340 void
2341 vec4_visitor::resolve_ud_negate(src_reg *reg)
2342 {
2343 if (reg->type != BRW_REGISTER_TYPE_UD ||
2344 !reg->negate)
2345 return;
2346
2347 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2348 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2349 *reg = temp;
2350 }
2351
2352 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2353 struct gl_shader_program *prog,
2354 struct brw_shader *shader)
2355 {
2356 this->c = c;
2357 this->p = &c->func;
2358 this->brw = p->brw;
2359 this->intel = &brw->intel;
2360 this->ctx = &intel->ctx;
2361 this->prog = prog;
2362 this->shader = shader;
2363
2364 this->mem_ctx = ralloc_context(NULL);
2365 this->failed = false;
2366
2367 this->base_ir = NULL;
2368 this->current_annotation = NULL;
2369
2370 this->c = c;
2371 this->vp = (struct gl_vertex_program *)
2372 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2373 this->prog_data = &c->prog_data;
2374
2375 this->variable_ht = hash_table_ctor(0,
2376 hash_table_pointer_hash,
2377 hash_table_pointer_compare);
2378
2379 this->virtual_grf_def = NULL;
2380 this->virtual_grf_use = NULL;
2381 this->virtual_grf_sizes = NULL;
2382 this->virtual_grf_count = 0;
2383 this->virtual_grf_reg_map = NULL;
2384 this->virtual_grf_reg_count = 0;
2385 this->virtual_grf_array_size = 0;
2386 this->live_intervals_valid = false;
2387
2388 this->uniforms = 0;
2389
2390 this->variable_ht = hash_table_ctor(0,
2391 hash_table_pointer_hash,
2392 hash_table_pointer_compare);
2393 }
2394
2395 vec4_visitor::~vec4_visitor()
2396 {
2397 ralloc_free(this->mem_ctx);
2398 hash_table_dtor(this->variable_ht);
2399 }
2400
2401
2402 void
2403 vec4_visitor::fail(const char *format, ...)
2404 {
2405 va_list va;
2406 char *msg;
2407
2408 if (failed)
2409 return;
2410
2411 failed = true;
2412
2413 va_start(va, format);
2414 msg = ralloc_vasprintf(mem_ctx, format, va);
2415 va_end(va);
2416 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2417
2418 this->fail_msg = msg;
2419
2420 if (INTEL_DEBUG & DEBUG_VS) {
2421 fprintf(stderr, "%s", msg);
2422 }
2423 }
2424
2425 } /* namespace brw */