i965/vp: Fix crashes with INTEL_DEBUG=vs.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/context.h"
27 #include "main/macros.h"
28 #include "program/prog_parameter.h"
29 #include "program/sampler.h"
30 }
31
32 namespace brw {
33
34 vec4_instruction::vec4_instruction(vec4_visitor *v,
35 enum opcode opcode, dst_reg dst,
36 src_reg src0, src_reg src1, src_reg src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->ir = v->base_ir;
44 this->annotation = v->current_annotation;
45 }
46
47 vec4_instruction *
48 vec4_visitor::emit(vec4_instruction *inst)
49 {
50 this->instructions.push_tail(inst);
51
52 return inst;
53 }
54
55 vec4_instruction *
56 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
57 {
58 new_inst->ir = inst->ir;
59 new_inst->annotation = inst->annotation;
60
61 inst->insert_before(new_inst);
62
63 return inst;
64 }
65
66 vec4_instruction *
67 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
68 src_reg src0, src_reg src1, src_reg src2)
69 {
70 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
71 src0, src1, src2));
72 }
73
74
75 vec4_instruction *
76 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
77 {
78 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
79 }
80
81 vec4_instruction *
82 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
85 }
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
91 }
92
93 #define ALU1(op) \
94 vec4_instruction * \
95 vec4_visitor::op(dst_reg dst, src_reg src0) \
96 { \
97 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
98 src0); \
99 }
100
101 #define ALU2(op) \
102 vec4_instruction * \
103 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
104 { \
105 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
106 src0, src1); \
107 }
108
109 ALU1(NOT)
110 ALU1(MOV)
111 ALU1(FRC)
112 ALU1(RNDD)
113 ALU1(RNDE)
114 ALU1(RNDZ)
115 ALU2(ADD)
116 ALU2(MUL)
117 ALU2(MACH)
118 ALU2(AND)
119 ALU2(OR)
120 ALU2(XOR)
121 ALU2(DP3)
122 ALU2(DP4)
123 ALU2(DPH)
124 ALU2(SHL)
125 ALU2(SHR)
126 ALU2(ASR)
127
128 /** Gen4 predicated IF. */
129 vec4_instruction *
130 vec4_visitor::IF(uint32_t predicate)
131 {
132 vec4_instruction *inst;
133
134 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
135 inst->predicate = predicate;
136
137 return inst;
138 }
139
140 /** Gen6+ IF with embedded comparison. */
141 vec4_instruction *
142 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
143 {
144 assert(intel->gen >= 6);
145
146 vec4_instruction *inst;
147
148 resolve_ud_negate(&src0);
149 resolve_ud_negate(&src1);
150
151 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
152 src0, src1);
153 inst->conditional_mod = condition;
154
155 return inst;
156 }
157
158 /**
159 * CMP: Sets the low bit of the destination channels with the result
160 * of the comparison, while the upper bits are undefined, and updates
161 * the flag register with the packed 16 bits of the result.
162 */
163 vec4_instruction *
164 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
165 {
166 vec4_instruction *inst;
167
168 /* original gen4 does type conversion to the destination type
169 * before before comparison, producing garbage results for floating
170 * point comparisons.
171 */
172 if (intel->gen == 4) {
173 dst.type = src0.type;
174 if (dst.file == HW_REG)
175 dst.fixed_hw_reg.type = dst.type;
176 }
177
178 resolve_ud_negate(&src0);
179 resolve_ud_negate(&src1);
180
181 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
182 inst->conditional_mod = condition;
183
184 return inst;
185 }
186
187 vec4_instruction *
188 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
189 {
190 vec4_instruction *inst;
191
192 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
193 dst, index);
194 inst->base_mrf = 14;
195 inst->mlen = 2;
196
197 return inst;
198 }
199
200 vec4_instruction *
201 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
202 {
203 vec4_instruction *inst;
204
205 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
206 dst, src, index);
207 inst->base_mrf = 13;
208 inst->mlen = 3;
209
210 return inst;
211 }
212
213 void
214 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
215 {
216 static enum opcode dot_opcodes[] = {
217 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
218 };
219
220 emit(dot_opcodes[elements - 2], dst, src0, src1);
221 }
222
223 void
224 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
225 {
226 /* The gen6 math instruction ignores the source modifiers --
227 * swizzle, abs, negate, and at least some parts of the register
228 * region description.
229 *
230 * While it would seem that this MOV could be avoided at this point
231 * in the case that the swizzle is matched up with the destination
232 * writemask, note that uniform packing and register allocation
233 * could rearrange our swizzle, so let's leave this matter up to
234 * copy propagation later.
235 */
236 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
237 emit(MOV(dst_reg(temp_src), src));
238
239 if (dst.writemask != WRITEMASK_XYZW) {
240 /* The gen6 math instruction must be align1, so we can't do
241 * writemasks.
242 */
243 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
244
245 emit(opcode, temp_dst, temp_src);
246
247 emit(MOV(dst, src_reg(temp_dst)));
248 } else {
249 emit(opcode, dst, temp_src);
250 }
251 }
252
253 void
254 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
255 {
256 vec4_instruction *inst = emit(opcode, dst, src);
257 inst->base_mrf = 1;
258 inst->mlen = 1;
259 }
260
261 void
262 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
263 {
264 switch (opcode) {
265 case SHADER_OPCODE_RCP:
266 case SHADER_OPCODE_RSQ:
267 case SHADER_OPCODE_SQRT:
268 case SHADER_OPCODE_EXP2:
269 case SHADER_OPCODE_LOG2:
270 case SHADER_OPCODE_SIN:
271 case SHADER_OPCODE_COS:
272 break;
273 default:
274 assert(!"not reached: bad math opcode");
275 return;
276 }
277
278 if (intel->gen >= 7) {
279 emit(opcode, dst, src);
280 } else if (intel->gen == 6) {
281 return emit_math1_gen6(opcode, dst, src);
282 } else {
283 return emit_math1_gen4(opcode, dst, src);
284 }
285 }
286
287 void
288 vec4_visitor::emit_math2_gen6(enum opcode opcode,
289 dst_reg dst, src_reg src0, src_reg src1)
290 {
291 src_reg expanded;
292
293 /* The gen6 math instruction ignores the source modifiers --
294 * swizzle, abs, negate, and at least some parts of the register
295 * region description. Move the sources to temporaries to make it
296 * generally work.
297 */
298
299 expanded = src_reg(this, glsl_type::vec4_type);
300 expanded.type = src0.type;
301 emit(MOV(dst_reg(expanded), src0));
302 src0 = expanded;
303
304 expanded = src_reg(this, glsl_type::vec4_type);
305 expanded.type = src1.type;
306 emit(MOV(dst_reg(expanded), src1));
307 src1 = expanded;
308
309 if (dst.writemask != WRITEMASK_XYZW) {
310 /* The gen6 math instruction must be align1, so we can't do
311 * writemasks.
312 */
313 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
314 temp_dst.type = dst.type;
315
316 emit(opcode, temp_dst, src0, src1);
317
318 emit(MOV(dst, src_reg(temp_dst)));
319 } else {
320 emit(opcode, dst, src0, src1);
321 }
322 }
323
324 void
325 vec4_visitor::emit_math2_gen4(enum opcode opcode,
326 dst_reg dst, src_reg src0, src_reg src1)
327 {
328 vec4_instruction *inst = emit(opcode, dst, src0, src1);
329 inst->base_mrf = 1;
330 inst->mlen = 2;
331 }
332
333 void
334 vec4_visitor::emit_math(enum opcode opcode,
335 dst_reg dst, src_reg src0, src_reg src1)
336 {
337 switch (opcode) {
338 case SHADER_OPCODE_POW:
339 case SHADER_OPCODE_INT_QUOTIENT:
340 case SHADER_OPCODE_INT_REMAINDER:
341 break;
342 default:
343 assert(!"not reached: unsupported binary math opcode");
344 return;
345 }
346
347 if (intel->gen >= 7) {
348 emit(opcode, dst, src0, src1);
349 } else if (intel->gen == 6) {
350 return emit_math2_gen6(opcode, dst, src0, src1);
351 } else {
352 return emit_math2_gen4(opcode, dst, src0, src1);
353 }
354 }
355
356 void
357 vec4_visitor::visit_instructions(const exec_list *list)
358 {
359 foreach_list(node, list) {
360 ir_instruction *ir = (ir_instruction *)node;
361
362 base_ir = ir;
363 ir->accept(this);
364 }
365 }
366
367
368 static int
369 type_size(const struct glsl_type *type)
370 {
371 unsigned int i;
372 int size;
373
374 switch (type->base_type) {
375 case GLSL_TYPE_UINT:
376 case GLSL_TYPE_INT:
377 case GLSL_TYPE_FLOAT:
378 case GLSL_TYPE_BOOL:
379 if (type->is_matrix()) {
380 return type->matrix_columns;
381 } else {
382 /* Regardless of size of vector, it gets a vec4. This is bad
383 * packing for things like floats, but otherwise arrays become a
384 * mess. Hopefully a later pass over the code can pack scalars
385 * down if appropriate.
386 */
387 return 1;
388 }
389 case GLSL_TYPE_ARRAY:
390 assert(type->length > 0);
391 return type_size(type->fields.array) * type->length;
392 case GLSL_TYPE_STRUCT:
393 size = 0;
394 for (i = 0; i < type->length; i++) {
395 size += type_size(type->fields.structure[i].type);
396 }
397 return size;
398 case GLSL_TYPE_SAMPLER:
399 /* Samplers take up one slot in UNIFORMS[], but they're baked in
400 * at link time.
401 */
402 return 1;
403 default:
404 assert(0);
405 return 0;
406 }
407 }
408
409 int
410 vec4_visitor::virtual_grf_alloc(int size)
411 {
412 if (virtual_grf_array_size <= virtual_grf_count) {
413 if (virtual_grf_array_size == 0)
414 virtual_grf_array_size = 16;
415 else
416 virtual_grf_array_size *= 2;
417 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
418 virtual_grf_array_size);
419 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
420 virtual_grf_array_size);
421 }
422 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
423 virtual_grf_reg_count += size;
424 virtual_grf_sizes[virtual_grf_count] = size;
425 return virtual_grf_count++;
426 }
427
428 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
429 {
430 init();
431
432 this->file = GRF;
433 this->reg = v->virtual_grf_alloc(type_size(type));
434
435 if (type->is_array() || type->is_record()) {
436 this->swizzle = BRW_SWIZZLE_NOOP;
437 } else {
438 this->swizzle = swizzle_for_size(type->vector_elements);
439 }
440
441 this->type = brw_type_for_base_type(type);
442 }
443
444 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
445 {
446 init();
447
448 this->file = GRF;
449 this->reg = v->virtual_grf_alloc(type_size(type));
450
451 if (type->is_array() || type->is_record()) {
452 this->writemask = WRITEMASK_XYZW;
453 } else {
454 this->writemask = (1 << type->vector_elements) - 1;
455 }
456
457 this->type = brw_type_for_base_type(type);
458 }
459
460 /* Our support for uniforms is piggy-backed on the struct
461 * gl_fragment_program, because that's where the values actually
462 * get stored, rather than in some global gl_shader_program uniform
463 * store.
464 */
465 int
466 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
467 {
468 unsigned int offset = 0;
469 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
470
471 if (type->is_matrix()) {
472 const glsl_type *column = type->column_type();
473
474 for (unsigned int i = 0; i < type->matrix_columns; i++) {
475 offset += setup_uniform_values(loc + offset, column);
476 }
477
478 return offset;
479 }
480
481 switch (type->base_type) {
482 case GLSL_TYPE_FLOAT:
483 case GLSL_TYPE_UINT:
484 case GLSL_TYPE_INT:
485 case GLSL_TYPE_BOOL:
486 for (unsigned int i = 0; i < type->vector_elements; i++) {
487 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
488 }
489
490 /* Set up pad elements to get things aligned to a vec4 boundary. */
491 for (unsigned int i = type->vector_elements; i < 4; i++) {
492 static float zero = 0;
493
494 c->prog_data.param[this->uniforms * 4 + i] = &zero;
495 }
496
497 /* Track the size of this uniform vector, for future packing of
498 * uniforms.
499 */
500 this->uniform_vector_size[this->uniforms] = type->vector_elements;
501 this->uniforms++;
502
503 return 1;
504
505 case GLSL_TYPE_STRUCT:
506 for (unsigned int i = 0; i < type->length; i++) {
507 offset += setup_uniform_values(loc + offset,
508 type->fields.structure[i].type);
509 }
510 return offset;
511
512 case GLSL_TYPE_ARRAY:
513 for (unsigned int i = 0; i < type->length; i++) {
514 offset += setup_uniform_values(loc + offset, type->fields.array);
515 }
516 return offset;
517
518 case GLSL_TYPE_SAMPLER:
519 /* The sampler takes up a slot, but we don't use any values from it. */
520 return 1;
521
522 default:
523 assert(!"not reached");
524 return 0;
525 }
526 }
527
528 void
529 vec4_visitor::setup_uniform_clipplane_values()
530 {
531 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
532
533 if (intel->gen < 6) {
534 /* Pre-Gen6, we compact clip planes. For example, if the user
535 * enables just clip planes 0, 1, and 3, we will enable clip planes
536 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
537 * plane 2. This simplifies the implementation of the Gen6 clip
538 * thread.
539 */
540 int compacted_clipplane_index = 0;
541 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
542 if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
543 continue;
544
545 this->uniform_vector_size[this->uniforms] = 4;
546 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
547 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
548 for (int j = 0; j < 4; ++j) {
549 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
550 }
551 ++compacted_clipplane_index;
552 ++this->uniforms;
553 }
554 } else {
555 /* In Gen6 and later, we don't compact clip planes, because this
556 * simplifies the implementation of gl_ClipDistance.
557 */
558 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
559 this->uniform_vector_size[this->uniforms] = 4;
560 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
561 this->userplane[i].type = BRW_REGISTER_TYPE_F;
562 for (int j = 0; j < 4; ++j) {
563 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
564 }
565 ++this->uniforms;
566 }
567 }
568 }
569
570 /* Our support for builtin uniforms is even scarier than non-builtin.
571 * It sits on top of the PROG_STATE_VAR parameters that are
572 * automatically updated from GL context state.
573 */
574 void
575 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
576 {
577 const ir_state_slot *const slots = ir->state_slots;
578 assert(ir->state_slots != NULL);
579
580 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
581 /* This state reference has already been setup by ir_to_mesa,
582 * but we'll get the same index back here. We can reference
583 * ParameterValues directly, since unlike brw_fs.cpp, we never
584 * add new state references during compile.
585 */
586 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
587 (gl_state_index *)slots[i].tokens);
588 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
589
590 this->uniform_vector_size[this->uniforms] = 0;
591 /* Add each of the unique swizzled channels of the element.
592 * This will end up matching the size of the glsl_type of this field.
593 */
594 int last_swiz = -1;
595 for (unsigned int j = 0; j < 4; j++) {
596 int swiz = GET_SWZ(slots[i].swizzle, j);
597 last_swiz = swiz;
598
599 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
600 if (swiz <= last_swiz)
601 this->uniform_vector_size[this->uniforms]++;
602 }
603 this->uniforms++;
604 }
605 }
606
607 dst_reg *
608 vec4_visitor::variable_storage(ir_variable *var)
609 {
610 return (dst_reg *)hash_table_find(this->variable_ht, var);
611 }
612
613 void
614 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
615 {
616 ir_expression *expr = ir->as_expression();
617
618 *predicate = BRW_PREDICATE_NORMAL;
619
620 if (expr) {
621 src_reg op[2];
622 vec4_instruction *inst;
623
624 assert(expr->get_num_operands() <= 2);
625 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
626 expr->operands[i]->accept(this);
627 op[i] = this->result;
628
629 resolve_ud_negate(&op[i]);
630 }
631
632 switch (expr->operation) {
633 case ir_unop_logic_not:
634 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
635 inst->conditional_mod = BRW_CONDITIONAL_Z;
636 break;
637
638 case ir_binop_logic_xor:
639 inst = emit(XOR(dst_null_d(), op[0], op[1]));
640 inst->conditional_mod = BRW_CONDITIONAL_NZ;
641 break;
642
643 case ir_binop_logic_or:
644 inst = emit(OR(dst_null_d(), op[0], op[1]));
645 inst->conditional_mod = BRW_CONDITIONAL_NZ;
646 break;
647
648 case ir_binop_logic_and:
649 inst = emit(AND(dst_null_d(), op[0], op[1]));
650 inst->conditional_mod = BRW_CONDITIONAL_NZ;
651 break;
652
653 case ir_unop_f2b:
654 if (intel->gen >= 6) {
655 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
656 } else {
657 inst = emit(MOV(dst_null_f(), op[0]));
658 inst->conditional_mod = BRW_CONDITIONAL_NZ;
659 }
660 break;
661
662 case ir_unop_i2b:
663 if (intel->gen >= 6) {
664 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
665 } else {
666 inst = emit(MOV(dst_null_d(), op[0]));
667 inst->conditional_mod = BRW_CONDITIONAL_NZ;
668 }
669 break;
670
671 case ir_binop_all_equal:
672 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
673 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
674 break;
675
676 case ir_binop_any_nequal:
677 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
678 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
679 break;
680
681 case ir_unop_any:
682 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
683 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
684 break;
685
686 case ir_binop_greater:
687 case ir_binop_gequal:
688 case ir_binop_less:
689 case ir_binop_lequal:
690 case ir_binop_equal:
691 case ir_binop_nequal:
692 emit(CMP(dst_null_d(), op[0], op[1],
693 brw_conditional_for_comparison(expr->operation)));
694 break;
695
696 default:
697 assert(!"not reached");
698 break;
699 }
700 return;
701 }
702
703 ir->accept(this);
704
705 resolve_ud_negate(&this->result);
706
707 if (intel->gen >= 6) {
708 vec4_instruction *inst = emit(AND(dst_null_d(),
709 this->result, src_reg(1)));
710 inst->conditional_mod = BRW_CONDITIONAL_NZ;
711 } else {
712 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
713 inst->conditional_mod = BRW_CONDITIONAL_NZ;
714 }
715 }
716
717 /**
718 * Emit a gen6 IF statement with the comparison folded into the IF
719 * instruction.
720 */
721 void
722 vec4_visitor::emit_if_gen6(ir_if *ir)
723 {
724 ir_expression *expr = ir->condition->as_expression();
725
726 if (expr) {
727 src_reg op[2];
728 dst_reg temp;
729
730 assert(expr->get_num_operands() <= 2);
731 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
732 expr->operands[i]->accept(this);
733 op[i] = this->result;
734 }
735
736 switch (expr->operation) {
737 case ir_unop_logic_not:
738 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
739 return;
740
741 case ir_binop_logic_xor:
742 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
743 return;
744
745 case ir_binop_logic_or:
746 temp = dst_reg(this, glsl_type::bool_type);
747 emit(OR(temp, op[0], op[1]));
748 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
749 return;
750
751 case ir_binop_logic_and:
752 temp = dst_reg(this, glsl_type::bool_type);
753 emit(AND(temp, op[0], op[1]));
754 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
755 return;
756
757 case ir_unop_f2b:
758 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
759 return;
760
761 case ir_unop_i2b:
762 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
763 return;
764
765 case ir_binop_greater:
766 case ir_binop_gequal:
767 case ir_binop_less:
768 case ir_binop_lequal:
769 case ir_binop_equal:
770 case ir_binop_nequal:
771 emit(IF(op[0], op[1],
772 brw_conditional_for_comparison(expr->operation)));
773 return;
774
775 case ir_binop_all_equal:
776 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
777 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
778 return;
779
780 case ir_binop_any_nequal:
781 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
782 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
783 return;
784
785 case ir_unop_any:
786 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
787 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
788 return;
789
790 default:
791 assert(!"not reached");
792 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
793 return;
794 }
795 return;
796 }
797
798 ir->condition->accept(this);
799
800 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
801 }
802
803 static dst_reg
804 with_writemask(dst_reg const & r, int mask)
805 {
806 dst_reg result = r;
807 result.writemask = mask;
808 return result;
809 }
810
811 void
812 vec4_visitor::emit_attribute_fixups()
813 {
814 dst_reg sign_recovery_shift;
815 dst_reg normalize_factor;
816 dst_reg es3_normalize_factor;
817
818 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
819 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
820 uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
821 dst_reg reg(ATTR, i);
822 dst_reg reg_d = reg;
823 reg_d.type = BRW_REGISTER_TYPE_D;
824 dst_reg reg_ud = reg;
825 reg_ud.type = BRW_REGISTER_TYPE_UD;
826
827 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
828 * come in as floating point conversions of the integer values.
829 */
830 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
831 dst_reg dst = reg;
832 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
833 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
834 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
835 }
836
837 /* Do sign recovery for 2101010 formats if required. */
838 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
839 if (sign_recovery_shift.file == BAD_FILE) {
840 /* shift constant: <22,22,22,30> */
841 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
842 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
843 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
844 }
845
846 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
847 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
848 }
849
850 /* Apply BGRA swizzle if required. */
851 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
852 src_reg temp = src_reg(reg);
853 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
854 emit(MOV(reg, temp));
855 }
856
857 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
858 /* ES 3.0 has different rules for converting signed normalized
859 * fixed-point numbers than desktop GL.
860 */
861 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
862 /* According to equation 2.2 of the ES 3.0 specification,
863 * signed normalization conversion is done by:
864 *
865 * f = c / (2^(b-1)-1)
866 */
867 if (es3_normalize_factor.file == BAD_FILE) {
868 /* mul constant: 1 / (2^(b-1) - 1) */
869 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
870 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
871 src_reg(1.0f / ((1<<9) - 1))));
872 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
873 src_reg(1.0f / ((1<<1) - 1))));
874 }
875
876 dst_reg dst = reg;
877 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
878 emit(MOV(dst, src_reg(reg_d)));
879 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
880 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
881 } else {
882 /* The following equations are from the OpenGL 3.2 specification:
883 *
884 * 2.1 unsigned normalization
885 * f = c/(2^n-1)
886 *
887 * 2.2 signed normalization
888 * f = (2c+1)/(2^n-1)
889 *
890 * Both of these share a common divisor, which is represented by
891 * "normalize_factor" in the code below.
892 */
893 if (normalize_factor.file == BAD_FILE) {
894 /* 1 / (2^b - 1) for b=<10,10,10,2> */
895 normalize_factor = dst_reg(this, glsl_type::vec4_type);
896 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
897 src_reg(1.0f / ((1<<10) - 1))));
898 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
899 src_reg(1.0f / ((1<<2) - 1))));
900 }
901
902 dst_reg dst = reg;
903 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
904 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
905
906 /* For signed normalization, we want the numerator to be 2c+1. */
907 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
908 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
909 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
910 }
911
912 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
913 }
914 }
915
916 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
917 dst_reg dst = reg;
918 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
919 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
920 }
921 }
922 }
923 }
924
925 void
926 vec4_visitor::visit(ir_variable *ir)
927 {
928 dst_reg *reg = NULL;
929
930 if (variable_storage(ir))
931 return;
932
933 switch (ir->mode) {
934 case ir_var_in:
935 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
936 break;
937
938 case ir_var_out:
939 reg = new(mem_ctx) dst_reg(this, ir->type);
940
941 for (int i = 0; i < type_size(ir->type); i++) {
942 output_reg[ir->location + i] = *reg;
943 output_reg[ir->location + i].reg_offset = i;
944 output_reg[ir->location + i].type =
945 brw_type_for_base_type(ir->type->get_scalar_type());
946 output_reg_annotation[ir->location + i] = ir->name;
947 }
948 break;
949
950 case ir_var_auto:
951 case ir_var_temporary:
952 reg = new(mem_ctx) dst_reg(this, ir->type);
953 break;
954
955 case ir_var_uniform:
956 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
957
958 /* Thanks to the lower_ubo_reference pass, we will see only
959 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
960 * variables, so no need for them to be in variable_ht.
961 */
962 if (ir->uniform_block != -1)
963 return;
964
965 /* Track how big the whole uniform variable is, in case we need to put a
966 * copy of its data into pull constants for array access.
967 */
968 this->uniform_size[this->uniforms] = type_size(ir->type);
969
970 if (!strncmp(ir->name, "gl_", 3)) {
971 setup_builtin_uniform_values(ir);
972 } else {
973 setup_uniform_values(ir->location, ir->type);
974 }
975 break;
976
977 case ir_var_system_value:
978 /* VertexID is stored by the VF as the last vertex element, but
979 * we don't represent it with a flag in inputs_read, so we call
980 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
981 */
982 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
983 prog_data->uses_vertexid = true;
984
985 switch (ir->location) {
986 case SYSTEM_VALUE_VERTEX_ID:
987 reg->writemask = WRITEMASK_X;
988 break;
989 case SYSTEM_VALUE_INSTANCE_ID:
990 reg->writemask = WRITEMASK_Y;
991 break;
992 default:
993 assert(!"not reached");
994 break;
995 }
996 break;
997
998 default:
999 assert(!"not reached");
1000 }
1001
1002 reg->type = brw_type_for_base_type(ir->type);
1003 hash_table_insert(this->variable_ht, reg, ir);
1004 }
1005
1006 void
1007 vec4_visitor::visit(ir_loop *ir)
1008 {
1009 dst_reg counter;
1010
1011 /* We don't want debugging output to print the whole body of the
1012 * loop as the annotation.
1013 */
1014 this->base_ir = NULL;
1015
1016 if (ir->counter != NULL) {
1017 this->base_ir = ir->counter;
1018 ir->counter->accept(this);
1019 counter = *(variable_storage(ir->counter));
1020
1021 if (ir->from != NULL) {
1022 this->base_ir = ir->from;
1023 ir->from->accept(this);
1024
1025 emit(MOV(counter, this->result));
1026 }
1027 }
1028
1029 emit(BRW_OPCODE_DO);
1030
1031 if (ir->to) {
1032 this->base_ir = ir->to;
1033 ir->to->accept(this);
1034
1035 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1036 brw_conditional_for_comparison(ir->cmp)));
1037
1038 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1039 inst->predicate = BRW_PREDICATE_NORMAL;
1040 }
1041
1042 visit_instructions(&ir->body_instructions);
1043
1044
1045 if (ir->increment) {
1046 this->base_ir = ir->increment;
1047 ir->increment->accept(this);
1048 emit(ADD(counter, src_reg(counter), this->result));
1049 }
1050
1051 emit(BRW_OPCODE_WHILE);
1052 }
1053
1054 void
1055 vec4_visitor::visit(ir_loop_jump *ir)
1056 {
1057 switch (ir->mode) {
1058 case ir_loop_jump::jump_break:
1059 emit(BRW_OPCODE_BREAK);
1060 break;
1061 case ir_loop_jump::jump_continue:
1062 emit(BRW_OPCODE_CONTINUE);
1063 break;
1064 }
1065 }
1066
1067
1068 void
1069 vec4_visitor::visit(ir_function_signature *ir)
1070 {
1071 assert(0);
1072 (void)ir;
1073 }
1074
1075 void
1076 vec4_visitor::visit(ir_function *ir)
1077 {
1078 /* Ignore function bodies other than main() -- we shouldn't see calls to
1079 * them since they should all be inlined.
1080 */
1081 if (strcmp(ir->name, "main") == 0) {
1082 const ir_function_signature *sig;
1083 exec_list empty;
1084
1085 sig = ir->matching_signature(&empty);
1086
1087 assert(sig);
1088
1089 visit_instructions(&sig->body);
1090 }
1091 }
1092
1093 bool
1094 vec4_visitor::try_emit_sat(ir_expression *ir)
1095 {
1096 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1097 if (!sat_src)
1098 return false;
1099
1100 sat_src->accept(this);
1101 src_reg src = this->result;
1102
1103 this->result = src_reg(this, ir->type);
1104 vec4_instruction *inst;
1105 inst = emit(MOV(dst_reg(this->result), src));
1106 inst->saturate = true;
1107
1108 return true;
1109 }
1110
1111 void
1112 vec4_visitor::emit_bool_comparison(unsigned int op,
1113 dst_reg dst, src_reg src0, src_reg src1)
1114 {
1115 /* original gen4 does destination conversion before comparison. */
1116 if (intel->gen < 5)
1117 dst.type = src0.type;
1118
1119 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1120
1121 dst.type = BRW_REGISTER_TYPE_D;
1122 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1123 }
1124
1125 void
1126 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1127 src_reg src0, src_reg src1)
1128 {
1129 vec4_instruction *inst;
1130
1131 if (intel->gen >= 6) {
1132 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1133 inst->conditional_mod = conditionalmod;
1134 } else {
1135 emit(CMP(dst, src0, src1, conditionalmod));
1136
1137 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1138 inst->predicate = BRW_PREDICATE_NORMAL;
1139 }
1140 }
1141
1142 void
1143 vec4_visitor::visit(ir_expression *ir)
1144 {
1145 unsigned int operand;
1146 src_reg op[Elements(ir->operands)];
1147 src_reg result_src;
1148 dst_reg result_dst;
1149 vec4_instruction *inst;
1150
1151 if (try_emit_sat(ir))
1152 return;
1153
1154 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1155 this->result.file = BAD_FILE;
1156 ir->operands[operand]->accept(this);
1157 if (this->result.file == BAD_FILE) {
1158 printf("Failed to get tree for expression operand:\n");
1159 ir->operands[operand]->print();
1160 exit(1);
1161 }
1162 op[operand] = this->result;
1163
1164 /* Matrix expression operands should have been broken down to vector
1165 * operations already.
1166 */
1167 assert(!ir->operands[operand]->type->is_matrix());
1168 }
1169
1170 int vector_elements = ir->operands[0]->type->vector_elements;
1171 if (ir->operands[1]) {
1172 vector_elements = MAX2(vector_elements,
1173 ir->operands[1]->type->vector_elements);
1174 }
1175
1176 this->result.file = BAD_FILE;
1177
1178 /* Storage for our result. Ideally for an assignment we'd be using
1179 * the actual storage for the result here, instead.
1180 */
1181 result_src = src_reg(this, ir->type);
1182 /* convenience for the emit functions below. */
1183 result_dst = dst_reg(result_src);
1184 /* If nothing special happens, this is the result. */
1185 this->result = result_src;
1186 /* Limit writes to the channels that will be used by result_src later.
1187 * This does limit this temp's use as a temporary for multi-instruction
1188 * sequences.
1189 */
1190 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1191
1192 switch (ir->operation) {
1193 case ir_unop_logic_not:
1194 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1195 * ones complement of the whole register, not just bit 0.
1196 */
1197 emit(XOR(result_dst, op[0], src_reg(1)));
1198 break;
1199 case ir_unop_neg:
1200 op[0].negate = !op[0].negate;
1201 this->result = op[0];
1202 break;
1203 case ir_unop_abs:
1204 op[0].abs = true;
1205 op[0].negate = false;
1206 this->result = op[0];
1207 break;
1208
1209 case ir_unop_sign:
1210 emit(MOV(result_dst, src_reg(0.0f)));
1211
1212 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1213 inst = emit(MOV(result_dst, src_reg(1.0f)));
1214 inst->predicate = BRW_PREDICATE_NORMAL;
1215
1216 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1217 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1218 inst->predicate = BRW_PREDICATE_NORMAL;
1219
1220 break;
1221
1222 case ir_unop_rcp:
1223 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1224 break;
1225
1226 case ir_unop_exp2:
1227 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1228 break;
1229 case ir_unop_log2:
1230 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1231 break;
1232 case ir_unop_exp:
1233 case ir_unop_log:
1234 assert(!"not reached: should be handled by ir_explog_to_explog2");
1235 break;
1236 case ir_unop_sin:
1237 case ir_unop_sin_reduced:
1238 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1239 break;
1240 case ir_unop_cos:
1241 case ir_unop_cos_reduced:
1242 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1243 break;
1244
1245 case ir_unop_dFdx:
1246 case ir_unop_dFdy:
1247 assert(!"derivatives not valid in vertex shader");
1248 break;
1249
1250 case ir_unop_noise:
1251 assert(!"not reached: should be handled by lower_noise");
1252 break;
1253
1254 case ir_binop_add:
1255 emit(ADD(result_dst, op[0], op[1]));
1256 break;
1257 case ir_binop_sub:
1258 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1259 break;
1260
1261 case ir_binop_mul:
1262 if (ir->type->is_integer()) {
1263 /* For integer multiplication, the MUL uses the low 16 bits
1264 * of one of the operands (src0 on gen6, src1 on gen7). The
1265 * MACH accumulates in the contribution of the upper 16 bits
1266 * of that operand.
1267 *
1268 * FINISHME: Emit just the MUL if we know an operand is small
1269 * enough.
1270 */
1271 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1272
1273 emit(MUL(acc, op[0], op[1]));
1274 emit(MACH(dst_null_d(), op[0], op[1]));
1275 emit(MOV(result_dst, src_reg(acc)));
1276 } else {
1277 emit(MUL(result_dst, op[0], op[1]));
1278 }
1279 break;
1280 case ir_binop_div:
1281 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1282 assert(ir->type->is_integer());
1283 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1284 break;
1285 case ir_binop_mod:
1286 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1287 assert(ir->type->is_integer());
1288 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1289 break;
1290
1291 case ir_binop_less:
1292 case ir_binop_greater:
1293 case ir_binop_lequal:
1294 case ir_binop_gequal:
1295 case ir_binop_equal:
1296 case ir_binop_nequal: {
1297 emit(CMP(result_dst, op[0], op[1],
1298 brw_conditional_for_comparison(ir->operation)));
1299 emit(AND(result_dst, result_src, src_reg(0x1)));
1300 break;
1301 }
1302
1303 case ir_binop_all_equal:
1304 /* "==" operator producing a scalar boolean. */
1305 if (ir->operands[0]->type->is_vector() ||
1306 ir->operands[1]->type->is_vector()) {
1307 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1308 emit(MOV(result_dst, src_reg(0)));
1309 inst = emit(MOV(result_dst, src_reg(1)));
1310 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1311 } else {
1312 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1313 emit(AND(result_dst, result_src, src_reg(0x1)));
1314 }
1315 break;
1316 case ir_binop_any_nequal:
1317 /* "!=" operator producing a scalar boolean. */
1318 if (ir->operands[0]->type->is_vector() ||
1319 ir->operands[1]->type->is_vector()) {
1320 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1321
1322 emit(MOV(result_dst, src_reg(0)));
1323 inst = emit(MOV(result_dst, src_reg(1)));
1324 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1325 } else {
1326 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1327 emit(AND(result_dst, result_src, src_reg(0x1)));
1328 }
1329 break;
1330
1331 case ir_unop_any:
1332 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1333 emit(MOV(result_dst, src_reg(0)));
1334
1335 inst = emit(MOV(result_dst, src_reg(1)));
1336 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1337 break;
1338
1339 case ir_binop_logic_xor:
1340 emit(XOR(result_dst, op[0], op[1]));
1341 break;
1342
1343 case ir_binop_logic_or:
1344 emit(OR(result_dst, op[0], op[1]));
1345 break;
1346
1347 case ir_binop_logic_and:
1348 emit(AND(result_dst, op[0], op[1]));
1349 break;
1350
1351 case ir_binop_dot:
1352 assert(ir->operands[0]->type->is_vector());
1353 assert(ir->operands[0]->type == ir->operands[1]->type);
1354 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1355 break;
1356
1357 case ir_unop_sqrt:
1358 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1359 break;
1360 case ir_unop_rsq:
1361 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1362 break;
1363
1364 case ir_unop_bitcast_i2f:
1365 case ir_unop_bitcast_u2f:
1366 this->result = op[0];
1367 this->result.type = BRW_REGISTER_TYPE_F;
1368 break;
1369
1370 case ir_unop_bitcast_f2i:
1371 this->result = op[0];
1372 this->result.type = BRW_REGISTER_TYPE_D;
1373 break;
1374
1375 case ir_unop_bitcast_f2u:
1376 this->result = op[0];
1377 this->result.type = BRW_REGISTER_TYPE_UD;
1378 break;
1379
1380 case ir_unop_i2f:
1381 case ir_unop_i2u:
1382 case ir_unop_u2i:
1383 case ir_unop_u2f:
1384 case ir_unop_b2f:
1385 case ir_unop_b2i:
1386 case ir_unop_f2i:
1387 case ir_unop_f2u:
1388 emit(MOV(result_dst, op[0]));
1389 break;
1390 case ir_unop_f2b:
1391 case ir_unop_i2b: {
1392 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1393 emit(AND(result_dst, result_src, src_reg(1)));
1394 break;
1395 }
1396
1397 case ir_unop_trunc:
1398 emit(RNDZ(result_dst, op[0]));
1399 break;
1400 case ir_unop_ceil:
1401 op[0].negate = !op[0].negate;
1402 inst = emit(RNDD(result_dst, op[0]));
1403 this->result.negate = true;
1404 break;
1405 case ir_unop_floor:
1406 inst = emit(RNDD(result_dst, op[0]));
1407 break;
1408 case ir_unop_fract:
1409 inst = emit(FRC(result_dst, op[0]));
1410 break;
1411 case ir_unop_round_even:
1412 emit(RNDE(result_dst, op[0]));
1413 break;
1414
1415 case ir_binop_min:
1416 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1417 break;
1418 case ir_binop_max:
1419 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1420 break;
1421
1422 case ir_binop_pow:
1423 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1424 break;
1425
1426 case ir_unop_bit_not:
1427 inst = emit(NOT(result_dst, op[0]));
1428 break;
1429 case ir_binop_bit_and:
1430 inst = emit(AND(result_dst, op[0], op[1]));
1431 break;
1432 case ir_binop_bit_xor:
1433 inst = emit(XOR(result_dst, op[0], op[1]));
1434 break;
1435 case ir_binop_bit_or:
1436 inst = emit(OR(result_dst, op[0], op[1]));
1437 break;
1438
1439 case ir_binop_lshift:
1440 inst = emit(SHL(result_dst, op[0], op[1]));
1441 break;
1442
1443 case ir_binop_rshift:
1444 if (ir->type->base_type == GLSL_TYPE_INT)
1445 inst = emit(ASR(result_dst, op[0], op[1]));
1446 else
1447 inst = emit(SHR(result_dst, op[0], op[1]));
1448 break;
1449
1450 case ir_binop_ubo_load: {
1451 ir_constant *uniform_block = ir->operands[0]->as_constant();
1452 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1453 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1454 src_reg offset = op[1];
1455
1456 /* Now, load the vector from that offset. */
1457 assert(ir->type->is_vector() || ir->type->is_scalar());
1458
1459 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1460 packed_consts.type = result.type;
1461 src_reg surf_index =
1462 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1463 if (const_offset_ir) {
1464 offset = src_reg(const_offset / 16);
1465 } else {
1466 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1467 }
1468
1469 vec4_instruction *pull =
1470 emit(new(mem_ctx) vec4_instruction(this,
1471 VS_OPCODE_PULL_CONSTANT_LOAD,
1472 dst_reg(packed_consts),
1473 surf_index,
1474 offset));
1475 pull->base_mrf = 14;
1476 pull->mlen = 1;
1477
1478 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1479 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1480 const_offset % 16 / 4,
1481 const_offset % 16 / 4,
1482 const_offset % 16 / 4);
1483
1484 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1485 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1486 emit(CMP(result_dst, packed_consts, src_reg(0u),
1487 BRW_CONDITIONAL_NZ));
1488 emit(AND(result_dst, result, src_reg(0x1)));
1489 } else {
1490 emit(MOV(result_dst, packed_consts));
1491 }
1492 break;
1493 }
1494
1495 case ir_quadop_vector:
1496 assert(!"not reached: should be handled by lower_quadop_vector");
1497 break;
1498 }
1499 }
1500
1501
1502 void
1503 vec4_visitor::visit(ir_swizzle *ir)
1504 {
1505 src_reg src;
1506 int i = 0;
1507 int swizzle[4];
1508
1509 /* Note that this is only swizzles in expressions, not those on the left
1510 * hand side of an assignment, which do write masking. See ir_assignment
1511 * for that.
1512 */
1513
1514 ir->val->accept(this);
1515 src = this->result;
1516 assert(src.file != BAD_FILE);
1517
1518 for (i = 0; i < ir->type->vector_elements; i++) {
1519 switch (i) {
1520 case 0:
1521 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1522 break;
1523 case 1:
1524 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1525 break;
1526 case 2:
1527 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1528 break;
1529 case 3:
1530 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1531 break;
1532 }
1533 }
1534 for (; i < 4; i++) {
1535 /* Replicate the last channel out. */
1536 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1537 }
1538
1539 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1540
1541 this->result = src;
1542 }
1543
1544 void
1545 vec4_visitor::visit(ir_dereference_variable *ir)
1546 {
1547 const struct glsl_type *type = ir->type;
1548 dst_reg *reg = variable_storage(ir->var);
1549
1550 if (!reg) {
1551 fail("Failed to find variable storage for %s\n", ir->var->name);
1552 this->result = src_reg(brw_null_reg());
1553 return;
1554 }
1555
1556 this->result = src_reg(*reg);
1557
1558 /* System values get their swizzle from the dst_reg writemask */
1559 if (ir->var->mode == ir_var_system_value)
1560 return;
1561
1562 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1563 this->result.swizzle = swizzle_for_size(type->vector_elements);
1564 }
1565
1566 void
1567 vec4_visitor::visit(ir_dereference_array *ir)
1568 {
1569 ir_constant *constant_index;
1570 src_reg src;
1571 int element_size = type_size(ir->type);
1572
1573 constant_index = ir->array_index->constant_expression_value();
1574
1575 ir->array->accept(this);
1576 src = this->result;
1577
1578 if (constant_index) {
1579 src.reg_offset += constant_index->value.i[0] * element_size;
1580 } else {
1581 /* Variable index array dereference. It eats the "vec4" of the
1582 * base of the array and an index that offsets the Mesa register
1583 * index.
1584 */
1585 ir->array_index->accept(this);
1586
1587 src_reg index_reg;
1588
1589 if (element_size == 1) {
1590 index_reg = this->result;
1591 } else {
1592 index_reg = src_reg(this, glsl_type::int_type);
1593
1594 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1595 }
1596
1597 if (src.reladdr) {
1598 src_reg temp = src_reg(this, glsl_type::int_type);
1599
1600 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1601
1602 index_reg = temp;
1603 }
1604
1605 src.reladdr = ralloc(mem_ctx, src_reg);
1606 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1607 }
1608
1609 /* If the type is smaller than a vec4, replicate the last channel out. */
1610 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1611 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1612 else
1613 src.swizzle = BRW_SWIZZLE_NOOP;
1614 src.type = brw_type_for_base_type(ir->type);
1615
1616 this->result = src;
1617 }
1618
1619 void
1620 vec4_visitor::visit(ir_dereference_record *ir)
1621 {
1622 unsigned int i;
1623 const glsl_type *struct_type = ir->record->type;
1624 int offset = 0;
1625
1626 ir->record->accept(this);
1627
1628 for (i = 0; i < struct_type->length; i++) {
1629 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1630 break;
1631 offset += type_size(struct_type->fields.structure[i].type);
1632 }
1633
1634 /* If the type is smaller than a vec4, replicate the last channel out. */
1635 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1636 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1637 else
1638 this->result.swizzle = BRW_SWIZZLE_NOOP;
1639 this->result.type = brw_type_for_base_type(ir->type);
1640
1641 this->result.reg_offset += offset;
1642 }
1643
1644 /**
1645 * We want to be careful in assignment setup to hit the actual storage
1646 * instead of potentially using a temporary like we might with the
1647 * ir_dereference handler.
1648 */
1649 static dst_reg
1650 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1651 {
1652 /* The LHS must be a dereference. If the LHS is a variable indexed array
1653 * access of a vector, it must be separated into a series conditional moves
1654 * before reaching this point (see ir_vec_index_to_cond_assign).
1655 */
1656 assert(ir->as_dereference());
1657 ir_dereference_array *deref_array = ir->as_dereference_array();
1658 if (deref_array) {
1659 assert(!deref_array->array->type->is_vector());
1660 }
1661
1662 /* Use the rvalue deref handler for the most part. We'll ignore
1663 * swizzles in it and write swizzles using writemask, though.
1664 */
1665 ir->accept(v);
1666 return dst_reg(v->result);
1667 }
1668
1669 void
1670 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1671 const struct glsl_type *type, uint32_t predicate)
1672 {
1673 if (type->base_type == GLSL_TYPE_STRUCT) {
1674 for (unsigned int i = 0; i < type->length; i++) {
1675 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1676 }
1677 return;
1678 }
1679
1680 if (type->is_array()) {
1681 for (unsigned int i = 0; i < type->length; i++) {
1682 emit_block_move(dst, src, type->fields.array, predicate);
1683 }
1684 return;
1685 }
1686
1687 if (type->is_matrix()) {
1688 const struct glsl_type *vec_type;
1689
1690 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1691 type->vector_elements, 1);
1692
1693 for (int i = 0; i < type->matrix_columns; i++) {
1694 emit_block_move(dst, src, vec_type, predicate);
1695 }
1696 return;
1697 }
1698
1699 assert(type->is_scalar() || type->is_vector());
1700
1701 dst->type = brw_type_for_base_type(type);
1702 src->type = dst->type;
1703
1704 dst->writemask = (1 << type->vector_elements) - 1;
1705
1706 src->swizzle = swizzle_for_size(type->vector_elements);
1707
1708 vec4_instruction *inst = emit(MOV(*dst, *src));
1709 inst->predicate = predicate;
1710
1711 dst->reg_offset++;
1712 src->reg_offset++;
1713 }
1714
1715
1716 /* If the RHS processing resulted in an instruction generating a
1717 * temporary value, and it would be easy to rewrite the instruction to
1718 * generate its result right into the LHS instead, do so. This ends
1719 * up reliably removing instructions where it can be tricky to do so
1720 * later without real UD chain information.
1721 */
1722 bool
1723 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1724 dst_reg dst,
1725 src_reg src,
1726 vec4_instruction *pre_rhs_inst,
1727 vec4_instruction *last_rhs_inst)
1728 {
1729 /* This could be supported, but it would take more smarts. */
1730 if (ir->condition)
1731 return false;
1732
1733 if (pre_rhs_inst == last_rhs_inst)
1734 return false; /* No instructions generated to work with. */
1735
1736 /* Make sure the last instruction generated our source reg. */
1737 if (src.file != GRF ||
1738 src.file != last_rhs_inst->dst.file ||
1739 src.reg != last_rhs_inst->dst.reg ||
1740 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1741 src.reladdr ||
1742 src.abs ||
1743 src.negate ||
1744 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1745 return false;
1746
1747 /* Check that that last instruction fully initialized the channels
1748 * we want to use, in the order we want to use them. We could
1749 * potentially reswizzle the operands of many instructions so that
1750 * we could handle out of order channels, but don't yet.
1751 */
1752
1753 for (unsigned i = 0; i < 4; i++) {
1754 if (dst.writemask & (1 << i)) {
1755 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1756 return false;
1757
1758 if (BRW_GET_SWZ(src.swizzle, i) != i)
1759 return false;
1760 }
1761 }
1762
1763 /* Success! Rewrite the instruction. */
1764 last_rhs_inst->dst.file = dst.file;
1765 last_rhs_inst->dst.reg = dst.reg;
1766 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1767 last_rhs_inst->dst.reladdr = dst.reladdr;
1768 last_rhs_inst->dst.writemask &= dst.writemask;
1769
1770 return true;
1771 }
1772
1773 void
1774 vec4_visitor::visit(ir_assignment *ir)
1775 {
1776 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1777 uint32_t predicate = BRW_PREDICATE_NONE;
1778
1779 if (!ir->lhs->type->is_scalar() &&
1780 !ir->lhs->type->is_vector()) {
1781 ir->rhs->accept(this);
1782 src_reg src = this->result;
1783
1784 if (ir->condition) {
1785 emit_bool_to_cond_code(ir->condition, &predicate);
1786 }
1787
1788 /* emit_block_move doesn't account for swizzles in the source register.
1789 * This should be ok, since the source register is a structure or an
1790 * array, and those can't be swizzled. But double-check to be sure.
1791 */
1792 assert(src.swizzle ==
1793 (ir->rhs->type->is_matrix()
1794 ? swizzle_for_size(ir->rhs->type->vector_elements)
1795 : BRW_SWIZZLE_NOOP));
1796
1797 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1798 return;
1799 }
1800
1801 /* Now we're down to just a scalar/vector with writemasks. */
1802 int i;
1803
1804 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1805 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1806
1807 ir->rhs->accept(this);
1808
1809 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1810
1811 src_reg src = this->result;
1812
1813 int swizzles[4];
1814 int first_enabled_chan = 0;
1815 int src_chan = 0;
1816
1817 assert(ir->lhs->type->is_vector() ||
1818 ir->lhs->type->is_scalar());
1819 dst.writemask = ir->write_mask;
1820
1821 for (int i = 0; i < 4; i++) {
1822 if (dst.writemask & (1 << i)) {
1823 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1824 break;
1825 }
1826 }
1827
1828 /* Swizzle a small RHS vector into the channels being written.
1829 *
1830 * glsl ir treats write_mask as dictating how many channels are
1831 * present on the RHS while in our instructions we need to make
1832 * those channels appear in the slots of the vec4 they're written to.
1833 */
1834 for (int i = 0; i < 4; i++) {
1835 if (dst.writemask & (1 << i))
1836 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1837 else
1838 swizzles[i] = first_enabled_chan;
1839 }
1840 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1841 swizzles[2], swizzles[3]);
1842
1843 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1844 return;
1845 }
1846
1847 if (ir->condition) {
1848 emit_bool_to_cond_code(ir->condition, &predicate);
1849 }
1850
1851 for (i = 0; i < type_size(ir->lhs->type); i++) {
1852 vec4_instruction *inst = emit(MOV(dst, src));
1853 inst->predicate = predicate;
1854
1855 dst.reg_offset++;
1856 src.reg_offset++;
1857 }
1858 }
1859
1860 void
1861 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1862 {
1863 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1864 foreach_list(node, &ir->components) {
1865 ir_constant *field_value = (ir_constant *)node;
1866
1867 emit_constant_values(dst, field_value);
1868 }
1869 return;
1870 }
1871
1872 if (ir->type->is_array()) {
1873 for (unsigned int i = 0; i < ir->type->length; i++) {
1874 emit_constant_values(dst, ir->array_elements[i]);
1875 }
1876 return;
1877 }
1878
1879 if (ir->type->is_matrix()) {
1880 for (int i = 0; i < ir->type->matrix_columns; i++) {
1881 float *vec = &ir->value.f[i * ir->type->vector_elements];
1882
1883 for (int j = 0; j < ir->type->vector_elements; j++) {
1884 dst->writemask = 1 << j;
1885 dst->type = BRW_REGISTER_TYPE_F;
1886
1887 emit(MOV(*dst, src_reg(vec[j])));
1888 }
1889 dst->reg_offset++;
1890 }
1891 return;
1892 }
1893
1894 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1895
1896 for (int i = 0; i < ir->type->vector_elements; i++) {
1897 if (!(remaining_writemask & (1 << i)))
1898 continue;
1899
1900 dst->writemask = 1 << i;
1901 dst->type = brw_type_for_base_type(ir->type);
1902
1903 /* Find other components that match the one we're about to
1904 * write. Emits fewer instructions for things like vec4(0.5,
1905 * 1.5, 1.5, 1.5).
1906 */
1907 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1908 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1909 if (ir->value.b[i] == ir->value.b[j])
1910 dst->writemask |= (1 << j);
1911 } else {
1912 /* u, i, and f storage all line up, so no need for a
1913 * switch case for comparing each type.
1914 */
1915 if (ir->value.u[i] == ir->value.u[j])
1916 dst->writemask |= (1 << j);
1917 }
1918 }
1919
1920 switch (ir->type->base_type) {
1921 case GLSL_TYPE_FLOAT:
1922 emit(MOV(*dst, src_reg(ir->value.f[i])));
1923 break;
1924 case GLSL_TYPE_INT:
1925 emit(MOV(*dst, src_reg(ir->value.i[i])));
1926 break;
1927 case GLSL_TYPE_UINT:
1928 emit(MOV(*dst, src_reg(ir->value.u[i])));
1929 break;
1930 case GLSL_TYPE_BOOL:
1931 emit(MOV(*dst, src_reg(ir->value.b[i])));
1932 break;
1933 default:
1934 assert(!"Non-float/uint/int/bool constant");
1935 break;
1936 }
1937
1938 remaining_writemask &= ~dst->writemask;
1939 }
1940 dst->reg_offset++;
1941 }
1942
1943 void
1944 vec4_visitor::visit(ir_constant *ir)
1945 {
1946 dst_reg dst = dst_reg(this, ir->type);
1947 this->result = src_reg(dst);
1948
1949 emit_constant_values(&dst, ir);
1950 }
1951
1952 void
1953 vec4_visitor::visit(ir_call *ir)
1954 {
1955 assert(!"not reached");
1956 }
1957
1958 void
1959 vec4_visitor::visit(ir_texture *ir)
1960 {
1961 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1962
1963 /* Should be lowered by do_lower_texture_projection */
1964 assert(!ir->projector);
1965
1966 /* Generate code to compute all the subexpression trees. This has to be
1967 * done before loading any values into MRFs for the sampler message since
1968 * generating these values may involve SEND messages that need the MRFs.
1969 */
1970 src_reg coordinate;
1971 if (ir->coordinate) {
1972 ir->coordinate->accept(this);
1973 coordinate = this->result;
1974 }
1975
1976 src_reg shadow_comparitor;
1977 if (ir->shadow_comparitor) {
1978 ir->shadow_comparitor->accept(this);
1979 shadow_comparitor = this->result;
1980 }
1981
1982 src_reg lod, dPdx, dPdy;
1983 switch (ir->op) {
1984 case ir_txf:
1985 case ir_txl:
1986 case ir_txs:
1987 ir->lod_info.lod->accept(this);
1988 lod = this->result;
1989 break;
1990 case ir_txd:
1991 ir->lod_info.grad.dPdx->accept(this);
1992 dPdx = this->result;
1993
1994 ir->lod_info.grad.dPdy->accept(this);
1995 dPdy = this->result;
1996 break;
1997 case ir_tex:
1998 case ir_txb:
1999 break;
2000 }
2001
2002 vec4_instruction *inst = NULL;
2003 switch (ir->op) {
2004 case ir_tex:
2005 case ir_txl:
2006 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2007 break;
2008 case ir_txd:
2009 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2010 break;
2011 case ir_txf:
2012 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2013 break;
2014 case ir_txs:
2015 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2016 break;
2017 case ir_txb:
2018 assert(!"TXB is not valid for vertex shaders.");
2019 }
2020
2021 /* Texel offsets go in the message header; Gen4 also requires headers. */
2022 inst->header_present = ir->offset || intel->gen < 5;
2023 inst->base_mrf = 2;
2024 inst->mlen = inst->header_present + 1; /* always at least one */
2025 inst->sampler = sampler;
2026 inst->dst = dst_reg(this, ir->type);
2027 inst->shadow_compare = ir->shadow_comparitor != NULL;
2028
2029 if (ir->offset != NULL && ir->op != ir_txf)
2030 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2031
2032 /* MRF for the first parameter */
2033 int param_base = inst->base_mrf + inst->header_present;
2034
2035 if (ir->op == ir_txs) {
2036 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2037 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
2038 lod));
2039 } else {
2040 int i, coord_mask = 0, zero_mask = 0;
2041 /* Load the coordinate */
2042 /* FINISHME: gl_clamp_mask and saturate */
2043 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2044 coord_mask |= (1 << i);
2045 for (; i < 4; i++)
2046 zero_mask |= (1 << i);
2047
2048 if (ir->offset && ir->op == ir_txf) {
2049 /* It appears that the ld instruction used for txf does its
2050 * address bounds check before adding in the offset. To work
2051 * around this, just add the integer offset to the integer
2052 * texel coordinate, and don't put the offset in the header.
2053 */
2054 ir_constant *offset = ir->offset->as_constant();
2055 assert(offset);
2056
2057 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2058 src_reg src = coordinate;
2059 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2060 BRW_GET_SWZ(src.swizzle, j),
2061 BRW_GET_SWZ(src.swizzle, j),
2062 BRW_GET_SWZ(src.swizzle, j));
2063 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2064 src, offset->value.i[j]));
2065 }
2066 } else {
2067 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2068 coordinate));
2069 }
2070 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2071 src_reg(0)));
2072 /* Load the shadow comparitor */
2073 if (ir->shadow_comparitor) {
2074 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2075 WRITEMASK_X),
2076 shadow_comparitor));
2077 inst->mlen++;
2078 }
2079
2080 /* Load the LOD info */
2081 if (ir->op == ir_txl) {
2082 int mrf, writemask;
2083 if (intel->gen >= 5) {
2084 mrf = param_base + 1;
2085 if (ir->shadow_comparitor) {
2086 writemask = WRITEMASK_Y;
2087 /* mlen already incremented */
2088 } else {
2089 writemask = WRITEMASK_X;
2090 inst->mlen++;
2091 }
2092 } else /* intel->gen == 4 */ {
2093 mrf = param_base;
2094 writemask = WRITEMASK_Z;
2095 }
2096 emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
2097 } else if (ir->op == ir_txf) {
2098 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
2099 lod));
2100 } else if (ir->op == ir_txd) {
2101 const glsl_type *type = ir->lod_info.grad.dPdx->type;
2102
2103 if (intel->gen >= 5) {
2104 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2105 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2106 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2107 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2108 inst->mlen++;
2109
2110 if (ir->type->vector_elements == 3) {
2111 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2112 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2113 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2114 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2115 inst->mlen++;
2116 }
2117 } else /* intel->gen == 4 */ {
2118 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2119 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2120 inst->mlen += 2;
2121 }
2122 }
2123 }
2124
2125 emit(inst);
2126
2127 swizzle_result(ir, src_reg(inst->dst), sampler);
2128 }
2129
2130 void
2131 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2132 {
2133 this->result = orig_val;
2134
2135 int s = c->key.tex.swizzles[sampler];
2136
2137 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2138 || s == SWIZZLE_NOOP)
2139 return;
2140
2141 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2142 int swizzle[4];
2143
2144 for (int i = 0; i < 4; i++) {
2145 switch (GET_SWZ(s, i)) {
2146 case SWIZZLE_ZERO:
2147 zero_mask |= (1 << i);
2148 break;
2149 case SWIZZLE_ONE:
2150 one_mask |= (1 << i);
2151 break;
2152 default:
2153 copy_mask |= (1 << i);
2154 swizzle[i] = GET_SWZ(s, i);
2155 break;
2156 }
2157 }
2158
2159 this->result = src_reg(this, ir->type);
2160 dst_reg swizzled_result(this->result);
2161
2162 if (copy_mask) {
2163 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2164 swizzled_result.writemask = copy_mask;
2165 emit(MOV(swizzled_result, orig_val));
2166 }
2167
2168 if (zero_mask) {
2169 swizzled_result.writemask = zero_mask;
2170 emit(MOV(swizzled_result, src_reg(0.0f)));
2171 }
2172
2173 if (one_mask) {
2174 swizzled_result.writemask = one_mask;
2175 emit(MOV(swizzled_result, src_reg(1.0f)));
2176 }
2177 }
2178
2179 void
2180 vec4_visitor::visit(ir_return *ir)
2181 {
2182 assert(!"not reached");
2183 }
2184
2185 void
2186 vec4_visitor::visit(ir_discard *ir)
2187 {
2188 assert(!"not reached");
2189 }
2190
2191 void
2192 vec4_visitor::visit(ir_if *ir)
2193 {
2194 /* Don't point the annotation at the if statement, because then it plus
2195 * the then and else blocks get printed.
2196 */
2197 this->base_ir = ir->condition;
2198
2199 if (intel->gen == 6) {
2200 emit_if_gen6(ir);
2201 } else {
2202 uint32_t predicate;
2203 emit_bool_to_cond_code(ir->condition, &predicate);
2204 emit(IF(predicate));
2205 }
2206
2207 visit_instructions(&ir->then_instructions);
2208
2209 if (!ir->else_instructions.is_empty()) {
2210 this->base_ir = ir->condition;
2211 emit(BRW_OPCODE_ELSE);
2212
2213 visit_instructions(&ir->else_instructions);
2214 }
2215
2216 this->base_ir = ir->condition;
2217 emit(BRW_OPCODE_ENDIF);
2218 }
2219
2220 void
2221 vec4_visitor::emit_ndc_computation()
2222 {
2223 /* Get the position */
2224 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2225
2226 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2227 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2228 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2229
2230 current_annotation = "NDC";
2231 dst_reg ndc_w = ndc;
2232 ndc_w.writemask = WRITEMASK_W;
2233 src_reg pos_w = pos;
2234 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2235 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2236
2237 dst_reg ndc_xyz = ndc;
2238 ndc_xyz.writemask = WRITEMASK_XYZ;
2239
2240 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2241 }
2242
2243 void
2244 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2245 {
2246 if (intel->gen < 6 &&
2247 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2248 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2249 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2250 dst_reg header1_w = header1;
2251 header1_w.writemask = WRITEMASK_W;
2252 GLuint i;
2253
2254 emit(MOV(header1, 0u));
2255
2256 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2257 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2258
2259 current_annotation = "Point size";
2260 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2261 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2262 }
2263
2264 current_annotation = "Clipping flags";
2265 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2266 vec4_instruction *inst;
2267
2268 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2269 src_reg(this->userplane[i])));
2270 inst->conditional_mod = BRW_CONDITIONAL_L;
2271
2272 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2273 inst->predicate = BRW_PREDICATE_NORMAL;
2274 }
2275
2276 /* i965 clipping workaround:
2277 * 1) Test for -ve rhw
2278 * 2) If set,
2279 * set ndc = (0,0,0,0)
2280 * set ucp[6] = 1
2281 *
2282 * Later, clipping will detect ucp[6] and ensure the primitive is
2283 * clipped against all fixed planes.
2284 */
2285 if (brw->has_negative_rhw_bug) {
2286 #if 0
2287 /* FINISHME */
2288 brw_CMP(p,
2289 vec8(brw_null_reg()),
2290 BRW_CONDITIONAL_L,
2291 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2292 brw_imm_f(0));
2293
2294 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2295 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2296 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2297 #endif
2298 }
2299
2300 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2301 } else if (intel->gen < 6) {
2302 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2303 } else {
2304 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2305 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2306 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2307 src_reg(output_reg[VERT_RESULT_PSIZ])));
2308 }
2309 }
2310 }
2311
2312 void
2313 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2314 {
2315 if (intel->gen < 6) {
2316 /* Clip distance slots are set aside in gen5, but they are not used. It
2317 * is not clear whether we actually need to set aside space for them,
2318 * but the performance cost is negligible.
2319 */
2320 return;
2321 }
2322
2323 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2324 *
2325 * "If a linked set of shaders forming the vertex stage contains no
2326 * static write to gl_ClipVertex or gl_ClipDistance, but the
2327 * application has requested clipping against user clip planes through
2328 * the API, then the coordinate written to gl_Position is used for
2329 * comparison against the user clip planes."
2330 *
2331 * This function is only called if the shader didn't write to
2332 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2333 * if the user wrote to it; otherwise we use gl_Position.
2334 */
2335 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2336 if (!(c->prog_data.outputs_written
2337 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2338 clip_vertex = VERT_RESULT_HPOS;
2339 }
2340
2341 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2342 ++i) {
2343 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2344 src_reg(output_reg[clip_vertex]),
2345 src_reg(this->userplane[i + offset])));
2346 }
2347 }
2348
2349 void
2350 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2351 {
2352 assert (vert_result < VERT_RESULT_MAX);
2353 reg.type = output_reg[vert_result].type;
2354 current_annotation = output_reg_annotation[vert_result];
2355 /* Copy the register, saturating if necessary */
2356 vec4_instruction *inst = emit(MOV(reg,
2357 src_reg(output_reg[vert_result])));
2358 if ((vert_result == VERT_RESULT_COL0 ||
2359 vert_result == VERT_RESULT_COL1 ||
2360 vert_result == VERT_RESULT_BFC0 ||
2361 vert_result == VERT_RESULT_BFC1) &&
2362 c->key.clamp_vertex_color) {
2363 inst->saturate = true;
2364 }
2365 }
2366
2367 void
2368 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2369 {
2370 struct brw_reg hw_reg = brw_message_reg(mrf);
2371 dst_reg reg = dst_reg(MRF, mrf);
2372 reg.type = BRW_REGISTER_TYPE_F;
2373
2374 switch (vert_result) {
2375 case VERT_RESULT_PSIZ:
2376 /* PSIZ is always in slot 0, and is coupled with other flags. */
2377 current_annotation = "indices, point width, clip flags";
2378 emit_psiz_and_flags(hw_reg);
2379 break;
2380 case BRW_VERT_RESULT_NDC:
2381 current_annotation = "NDC";
2382 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2383 break;
2384 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2385 case VERT_RESULT_HPOS:
2386 current_annotation = "gl_Position";
2387 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2388 break;
2389 case VERT_RESULT_CLIP_DIST0:
2390 case VERT_RESULT_CLIP_DIST1:
2391 if (this->c->key.uses_clip_distance) {
2392 emit_generic_urb_slot(reg, vert_result);
2393 } else {
2394 current_annotation = "user clip distances";
2395 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2396 }
2397 break;
2398 case VERT_RESULT_EDGE:
2399 /* This is present when doing unfilled polygons. We're supposed to copy
2400 * the edge flag from the user-provided vertex array
2401 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2402 * of that attribute (starts as 1.0f). This is then used in clipping to
2403 * determine which edges should be drawn as wireframe.
2404 */
2405 current_annotation = "edge flag";
2406 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2407 glsl_type::float_type, WRITEMASK_XYZW))));
2408 break;
2409 case BRW_VERT_RESULT_PAD:
2410 /* No need to write to this slot */
2411 break;
2412 default:
2413 emit_generic_urb_slot(reg, vert_result);
2414 break;
2415 }
2416 }
2417
2418 static int
2419 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2420 {
2421 struct intel_context *intel = &brw->intel;
2422
2423 if (intel->gen >= 6) {
2424 /* URB data written (does not include the message header reg) must
2425 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2426 * section 5.4.3.2.2: URB_INTERLEAVED.
2427 *
2428 * URB entries are allocated on a multiple of 1024 bits, so an
2429 * extra 128 bits written here to make the end align to 256 is
2430 * no problem.
2431 */
2432 if ((mlen % 2) != 1)
2433 mlen++;
2434 }
2435
2436 return mlen;
2437 }
2438
2439 /**
2440 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2441 * complete the VS thread.
2442 *
2443 * The VUE layout is documented in Volume 2a.
2444 */
2445 void
2446 vec4_visitor::emit_urb_writes()
2447 {
2448 /* MRF 0 is reserved for the debugger, so start with message header
2449 * in MRF 1.
2450 */
2451 int base_mrf = 1;
2452 int mrf = base_mrf;
2453 /* In the process of generating our URB write message contents, we
2454 * may need to unspill a register or load from an array. Those
2455 * reads would use MRFs 14-15.
2456 */
2457 int max_usable_mrf = 13;
2458
2459 /* The following assertion verifies that max_usable_mrf causes an
2460 * even-numbered amount of URB write data, which will meet gen6's
2461 * requirements for length alignment.
2462 */
2463 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2464
2465 /* First mrf is the g0-based message header containing URB handles and such,
2466 * which is implied in VS_OPCODE_URB_WRITE.
2467 */
2468 mrf++;
2469
2470 if (intel->gen < 6) {
2471 emit_ndc_computation();
2472 }
2473
2474 /* Set up the VUE data for the first URB write */
2475 int slot;
2476 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2477 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2478
2479 /* If this was max_usable_mrf, we can't fit anything more into this URB
2480 * WRITE.
2481 */
2482 if (mrf > max_usable_mrf) {
2483 slot++;
2484 break;
2485 }
2486 }
2487
2488 current_annotation = "URB write";
2489 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2490 inst->base_mrf = base_mrf;
2491 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2492 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2493
2494 /* Optional second URB write */
2495 if (!inst->eot) {
2496 mrf = base_mrf + 1;
2497
2498 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2499 assert(mrf < max_usable_mrf);
2500
2501 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2502 }
2503
2504 current_annotation = "URB write";
2505 inst = emit(VS_OPCODE_URB_WRITE);
2506 inst->base_mrf = base_mrf;
2507 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2508 inst->eot = true;
2509 /* URB destination offset. In the previous write, we got MRFs
2510 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2511 * URB row increments, and each of our MRFs is half of one of
2512 * those, since we're doing interleaved writes.
2513 */
2514 inst->offset = (max_usable_mrf - base_mrf) / 2;
2515 }
2516 }
2517
2518 src_reg
2519 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2520 src_reg *reladdr, int reg_offset)
2521 {
2522 /* Because we store the values to scratch interleaved like our
2523 * vertex data, we need to scale the vec4 index by 2.
2524 */
2525 int message_header_scale = 2;
2526
2527 /* Pre-gen6, the message header uses byte offsets instead of vec4
2528 * (16-byte) offset units.
2529 */
2530 if (intel->gen < 6)
2531 message_header_scale *= 16;
2532
2533 if (reladdr) {
2534 src_reg index = src_reg(this, glsl_type::int_type);
2535
2536 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2537 emit_before(inst, MUL(dst_reg(index),
2538 index, src_reg(message_header_scale)));
2539
2540 return index;
2541 } else {
2542 return src_reg(reg_offset * message_header_scale);
2543 }
2544 }
2545
2546 src_reg
2547 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2548 src_reg *reladdr, int reg_offset)
2549 {
2550 if (reladdr) {
2551 src_reg index = src_reg(this, glsl_type::int_type);
2552
2553 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2554
2555 /* Pre-gen6, the message header uses byte offsets instead of vec4
2556 * (16-byte) offset units.
2557 */
2558 if (intel->gen < 6) {
2559 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2560 }
2561
2562 return index;
2563 } else {
2564 int message_header_scale = intel->gen < 6 ? 16 : 1;
2565 return src_reg(reg_offset * message_header_scale);
2566 }
2567 }
2568
2569 /**
2570 * Emits an instruction before @inst to load the value named by @orig_src
2571 * from scratch space at @base_offset to @temp.
2572 *
2573 * @base_offset is measured in 32-byte units (the size of a register).
2574 */
2575 void
2576 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2577 dst_reg temp, src_reg orig_src,
2578 int base_offset)
2579 {
2580 int reg_offset = base_offset + orig_src.reg_offset;
2581 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2582
2583 emit_before(inst, SCRATCH_READ(temp, index));
2584 }
2585
2586 /**
2587 * Emits an instruction after @inst to store the value to be written
2588 * to @orig_dst to scratch space at @base_offset, from @temp.
2589 *
2590 * @base_offset is measured in 32-byte units (the size of a register).
2591 */
2592 void
2593 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2594 {
2595 int reg_offset = base_offset + inst->dst.reg_offset;
2596 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2597
2598 /* Create a temporary register to store *inst's result in.
2599 *
2600 * We have to be careful in MOVing from our temporary result register in
2601 * the scratch write. If we swizzle from channels of the temporary that
2602 * weren't initialized, it will confuse live interval analysis, which will
2603 * make spilling fail to make progress.
2604 */
2605 src_reg temp = src_reg(this, glsl_type::vec4_type);
2606 temp.type = inst->dst.type;
2607 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2608 int swizzles[4];
2609 for (int i = 0; i < 4; i++)
2610 if (inst->dst.writemask & (1 << i))
2611 swizzles[i] = i;
2612 else
2613 swizzles[i] = first_writemask_chan;
2614 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2615 swizzles[2], swizzles[3]);
2616
2617 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2618 inst->dst.writemask));
2619 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2620 write->predicate = inst->predicate;
2621 write->ir = inst->ir;
2622 write->annotation = inst->annotation;
2623 inst->insert_after(write);
2624
2625 inst->dst.file = temp.file;
2626 inst->dst.reg = temp.reg;
2627 inst->dst.reg_offset = temp.reg_offset;
2628 inst->dst.reladdr = NULL;
2629 }
2630
2631 /**
2632 * We can't generally support array access in GRF space, because a
2633 * single instruction's destination can only span 2 contiguous
2634 * registers. So, we send all GRF arrays that get variable index
2635 * access to scratch space.
2636 */
2637 void
2638 vec4_visitor::move_grf_array_access_to_scratch()
2639 {
2640 int scratch_loc[this->virtual_grf_count];
2641
2642 for (int i = 0; i < this->virtual_grf_count; i++) {
2643 scratch_loc[i] = -1;
2644 }
2645
2646 /* First, calculate the set of virtual GRFs that need to be punted
2647 * to scratch due to having any array access on them, and where in
2648 * scratch.
2649 */
2650 foreach_list(node, &this->instructions) {
2651 vec4_instruction *inst = (vec4_instruction *)node;
2652
2653 if (inst->dst.file == GRF && inst->dst.reladdr &&
2654 scratch_loc[inst->dst.reg] == -1) {
2655 scratch_loc[inst->dst.reg] = c->last_scratch;
2656 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2657 }
2658
2659 for (int i = 0 ; i < 3; i++) {
2660 src_reg *src = &inst->src[i];
2661
2662 if (src->file == GRF && src->reladdr &&
2663 scratch_loc[src->reg] == -1) {
2664 scratch_loc[src->reg] = c->last_scratch;
2665 c->last_scratch += this->virtual_grf_sizes[src->reg];
2666 }
2667 }
2668 }
2669
2670 /* Now, for anything that will be accessed through scratch, rewrite
2671 * it to load/store. Note that this is a _safe list walk, because
2672 * we may generate a new scratch_write instruction after the one
2673 * we're processing.
2674 */
2675 foreach_list_safe(node, &this->instructions) {
2676 vec4_instruction *inst = (vec4_instruction *)node;
2677
2678 /* Set up the annotation tracking for new generated instructions. */
2679 base_ir = inst->ir;
2680 current_annotation = inst->annotation;
2681
2682 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2683 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2684 }
2685
2686 for (int i = 0 ; i < 3; i++) {
2687 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2688 continue;
2689
2690 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2691
2692 emit_scratch_read(inst, temp, inst->src[i],
2693 scratch_loc[inst->src[i].reg]);
2694
2695 inst->src[i].file = temp.file;
2696 inst->src[i].reg = temp.reg;
2697 inst->src[i].reg_offset = temp.reg_offset;
2698 inst->src[i].reladdr = NULL;
2699 }
2700 }
2701 }
2702
2703 /**
2704 * Emits an instruction before @inst to load the value named by @orig_src
2705 * from the pull constant buffer (surface) at @base_offset to @temp.
2706 */
2707 void
2708 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2709 dst_reg temp, src_reg orig_src,
2710 int base_offset)
2711 {
2712 int reg_offset = base_offset + orig_src.reg_offset;
2713 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2714 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2715 vec4_instruction *load;
2716
2717 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2718 temp, index, offset);
2719 load->base_mrf = 14;
2720 load->mlen = 1;
2721 emit_before(inst, load);
2722 }
2723
2724 /**
2725 * Implements array access of uniforms by inserting a
2726 * PULL_CONSTANT_LOAD instruction.
2727 *
2728 * Unlike temporary GRF array access (where we don't support it due to
2729 * the difficulty of doing relative addressing on instruction
2730 * destinations), we could potentially do array access of uniforms
2731 * that were loaded in GRF space as push constants. In real-world
2732 * usage we've seen, though, the arrays being used are always larger
2733 * than we could load as push constants, so just always move all
2734 * uniform array access out to a pull constant buffer.
2735 */
2736 void
2737 vec4_visitor::move_uniform_array_access_to_pull_constants()
2738 {
2739 int pull_constant_loc[this->uniforms];
2740
2741 for (int i = 0; i < this->uniforms; i++) {
2742 pull_constant_loc[i] = -1;
2743 }
2744
2745 /* Walk through and find array access of uniforms. Put a copy of that
2746 * uniform in the pull constant buffer.
2747 *
2748 * Note that we don't move constant-indexed accesses to arrays. No
2749 * testing has been done of the performance impact of this choice.
2750 */
2751 foreach_list_safe(node, &this->instructions) {
2752 vec4_instruction *inst = (vec4_instruction *)node;
2753
2754 for (int i = 0 ; i < 3; i++) {
2755 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2756 continue;
2757
2758 int uniform = inst->src[i].reg;
2759
2760 /* If this array isn't already present in the pull constant buffer,
2761 * add it.
2762 */
2763 if (pull_constant_loc[uniform] == -1) {
2764 const float **values = &prog_data->param[uniform * 4];
2765
2766 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2767
2768 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2769 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2770 }
2771 }
2772
2773 /* Set up the annotation tracking for new generated instructions. */
2774 base_ir = inst->ir;
2775 current_annotation = inst->annotation;
2776
2777 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2778
2779 emit_pull_constant_load(inst, temp, inst->src[i],
2780 pull_constant_loc[uniform]);
2781
2782 inst->src[i].file = temp.file;
2783 inst->src[i].reg = temp.reg;
2784 inst->src[i].reg_offset = temp.reg_offset;
2785 inst->src[i].reladdr = NULL;
2786 }
2787 }
2788
2789 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2790 * no need to track them as larger-than-vec4 objects. This will be
2791 * relied on in cutting out unused uniform vectors from push
2792 * constants.
2793 */
2794 split_uniform_registers();
2795 }
2796
2797 void
2798 vec4_visitor::resolve_ud_negate(src_reg *reg)
2799 {
2800 if (reg->type != BRW_REGISTER_TYPE_UD ||
2801 !reg->negate)
2802 return;
2803
2804 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2805 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2806 *reg = temp;
2807 }
2808
2809 vec4_visitor::vec4_visitor(struct brw_context *brw,
2810 struct brw_vs_compile *c,
2811 struct gl_shader_program *prog,
2812 struct brw_shader *shader,
2813 void *mem_ctx)
2814 {
2815 this->c = c;
2816 this->brw = brw;
2817 this->intel = &brw->intel;
2818 this->ctx = &intel->ctx;
2819 this->prog = prog;
2820 this->shader = shader;
2821
2822 this->mem_ctx = mem_ctx;
2823 this->failed = false;
2824
2825 this->base_ir = NULL;
2826 this->current_annotation = NULL;
2827 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2828
2829 this->c = c;
2830 this->vp = &c->vp->program;
2831 this->prog_data = &c->prog_data;
2832
2833 this->variable_ht = hash_table_ctor(0,
2834 hash_table_pointer_hash,
2835 hash_table_pointer_compare);
2836
2837 this->virtual_grf_def = NULL;
2838 this->virtual_grf_use = NULL;
2839 this->virtual_grf_sizes = NULL;
2840 this->virtual_grf_count = 0;
2841 this->virtual_grf_reg_map = NULL;
2842 this->virtual_grf_reg_count = 0;
2843 this->virtual_grf_array_size = 0;
2844 this->live_intervals_valid = false;
2845
2846 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2847
2848 this->uniforms = 0;
2849 }
2850
2851 vec4_visitor::~vec4_visitor()
2852 {
2853 hash_table_dtor(this->variable_ht);
2854 }
2855
2856
2857 void
2858 vec4_visitor::fail(const char *format, ...)
2859 {
2860 va_list va;
2861 char *msg;
2862
2863 if (failed)
2864 return;
2865
2866 failed = true;
2867
2868 va_start(va, format);
2869 msg = ralloc_vasprintf(mem_ctx, format, va);
2870 va_end(va);
2871 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2872
2873 this->fail_msg = msg;
2874
2875 if (INTEL_DEBUG & DEBUG_VS) {
2876 fprintf(stderr, "%s", msg);
2877 }
2878 }
2879
2880 } /* namespace brw */