glsl: Add a "ubo_load" expression type for fetches from UBOs.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, dst_reg dst,
35 src_reg src0, src_reg src1, src_reg src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->ir = v->base_ir;
43 this->annotation = v->current_annotation;
44 }
45
46 vec4_instruction *
47 vec4_visitor::emit(vec4_instruction *inst)
48 {
49 this->instructions.push_tail(inst);
50
51 return inst;
52 }
53
54 vec4_instruction *
55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
56 {
57 new_inst->ir = inst->ir;
58 new_inst->annotation = inst->annotation;
59
60 inst->insert_before(new_inst);
61
62 return inst;
63 }
64
65 vec4_instruction *
66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
67 src_reg src0, src_reg src1, src_reg src2)
68 {
69 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
70 src0, src1, src2));
71 }
72
73
74 vec4_instruction *
75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
76 {
77 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
84 }
85
86 vec4_instruction *
87 vec4_visitor::emit(enum opcode opcode)
88 {
89 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
90 }
91
92 #define ALU1(op) \
93 vec4_instruction * \
94 vec4_visitor::op(dst_reg dst, src_reg src0) \
95 { \
96 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
97 src0); \
98 }
99
100 #define ALU2(op) \
101 vec4_instruction * \
102 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
103 { \
104 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
105 src0, src1); \
106 }
107
108 ALU1(NOT)
109 ALU1(MOV)
110 ALU1(FRC)
111 ALU1(RNDD)
112 ALU1(RNDE)
113 ALU1(RNDZ)
114 ALU2(ADD)
115 ALU2(MUL)
116 ALU2(MACH)
117 ALU2(AND)
118 ALU2(OR)
119 ALU2(XOR)
120 ALU2(DP3)
121 ALU2(DP4)
122
123 /** Gen4 predicated IF. */
124 vec4_instruction *
125 vec4_visitor::IF(uint32_t predicate)
126 {
127 vec4_instruction *inst;
128
129 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
130 inst->predicate = predicate;
131
132 return inst;
133 }
134
135 /** Gen6+ IF with embedded comparison. */
136 vec4_instruction *
137 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
138 {
139 assert(intel->gen >= 6);
140
141 vec4_instruction *inst;
142
143 resolve_ud_negate(&src0);
144 resolve_ud_negate(&src1);
145
146 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
147 src0, src1);
148 inst->conditional_mod = condition;
149
150 return inst;
151 }
152
153 /**
154 * CMP: Sets the low bit of the destination channels with the result
155 * of the comparison, while the upper bits are undefined, and updates
156 * the flag register with the packed 16 bits of the result.
157 */
158 vec4_instruction *
159 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
160 {
161 vec4_instruction *inst;
162
163 /* original gen4 does type conversion to the destination type
164 * before before comparison, producing garbage results for floating
165 * point comparisons.
166 */
167 if (intel->gen == 4) {
168 dst.type = src0.type;
169 if (dst.file == HW_REG)
170 dst.fixed_hw_reg.type = dst.type;
171 }
172
173 resolve_ud_negate(&src0);
174 resolve_ud_negate(&src1);
175
176 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
177 inst->conditional_mod = condition;
178
179 return inst;
180 }
181
182 vec4_instruction *
183 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
184 {
185 vec4_instruction *inst;
186
187 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
188 dst, index);
189 inst->base_mrf = 14;
190 inst->mlen = 1;
191
192 return inst;
193 }
194
195 vec4_instruction *
196 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
197 {
198 vec4_instruction *inst;
199
200 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
201 dst, src, index);
202 inst->base_mrf = 13;
203 inst->mlen = 2;
204
205 return inst;
206 }
207
208 void
209 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
210 {
211 static enum opcode dot_opcodes[] = {
212 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
213 };
214
215 emit(dot_opcodes[elements - 2], dst, src0, src1);
216 }
217
218 void
219 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
220 {
221 /* The gen6 math instruction ignores the source modifiers --
222 * swizzle, abs, negate, and at least some parts of the register
223 * region description.
224 *
225 * While it would seem that this MOV could be avoided at this point
226 * in the case that the swizzle is matched up with the destination
227 * writemask, note that uniform packing and register allocation
228 * could rearrange our swizzle, so let's leave this matter up to
229 * copy propagation later.
230 */
231 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
232 emit(MOV(dst_reg(temp_src), src));
233
234 if (dst.writemask != WRITEMASK_XYZW) {
235 /* The gen6 math instruction must be align1, so we can't do
236 * writemasks.
237 */
238 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
239
240 emit(opcode, temp_dst, temp_src);
241
242 emit(MOV(dst, src_reg(temp_dst)));
243 } else {
244 emit(opcode, dst, temp_src);
245 }
246 }
247
248 void
249 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
250 {
251 vec4_instruction *inst = emit(opcode, dst, src);
252 inst->base_mrf = 1;
253 inst->mlen = 1;
254 }
255
256 void
257 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
258 {
259 switch (opcode) {
260 case SHADER_OPCODE_RCP:
261 case SHADER_OPCODE_RSQ:
262 case SHADER_OPCODE_SQRT:
263 case SHADER_OPCODE_EXP2:
264 case SHADER_OPCODE_LOG2:
265 case SHADER_OPCODE_SIN:
266 case SHADER_OPCODE_COS:
267 break;
268 default:
269 assert(!"not reached: bad math opcode");
270 return;
271 }
272
273 if (intel->gen >= 7) {
274 emit(opcode, dst, src);
275 } else if (intel->gen == 6) {
276 return emit_math1_gen6(opcode, dst, src);
277 } else {
278 return emit_math1_gen4(opcode, dst, src);
279 }
280 }
281
282 void
283 vec4_visitor::emit_math2_gen6(enum opcode opcode,
284 dst_reg dst, src_reg src0, src_reg src1)
285 {
286 src_reg expanded;
287
288 /* The gen6 math instruction ignores the source modifiers --
289 * swizzle, abs, negate, and at least some parts of the register
290 * region description. Move the sources to temporaries to make it
291 * generally work.
292 */
293
294 expanded = src_reg(this, glsl_type::vec4_type);
295 expanded.type = src0.type;
296 emit(MOV(dst_reg(expanded), src0));
297 src0 = expanded;
298
299 expanded = src_reg(this, glsl_type::vec4_type);
300 expanded.type = src1.type;
301 emit(MOV(dst_reg(expanded), src1));
302 src1 = expanded;
303
304 if (dst.writemask != WRITEMASK_XYZW) {
305 /* The gen6 math instruction must be align1, so we can't do
306 * writemasks.
307 */
308 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
309 temp_dst.type = dst.type;
310
311 emit(opcode, temp_dst, src0, src1);
312
313 emit(MOV(dst, src_reg(temp_dst)));
314 } else {
315 emit(opcode, dst, src0, src1);
316 }
317 }
318
319 void
320 vec4_visitor::emit_math2_gen4(enum opcode opcode,
321 dst_reg dst, src_reg src0, src_reg src1)
322 {
323 vec4_instruction *inst = emit(opcode, dst, src0, src1);
324 inst->base_mrf = 1;
325 inst->mlen = 2;
326 }
327
328 void
329 vec4_visitor::emit_math(enum opcode opcode,
330 dst_reg dst, src_reg src0, src_reg src1)
331 {
332 switch (opcode) {
333 case SHADER_OPCODE_POW:
334 case SHADER_OPCODE_INT_QUOTIENT:
335 case SHADER_OPCODE_INT_REMAINDER:
336 break;
337 default:
338 assert(!"not reached: unsupported binary math opcode");
339 return;
340 }
341
342 if (intel->gen >= 7) {
343 emit(opcode, dst, src0, src1);
344 } else if (intel->gen == 6) {
345 return emit_math2_gen6(opcode, dst, src0, src1);
346 } else {
347 return emit_math2_gen4(opcode, dst, src0, src1);
348 }
349 }
350
351 void
352 vec4_visitor::visit_instructions(const exec_list *list)
353 {
354 foreach_list(node, list) {
355 ir_instruction *ir = (ir_instruction *)node;
356
357 base_ir = ir;
358 ir->accept(this);
359 }
360 }
361
362
363 static int
364 type_size(const struct glsl_type *type)
365 {
366 unsigned int i;
367 int size;
368
369 switch (type->base_type) {
370 case GLSL_TYPE_UINT:
371 case GLSL_TYPE_INT:
372 case GLSL_TYPE_FLOAT:
373 case GLSL_TYPE_BOOL:
374 if (type->is_matrix()) {
375 return type->matrix_columns;
376 } else {
377 /* Regardless of size of vector, it gets a vec4. This is bad
378 * packing for things like floats, but otherwise arrays become a
379 * mess. Hopefully a later pass over the code can pack scalars
380 * down if appropriate.
381 */
382 return 1;
383 }
384 case GLSL_TYPE_ARRAY:
385 assert(type->length > 0);
386 return type_size(type->fields.array) * type->length;
387 case GLSL_TYPE_STRUCT:
388 size = 0;
389 for (i = 0; i < type->length; i++) {
390 size += type_size(type->fields.structure[i].type);
391 }
392 return size;
393 case GLSL_TYPE_SAMPLER:
394 /* Samplers take up one slot in UNIFORMS[], but they're baked in
395 * at link time.
396 */
397 return 1;
398 default:
399 assert(0);
400 return 0;
401 }
402 }
403
404 int
405 vec4_visitor::virtual_grf_alloc(int size)
406 {
407 if (virtual_grf_array_size <= virtual_grf_count) {
408 if (virtual_grf_array_size == 0)
409 virtual_grf_array_size = 16;
410 else
411 virtual_grf_array_size *= 2;
412 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
413 virtual_grf_array_size);
414 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
415 virtual_grf_array_size);
416 }
417 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
418 virtual_grf_reg_count += size;
419 virtual_grf_sizes[virtual_grf_count] = size;
420 return virtual_grf_count++;
421 }
422
423 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
424 {
425 init();
426
427 this->file = GRF;
428 this->reg = v->virtual_grf_alloc(type_size(type));
429
430 if (type->is_array() || type->is_record()) {
431 this->swizzle = BRW_SWIZZLE_NOOP;
432 } else {
433 this->swizzle = swizzle_for_size(type->vector_elements);
434 }
435
436 this->type = brw_type_for_base_type(type);
437 }
438
439 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
440 {
441 init();
442
443 this->file = GRF;
444 this->reg = v->virtual_grf_alloc(type_size(type));
445
446 if (type->is_array() || type->is_record()) {
447 this->writemask = WRITEMASK_XYZW;
448 } else {
449 this->writemask = (1 << type->vector_elements) - 1;
450 }
451
452 this->type = brw_type_for_base_type(type);
453 }
454
455 /* Our support for uniforms is piggy-backed on the struct
456 * gl_fragment_program, because that's where the values actually
457 * get stored, rather than in some global gl_shader_program uniform
458 * store.
459 */
460 int
461 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
462 {
463 unsigned int offset = 0;
464 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
465
466 if (type->is_matrix()) {
467 const glsl_type *column = type->column_type();
468
469 for (unsigned int i = 0; i < type->matrix_columns; i++) {
470 offset += setup_uniform_values(loc + offset, column);
471 }
472
473 return offset;
474 }
475
476 switch (type->base_type) {
477 case GLSL_TYPE_FLOAT:
478 case GLSL_TYPE_UINT:
479 case GLSL_TYPE_INT:
480 case GLSL_TYPE_BOOL:
481 for (unsigned int i = 0; i < type->vector_elements; i++) {
482 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
483 }
484
485 /* Set up pad elements to get things aligned to a vec4 boundary. */
486 for (unsigned int i = type->vector_elements; i < 4; i++) {
487 static float zero = 0;
488
489 c->prog_data.param[this->uniforms * 4 + i] = &zero;
490 }
491
492 /* Track the size of this uniform vector, for future packing of
493 * uniforms.
494 */
495 this->uniform_vector_size[this->uniforms] = type->vector_elements;
496 this->uniforms++;
497
498 return 1;
499
500 case GLSL_TYPE_STRUCT:
501 for (unsigned int i = 0; i < type->length; i++) {
502 offset += setup_uniform_values(loc + offset,
503 type->fields.structure[i].type);
504 }
505 return offset;
506
507 case GLSL_TYPE_ARRAY:
508 for (unsigned int i = 0; i < type->length; i++) {
509 offset += setup_uniform_values(loc + offset, type->fields.array);
510 }
511 return offset;
512
513 case GLSL_TYPE_SAMPLER:
514 /* The sampler takes up a slot, but we don't use any values from it. */
515 return 1;
516
517 default:
518 assert(!"not reached");
519 return 0;
520 }
521 }
522
523 void
524 vec4_visitor::setup_uniform_clipplane_values()
525 {
526 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
527
528 /* Pre-Gen6, we compact clip planes. For example, if the user
529 * enables just clip planes 0, 1, and 3, we will enable clip planes
530 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
531 * plane 2. This simplifies the implementation of the Gen6 clip
532 * thread.
533 *
534 * In Gen6 and later, we don't compact clip planes, because this
535 * simplifies the implementation of gl_ClipDistance.
536 */
537 int compacted_clipplane_index = 0;
538 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
539 if (intel->gen < 6 &&
540 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
541 continue;
542 }
543 this->uniform_vector_size[this->uniforms] = 4;
544 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
545 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
546 for (int j = 0; j < 4; ++j) {
547 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
548 }
549 ++compacted_clipplane_index;
550 ++this->uniforms;
551 }
552 }
553
554 /* Our support for builtin uniforms is even scarier than non-builtin.
555 * It sits on top of the PROG_STATE_VAR parameters that are
556 * automatically updated from GL context state.
557 */
558 void
559 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
560 {
561 const ir_state_slot *const slots = ir->state_slots;
562 assert(ir->state_slots != NULL);
563
564 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
565 /* This state reference has already been setup by ir_to_mesa,
566 * but we'll get the same index back here. We can reference
567 * ParameterValues directly, since unlike brw_fs.cpp, we never
568 * add new state references during compile.
569 */
570 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
571 (gl_state_index *)slots[i].tokens);
572 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
573
574 this->uniform_vector_size[this->uniforms] = 0;
575 /* Add each of the unique swizzled channels of the element.
576 * This will end up matching the size of the glsl_type of this field.
577 */
578 int last_swiz = -1;
579 for (unsigned int j = 0; j < 4; j++) {
580 int swiz = GET_SWZ(slots[i].swizzle, j);
581 last_swiz = swiz;
582
583 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
584 if (swiz <= last_swiz)
585 this->uniform_vector_size[this->uniforms]++;
586 }
587 this->uniforms++;
588 }
589 }
590
591 dst_reg *
592 vec4_visitor::variable_storage(ir_variable *var)
593 {
594 return (dst_reg *)hash_table_find(this->variable_ht, var);
595 }
596
597 void
598 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
599 {
600 ir_expression *expr = ir->as_expression();
601
602 *predicate = BRW_PREDICATE_NORMAL;
603
604 if (expr) {
605 src_reg op[2];
606 vec4_instruction *inst;
607
608 assert(expr->get_num_operands() <= 2);
609 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
610 expr->operands[i]->accept(this);
611 op[i] = this->result;
612
613 resolve_ud_negate(&op[i]);
614 }
615
616 switch (expr->operation) {
617 case ir_unop_logic_not:
618 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
619 inst->conditional_mod = BRW_CONDITIONAL_Z;
620 break;
621
622 case ir_binop_logic_xor:
623 inst = emit(XOR(dst_null_d(), op[0], op[1]));
624 inst->conditional_mod = BRW_CONDITIONAL_NZ;
625 break;
626
627 case ir_binop_logic_or:
628 inst = emit(OR(dst_null_d(), op[0], op[1]));
629 inst->conditional_mod = BRW_CONDITIONAL_NZ;
630 break;
631
632 case ir_binop_logic_and:
633 inst = emit(AND(dst_null_d(), op[0], op[1]));
634 inst->conditional_mod = BRW_CONDITIONAL_NZ;
635 break;
636
637 case ir_unop_f2b:
638 if (intel->gen >= 6) {
639 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
640 } else {
641 inst = emit(MOV(dst_null_f(), op[0]));
642 inst->conditional_mod = BRW_CONDITIONAL_NZ;
643 }
644 break;
645
646 case ir_unop_i2b:
647 if (intel->gen >= 6) {
648 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
649 } else {
650 inst = emit(MOV(dst_null_d(), op[0]));
651 inst->conditional_mod = BRW_CONDITIONAL_NZ;
652 }
653 break;
654
655 case ir_binop_all_equal:
656 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
657 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
658 break;
659
660 case ir_binop_any_nequal:
661 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
662 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
663 break;
664
665 case ir_unop_any:
666 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
667 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
668 break;
669
670 case ir_binop_greater:
671 case ir_binop_gequal:
672 case ir_binop_less:
673 case ir_binop_lequal:
674 case ir_binop_equal:
675 case ir_binop_nequal:
676 emit(CMP(dst_null_d(), op[0], op[1],
677 brw_conditional_for_comparison(expr->operation)));
678 break;
679
680 default:
681 assert(!"not reached");
682 break;
683 }
684 return;
685 }
686
687 ir->accept(this);
688
689 resolve_ud_negate(&this->result);
690
691 if (intel->gen >= 6) {
692 vec4_instruction *inst = emit(AND(dst_null_d(),
693 this->result, src_reg(1)));
694 inst->conditional_mod = BRW_CONDITIONAL_NZ;
695 } else {
696 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
697 inst->conditional_mod = BRW_CONDITIONAL_NZ;
698 }
699 }
700
701 /**
702 * Emit a gen6 IF statement with the comparison folded into the IF
703 * instruction.
704 */
705 void
706 vec4_visitor::emit_if_gen6(ir_if *ir)
707 {
708 ir_expression *expr = ir->condition->as_expression();
709
710 if (expr) {
711 src_reg op[2];
712 dst_reg temp;
713
714 assert(expr->get_num_operands() <= 2);
715 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
716 expr->operands[i]->accept(this);
717 op[i] = this->result;
718 }
719
720 switch (expr->operation) {
721 case ir_unop_logic_not:
722 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
723 return;
724
725 case ir_binop_logic_xor:
726 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
727 return;
728
729 case ir_binop_logic_or:
730 temp = dst_reg(this, glsl_type::bool_type);
731 emit(OR(temp, op[0], op[1]));
732 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
733 return;
734
735 case ir_binop_logic_and:
736 temp = dst_reg(this, glsl_type::bool_type);
737 emit(AND(temp, op[0], op[1]));
738 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
739 return;
740
741 case ir_unop_f2b:
742 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
743 return;
744
745 case ir_unop_i2b:
746 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
747 return;
748
749 case ir_binop_greater:
750 case ir_binop_gequal:
751 case ir_binop_less:
752 case ir_binop_lequal:
753 case ir_binop_equal:
754 case ir_binop_nequal:
755 emit(IF(op[0], op[1],
756 brw_conditional_for_comparison(expr->operation)));
757 return;
758
759 case ir_binop_all_equal:
760 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
761 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
762 return;
763
764 case ir_binop_any_nequal:
765 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
766 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
767 return;
768
769 case ir_unop_any:
770 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
771 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
772 return;
773
774 default:
775 assert(!"not reached");
776 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
777 return;
778 }
779 return;
780 }
781
782 ir->condition->accept(this);
783
784 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
785 }
786
787 void
788 vec4_visitor::visit(ir_variable *ir)
789 {
790 dst_reg *reg = NULL;
791
792 if (variable_storage(ir))
793 return;
794
795 switch (ir->mode) {
796 case ir_var_in:
797 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
798
799 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
800 * come in as floating point conversions of the integer values.
801 */
802 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
803 if (!c->key.gl_fixed_input_size[i])
804 continue;
805
806 dst_reg dst = *reg;
807 dst.type = brw_type_for_base_type(ir->type);
808 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
809 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
810 }
811 break;
812
813 case ir_var_out:
814 reg = new(mem_ctx) dst_reg(this, ir->type);
815
816 for (int i = 0; i < type_size(ir->type); i++) {
817 output_reg[ir->location + i] = *reg;
818 output_reg[ir->location + i].reg_offset = i;
819 output_reg[ir->location + i].type =
820 brw_type_for_base_type(ir->type->get_scalar_type());
821 output_reg_annotation[ir->location + i] = ir->name;
822 }
823 break;
824
825 case ir_var_auto:
826 case ir_var_temporary:
827 reg = new(mem_ctx) dst_reg(this, ir->type);
828 break;
829
830 case ir_var_uniform:
831 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
832
833 /* Track how big the whole uniform variable is, in case we need to put a
834 * copy of its data into pull constants for array access.
835 */
836 this->uniform_size[this->uniforms] = type_size(ir->type);
837
838 if (!strncmp(ir->name, "gl_", 3)) {
839 setup_builtin_uniform_values(ir);
840 } else {
841 setup_uniform_values(ir->location, ir->type);
842 }
843 break;
844
845 case ir_var_system_value:
846 /* VertexID is stored by the VF as the last vertex element, but
847 * we don't represent it with a flag in inputs_read, so we call
848 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
849 */
850 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
851 prog_data->uses_vertexid = true;
852
853 switch (ir->location) {
854 case SYSTEM_VALUE_VERTEX_ID:
855 reg->writemask = WRITEMASK_X;
856 break;
857 case SYSTEM_VALUE_INSTANCE_ID:
858 reg->writemask = WRITEMASK_Y;
859 break;
860 default:
861 assert(!"not reached");
862 break;
863 }
864 break;
865
866 default:
867 assert(!"not reached");
868 }
869
870 reg->type = brw_type_for_base_type(ir->type);
871 hash_table_insert(this->variable_ht, reg, ir);
872 }
873
874 void
875 vec4_visitor::visit(ir_loop *ir)
876 {
877 dst_reg counter;
878
879 /* We don't want debugging output to print the whole body of the
880 * loop as the annotation.
881 */
882 this->base_ir = NULL;
883
884 if (ir->counter != NULL) {
885 this->base_ir = ir->counter;
886 ir->counter->accept(this);
887 counter = *(variable_storage(ir->counter));
888
889 if (ir->from != NULL) {
890 this->base_ir = ir->from;
891 ir->from->accept(this);
892
893 emit(MOV(counter, this->result));
894 }
895 }
896
897 emit(BRW_OPCODE_DO);
898
899 if (ir->to) {
900 this->base_ir = ir->to;
901 ir->to->accept(this);
902
903 emit(CMP(dst_null_d(), src_reg(counter), this->result,
904 brw_conditional_for_comparison(ir->cmp)));
905
906 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
907 inst->predicate = BRW_PREDICATE_NORMAL;
908 }
909
910 visit_instructions(&ir->body_instructions);
911
912
913 if (ir->increment) {
914 this->base_ir = ir->increment;
915 ir->increment->accept(this);
916 emit(ADD(counter, src_reg(counter), this->result));
917 }
918
919 emit(BRW_OPCODE_WHILE);
920 }
921
922 void
923 vec4_visitor::visit(ir_loop_jump *ir)
924 {
925 switch (ir->mode) {
926 case ir_loop_jump::jump_break:
927 emit(BRW_OPCODE_BREAK);
928 break;
929 case ir_loop_jump::jump_continue:
930 emit(BRW_OPCODE_CONTINUE);
931 break;
932 }
933 }
934
935
936 void
937 vec4_visitor::visit(ir_function_signature *ir)
938 {
939 assert(0);
940 (void)ir;
941 }
942
943 void
944 vec4_visitor::visit(ir_function *ir)
945 {
946 /* Ignore function bodies other than main() -- we shouldn't see calls to
947 * them since they should all be inlined.
948 */
949 if (strcmp(ir->name, "main") == 0) {
950 const ir_function_signature *sig;
951 exec_list empty;
952
953 sig = ir->matching_signature(&empty);
954
955 assert(sig);
956
957 visit_instructions(&sig->body);
958 }
959 }
960
961 bool
962 vec4_visitor::try_emit_sat(ir_expression *ir)
963 {
964 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
965 if (!sat_src)
966 return false;
967
968 sat_src->accept(this);
969 src_reg src = this->result;
970
971 this->result = src_reg(this, ir->type);
972 vec4_instruction *inst;
973 inst = emit(MOV(dst_reg(this->result), src));
974 inst->saturate = true;
975
976 return true;
977 }
978
979 void
980 vec4_visitor::emit_bool_comparison(unsigned int op,
981 dst_reg dst, src_reg src0, src_reg src1)
982 {
983 /* original gen4 does destination conversion before comparison. */
984 if (intel->gen < 5)
985 dst.type = src0.type;
986
987 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
988
989 dst.type = BRW_REGISTER_TYPE_D;
990 emit(AND(dst, src_reg(dst), src_reg(0x1)));
991 }
992
993 void
994 vec4_visitor::visit(ir_expression *ir)
995 {
996 unsigned int operand;
997 src_reg op[Elements(ir->operands)];
998 src_reg result_src;
999 dst_reg result_dst;
1000 vec4_instruction *inst;
1001
1002 if (try_emit_sat(ir))
1003 return;
1004
1005 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1006 this->result.file = BAD_FILE;
1007 ir->operands[operand]->accept(this);
1008 if (this->result.file == BAD_FILE) {
1009 printf("Failed to get tree for expression operand:\n");
1010 ir->operands[operand]->print();
1011 exit(1);
1012 }
1013 op[operand] = this->result;
1014
1015 /* Matrix expression operands should have been broken down to vector
1016 * operations already.
1017 */
1018 assert(!ir->operands[operand]->type->is_matrix());
1019 }
1020
1021 int vector_elements = ir->operands[0]->type->vector_elements;
1022 if (ir->operands[1]) {
1023 vector_elements = MAX2(vector_elements,
1024 ir->operands[1]->type->vector_elements);
1025 }
1026
1027 this->result.file = BAD_FILE;
1028
1029 /* Storage for our result. Ideally for an assignment we'd be using
1030 * the actual storage for the result here, instead.
1031 */
1032 result_src = src_reg(this, ir->type);
1033 /* convenience for the emit functions below. */
1034 result_dst = dst_reg(result_src);
1035 /* If nothing special happens, this is the result. */
1036 this->result = result_src;
1037 /* Limit writes to the channels that will be used by result_src later.
1038 * This does limit this temp's use as a temporary for multi-instruction
1039 * sequences.
1040 */
1041 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1042
1043 switch (ir->operation) {
1044 case ir_unop_logic_not:
1045 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1046 * ones complement of the whole register, not just bit 0.
1047 */
1048 emit(XOR(result_dst, op[0], src_reg(1)));
1049 break;
1050 case ir_unop_neg:
1051 op[0].negate = !op[0].negate;
1052 this->result = op[0];
1053 break;
1054 case ir_unop_abs:
1055 op[0].abs = true;
1056 op[0].negate = false;
1057 this->result = op[0];
1058 break;
1059
1060 case ir_unop_sign:
1061 emit(MOV(result_dst, src_reg(0.0f)));
1062
1063 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1064 inst = emit(MOV(result_dst, src_reg(1.0f)));
1065 inst->predicate = BRW_PREDICATE_NORMAL;
1066
1067 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1068 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1069 inst->predicate = BRW_PREDICATE_NORMAL;
1070
1071 break;
1072
1073 case ir_unop_rcp:
1074 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1075 break;
1076
1077 case ir_unop_exp2:
1078 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1079 break;
1080 case ir_unop_log2:
1081 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1082 break;
1083 case ir_unop_exp:
1084 case ir_unop_log:
1085 assert(!"not reached: should be handled by ir_explog_to_explog2");
1086 break;
1087 case ir_unop_sin:
1088 case ir_unop_sin_reduced:
1089 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1090 break;
1091 case ir_unop_cos:
1092 case ir_unop_cos_reduced:
1093 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1094 break;
1095
1096 case ir_unop_dFdx:
1097 case ir_unop_dFdy:
1098 assert(!"derivatives not valid in vertex shader");
1099 break;
1100
1101 case ir_unop_noise:
1102 assert(!"not reached: should be handled by lower_noise");
1103 break;
1104
1105 case ir_binop_add:
1106 emit(ADD(result_dst, op[0], op[1]));
1107 break;
1108 case ir_binop_sub:
1109 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1110 break;
1111
1112 case ir_binop_mul:
1113 if (ir->type->is_integer()) {
1114 /* For integer multiplication, the MUL uses the low 16 bits
1115 * of one of the operands (src0 on gen6, src1 on gen7). The
1116 * MACH accumulates in the contribution of the upper 16 bits
1117 * of that operand.
1118 *
1119 * FINISHME: Emit just the MUL if we know an operand is small
1120 * enough.
1121 */
1122 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1123
1124 emit(MUL(acc, op[0], op[1]));
1125 emit(MACH(dst_null_d(), op[0], op[1]));
1126 emit(MOV(result_dst, src_reg(acc)));
1127 } else {
1128 emit(MUL(result_dst, op[0], op[1]));
1129 }
1130 break;
1131 case ir_binop_div:
1132 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1133 assert(ir->type->is_integer());
1134 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1135 break;
1136 case ir_binop_mod:
1137 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1138 assert(ir->type->is_integer());
1139 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1140 break;
1141
1142 case ir_binop_less:
1143 case ir_binop_greater:
1144 case ir_binop_lequal:
1145 case ir_binop_gequal:
1146 case ir_binop_equal:
1147 case ir_binop_nequal: {
1148 emit(CMP(result_dst, op[0], op[1],
1149 brw_conditional_for_comparison(ir->operation)));
1150 emit(AND(result_dst, result_src, src_reg(0x1)));
1151 break;
1152 }
1153
1154 case ir_binop_all_equal:
1155 /* "==" operator producing a scalar boolean. */
1156 if (ir->operands[0]->type->is_vector() ||
1157 ir->operands[1]->type->is_vector()) {
1158 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1159 emit(MOV(result_dst, src_reg(0)));
1160 inst = emit(MOV(result_dst, src_reg(1)));
1161 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1162 } else {
1163 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1164 emit(AND(result_dst, result_src, src_reg(0x1)));
1165 }
1166 break;
1167 case ir_binop_any_nequal:
1168 /* "!=" operator producing a scalar boolean. */
1169 if (ir->operands[0]->type->is_vector() ||
1170 ir->operands[1]->type->is_vector()) {
1171 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1172
1173 emit(MOV(result_dst, src_reg(0)));
1174 inst = emit(MOV(result_dst, src_reg(1)));
1175 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1176 } else {
1177 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1178 emit(AND(result_dst, result_src, src_reg(0x1)));
1179 }
1180 break;
1181
1182 case ir_unop_any:
1183 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1184 emit(MOV(result_dst, src_reg(0)));
1185
1186 inst = emit(MOV(result_dst, src_reg(1)));
1187 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1188 break;
1189
1190 case ir_binop_logic_xor:
1191 emit(XOR(result_dst, op[0], op[1]));
1192 break;
1193
1194 case ir_binop_logic_or:
1195 emit(OR(result_dst, op[0], op[1]));
1196 break;
1197
1198 case ir_binop_logic_and:
1199 emit(AND(result_dst, op[0], op[1]));
1200 break;
1201
1202 case ir_binop_dot:
1203 assert(ir->operands[0]->type->is_vector());
1204 assert(ir->operands[0]->type == ir->operands[1]->type);
1205 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1206 break;
1207
1208 case ir_unop_sqrt:
1209 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1210 break;
1211 case ir_unop_rsq:
1212 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1213 break;
1214
1215 case ir_unop_bitcast_i2f:
1216 case ir_unop_bitcast_u2f:
1217 this->result = op[0];
1218 this->result.type = BRW_REGISTER_TYPE_F;
1219 break;
1220
1221 case ir_unop_bitcast_f2i:
1222 this->result = op[0];
1223 this->result.type = BRW_REGISTER_TYPE_D;
1224 break;
1225
1226 case ir_unop_bitcast_f2u:
1227 this->result = op[0];
1228 this->result.type = BRW_REGISTER_TYPE_UD;
1229 break;
1230
1231 case ir_unop_i2f:
1232 case ir_unop_i2u:
1233 case ir_unop_u2i:
1234 case ir_unop_u2f:
1235 case ir_unop_b2f:
1236 case ir_unop_b2i:
1237 case ir_unop_f2i:
1238 case ir_unop_f2u:
1239 emit(MOV(result_dst, op[0]));
1240 break;
1241 case ir_unop_f2b:
1242 case ir_unop_i2b: {
1243 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1244 emit(AND(result_dst, result_src, src_reg(1)));
1245 break;
1246 }
1247
1248 case ir_unop_trunc:
1249 emit(RNDZ(result_dst, op[0]));
1250 break;
1251 case ir_unop_ceil:
1252 op[0].negate = !op[0].negate;
1253 inst = emit(RNDD(result_dst, op[0]));
1254 this->result.negate = true;
1255 break;
1256 case ir_unop_floor:
1257 inst = emit(RNDD(result_dst, op[0]));
1258 break;
1259 case ir_unop_fract:
1260 inst = emit(FRC(result_dst, op[0]));
1261 break;
1262 case ir_unop_round_even:
1263 emit(RNDE(result_dst, op[0]));
1264 break;
1265
1266 case ir_binop_min:
1267 if (intel->gen >= 6) {
1268 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1269 inst->conditional_mod = BRW_CONDITIONAL_L;
1270 } else {
1271 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1272
1273 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1274 inst->predicate = BRW_PREDICATE_NORMAL;
1275 }
1276 break;
1277 case ir_binop_max:
1278 if (intel->gen >= 6) {
1279 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1280 inst->conditional_mod = BRW_CONDITIONAL_G;
1281 } else {
1282 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1283
1284 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1285 inst->predicate = BRW_PREDICATE_NORMAL;
1286 }
1287 break;
1288
1289 case ir_binop_pow:
1290 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1291 break;
1292
1293 case ir_unop_bit_not:
1294 inst = emit(NOT(result_dst, op[0]));
1295 break;
1296 case ir_binop_bit_and:
1297 inst = emit(AND(result_dst, op[0], op[1]));
1298 break;
1299 case ir_binop_bit_xor:
1300 inst = emit(XOR(result_dst, op[0], op[1]));
1301 break;
1302 case ir_binop_bit_or:
1303 inst = emit(OR(result_dst, op[0], op[1]));
1304 break;
1305
1306 case ir_binop_lshift:
1307 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1308 break;
1309
1310 case ir_binop_rshift:
1311 if (ir->type->base_type == GLSL_TYPE_INT)
1312 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1313 else
1314 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1315 break;
1316
1317 case ir_binop_ubo_load:
1318 assert(!"not yet supported");
1319 break;
1320
1321 case ir_quadop_vector:
1322 assert(!"not reached: should be handled by lower_quadop_vector");
1323 break;
1324 }
1325 }
1326
1327
1328 void
1329 vec4_visitor::visit(ir_swizzle *ir)
1330 {
1331 src_reg src;
1332 int i = 0;
1333 int swizzle[4];
1334
1335 /* Note that this is only swizzles in expressions, not those on the left
1336 * hand side of an assignment, which do write masking. See ir_assignment
1337 * for that.
1338 */
1339
1340 ir->val->accept(this);
1341 src = this->result;
1342 assert(src.file != BAD_FILE);
1343
1344 for (i = 0; i < ir->type->vector_elements; i++) {
1345 switch (i) {
1346 case 0:
1347 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1348 break;
1349 case 1:
1350 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1351 break;
1352 case 2:
1353 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1354 break;
1355 case 3:
1356 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1357 break;
1358 }
1359 }
1360 for (; i < 4; i++) {
1361 /* Replicate the last channel out. */
1362 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1363 }
1364
1365 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1366
1367 this->result = src;
1368 }
1369
1370 void
1371 vec4_visitor::visit(ir_dereference_variable *ir)
1372 {
1373 const struct glsl_type *type = ir->type;
1374 dst_reg *reg = variable_storage(ir->var);
1375
1376 if (!reg) {
1377 fail("Failed to find variable storage for %s\n", ir->var->name);
1378 this->result = src_reg(brw_null_reg());
1379 return;
1380 }
1381
1382 this->result = src_reg(*reg);
1383
1384 /* System values get their swizzle from the dst_reg writemask */
1385 if (ir->var->mode == ir_var_system_value)
1386 return;
1387
1388 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1389 this->result.swizzle = swizzle_for_size(type->vector_elements);
1390 }
1391
1392 void
1393 vec4_visitor::visit(ir_dereference_array *ir)
1394 {
1395 ir_constant *constant_index;
1396 src_reg src;
1397 int element_size = type_size(ir->type);
1398
1399 constant_index = ir->array_index->constant_expression_value();
1400
1401 ir->array->accept(this);
1402 src = this->result;
1403
1404 if (constant_index) {
1405 src.reg_offset += constant_index->value.i[0] * element_size;
1406 } else {
1407 /* Variable index array dereference. It eats the "vec4" of the
1408 * base of the array and an index that offsets the Mesa register
1409 * index.
1410 */
1411 ir->array_index->accept(this);
1412
1413 src_reg index_reg;
1414
1415 if (element_size == 1) {
1416 index_reg = this->result;
1417 } else {
1418 index_reg = src_reg(this, glsl_type::int_type);
1419
1420 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1421 }
1422
1423 if (src.reladdr) {
1424 src_reg temp = src_reg(this, glsl_type::int_type);
1425
1426 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1427
1428 index_reg = temp;
1429 }
1430
1431 src.reladdr = ralloc(mem_ctx, src_reg);
1432 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1433 }
1434
1435 /* If the type is smaller than a vec4, replicate the last channel out. */
1436 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1437 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1438 else
1439 src.swizzle = BRW_SWIZZLE_NOOP;
1440 src.type = brw_type_for_base_type(ir->type);
1441
1442 this->result = src;
1443 }
1444
1445 void
1446 vec4_visitor::visit(ir_dereference_record *ir)
1447 {
1448 unsigned int i;
1449 const glsl_type *struct_type = ir->record->type;
1450 int offset = 0;
1451
1452 ir->record->accept(this);
1453
1454 for (i = 0; i < struct_type->length; i++) {
1455 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1456 break;
1457 offset += type_size(struct_type->fields.structure[i].type);
1458 }
1459
1460 /* If the type is smaller than a vec4, replicate the last channel out. */
1461 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1462 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1463 else
1464 this->result.swizzle = BRW_SWIZZLE_NOOP;
1465 this->result.type = brw_type_for_base_type(ir->type);
1466
1467 this->result.reg_offset += offset;
1468 }
1469
1470 /**
1471 * We want to be careful in assignment setup to hit the actual storage
1472 * instead of potentially using a temporary like we might with the
1473 * ir_dereference handler.
1474 */
1475 static dst_reg
1476 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1477 {
1478 /* The LHS must be a dereference. If the LHS is a variable indexed array
1479 * access of a vector, it must be separated into a series conditional moves
1480 * before reaching this point (see ir_vec_index_to_cond_assign).
1481 */
1482 assert(ir->as_dereference());
1483 ir_dereference_array *deref_array = ir->as_dereference_array();
1484 if (deref_array) {
1485 assert(!deref_array->array->type->is_vector());
1486 }
1487
1488 /* Use the rvalue deref handler for the most part. We'll ignore
1489 * swizzles in it and write swizzles using writemask, though.
1490 */
1491 ir->accept(v);
1492 return dst_reg(v->result);
1493 }
1494
1495 void
1496 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1497 const struct glsl_type *type, uint32_t predicate)
1498 {
1499 if (type->base_type == GLSL_TYPE_STRUCT) {
1500 for (unsigned int i = 0; i < type->length; i++) {
1501 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1502 }
1503 return;
1504 }
1505
1506 if (type->is_array()) {
1507 for (unsigned int i = 0; i < type->length; i++) {
1508 emit_block_move(dst, src, type->fields.array, predicate);
1509 }
1510 return;
1511 }
1512
1513 if (type->is_matrix()) {
1514 const struct glsl_type *vec_type;
1515
1516 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1517 type->vector_elements, 1);
1518
1519 for (int i = 0; i < type->matrix_columns; i++) {
1520 emit_block_move(dst, src, vec_type, predicate);
1521 }
1522 return;
1523 }
1524
1525 assert(type->is_scalar() || type->is_vector());
1526
1527 dst->type = brw_type_for_base_type(type);
1528 src->type = dst->type;
1529
1530 dst->writemask = (1 << type->vector_elements) - 1;
1531
1532 src->swizzle = swizzle_for_size(type->vector_elements);
1533
1534 vec4_instruction *inst = emit(MOV(*dst, *src));
1535 inst->predicate = predicate;
1536
1537 dst->reg_offset++;
1538 src->reg_offset++;
1539 }
1540
1541
1542 /* If the RHS processing resulted in an instruction generating a
1543 * temporary value, and it would be easy to rewrite the instruction to
1544 * generate its result right into the LHS instead, do so. This ends
1545 * up reliably removing instructions where it can be tricky to do so
1546 * later without real UD chain information.
1547 */
1548 bool
1549 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1550 dst_reg dst,
1551 src_reg src,
1552 vec4_instruction *pre_rhs_inst,
1553 vec4_instruction *last_rhs_inst)
1554 {
1555 /* This could be supported, but it would take more smarts. */
1556 if (ir->condition)
1557 return false;
1558
1559 if (pre_rhs_inst == last_rhs_inst)
1560 return false; /* No instructions generated to work with. */
1561
1562 /* Make sure the last instruction generated our source reg. */
1563 if (src.file != GRF ||
1564 src.file != last_rhs_inst->dst.file ||
1565 src.reg != last_rhs_inst->dst.reg ||
1566 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1567 src.reladdr ||
1568 src.abs ||
1569 src.negate ||
1570 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1571 return false;
1572
1573 /* Check that that last instruction fully initialized the channels
1574 * we want to use, in the order we want to use them. We could
1575 * potentially reswizzle the operands of many instructions so that
1576 * we could handle out of order channels, but don't yet.
1577 */
1578
1579 for (unsigned i = 0; i < 4; i++) {
1580 if (dst.writemask & (1 << i)) {
1581 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1582 return false;
1583
1584 if (BRW_GET_SWZ(src.swizzle, i) != i)
1585 return false;
1586 }
1587 }
1588
1589 /* Success! Rewrite the instruction. */
1590 last_rhs_inst->dst.file = dst.file;
1591 last_rhs_inst->dst.reg = dst.reg;
1592 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1593 last_rhs_inst->dst.reladdr = dst.reladdr;
1594 last_rhs_inst->dst.writemask &= dst.writemask;
1595
1596 return true;
1597 }
1598
1599 void
1600 vec4_visitor::visit(ir_assignment *ir)
1601 {
1602 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1603 uint32_t predicate = BRW_PREDICATE_NONE;
1604
1605 if (!ir->lhs->type->is_scalar() &&
1606 !ir->lhs->type->is_vector()) {
1607 ir->rhs->accept(this);
1608 src_reg src = this->result;
1609
1610 if (ir->condition) {
1611 emit_bool_to_cond_code(ir->condition, &predicate);
1612 }
1613
1614 /* emit_block_move doesn't account for swizzles in the source register.
1615 * This should be ok, since the source register is a structure or an
1616 * array, and those can't be swizzled. But double-check to be sure.
1617 */
1618 assert(src.swizzle ==
1619 (ir->rhs->type->is_matrix()
1620 ? swizzle_for_size(ir->rhs->type->vector_elements)
1621 : BRW_SWIZZLE_NOOP));
1622
1623 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1624 return;
1625 }
1626
1627 /* Now we're down to just a scalar/vector with writemasks. */
1628 int i;
1629
1630 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1631 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1632
1633 ir->rhs->accept(this);
1634
1635 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1636
1637 src_reg src = this->result;
1638
1639 int swizzles[4];
1640 int first_enabled_chan = 0;
1641 int src_chan = 0;
1642
1643 assert(ir->lhs->type->is_vector() ||
1644 ir->lhs->type->is_scalar());
1645 dst.writemask = ir->write_mask;
1646
1647 for (int i = 0; i < 4; i++) {
1648 if (dst.writemask & (1 << i)) {
1649 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1650 break;
1651 }
1652 }
1653
1654 /* Swizzle a small RHS vector into the channels being written.
1655 *
1656 * glsl ir treats write_mask as dictating how many channels are
1657 * present on the RHS while in our instructions we need to make
1658 * those channels appear in the slots of the vec4 they're written to.
1659 */
1660 for (int i = 0; i < 4; i++) {
1661 if (dst.writemask & (1 << i))
1662 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1663 else
1664 swizzles[i] = first_enabled_chan;
1665 }
1666 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1667 swizzles[2], swizzles[3]);
1668
1669 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1670 return;
1671 }
1672
1673 if (ir->condition) {
1674 emit_bool_to_cond_code(ir->condition, &predicate);
1675 }
1676
1677 for (i = 0; i < type_size(ir->lhs->type); i++) {
1678 vec4_instruction *inst = emit(MOV(dst, src));
1679 inst->predicate = predicate;
1680
1681 dst.reg_offset++;
1682 src.reg_offset++;
1683 }
1684 }
1685
1686 void
1687 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1688 {
1689 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1690 foreach_list(node, &ir->components) {
1691 ir_constant *field_value = (ir_constant *)node;
1692
1693 emit_constant_values(dst, field_value);
1694 }
1695 return;
1696 }
1697
1698 if (ir->type->is_array()) {
1699 for (unsigned int i = 0; i < ir->type->length; i++) {
1700 emit_constant_values(dst, ir->array_elements[i]);
1701 }
1702 return;
1703 }
1704
1705 if (ir->type->is_matrix()) {
1706 for (int i = 0; i < ir->type->matrix_columns; i++) {
1707 float *vec = &ir->value.f[i * ir->type->vector_elements];
1708
1709 for (int j = 0; j < ir->type->vector_elements; j++) {
1710 dst->writemask = 1 << j;
1711 dst->type = BRW_REGISTER_TYPE_F;
1712
1713 emit(MOV(*dst, src_reg(vec[j])));
1714 }
1715 dst->reg_offset++;
1716 }
1717 return;
1718 }
1719
1720 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1721
1722 for (int i = 0; i < ir->type->vector_elements; i++) {
1723 if (!(remaining_writemask & (1 << i)))
1724 continue;
1725
1726 dst->writemask = 1 << i;
1727 dst->type = brw_type_for_base_type(ir->type);
1728
1729 /* Find other components that match the one we're about to
1730 * write. Emits fewer instructions for things like vec4(0.5,
1731 * 1.5, 1.5, 1.5).
1732 */
1733 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1734 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1735 if (ir->value.b[i] == ir->value.b[j])
1736 dst->writemask |= (1 << j);
1737 } else {
1738 /* u, i, and f storage all line up, so no need for a
1739 * switch case for comparing each type.
1740 */
1741 if (ir->value.u[i] == ir->value.u[j])
1742 dst->writemask |= (1 << j);
1743 }
1744 }
1745
1746 switch (ir->type->base_type) {
1747 case GLSL_TYPE_FLOAT:
1748 emit(MOV(*dst, src_reg(ir->value.f[i])));
1749 break;
1750 case GLSL_TYPE_INT:
1751 emit(MOV(*dst, src_reg(ir->value.i[i])));
1752 break;
1753 case GLSL_TYPE_UINT:
1754 emit(MOV(*dst, src_reg(ir->value.u[i])));
1755 break;
1756 case GLSL_TYPE_BOOL:
1757 emit(MOV(*dst, src_reg(ir->value.b[i])));
1758 break;
1759 default:
1760 assert(!"Non-float/uint/int/bool constant");
1761 break;
1762 }
1763
1764 remaining_writemask &= ~dst->writemask;
1765 }
1766 dst->reg_offset++;
1767 }
1768
1769 void
1770 vec4_visitor::visit(ir_constant *ir)
1771 {
1772 dst_reg dst = dst_reg(this, ir->type);
1773 this->result = src_reg(dst);
1774
1775 emit_constant_values(&dst, ir);
1776 }
1777
1778 void
1779 vec4_visitor::visit(ir_call *ir)
1780 {
1781 assert(!"not reached");
1782 }
1783
1784 void
1785 vec4_visitor::visit(ir_texture *ir)
1786 {
1787 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1788 sampler = vp->Base.SamplerUnits[sampler];
1789
1790 /* Should be lowered by do_lower_texture_projection */
1791 assert(!ir->projector);
1792
1793 /* Generate code to compute all the subexpression trees. This has to be
1794 * done before loading any values into MRFs for the sampler message since
1795 * generating these values may involve SEND messages that need the MRFs.
1796 */
1797 src_reg coordinate;
1798 if (ir->coordinate) {
1799 ir->coordinate->accept(this);
1800 coordinate = this->result;
1801 }
1802
1803 src_reg shadow_comparitor;
1804 if (ir->shadow_comparitor) {
1805 ir->shadow_comparitor->accept(this);
1806 shadow_comparitor = this->result;
1807 }
1808
1809 src_reg lod, dPdx, dPdy;
1810 switch (ir->op) {
1811 case ir_txf:
1812 case ir_txl:
1813 case ir_txs:
1814 ir->lod_info.lod->accept(this);
1815 lod = this->result;
1816 break;
1817 case ir_txd:
1818 ir->lod_info.grad.dPdx->accept(this);
1819 dPdx = this->result;
1820
1821 ir->lod_info.grad.dPdy->accept(this);
1822 dPdy = this->result;
1823 break;
1824 case ir_tex:
1825 case ir_txb:
1826 break;
1827 }
1828
1829 vec4_instruction *inst = NULL;
1830 switch (ir->op) {
1831 case ir_tex:
1832 case ir_txl:
1833 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1834 break;
1835 case ir_txd:
1836 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1837 break;
1838 case ir_txf:
1839 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1840 break;
1841 case ir_txs:
1842 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1843 break;
1844 case ir_txb:
1845 assert(!"TXB is not valid for vertex shaders.");
1846 }
1847
1848 /* Texel offsets go in the message header; Gen4 also requires headers. */
1849 inst->header_present = ir->offset || intel->gen < 5;
1850 inst->base_mrf = 2;
1851 inst->mlen = inst->header_present + 1; /* always at least one */
1852 inst->sampler = sampler;
1853 inst->dst = dst_reg(this, ir->type);
1854 inst->shadow_compare = ir->shadow_comparitor != NULL;
1855
1856 if (ir->offset != NULL && ir->op != ir_txf)
1857 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1858
1859 /* MRF for the first parameter */
1860 int param_base = inst->base_mrf + inst->header_present;
1861
1862 if (ir->op == ir_txs) {
1863 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1864 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1865 lod));
1866 } else {
1867 int i, coord_mask = 0, zero_mask = 0;
1868 /* Load the coordinate */
1869 /* FINISHME: gl_clamp_mask and saturate */
1870 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1871 coord_mask |= (1 << i);
1872 for (; i < 4; i++)
1873 zero_mask |= (1 << i);
1874
1875 if (ir->offset && ir->op == ir_txf) {
1876 /* It appears that the ld instruction used for txf does its
1877 * address bounds check before adding in the offset. To work
1878 * around this, just add the integer offset to the integer
1879 * texel coordinate, and don't put the offset in the header.
1880 */
1881 ir_constant *offset = ir->offset->as_constant();
1882 assert(offset);
1883
1884 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1885 src_reg src = coordinate;
1886 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1887 BRW_GET_SWZ(src.swizzle, j),
1888 BRW_GET_SWZ(src.swizzle, j),
1889 BRW_GET_SWZ(src.swizzle, j));
1890 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1891 src, offset->value.i[j]));
1892 }
1893 } else {
1894 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1895 coordinate));
1896 }
1897 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1898 src_reg(0)));
1899 /* Load the shadow comparitor */
1900 if (ir->shadow_comparitor) {
1901 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1902 WRITEMASK_X),
1903 shadow_comparitor));
1904 inst->mlen++;
1905 }
1906
1907 /* Load the LOD info */
1908 if (ir->op == ir_txl) {
1909 int mrf, writemask;
1910 if (intel->gen >= 5) {
1911 mrf = param_base + 1;
1912 if (ir->shadow_comparitor) {
1913 writemask = WRITEMASK_Y;
1914 /* mlen already incremented */
1915 } else {
1916 writemask = WRITEMASK_X;
1917 inst->mlen++;
1918 }
1919 } else /* intel->gen == 4 */ {
1920 mrf = param_base;
1921 writemask = WRITEMASK_Z;
1922 }
1923 emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
1924 } else if (ir->op == ir_txf) {
1925 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1926 lod));
1927 } else if (ir->op == ir_txd) {
1928 const glsl_type *type = ir->lod_info.grad.dPdx->type;
1929
1930 if (intel->gen >= 5) {
1931 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1932 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1933 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1934 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1935 inst->mlen++;
1936
1937 if (ir->type->vector_elements == 3) {
1938 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1939 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1940 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1941 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1942 inst->mlen++;
1943 }
1944 } else /* intel->gen == 4 */ {
1945 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1946 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1947 inst->mlen += 2;
1948 }
1949 }
1950 }
1951
1952 emit(inst);
1953
1954 swizzle_result(ir, src_reg(inst->dst), sampler);
1955 }
1956
1957 void
1958 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
1959 {
1960 this->result = orig_val;
1961
1962 int s = c->key.tex.swizzles[sampler];
1963
1964 if (ir->op == ir_txs || ir->type == glsl_type::float_type
1965 || s == SWIZZLE_NOOP)
1966 return;
1967
1968 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1969 int swizzle[4];
1970
1971 for (int i = 0; i < 4; i++) {
1972 switch (GET_SWZ(s, i)) {
1973 case SWIZZLE_ZERO:
1974 zero_mask |= (1 << i);
1975 break;
1976 case SWIZZLE_ONE:
1977 one_mask |= (1 << i);
1978 break;
1979 default:
1980 copy_mask |= (1 << i);
1981 swizzle[i] = GET_SWZ(s, i);
1982 break;
1983 }
1984 }
1985
1986 this->result = src_reg(this, ir->type);
1987 dst_reg swizzled_result(this->result);
1988
1989 if (copy_mask) {
1990 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1991 swizzled_result.writemask = copy_mask;
1992 emit(MOV(swizzled_result, orig_val));
1993 }
1994
1995 if (zero_mask) {
1996 swizzled_result.writemask = zero_mask;
1997 emit(MOV(swizzled_result, src_reg(0.0f)));
1998 }
1999
2000 if (one_mask) {
2001 swizzled_result.writemask = one_mask;
2002 emit(MOV(swizzled_result, src_reg(1.0f)));
2003 }
2004 }
2005
2006 void
2007 vec4_visitor::visit(ir_return *ir)
2008 {
2009 assert(!"not reached");
2010 }
2011
2012 void
2013 vec4_visitor::visit(ir_discard *ir)
2014 {
2015 assert(!"not reached");
2016 }
2017
2018 void
2019 vec4_visitor::visit(ir_if *ir)
2020 {
2021 /* Don't point the annotation at the if statement, because then it plus
2022 * the then and else blocks get printed.
2023 */
2024 this->base_ir = ir->condition;
2025
2026 if (intel->gen == 6) {
2027 emit_if_gen6(ir);
2028 } else {
2029 uint32_t predicate;
2030 emit_bool_to_cond_code(ir->condition, &predicate);
2031 emit(IF(predicate));
2032 }
2033
2034 visit_instructions(&ir->then_instructions);
2035
2036 if (!ir->else_instructions.is_empty()) {
2037 this->base_ir = ir->condition;
2038 emit(BRW_OPCODE_ELSE);
2039
2040 visit_instructions(&ir->else_instructions);
2041 }
2042
2043 this->base_ir = ir->condition;
2044 emit(BRW_OPCODE_ENDIF);
2045 }
2046
2047 void
2048 vec4_visitor::emit_ndc_computation()
2049 {
2050 /* Get the position */
2051 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2052
2053 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2054 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2055 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2056
2057 current_annotation = "NDC";
2058 dst_reg ndc_w = ndc;
2059 ndc_w.writemask = WRITEMASK_W;
2060 src_reg pos_w = pos;
2061 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2062 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2063
2064 dst_reg ndc_xyz = ndc;
2065 ndc_xyz.writemask = WRITEMASK_XYZ;
2066
2067 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2068 }
2069
2070 void
2071 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2072 {
2073 if (intel->gen < 6 &&
2074 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2075 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2076 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2077 dst_reg header1_w = header1;
2078 header1_w.writemask = WRITEMASK_W;
2079 GLuint i;
2080
2081 emit(MOV(header1, 0u));
2082
2083 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2084 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2085
2086 current_annotation = "Point size";
2087 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2088 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2089 }
2090
2091 current_annotation = "Clipping flags";
2092 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2093 vec4_instruction *inst;
2094
2095 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2096 src_reg(this->userplane[i])));
2097 inst->conditional_mod = BRW_CONDITIONAL_L;
2098
2099 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2100 inst->predicate = BRW_PREDICATE_NORMAL;
2101 }
2102
2103 /* i965 clipping workaround:
2104 * 1) Test for -ve rhw
2105 * 2) If set,
2106 * set ndc = (0,0,0,0)
2107 * set ucp[6] = 1
2108 *
2109 * Later, clipping will detect ucp[6] and ensure the primitive is
2110 * clipped against all fixed planes.
2111 */
2112 if (brw->has_negative_rhw_bug) {
2113 #if 0
2114 /* FINISHME */
2115 brw_CMP(p,
2116 vec8(brw_null_reg()),
2117 BRW_CONDITIONAL_L,
2118 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2119 brw_imm_f(0));
2120
2121 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2122 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2123 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2124 #endif
2125 }
2126
2127 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2128 } else if (intel->gen < 6) {
2129 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2130 } else {
2131 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2132 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2133 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2134 src_reg(output_reg[VERT_RESULT_PSIZ])));
2135 }
2136 }
2137 }
2138
2139 void
2140 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2141 {
2142 if (intel->gen < 6) {
2143 /* Clip distance slots are set aside in gen5, but they are not used. It
2144 * is not clear whether we actually need to set aside space for them,
2145 * but the performance cost is negligible.
2146 */
2147 return;
2148 }
2149
2150 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2151 *
2152 * "If a linked set of shaders forming the vertex stage contains no
2153 * static write to gl_ClipVertex or gl_ClipDistance, but the
2154 * application has requested clipping against user clip planes through
2155 * the API, then the coordinate written to gl_Position is used for
2156 * comparison against the user clip planes."
2157 *
2158 * This function is only called if the shader didn't write to
2159 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2160 * if the user wrote to it; otherwise we use gl_Position.
2161 */
2162 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2163 if (!(c->prog_data.outputs_written
2164 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2165 clip_vertex = VERT_RESULT_HPOS;
2166 }
2167
2168 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2169 ++i) {
2170 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2171 src_reg(output_reg[clip_vertex]),
2172 src_reg(this->userplane[i + offset])));
2173 }
2174 }
2175
2176 void
2177 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2178 {
2179 assert (vert_result < VERT_RESULT_MAX);
2180 reg.type = output_reg[vert_result].type;
2181 current_annotation = output_reg_annotation[vert_result];
2182 /* Copy the register, saturating if necessary */
2183 vec4_instruction *inst = emit(MOV(reg,
2184 src_reg(output_reg[vert_result])));
2185 if ((vert_result == VERT_RESULT_COL0 ||
2186 vert_result == VERT_RESULT_COL1 ||
2187 vert_result == VERT_RESULT_BFC0 ||
2188 vert_result == VERT_RESULT_BFC1) &&
2189 c->key.clamp_vertex_color) {
2190 inst->saturate = true;
2191 }
2192 }
2193
2194 void
2195 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2196 {
2197 struct brw_reg hw_reg = brw_message_reg(mrf);
2198 dst_reg reg = dst_reg(MRF, mrf);
2199 reg.type = BRW_REGISTER_TYPE_F;
2200
2201 switch (vert_result) {
2202 case VERT_RESULT_PSIZ:
2203 /* PSIZ is always in slot 0, and is coupled with other flags. */
2204 current_annotation = "indices, point width, clip flags";
2205 emit_psiz_and_flags(hw_reg);
2206 break;
2207 case BRW_VERT_RESULT_NDC:
2208 current_annotation = "NDC";
2209 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2210 break;
2211 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2212 case VERT_RESULT_HPOS:
2213 current_annotation = "gl_Position";
2214 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2215 break;
2216 case VERT_RESULT_CLIP_DIST0:
2217 case VERT_RESULT_CLIP_DIST1:
2218 if (this->c->key.uses_clip_distance) {
2219 emit_generic_urb_slot(reg, vert_result);
2220 } else {
2221 current_annotation = "user clip distances";
2222 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2223 }
2224 break;
2225 case BRW_VERT_RESULT_PAD:
2226 /* No need to write to this slot */
2227 break;
2228 default:
2229 emit_generic_urb_slot(reg, vert_result);
2230 break;
2231 }
2232 }
2233
2234 static int
2235 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2236 {
2237 struct intel_context *intel = &brw->intel;
2238
2239 if (intel->gen >= 6) {
2240 /* URB data written (does not include the message header reg) must
2241 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2242 * section 5.4.3.2.2: URB_INTERLEAVED.
2243 *
2244 * URB entries are allocated on a multiple of 1024 bits, so an
2245 * extra 128 bits written here to make the end align to 256 is
2246 * no problem.
2247 */
2248 if ((mlen % 2) != 1)
2249 mlen++;
2250 }
2251
2252 return mlen;
2253 }
2254
2255 /**
2256 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2257 * complete the VS thread.
2258 *
2259 * The VUE layout is documented in Volume 2a.
2260 */
2261 void
2262 vec4_visitor::emit_urb_writes()
2263 {
2264 /* MRF 0 is reserved for the debugger, so start with message header
2265 * in MRF 1.
2266 */
2267 int base_mrf = 1;
2268 int mrf = base_mrf;
2269 /* In the process of generating our URB write message contents, we
2270 * may need to unspill a register or load from an array. Those
2271 * reads would use MRFs 14-15.
2272 */
2273 int max_usable_mrf = 13;
2274
2275 /* The following assertion verifies that max_usable_mrf causes an
2276 * even-numbered amount of URB write data, which will meet gen6's
2277 * requirements for length alignment.
2278 */
2279 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2280
2281 /* FINISHME: edgeflag */
2282
2283 /* First mrf is the g0-based message header containing URB handles and such,
2284 * which is implied in VS_OPCODE_URB_WRITE.
2285 */
2286 mrf++;
2287
2288 if (intel->gen < 6) {
2289 emit_ndc_computation();
2290 }
2291
2292 /* Set up the VUE data for the first URB write */
2293 int slot;
2294 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2295 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2296
2297 /* If this was max_usable_mrf, we can't fit anything more into this URB
2298 * WRITE.
2299 */
2300 if (mrf > max_usable_mrf) {
2301 slot++;
2302 break;
2303 }
2304 }
2305
2306 current_annotation = "URB write";
2307 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2308 inst->base_mrf = base_mrf;
2309 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2310 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2311
2312 /* Optional second URB write */
2313 if (!inst->eot) {
2314 mrf = base_mrf + 1;
2315
2316 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2317 assert(mrf < max_usable_mrf);
2318
2319 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2320 }
2321
2322 current_annotation = "URB write";
2323 inst = emit(VS_OPCODE_URB_WRITE);
2324 inst->base_mrf = base_mrf;
2325 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2326 inst->eot = true;
2327 /* URB destination offset. In the previous write, we got MRFs
2328 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2329 * URB row increments, and each of our MRFs is half of one of
2330 * those, since we're doing interleaved writes.
2331 */
2332 inst->offset = (max_usable_mrf - base_mrf) / 2;
2333 }
2334 }
2335
2336 src_reg
2337 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2338 src_reg *reladdr, int reg_offset)
2339 {
2340 /* Because we store the values to scratch interleaved like our
2341 * vertex data, we need to scale the vec4 index by 2.
2342 */
2343 int message_header_scale = 2;
2344
2345 /* Pre-gen6, the message header uses byte offsets instead of vec4
2346 * (16-byte) offset units.
2347 */
2348 if (intel->gen < 6)
2349 message_header_scale *= 16;
2350
2351 if (reladdr) {
2352 src_reg index = src_reg(this, glsl_type::int_type);
2353
2354 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2355 emit_before(inst, MUL(dst_reg(index),
2356 index, src_reg(message_header_scale)));
2357
2358 return index;
2359 } else {
2360 return src_reg(reg_offset * message_header_scale);
2361 }
2362 }
2363
2364 src_reg
2365 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2366 src_reg *reladdr, int reg_offset)
2367 {
2368 if (reladdr) {
2369 src_reg index = src_reg(this, glsl_type::int_type);
2370
2371 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2372
2373 /* Pre-gen6, the message header uses byte offsets instead of vec4
2374 * (16-byte) offset units.
2375 */
2376 if (intel->gen < 6) {
2377 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2378 }
2379
2380 return index;
2381 } else {
2382 int message_header_scale = intel->gen < 6 ? 16 : 1;
2383 return src_reg(reg_offset * message_header_scale);
2384 }
2385 }
2386
2387 /**
2388 * Emits an instruction before @inst to load the value named by @orig_src
2389 * from scratch space at @base_offset to @temp.
2390 */
2391 void
2392 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2393 dst_reg temp, src_reg orig_src,
2394 int base_offset)
2395 {
2396 int reg_offset = base_offset + orig_src.reg_offset;
2397 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2398
2399 emit_before(inst, SCRATCH_READ(temp, index));
2400 }
2401
2402 /**
2403 * Emits an instruction after @inst to store the value to be written
2404 * to @orig_dst to scratch space at @base_offset, from @temp.
2405 */
2406 void
2407 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2408 src_reg temp, dst_reg orig_dst,
2409 int base_offset)
2410 {
2411 int reg_offset = base_offset + orig_dst.reg_offset;
2412 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2413
2414 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2415 orig_dst.writemask));
2416 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2417 write->predicate = inst->predicate;
2418 write->ir = inst->ir;
2419 write->annotation = inst->annotation;
2420 inst->insert_after(write);
2421 }
2422
2423 /**
2424 * We can't generally support array access in GRF space, because a
2425 * single instruction's destination can only span 2 contiguous
2426 * registers. So, we send all GRF arrays that get variable index
2427 * access to scratch space.
2428 */
2429 void
2430 vec4_visitor::move_grf_array_access_to_scratch()
2431 {
2432 int scratch_loc[this->virtual_grf_count];
2433
2434 for (int i = 0; i < this->virtual_grf_count; i++) {
2435 scratch_loc[i] = -1;
2436 }
2437
2438 /* First, calculate the set of virtual GRFs that need to be punted
2439 * to scratch due to having any array access on them, and where in
2440 * scratch.
2441 */
2442 foreach_list(node, &this->instructions) {
2443 vec4_instruction *inst = (vec4_instruction *)node;
2444
2445 if (inst->dst.file == GRF && inst->dst.reladdr &&
2446 scratch_loc[inst->dst.reg] == -1) {
2447 scratch_loc[inst->dst.reg] = c->last_scratch;
2448 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2449 }
2450
2451 for (int i = 0 ; i < 3; i++) {
2452 src_reg *src = &inst->src[i];
2453
2454 if (src->file == GRF && src->reladdr &&
2455 scratch_loc[src->reg] == -1) {
2456 scratch_loc[src->reg] = c->last_scratch;
2457 c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2458 }
2459 }
2460 }
2461
2462 /* Now, for anything that will be accessed through scratch, rewrite
2463 * it to load/store. Note that this is a _safe list walk, because
2464 * we may generate a new scratch_write instruction after the one
2465 * we're processing.
2466 */
2467 foreach_list_safe(node, &this->instructions) {
2468 vec4_instruction *inst = (vec4_instruction *)node;
2469
2470 /* Set up the annotation tracking for new generated instructions. */
2471 base_ir = inst->ir;
2472 current_annotation = inst->annotation;
2473
2474 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2475 src_reg temp = src_reg(this, glsl_type::vec4_type);
2476
2477 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2478
2479 inst->dst.file = temp.file;
2480 inst->dst.reg = temp.reg;
2481 inst->dst.reg_offset = temp.reg_offset;
2482 inst->dst.reladdr = NULL;
2483 }
2484
2485 for (int i = 0 ; i < 3; i++) {
2486 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2487 continue;
2488
2489 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2490
2491 emit_scratch_read(inst, temp, inst->src[i],
2492 scratch_loc[inst->src[i].reg]);
2493
2494 inst->src[i].file = temp.file;
2495 inst->src[i].reg = temp.reg;
2496 inst->src[i].reg_offset = temp.reg_offset;
2497 inst->src[i].reladdr = NULL;
2498 }
2499 }
2500 }
2501
2502 /**
2503 * Emits an instruction before @inst to load the value named by @orig_src
2504 * from the pull constant buffer (surface) at @base_offset to @temp.
2505 */
2506 void
2507 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2508 dst_reg temp, src_reg orig_src,
2509 int base_offset)
2510 {
2511 int reg_offset = base_offset + orig_src.reg_offset;
2512 src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2513 vec4_instruction *load;
2514
2515 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2516 temp, index);
2517 load->base_mrf = 14;
2518 load->mlen = 1;
2519 emit_before(inst, load);
2520 }
2521
2522 /**
2523 * Implements array access of uniforms by inserting a
2524 * PULL_CONSTANT_LOAD instruction.
2525 *
2526 * Unlike temporary GRF array access (where we don't support it due to
2527 * the difficulty of doing relative addressing on instruction
2528 * destinations), we could potentially do array access of uniforms
2529 * that were loaded in GRF space as push constants. In real-world
2530 * usage we've seen, though, the arrays being used are always larger
2531 * than we could load as push constants, so just always move all
2532 * uniform array access out to a pull constant buffer.
2533 */
2534 void
2535 vec4_visitor::move_uniform_array_access_to_pull_constants()
2536 {
2537 int pull_constant_loc[this->uniforms];
2538
2539 for (int i = 0; i < this->uniforms; i++) {
2540 pull_constant_loc[i] = -1;
2541 }
2542
2543 /* Walk through and find array access of uniforms. Put a copy of that
2544 * uniform in the pull constant buffer.
2545 *
2546 * Note that we don't move constant-indexed accesses to arrays. No
2547 * testing has been done of the performance impact of this choice.
2548 */
2549 foreach_list_safe(node, &this->instructions) {
2550 vec4_instruction *inst = (vec4_instruction *)node;
2551
2552 for (int i = 0 ; i < 3; i++) {
2553 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2554 continue;
2555
2556 int uniform = inst->src[i].reg;
2557
2558 /* If this array isn't already present in the pull constant buffer,
2559 * add it.
2560 */
2561 if (pull_constant_loc[uniform] == -1) {
2562 const float **values = &prog_data->param[uniform * 4];
2563
2564 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2565
2566 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2567 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2568 }
2569 }
2570
2571 /* Set up the annotation tracking for new generated instructions. */
2572 base_ir = inst->ir;
2573 current_annotation = inst->annotation;
2574
2575 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2576
2577 emit_pull_constant_load(inst, temp, inst->src[i],
2578 pull_constant_loc[uniform]);
2579
2580 inst->src[i].file = temp.file;
2581 inst->src[i].reg = temp.reg;
2582 inst->src[i].reg_offset = temp.reg_offset;
2583 inst->src[i].reladdr = NULL;
2584 }
2585 }
2586
2587 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2588 * no need to track them as larger-than-vec4 objects. This will be
2589 * relied on in cutting out unused uniform vectors from push
2590 * constants.
2591 */
2592 split_uniform_registers();
2593 }
2594
2595 void
2596 vec4_visitor::resolve_ud_negate(src_reg *reg)
2597 {
2598 if (reg->type != BRW_REGISTER_TYPE_UD ||
2599 !reg->negate)
2600 return;
2601
2602 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2603 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2604 *reg = temp;
2605 }
2606
2607 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2608 struct gl_shader_program *prog,
2609 struct brw_shader *shader)
2610 {
2611 this->c = c;
2612 this->p = &c->func;
2613 this->brw = p->brw;
2614 this->intel = &brw->intel;
2615 this->ctx = &intel->ctx;
2616 this->prog = prog;
2617 this->shader = shader;
2618
2619 this->mem_ctx = ralloc_context(NULL);
2620 this->failed = false;
2621
2622 this->base_ir = NULL;
2623 this->current_annotation = NULL;
2624
2625 this->c = c;
2626 this->vp = (struct gl_vertex_program *)
2627 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2628 this->prog_data = &c->prog_data;
2629
2630 this->variable_ht = hash_table_ctor(0,
2631 hash_table_pointer_hash,
2632 hash_table_pointer_compare);
2633
2634 this->virtual_grf_def = NULL;
2635 this->virtual_grf_use = NULL;
2636 this->virtual_grf_sizes = NULL;
2637 this->virtual_grf_count = 0;
2638 this->virtual_grf_reg_map = NULL;
2639 this->virtual_grf_reg_count = 0;
2640 this->virtual_grf_array_size = 0;
2641 this->live_intervals_valid = false;
2642
2643 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2644
2645 this->uniforms = 0;
2646 }
2647
2648 vec4_visitor::~vec4_visitor()
2649 {
2650 ralloc_free(this->mem_ctx);
2651 hash_table_dtor(this->variable_ht);
2652 }
2653
2654
2655 void
2656 vec4_visitor::fail(const char *format, ...)
2657 {
2658 va_list va;
2659 char *msg;
2660
2661 if (failed)
2662 return;
2663
2664 failed = true;
2665
2666 va_start(va, format);
2667 msg = ralloc_vasprintf(mem_ctx, format, va);
2668 va_end(va);
2669 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2670
2671 this->fail_msg = msg;
2672
2673 if (INTEL_DEBUG & DEBUG_VS) {
2674 fprintf(stderr, "%s", msg);
2675 }
2676 }
2677
2678 } /* namespace brw */