i965: Add support for ir_unop_f2u to i965 backend.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 src_reg::src_reg(dst_reg reg)
34 {
35 init();
36
37 this->file = reg.file;
38 this->reg = reg.reg;
39 this->reg_offset = reg.reg_offset;
40 this->type = reg.type;
41 this->reladdr = reg.reladdr;
42 this->fixed_hw_reg = reg.fixed_hw_reg;
43
44 int swizzles[4];
45 int next_chan = 0;
46 int last = 0;
47
48 for (int i = 0; i < 4; i++) {
49 if (!(reg.writemask & (1 << i)))
50 continue;
51
52 swizzles[next_chan++] = last = i;
53 }
54
55 for (; next_chan < 4; next_chan++) {
56 swizzles[next_chan] = last;
57 }
58
59 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
60 swizzles[2], swizzles[3]);
61 }
62
63 dst_reg::dst_reg(src_reg reg)
64 {
65 init();
66
67 this->file = reg.file;
68 this->reg = reg.reg;
69 this->reg_offset = reg.reg_offset;
70 this->type = reg.type;
71 this->writemask = WRITEMASK_XYZW;
72 this->reladdr = reg.reladdr;
73 this->fixed_hw_reg = reg.fixed_hw_reg;
74 }
75
76 vec4_instruction::vec4_instruction(vec4_visitor *v,
77 enum opcode opcode, dst_reg dst,
78 src_reg src0, src_reg src1, src_reg src2)
79 {
80 this->opcode = opcode;
81 this->dst = dst;
82 this->src[0] = src0;
83 this->src[1] = src1;
84 this->src[2] = src2;
85 this->ir = v->base_ir;
86 this->annotation = v->current_annotation;
87 }
88
89 vec4_instruction *
90 vec4_visitor::emit(vec4_instruction *inst)
91 {
92 this->instructions.push_tail(inst);
93
94 return inst;
95 }
96
97 vec4_instruction *
98 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
99 {
100 new_inst->ir = inst->ir;
101 new_inst->annotation = inst->annotation;
102
103 inst->insert_before(new_inst);
104
105 return inst;
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
110 src_reg src0, src_reg src1, src_reg src2)
111 {
112 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
113 src0, src1, src2));
114 }
115
116
117 vec4_instruction *
118 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
119 {
120 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
121 }
122
123 vec4_instruction *
124 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
125 {
126 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
127 }
128
129 vec4_instruction *
130 vec4_visitor::emit(enum opcode opcode)
131 {
132 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
133 }
134
135 #define ALU1(op) \
136 vec4_instruction * \
137 vec4_visitor::op(dst_reg dst, src_reg src0) \
138 { \
139 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
140 src0); \
141 }
142
143 #define ALU2(op) \
144 vec4_instruction * \
145 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
146 { \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU2(ADD)
158 ALU2(MUL)
159 ALU2(MACH)
160 ALU2(AND)
161 ALU2(OR)
162 ALU2(XOR)
163 ALU2(DP3)
164 ALU2(DP4)
165
166 /** Gen4 predicated IF. */
167 vec4_instruction *
168 vec4_visitor::IF(uint32_t predicate)
169 {
170 vec4_instruction *inst;
171
172 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
173 inst->predicate = predicate;
174
175 return inst;
176 }
177
178 /** Gen6+ IF with embedded comparison. */
179 vec4_instruction *
180 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
181 {
182 assert(intel->gen >= 6);
183
184 vec4_instruction *inst;
185
186 resolve_ud_negate(&src0);
187 resolve_ud_negate(&src1);
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
190 src0, src1);
191 inst->conditional_mod = condition;
192
193 return inst;
194 }
195
196 /**
197 * CMP: Sets the low bit of the destination channels with the result
198 * of the comparison, while the upper bits are undefined, and updates
199 * the flag register with the packed 16 bits of the result.
200 */
201 vec4_instruction *
202 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
203 {
204 vec4_instruction *inst;
205
206 /* original gen4 does type conversion to the destination type
207 * before before comparison, producing garbage results for floating
208 * point comparisons.
209 */
210 if (intel->gen == 4) {
211 dst.type = src0.type;
212 if (dst.file == HW_REG)
213 dst.fixed_hw_reg.type = dst.type;
214 }
215
216 resolve_ud_negate(&src0);
217 resolve_ud_negate(&src1);
218
219 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
220 inst->conditional_mod = condition;
221
222 return inst;
223 }
224
225 vec4_instruction *
226 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
227 {
228 vec4_instruction *inst;
229
230 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
231 dst, index);
232 inst->base_mrf = 14;
233 inst->mlen = 1;
234
235 return inst;
236 }
237
238 vec4_instruction *
239 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
240 {
241 vec4_instruction *inst;
242
243 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
244 dst, src, index);
245 inst->base_mrf = 13;
246 inst->mlen = 2;
247
248 return inst;
249 }
250
251 void
252 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
253 {
254 static enum opcode dot_opcodes[] = {
255 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
256 };
257
258 emit(dot_opcodes[elements - 2], dst, src0, src1);
259 }
260
261 void
262 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
263 {
264 /* The gen6 math instruction ignores the source modifiers --
265 * swizzle, abs, negate, and at least some parts of the register
266 * region description.
267 *
268 * While it would seem that this MOV could be avoided at this point
269 * in the case that the swizzle is matched up with the destination
270 * writemask, note that uniform packing and register allocation
271 * could rearrange our swizzle, so let's leave this matter up to
272 * copy propagation later.
273 */
274 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
275 emit(MOV(dst_reg(temp_src), src));
276
277 if (dst.writemask != WRITEMASK_XYZW) {
278 /* The gen6 math instruction must be align1, so we can't do
279 * writemasks.
280 */
281 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
282
283 emit(opcode, temp_dst, temp_src);
284
285 emit(MOV(dst, src_reg(temp_dst)));
286 } else {
287 emit(opcode, dst, temp_src);
288 }
289 }
290
291 void
292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
293 {
294 vec4_instruction *inst = emit(opcode, dst, src);
295 inst->base_mrf = 1;
296 inst->mlen = 1;
297 }
298
299 void
300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
301 {
302 switch (opcode) {
303 case SHADER_OPCODE_RCP:
304 case SHADER_OPCODE_RSQ:
305 case SHADER_OPCODE_SQRT:
306 case SHADER_OPCODE_EXP2:
307 case SHADER_OPCODE_LOG2:
308 case SHADER_OPCODE_SIN:
309 case SHADER_OPCODE_COS:
310 break;
311 default:
312 assert(!"not reached: bad math opcode");
313 return;
314 }
315
316 if (intel->gen >= 7) {
317 emit(opcode, dst, src);
318 } else if (intel->gen == 6) {
319 return emit_math1_gen6(opcode, dst, src);
320 } else {
321 return emit_math1_gen4(opcode, dst, src);
322 }
323 }
324
325 void
326 vec4_visitor::emit_math2_gen6(enum opcode opcode,
327 dst_reg dst, src_reg src0, src_reg src1)
328 {
329 src_reg expanded;
330
331 /* The gen6 math instruction ignores the source modifiers --
332 * swizzle, abs, negate, and at least some parts of the register
333 * region description. Move the sources to temporaries to make it
334 * generally work.
335 */
336
337 expanded = src_reg(this, glsl_type::vec4_type);
338 expanded.type = src0.type;
339 emit(MOV(dst_reg(expanded), src0));
340 src0 = expanded;
341
342 expanded = src_reg(this, glsl_type::vec4_type);
343 expanded.type = src1.type;
344 emit(MOV(dst_reg(expanded), src1));
345 src1 = expanded;
346
347 if (dst.writemask != WRITEMASK_XYZW) {
348 /* The gen6 math instruction must be align1, so we can't do
349 * writemasks.
350 */
351 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
352 temp_dst.type = dst.type;
353
354 emit(opcode, temp_dst, src0, src1);
355
356 emit(MOV(dst, src_reg(temp_dst)));
357 } else {
358 emit(opcode, dst, src0, src1);
359 }
360 }
361
362 void
363 vec4_visitor::emit_math2_gen4(enum opcode opcode,
364 dst_reg dst, src_reg src0, src_reg src1)
365 {
366 vec4_instruction *inst = emit(opcode, dst, src0, src1);
367 inst->base_mrf = 1;
368 inst->mlen = 2;
369 }
370
371 void
372 vec4_visitor::emit_math(enum opcode opcode,
373 dst_reg dst, src_reg src0, src_reg src1)
374 {
375 switch (opcode) {
376 case SHADER_OPCODE_POW:
377 case SHADER_OPCODE_INT_QUOTIENT:
378 case SHADER_OPCODE_INT_REMAINDER:
379 break;
380 default:
381 assert(!"not reached: unsupported binary math opcode");
382 return;
383 }
384
385 if (intel->gen >= 7) {
386 emit(opcode, dst, src0, src1);
387 } else if (intel->gen == 6) {
388 return emit_math2_gen6(opcode, dst, src0, src1);
389 } else {
390 return emit_math2_gen4(opcode, dst, src0, src1);
391 }
392 }
393
394 void
395 vec4_visitor::visit_instructions(const exec_list *list)
396 {
397 foreach_list(node, list) {
398 ir_instruction *ir = (ir_instruction *)node;
399
400 base_ir = ir;
401 ir->accept(this);
402 }
403 }
404
405
406 static int
407 type_size(const struct glsl_type *type)
408 {
409 unsigned int i;
410 int size;
411
412 switch (type->base_type) {
413 case GLSL_TYPE_UINT:
414 case GLSL_TYPE_INT:
415 case GLSL_TYPE_FLOAT:
416 case GLSL_TYPE_BOOL:
417 if (type->is_matrix()) {
418 return type->matrix_columns;
419 } else {
420 /* Regardless of size of vector, it gets a vec4. This is bad
421 * packing for things like floats, but otherwise arrays become a
422 * mess. Hopefully a later pass over the code can pack scalars
423 * down if appropriate.
424 */
425 return 1;
426 }
427 case GLSL_TYPE_ARRAY:
428 assert(type->length > 0);
429 return type_size(type->fields.array) * type->length;
430 case GLSL_TYPE_STRUCT:
431 size = 0;
432 for (i = 0; i < type->length; i++) {
433 size += type_size(type->fields.structure[i].type);
434 }
435 return size;
436 case GLSL_TYPE_SAMPLER:
437 /* Samplers take up one slot in UNIFORMS[], but they're baked in
438 * at link time.
439 */
440 return 1;
441 default:
442 assert(0);
443 return 0;
444 }
445 }
446
447 int
448 vec4_visitor::virtual_grf_alloc(int size)
449 {
450 if (virtual_grf_array_size <= virtual_grf_count) {
451 if (virtual_grf_array_size == 0)
452 virtual_grf_array_size = 16;
453 else
454 virtual_grf_array_size *= 2;
455 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
456 virtual_grf_array_size);
457 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
458 virtual_grf_array_size);
459 }
460 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
461 virtual_grf_reg_count += size;
462 virtual_grf_sizes[virtual_grf_count] = size;
463 return virtual_grf_count++;
464 }
465
466 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
467 {
468 init();
469
470 this->file = GRF;
471 this->reg = v->virtual_grf_alloc(type_size(type));
472
473 if (type->is_array() || type->is_record()) {
474 this->swizzle = BRW_SWIZZLE_NOOP;
475 } else {
476 this->swizzle = swizzle_for_size(type->vector_elements);
477 }
478
479 this->type = brw_type_for_base_type(type);
480 }
481
482 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
483 {
484 init();
485
486 this->file = GRF;
487 this->reg = v->virtual_grf_alloc(type_size(type));
488
489 if (type->is_array() || type->is_record()) {
490 this->writemask = WRITEMASK_XYZW;
491 } else {
492 this->writemask = (1 << type->vector_elements) - 1;
493 }
494
495 this->type = brw_type_for_base_type(type);
496 }
497
498 /* Our support for uniforms is piggy-backed on the struct
499 * gl_fragment_program, because that's where the values actually
500 * get stored, rather than in some global gl_shader_program uniform
501 * store.
502 */
503 int
504 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
505 {
506 unsigned int offset = 0;
507 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
508
509 if (type->is_matrix()) {
510 const glsl_type *column = type->column_type();
511
512 for (unsigned int i = 0; i < type->matrix_columns; i++) {
513 offset += setup_uniform_values(loc + offset, column);
514 }
515
516 return offset;
517 }
518
519 switch (type->base_type) {
520 case GLSL_TYPE_FLOAT:
521 case GLSL_TYPE_UINT:
522 case GLSL_TYPE_INT:
523 case GLSL_TYPE_BOOL:
524 for (unsigned int i = 0; i < type->vector_elements; i++) {
525 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
526 }
527
528 /* Set up pad elements to get things aligned to a vec4 boundary. */
529 for (unsigned int i = type->vector_elements; i < 4; i++) {
530 static float zero = 0;
531
532 c->prog_data.param[this->uniforms * 4 + i] = &zero;
533 }
534
535 /* Track the size of this uniform vector, for future packing of
536 * uniforms.
537 */
538 this->uniform_vector_size[this->uniforms] = type->vector_elements;
539 this->uniforms++;
540
541 return 1;
542
543 case GLSL_TYPE_STRUCT:
544 for (unsigned int i = 0; i < type->length; i++) {
545 offset += setup_uniform_values(loc + offset,
546 type->fields.structure[i].type);
547 }
548 return offset;
549
550 case GLSL_TYPE_ARRAY:
551 for (unsigned int i = 0; i < type->length; i++) {
552 offset += setup_uniform_values(loc + offset, type->fields.array);
553 }
554 return offset;
555
556 case GLSL_TYPE_SAMPLER:
557 /* The sampler takes up a slot, but we don't use any values from it. */
558 return 1;
559
560 default:
561 assert(!"not reached");
562 return 0;
563 }
564 }
565
566 void
567 vec4_visitor::setup_uniform_clipplane_values()
568 {
569 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
570
571 /* Pre-Gen6, we compact clip planes. For example, if the user
572 * enables just clip planes 0, 1, and 3, we will enable clip planes
573 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
574 * plane 2. This simplifies the implementation of the Gen6 clip
575 * thread.
576 *
577 * In Gen6 and later, we don't compact clip planes, because this
578 * simplifies the implementation of gl_ClipDistance.
579 */
580 int compacted_clipplane_index = 0;
581 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
582 if (intel->gen < 6 &&
583 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
584 continue;
585 }
586 this->uniform_vector_size[this->uniforms] = 4;
587 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
588 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
589 for (int j = 0; j < 4; ++j) {
590 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
591 }
592 ++compacted_clipplane_index;
593 ++this->uniforms;
594 }
595 }
596
597 /* Our support for builtin uniforms is even scarier than non-builtin.
598 * It sits on top of the PROG_STATE_VAR parameters that are
599 * automatically updated from GL context state.
600 */
601 void
602 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
603 {
604 const ir_state_slot *const slots = ir->state_slots;
605 assert(ir->state_slots != NULL);
606
607 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
608 /* This state reference has already been setup by ir_to_mesa,
609 * but we'll get the same index back here. We can reference
610 * ParameterValues directly, since unlike brw_fs.cpp, we never
611 * add new state references during compile.
612 */
613 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
614 (gl_state_index *)slots[i].tokens);
615 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
616
617 this->uniform_vector_size[this->uniforms] = 0;
618 /* Add each of the unique swizzled channels of the element.
619 * This will end up matching the size of the glsl_type of this field.
620 */
621 int last_swiz = -1;
622 for (unsigned int j = 0; j < 4; j++) {
623 int swiz = GET_SWZ(slots[i].swizzle, j);
624 last_swiz = swiz;
625
626 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
627 if (swiz <= last_swiz)
628 this->uniform_vector_size[this->uniforms]++;
629 }
630 this->uniforms++;
631 }
632 }
633
634 dst_reg *
635 vec4_visitor::variable_storage(ir_variable *var)
636 {
637 return (dst_reg *)hash_table_find(this->variable_ht, var);
638 }
639
640 void
641 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
642 {
643 ir_expression *expr = ir->as_expression();
644
645 *predicate = BRW_PREDICATE_NORMAL;
646
647 if (expr) {
648 src_reg op[2];
649 vec4_instruction *inst;
650
651 assert(expr->get_num_operands() <= 2);
652 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
653 expr->operands[i]->accept(this);
654 op[i] = this->result;
655
656 resolve_ud_negate(&op[i]);
657 }
658
659 switch (expr->operation) {
660 case ir_unop_logic_not:
661 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
662 inst->conditional_mod = BRW_CONDITIONAL_Z;
663 break;
664
665 case ir_binop_logic_xor:
666 inst = emit(XOR(dst_null_d(), op[0], op[1]));
667 inst->conditional_mod = BRW_CONDITIONAL_NZ;
668 break;
669
670 case ir_binop_logic_or:
671 inst = emit(OR(dst_null_d(), op[0], op[1]));
672 inst->conditional_mod = BRW_CONDITIONAL_NZ;
673 break;
674
675 case ir_binop_logic_and:
676 inst = emit(AND(dst_null_d(), op[0], op[1]));
677 inst->conditional_mod = BRW_CONDITIONAL_NZ;
678 break;
679
680 case ir_unop_f2b:
681 if (intel->gen >= 6) {
682 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
683 } else {
684 inst = emit(MOV(dst_null_f(), op[0]));
685 inst->conditional_mod = BRW_CONDITIONAL_NZ;
686 }
687 break;
688
689 case ir_unop_i2b:
690 if (intel->gen >= 6) {
691 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
692 } else {
693 inst = emit(MOV(dst_null_d(), op[0]));
694 inst->conditional_mod = BRW_CONDITIONAL_NZ;
695 }
696 break;
697
698 case ir_binop_all_equal:
699 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
700 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
701 break;
702
703 case ir_binop_any_nequal:
704 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
705 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
706 break;
707
708 case ir_unop_any:
709 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
710 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
711 break;
712
713 case ir_binop_greater:
714 case ir_binop_gequal:
715 case ir_binop_less:
716 case ir_binop_lequal:
717 case ir_binop_equal:
718 case ir_binop_nequal:
719 emit(CMP(dst_null_d(), op[0], op[1],
720 brw_conditional_for_comparison(expr->operation)));
721 break;
722
723 default:
724 assert(!"not reached");
725 break;
726 }
727 return;
728 }
729
730 ir->accept(this);
731
732 resolve_ud_negate(&this->result);
733
734 if (intel->gen >= 6) {
735 vec4_instruction *inst = emit(AND(dst_null_d(),
736 this->result, src_reg(1)));
737 inst->conditional_mod = BRW_CONDITIONAL_NZ;
738 } else {
739 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
740 inst->conditional_mod = BRW_CONDITIONAL_NZ;
741 }
742 }
743
744 /**
745 * Emit a gen6 IF statement with the comparison folded into the IF
746 * instruction.
747 */
748 void
749 vec4_visitor::emit_if_gen6(ir_if *ir)
750 {
751 ir_expression *expr = ir->condition->as_expression();
752
753 if (expr) {
754 src_reg op[2];
755 dst_reg temp;
756
757 assert(expr->get_num_operands() <= 2);
758 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
759 expr->operands[i]->accept(this);
760 op[i] = this->result;
761 }
762
763 switch (expr->operation) {
764 case ir_unop_logic_not:
765 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
766 return;
767
768 case ir_binop_logic_xor:
769 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
770 return;
771
772 case ir_binop_logic_or:
773 temp = dst_reg(this, glsl_type::bool_type);
774 emit(OR(temp, op[0], op[1]));
775 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
776 return;
777
778 case ir_binop_logic_and:
779 temp = dst_reg(this, glsl_type::bool_type);
780 emit(AND(temp, op[0], op[1]));
781 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
782 return;
783
784 case ir_unop_f2b:
785 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
786 return;
787
788 case ir_unop_i2b:
789 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
790 return;
791
792 case ir_binop_greater:
793 case ir_binop_gequal:
794 case ir_binop_less:
795 case ir_binop_lequal:
796 case ir_binop_equal:
797 case ir_binop_nequal:
798 emit(IF(op[0], op[1],
799 brw_conditional_for_comparison(expr->operation)));
800 return;
801
802 case ir_binop_all_equal:
803 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
804 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
805 return;
806
807 case ir_binop_any_nequal:
808 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
809 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
810 return;
811
812 case ir_unop_any:
813 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
814 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
815 return;
816
817 default:
818 assert(!"not reached");
819 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
820 return;
821 }
822 return;
823 }
824
825 ir->condition->accept(this);
826
827 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
828 }
829
830 void
831 vec4_visitor::visit(ir_variable *ir)
832 {
833 dst_reg *reg = NULL;
834
835 if (variable_storage(ir))
836 return;
837
838 switch (ir->mode) {
839 case ir_var_in:
840 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
841
842 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
843 * come in as floating point conversions of the integer values.
844 */
845 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
846 if (!c->key.gl_fixed_input_size[i])
847 continue;
848
849 dst_reg dst = *reg;
850 dst.type = brw_type_for_base_type(ir->type);
851 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
852 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
853 }
854 break;
855
856 case ir_var_out:
857 reg = new(mem_ctx) dst_reg(this, ir->type);
858
859 for (int i = 0; i < type_size(ir->type); i++) {
860 output_reg[ir->location + i] = *reg;
861 output_reg[ir->location + i].reg_offset = i;
862 output_reg[ir->location + i].type =
863 brw_type_for_base_type(ir->type->get_scalar_type());
864 output_reg_annotation[ir->location + i] = ir->name;
865 }
866 break;
867
868 case ir_var_auto:
869 case ir_var_temporary:
870 reg = new(mem_ctx) dst_reg(this, ir->type);
871 break;
872
873 case ir_var_uniform:
874 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
875
876 /* Track how big the whole uniform variable is, in case we need to put a
877 * copy of its data into pull constants for array access.
878 */
879 this->uniform_size[this->uniforms] = type_size(ir->type);
880
881 if (!strncmp(ir->name, "gl_", 3)) {
882 setup_builtin_uniform_values(ir);
883 } else {
884 setup_uniform_values(ir->location, ir->type);
885 }
886 break;
887
888 case ir_var_system_value:
889 /* VertexID is stored by the VF as the last vertex element, but
890 * we don't represent it with a flag in inputs_read, so we call
891 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
892 */
893 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
894 prog_data->uses_vertexid = true;
895
896 switch (ir->location) {
897 case SYSTEM_VALUE_VERTEX_ID:
898 reg->writemask = WRITEMASK_X;
899 break;
900 case SYSTEM_VALUE_INSTANCE_ID:
901 reg->writemask = WRITEMASK_Y;
902 break;
903 default:
904 assert(!"not reached");
905 break;
906 }
907 break;
908
909 default:
910 assert(!"not reached");
911 }
912
913 reg->type = brw_type_for_base_type(ir->type);
914 hash_table_insert(this->variable_ht, reg, ir);
915 }
916
917 void
918 vec4_visitor::visit(ir_loop *ir)
919 {
920 dst_reg counter;
921
922 /* We don't want debugging output to print the whole body of the
923 * loop as the annotation.
924 */
925 this->base_ir = NULL;
926
927 if (ir->counter != NULL) {
928 this->base_ir = ir->counter;
929 ir->counter->accept(this);
930 counter = *(variable_storage(ir->counter));
931
932 if (ir->from != NULL) {
933 this->base_ir = ir->from;
934 ir->from->accept(this);
935
936 emit(MOV(counter, this->result));
937 }
938 }
939
940 emit(BRW_OPCODE_DO);
941
942 if (ir->to) {
943 this->base_ir = ir->to;
944 ir->to->accept(this);
945
946 emit(CMP(dst_null_d(), src_reg(counter), this->result,
947 brw_conditional_for_comparison(ir->cmp)));
948
949 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
950 inst->predicate = BRW_PREDICATE_NORMAL;
951 }
952
953 visit_instructions(&ir->body_instructions);
954
955
956 if (ir->increment) {
957 this->base_ir = ir->increment;
958 ir->increment->accept(this);
959 emit(ADD(counter, src_reg(counter), this->result));
960 }
961
962 emit(BRW_OPCODE_WHILE);
963 }
964
965 void
966 vec4_visitor::visit(ir_loop_jump *ir)
967 {
968 switch (ir->mode) {
969 case ir_loop_jump::jump_break:
970 emit(BRW_OPCODE_BREAK);
971 break;
972 case ir_loop_jump::jump_continue:
973 emit(BRW_OPCODE_CONTINUE);
974 break;
975 }
976 }
977
978
979 void
980 vec4_visitor::visit(ir_function_signature *ir)
981 {
982 assert(0);
983 (void)ir;
984 }
985
986 void
987 vec4_visitor::visit(ir_function *ir)
988 {
989 /* Ignore function bodies other than main() -- we shouldn't see calls to
990 * them since they should all be inlined.
991 */
992 if (strcmp(ir->name, "main") == 0) {
993 const ir_function_signature *sig;
994 exec_list empty;
995
996 sig = ir->matching_signature(&empty);
997
998 assert(sig);
999
1000 visit_instructions(&sig->body);
1001 }
1002 }
1003
1004 bool
1005 vec4_visitor::try_emit_sat(ir_expression *ir)
1006 {
1007 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1008 if (!sat_src)
1009 return false;
1010
1011 sat_src->accept(this);
1012 src_reg src = this->result;
1013
1014 this->result = src_reg(this, ir->type);
1015 vec4_instruction *inst;
1016 inst = emit(MOV(dst_reg(this->result), src));
1017 inst->saturate = true;
1018
1019 return true;
1020 }
1021
1022 void
1023 vec4_visitor::emit_bool_comparison(unsigned int op,
1024 dst_reg dst, src_reg src0, src_reg src1)
1025 {
1026 /* original gen4 does destination conversion before comparison. */
1027 if (intel->gen < 5)
1028 dst.type = src0.type;
1029
1030 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1031
1032 dst.type = BRW_REGISTER_TYPE_D;
1033 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1034 }
1035
1036 void
1037 vec4_visitor::visit(ir_expression *ir)
1038 {
1039 unsigned int operand;
1040 src_reg op[Elements(ir->operands)];
1041 src_reg result_src;
1042 dst_reg result_dst;
1043 vec4_instruction *inst;
1044
1045 if (try_emit_sat(ir))
1046 return;
1047
1048 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1049 this->result.file = BAD_FILE;
1050 ir->operands[operand]->accept(this);
1051 if (this->result.file == BAD_FILE) {
1052 printf("Failed to get tree for expression operand:\n");
1053 ir->operands[operand]->print();
1054 exit(1);
1055 }
1056 op[operand] = this->result;
1057
1058 /* Matrix expression operands should have been broken down to vector
1059 * operations already.
1060 */
1061 assert(!ir->operands[operand]->type->is_matrix());
1062 }
1063
1064 int vector_elements = ir->operands[0]->type->vector_elements;
1065 if (ir->operands[1]) {
1066 vector_elements = MAX2(vector_elements,
1067 ir->operands[1]->type->vector_elements);
1068 }
1069
1070 this->result.file = BAD_FILE;
1071
1072 /* Storage for our result. Ideally for an assignment we'd be using
1073 * the actual storage for the result here, instead.
1074 */
1075 result_src = src_reg(this, ir->type);
1076 /* convenience for the emit functions below. */
1077 result_dst = dst_reg(result_src);
1078 /* If nothing special happens, this is the result. */
1079 this->result = result_src;
1080 /* Limit writes to the channels that will be used by result_src later.
1081 * This does limit this temp's use as a temporary for multi-instruction
1082 * sequences.
1083 */
1084 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1085
1086 switch (ir->operation) {
1087 case ir_unop_logic_not:
1088 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1089 * ones complement of the whole register, not just bit 0.
1090 */
1091 emit(XOR(result_dst, op[0], src_reg(1)));
1092 break;
1093 case ir_unop_neg:
1094 op[0].negate = !op[0].negate;
1095 this->result = op[0];
1096 break;
1097 case ir_unop_abs:
1098 op[0].abs = true;
1099 op[0].negate = false;
1100 this->result = op[0];
1101 break;
1102
1103 case ir_unop_sign:
1104 emit(MOV(result_dst, src_reg(0.0f)));
1105
1106 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1107 inst = emit(MOV(result_dst, src_reg(1.0f)));
1108 inst->predicate = BRW_PREDICATE_NORMAL;
1109
1110 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1111 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1112 inst->predicate = BRW_PREDICATE_NORMAL;
1113
1114 break;
1115
1116 case ir_unop_rcp:
1117 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1118 break;
1119
1120 case ir_unop_exp2:
1121 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1122 break;
1123 case ir_unop_log2:
1124 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1125 break;
1126 case ir_unop_exp:
1127 case ir_unop_log:
1128 assert(!"not reached: should be handled by ir_explog_to_explog2");
1129 break;
1130 case ir_unop_sin:
1131 case ir_unop_sin_reduced:
1132 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1133 break;
1134 case ir_unop_cos:
1135 case ir_unop_cos_reduced:
1136 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1137 break;
1138
1139 case ir_unop_dFdx:
1140 case ir_unop_dFdy:
1141 assert(!"derivatives not valid in vertex shader");
1142 break;
1143
1144 case ir_unop_noise:
1145 assert(!"not reached: should be handled by lower_noise");
1146 break;
1147
1148 case ir_binop_add:
1149 emit(ADD(result_dst, op[0], op[1]));
1150 break;
1151 case ir_binop_sub:
1152 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1153 break;
1154
1155 case ir_binop_mul:
1156 if (ir->type->is_integer()) {
1157 /* For integer multiplication, the MUL uses the low 16 bits
1158 * of one of the operands (src0 on gen6, src1 on gen7). The
1159 * MACH accumulates in the contribution of the upper 16 bits
1160 * of that operand.
1161 *
1162 * FINISHME: Emit just the MUL if we know an operand is small
1163 * enough.
1164 */
1165 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1166
1167 emit(MUL(acc, op[0], op[1]));
1168 emit(MACH(dst_null_d(), op[0], op[1]));
1169 emit(MOV(result_dst, src_reg(acc)));
1170 } else {
1171 emit(MUL(result_dst, op[0], op[1]));
1172 }
1173 break;
1174 case ir_binop_div:
1175 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1176 assert(ir->type->is_integer());
1177 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1178 break;
1179 case ir_binop_mod:
1180 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1181 assert(ir->type->is_integer());
1182 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1183 break;
1184
1185 case ir_binop_less:
1186 case ir_binop_greater:
1187 case ir_binop_lequal:
1188 case ir_binop_gequal:
1189 case ir_binop_equal:
1190 case ir_binop_nequal: {
1191 emit(CMP(result_dst, op[0], op[1],
1192 brw_conditional_for_comparison(ir->operation)));
1193 emit(AND(result_dst, result_src, src_reg(0x1)));
1194 break;
1195 }
1196
1197 case ir_binop_all_equal:
1198 /* "==" operator producing a scalar boolean. */
1199 if (ir->operands[0]->type->is_vector() ||
1200 ir->operands[1]->type->is_vector()) {
1201 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1202 emit(MOV(result_dst, src_reg(0)));
1203 inst = emit(MOV(result_dst, src_reg(1)));
1204 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1205 } else {
1206 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1207 emit(AND(result_dst, result_src, src_reg(0x1)));
1208 }
1209 break;
1210 case ir_binop_any_nequal:
1211 /* "!=" operator producing a scalar boolean. */
1212 if (ir->operands[0]->type->is_vector() ||
1213 ir->operands[1]->type->is_vector()) {
1214 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1215
1216 emit(MOV(result_dst, src_reg(0)));
1217 inst = emit(MOV(result_dst, src_reg(1)));
1218 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1219 } else {
1220 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1221 emit(AND(result_dst, result_src, src_reg(0x1)));
1222 }
1223 break;
1224
1225 case ir_unop_any:
1226 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1227 emit(MOV(result_dst, src_reg(0)));
1228
1229 inst = emit(MOV(result_dst, src_reg(1)));
1230 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1231 break;
1232
1233 case ir_binop_logic_xor:
1234 emit(XOR(result_dst, op[0], op[1]));
1235 break;
1236
1237 case ir_binop_logic_or:
1238 emit(OR(result_dst, op[0], op[1]));
1239 break;
1240
1241 case ir_binop_logic_and:
1242 emit(AND(result_dst, op[0], op[1]));
1243 break;
1244
1245 case ir_binop_dot:
1246 assert(ir->operands[0]->type->is_vector());
1247 assert(ir->operands[0]->type == ir->operands[1]->type);
1248 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1249 break;
1250
1251 case ir_unop_sqrt:
1252 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1253 break;
1254 case ir_unop_rsq:
1255 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1256 break;
1257
1258 case ir_unop_bitcast_i2f:
1259 case ir_unop_bitcast_u2f:
1260 this->result = op[0];
1261 this->result.type = BRW_REGISTER_TYPE_F;
1262 break;
1263
1264 case ir_unop_bitcast_f2i:
1265 this->result = op[0];
1266 this->result.type = BRW_REGISTER_TYPE_D;
1267 break;
1268
1269 case ir_unop_bitcast_f2u:
1270 this->result = op[0];
1271 this->result.type = BRW_REGISTER_TYPE_UD;
1272 break;
1273
1274 case ir_unop_i2f:
1275 case ir_unop_i2u:
1276 case ir_unop_u2i:
1277 case ir_unop_u2f:
1278 case ir_unop_b2f:
1279 case ir_unop_b2i:
1280 case ir_unop_f2i:
1281 case ir_unop_f2u:
1282 emit(MOV(result_dst, op[0]));
1283 break;
1284 case ir_unop_f2b:
1285 case ir_unop_i2b: {
1286 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1287 emit(AND(result_dst, result_src, src_reg(1)));
1288 break;
1289 }
1290
1291 case ir_unop_trunc:
1292 emit(RNDZ(result_dst, op[0]));
1293 break;
1294 case ir_unop_ceil:
1295 op[0].negate = !op[0].negate;
1296 inst = emit(RNDD(result_dst, op[0]));
1297 this->result.negate = true;
1298 break;
1299 case ir_unop_floor:
1300 inst = emit(RNDD(result_dst, op[0]));
1301 break;
1302 case ir_unop_fract:
1303 inst = emit(FRC(result_dst, op[0]));
1304 break;
1305 case ir_unop_round_even:
1306 emit(RNDE(result_dst, op[0]));
1307 break;
1308
1309 case ir_binop_min:
1310 if (intel->gen >= 6) {
1311 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1312 inst->conditional_mod = BRW_CONDITIONAL_L;
1313 } else {
1314 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1315
1316 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1317 inst->predicate = BRW_PREDICATE_NORMAL;
1318 }
1319 break;
1320 case ir_binop_max:
1321 if (intel->gen >= 6) {
1322 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1323 inst->conditional_mod = BRW_CONDITIONAL_G;
1324 } else {
1325 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1326
1327 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1328 inst->predicate = BRW_PREDICATE_NORMAL;
1329 }
1330 break;
1331
1332 case ir_binop_pow:
1333 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1334 break;
1335
1336 case ir_unop_bit_not:
1337 inst = emit(NOT(result_dst, op[0]));
1338 break;
1339 case ir_binop_bit_and:
1340 inst = emit(AND(result_dst, op[0], op[1]));
1341 break;
1342 case ir_binop_bit_xor:
1343 inst = emit(XOR(result_dst, op[0], op[1]));
1344 break;
1345 case ir_binop_bit_or:
1346 inst = emit(OR(result_dst, op[0], op[1]));
1347 break;
1348
1349 case ir_binop_lshift:
1350 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1351 break;
1352
1353 case ir_binop_rshift:
1354 if (ir->type->base_type == GLSL_TYPE_INT)
1355 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1356 else
1357 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1358 break;
1359
1360 case ir_quadop_vector:
1361 assert(!"not reached: should be handled by lower_quadop_vector");
1362 break;
1363 }
1364 }
1365
1366
1367 void
1368 vec4_visitor::visit(ir_swizzle *ir)
1369 {
1370 src_reg src;
1371 int i = 0;
1372 int swizzle[4];
1373
1374 /* Note that this is only swizzles in expressions, not those on the left
1375 * hand side of an assignment, which do write masking. See ir_assignment
1376 * for that.
1377 */
1378
1379 ir->val->accept(this);
1380 src = this->result;
1381 assert(src.file != BAD_FILE);
1382
1383 for (i = 0; i < ir->type->vector_elements; i++) {
1384 switch (i) {
1385 case 0:
1386 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1387 break;
1388 case 1:
1389 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1390 break;
1391 case 2:
1392 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1393 break;
1394 case 3:
1395 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1396 break;
1397 }
1398 }
1399 for (; i < 4; i++) {
1400 /* Replicate the last channel out. */
1401 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1402 }
1403
1404 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1405
1406 this->result = src;
1407 }
1408
1409 void
1410 vec4_visitor::visit(ir_dereference_variable *ir)
1411 {
1412 const struct glsl_type *type = ir->type;
1413 dst_reg *reg = variable_storage(ir->var);
1414
1415 if (!reg) {
1416 fail("Failed to find variable storage for %s\n", ir->var->name);
1417 this->result = src_reg(brw_null_reg());
1418 return;
1419 }
1420
1421 this->result = src_reg(*reg);
1422
1423 /* System values get their swizzle from the dst_reg writemask */
1424 if (ir->var->mode == ir_var_system_value)
1425 return;
1426
1427 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1428 this->result.swizzle = swizzle_for_size(type->vector_elements);
1429 }
1430
1431 void
1432 vec4_visitor::visit(ir_dereference_array *ir)
1433 {
1434 ir_constant *constant_index;
1435 src_reg src;
1436 int element_size = type_size(ir->type);
1437
1438 constant_index = ir->array_index->constant_expression_value();
1439
1440 ir->array->accept(this);
1441 src = this->result;
1442
1443 if (constant_index) {
1444 src.reg_offset += constant_index->value.i[0] * element_size;
1445 } else {
1446 /* Variable index array dereference. It eats the "vec4" of the
1447 * base of the array and an index that offsets the Mesa register
1448 * index.
1449 */
1450 ir->array_index->accept(this);
1451
1452 src_reg index_reg;
1453
1454 if (element_size == 1) {
1455 index_reg = this->result;
1456 } else {
1457 index_reg = src_reg(this, glsl_type::int_type);
1458
1459 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1460 }
1461
1462 if (src.reladdr) {
1463 src_reg temp = src_reg(this, glsl_type::int_type);
1464
1465 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1466
1467 index_reg = temp;
1468 }
1469
1470 src.reladdr = ralloc(mem_ctx, src_reg);
1471 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1472 }
1473
1474 /* If the type is smaller than a vec4, replicate the last channel out. */
1475 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1476 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1477 else
1478 src.swizzle = BRW_SWIZZLE_NOOP;
1479 src.type = brw_type_for_base_type(ir->type);
1480
1481 this->result = src;
1482 }
1483
1484 void
1485 vec4_visitor::visit(ir_dereference_record *ir)
1486 {
1487 unsigned int i;
1488 const glsl_type *struct_type = ir->record->type;
1489 int offset = 0;
1490
1491 ir->record->accept(this);
1492
1493 for (i = 0; i < struct_type->length; i++) {
1494 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1495 break;
1496 offset += type_size(struct_type->fields.structure[i].type);
1497 }
1498
1499 /* If the type is smaller than a vec4, replicate the last channel out. */
1500 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1501 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1502 else
1503 this->result.swizzle = BRW_SWIZZLE_NOOP;
1504 this->result.type = brw_type_for_base_type(ir->type);
1505
1506 this->result.reg_offset += offset;
1507 }
1508
1509 /**
1510 * We want to be careful in assignment setup to hit the actual storage
1511 * instead of potentially using a temporary like we might with the
1512 * ir_dereference handler.
1513 */
1514 static dst_reg
1515 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1516 {
1517 /* The LHS must be a dereference. If the LHS is a variable indexed array
1518 * access of a vector, it must be separated into a series conditional moves
1519 * before reaching this point (see ir_vec_index_to_cond_assign).
1520 */
1521 assert(ir->as_dereference());
1522 ir_dereference_array *deref_array = ir->as_dereference_array();
1523 if (deref_array) {
1524 assert(!deref_array->array->type->is_vector());
1525 }
1526
1527 /* Use the rvalue deref handler for the most part. We'll ignore
1528 * swizzles in it and write swizzles using writemask, though.
1529 */
1530 ir->accept(v);
1531 return dst_reg(v->result);
1532 }
1533
1534 void
1535 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1536 const struct glsl_type *type, uint32_t predicate)
1537 {
1538 if (type->base_type == GLSL_TYPE_STRUCT) {
1539 for (unsigned int i = 0; i < type->length; i++) {
1540 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1541 }
1542 return;
1543 }
1544
1545 if (type->is_array()) {
1546 for (unsigned int i = 0; i < type->length; i++) {
1547 emit_block_move(dst, src, type->fields.array, predicate);
1548 }
1549 return;
1550 }
1551
1552 if (type->is_matrix()) {
1553 const struct glsl_type *vec_type;
1554
1555 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1556 type->vector_elements, 1);
1557
1558 for (int i = 0; i < type->matrix_columns; i++) {
1559 emit_block_move(dst, src, vec_type, predicate);
1560 }
1561 return;
1562 }
1563
1564 assert(type->is_scalar() || type->is_vector());
1565
1566 dst->type = brw_type_for_base_type(type);
1567 src->type = dst->type;
1568
1569 dst->writemask = (1 << type->vector_elements) - 1;
1570
1571 src->swizzle = swizzle_for_size(type->vector_elements);
1572
1573 vec4_instruction *inst = emit(MOV(*dst, *src));
1574 inst->predicate = predicate;
1575
1576 dst->reg_offset++;
1577 src->reg_offset++;
1578 }
1579
1580
1581 /* If the RHS processing resulted in an instruction generating a
1582 * temporary value, and it would be easy to rewrite the instruction to
1583 * generate its result right into the LHS instead, do so. This ends
1584 * up reliably removing instructions where it can be tricky to do so
1585 * later without real UD chain information.
1586 */
1587 bool
1588 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1589 dst_reg dst,
1590 src_reg src,
1591 vec4_instruction *pre_rhs_inst,
1592 vec4_instruction *last_rhs_inst)
1593 {
1594 /* This could be supported, but it would take more smarts. */
1595 if (ir->condition)
1596 return false;
1597
1598 if (pre_rhs_inst == last_rhs_inst)
1599 return false; /* No instructions generated to work with. */
1600
1601 /* Make sure the last instruction generated our source reg. */
1602 if (src.file != GRF ||
1603 src.file != last_rhs_inst->dst.file ||
1604 src.reg != last_rhs_inst->dst.reg ||
1605 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1606 src.reladdr ||
1607 src.abs ||
1608 src.negate ||
1609 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1610 return false;
1611
1612 /* Check that that last instruction fully initialized the channels
1613 * we want to use, in the order we want to use them. We could
1614 * potentially reswizzle the operands of many instructions so that
1615 * we could handle out of order channels, but don't yet.
1616 */
1617
1618 for (unsigned i = 0; i < 4; i++) {
1619 if (dst.writemask & (1 << i)) {
1620 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1621 return false;
1622
1623 if (BRW_GET_SWZ(src.swizzle, i) != i)
1624 return false;
1625 }
1626 }
1627
1628 /* Success! Rewrite the instruction. */
1629 last_rhs_inst->dst.file = dst.file;
1630 last_rhs_inst->dst.reg = dst.reg;
1631 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1632 last_rhs_inst->dst.reladdr = dst.reladdr;
1633 last_rhs_inst->dst.writemask &= dst.writemask;
1634
1635 return true;
1636 }
1637
1638 void
1639 vec4_visitor::visit(ir_assignment *ir)
1640 {
1641 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1642 uint32_t predicate = BRW_PREDICATE_NONE;
1643
1644 if (!ir->lhs->type->is_scalar() &&
1645 !ir->lhs->type->is_vector()) {
1646 ir->rhs->accept(this);
1647 src_reg src = this->result;
1648
1649 if (ir->condition) {
1650 emit_bool_to_cond_code(ir->condition, &predicate);
1651 }
1652
1653 /* emit_block_move doesn't account for swizzles in the source register.
1654 * This should be ok, since the source register is a structure or an
1655 * array, and those can't be swizzled. But double-check to be sure.
1656 */
1657 assert(src.swizzle ==
1658 (ir->rhs->type->is_matrix()
1659 ? swizzle_for_size(ir->rhs->type->vector_elements)
1660 : BRW_SWIZZLE_NOOP));
1661
1662 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1663 return;
1664 }
1665
1666 /* Now we're down to just a scalar/vector with writemasks. */
1667 int i;
1668
1669 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1670 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1671
1672 ir->rhs->accept(this);
1673
1674 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1675
1676 src_reg src = this->result;
1677
1678 int swizzles[4];
1679 int first_enabled_chan = 0;
1680 int src_chan = 0;
1681
1682 assert(ir->lhs->type->is_vector() ||
1683 ir->lhs->type->is_scalar());
1684 dst.writemask = ir->write_mask;
1685
1686 for (int i = 0; i < 4; i++) {
1687 if (dst.writemask & (1 << i)) {
1688 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1689 break;
1690 }
1691 }
1692
1693 /* Swizzle a small RHS vector into the channels being written.
1694 *
1695 * glsl ir treats write_mask as dictating how many channels are
1696 * present on the RHS while in our instructions we need to make
1697 * those channels appear in the slots of the vec4 they're written to.
1698 */
1699 for (int i = 0; i < 4; i++) {
1700 if (dst.writemask & (1 << i))
1701 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1702 else
1703 swizzles[i] = first_enabled_chan;
1704 }
1705 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1706 swizzles[2], swizzles[3]);
1707
1708 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1709 return;
1710 }
1711
1712 if (ir->condition) {
1713 emit_bool_to_cond_code(ir->condition, &predicate);
1714 }
1715
1716 for (i = 0; i < type_size(ir->lhs->type); i++) {
1717 vec4_instruction *inst = emit(MOV(dst, src));
1718 inst->predicate = predicate;
1719
1720 dst.reg_offset++;
1721 src.reg_offset++;
1722 }
1723 }
1724
1725 void
1726 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1727 {
1728 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1729 foreach_list(node, &ir->components) {
1730 ir_constant *field_value = (ir_constant *)node;
1731
1732 emit_constant_values(dst, field_value);
1733 }
1734 return;
1735 }
1736
1737 if (ir->type->is_array()) {
1738 for (unsigned int i = 0; i < ir->type->length; i++) {
1739 emit_constant_values(dst, ir->array_elements[i]);
1740 }
1741 return;
1742 }
1743
1744 if (ir->type->is_matrix()) {
1745 for (int i = 0; i < ir->type->matrix_columns; i++) {
1746 float *vec = &ir->value.f[i * ir->type->vector_elements];
1747
1748 for (int j = 0; j < ir->type->vector_elements; j++) {
1749 dst->writemask = 1 << j;
1750 dst->type = BRW_REGISTER_TYPE_F;
1751
1752 emit(MOV(*dst, src_reg(vec[j])));
1753 }
1754 dst->reg_offset++;
1755 }
1756 return;
1757 }
1758
1759 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1760
1761 for (int i = 0; i < ir->type->vector_elements; i++) {
1762 if (!(remaining_writemask & (1 << i)))
1763 continue;
1764
1765 dst->writemask = 1 << i;
1766 dst->type = brw_type_for_base_type(ir->type);
1767
1768 /* Find other components that match the one we're about to
1769 * write. Emits fewer instructions for things like vec4(0.5,
1770 * 1.5, 1.5, 1.5).
1771 */
1772 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1773 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1774 if (ir->value.b[i] == ir->value.b[j])
1775 dst->writemask |= (1 << j);
1776 } else {
1777 /* u, i, and f storage all line up, so no need for a
1778 * switch case for comparing each type.
1779 */
1780 if (ir->value.u[i] == ir->value.u[j])
1781 dst->writemask |= (1 << j);
1782 }
1783 }
1784
1785 switch (ir->type->base_type) {
1786 case GLSL_TYPE_FLOAT:
1787 emit(MOV(*dst, src_reg(ir->value.f[i])));
1788 break;
1789 case GLSL_TYPE_INT:
1790 emit(MOV(*dst, src_reg(ir->value.i[i])));
1791 break;
1792 case GLSL_TYPE_UINT:
1793 emit(MOV(*dst, src_reg(ir->value.u[i])));
1794 break;
1795 case GLSL_TYPE_BOOL:
1796 emit(MOV(*dst, src_reg(ir->value.b[i])));
1797 break;
1798 default:
1799 assert(!"Non-float/uint/int/bool constant");
1800 break;
1801 }
1802
1803 remaining_writemask &= ~dst->writemask;
1804 }
1805 dst->reg_offset++;
1806 }
1807
1808 void
1809 vec4_visitor::visit(ir_constant *ir)
1810 {
1811 dst_reg dst = dst_reg(this, ir->type);
1812 this->result = src_reg(dst);
1813
1814 emit_constant_values(&dst, ir);
1815 }
1816
1817 void
1818 vec4_visitor::visit(ir_call *ir)
1819 {
1820 assert(!"not reached");
1821 }
1822
1823 void
1824 vec4_visitor::visit(ir_texture *ir)
1825 {
1826 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1827 sampler = vp->Base.SamplerUnits[sampler];
1828
1829 /* Should be lowered by do_lower_texture_projection */
1830 assert(!ir->projector);
1831
1832 vec4_instruction *inst = NULL;
1833 switch (ir->op) {
1834 case ir_tex:
1835 case ir_txl:
1836 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1837 break;
1838 case ir_txd:
1839 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1840 break;
1841 case ir_txf:
1842 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1843 break;
1844 case ir_txs:
1845 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1846 break;
1847 case ir_txb:
1848 assert(!"TXB is not valid for vertex shaders.");
1849 }
1850
1851 /* Texel offsets go in the message header; Gen4 also requires headers. */
1852 inst->header_present = ir->offset || intel->gen < 5;
1853 inst->base_mrf = 2;
1854 inst->mlen = inst->header_present + 1; /* always at least one */
1855 inst->sampler = sampler;
1856 inst->dst = dst_reg(this, ir->type);
1857 inst->shadow_compare = ir->shadow_comparitor != NULL;
1858
1859 if (ir->offset != NULL && ir->op != ir_txf)
1860 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1861
1862 /* MRF for the first parameter */
1863 int param_base = inst->base_mrf + inst->header_present;
1864
1865 if (ir->op == ir_txs) {
1866 ir->lod_info.lod->accept(this);
1867 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1868 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1869 this->result));
1870 } else {
1871 int i, coord_mask = 0, zero_mask = 0;
1872 /* Load the coordinate */
1873 /* FINISHME: gl_clamp_mask and saturate */
1874 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1875 coord_mask |= (1 << i);
1876 for (; i < 4; i++)
1877 zero_mask |= (1 << i);
1878
1879 ir->coordinate->accept(this);
1880 if (ir->offset && ir->op == ir_txf) {
1881 /* It appears that the ld instruction used for txf does its
1882 * address bounds check before adding in the offset. To work
1883 * around this, just add the integer offset to the integer
1884 * texel coordinate, and don't put the offset in the header.
1885 */
1886 ir_constant *offset = ir->offset->as_constant();
1887 assert(offset);
1888
1889 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1890 src_reg src = this->result;
1891 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1892 BRW_GET_SWZ(src.swizzle, j),
1893 BRW_GET_SWZ(src.swizzle, j),
1894 BRW_GET_SWZ(src.swizzle, j));
1895 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1896 src, offset->value.i[j]));
1897 }
1898 } else {
1899 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1900 this->result));
1901 }
1902 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1903 src_reg(0)));
1904 /* Load the shadow comparitor */
1905 if (ir->shadow_comparitor) {
1906 ir->shadow_comparitor->accept(this);
1907 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1908 WRITEMASK_X),
1909 this->result));
1910 inst->mlen++;
1911 }
1912
1913 /* Load the LOD info */
1914 if (ir->op == ir_txl) {
1915 int mrf, writemask;
1916 if (intel->gen >= 5) {
1917 mrf = param_base + 1;
1918 if (ir->shadow_comparitor) {
1919 writemask = WRITEMASK_Y;
1920 /* mlen already incremented */
1921 } else {
1922 writemask = WRITEMASK_X;
1923 inst->mlen++;
1924 }
1925 } else /* intel->gen == 4 */ {
1926 mrf = param_base;
1927 writemask = WRITEMASK_Z;
1928 }
1929 ir->lod_info.lod->accept(this);
1930 emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
1931 this->result));
1932 } else if (ir->op == ir_txf) {
1933 ir->lod_info.lod->accept(this);
1934 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1935 this->result));
1936 } else if (ir->op == ir_txd) {
1937 const glsl_type *type = ir->lod_info.grad.dPdx->type;
1938
1939 ir->lod_info.grad.dPdx->accept(this);
1940 src_reg dPdx = this->result;
1941 ir->lod_info.grad.dPdy->accept(this);
1942 src_reg dPdy = this->result;
1943
1944 if (intel->gen >= 5) {
1945 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1946 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1947 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1948 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1949 inst->mlen++;
1950
1951 if (ir->type->vector_elements == 3) {
1952 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1953 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1954 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1955 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1956 inst->mlen++;
1957 }
1958 } else /* intel->gen == 4 */ {
1959 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1960 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1961 inst->mlen += 2;
1962 }
1963 }
1964 }
1965
1966 emit(inst);
1967
1968 swizzle_result(ir, src_reg(inst->dst), sampler);
1969 }
1970
1971 void
1972 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
1973 {
1974 this->result = orig_val;
1975
1976 int s = c->key.tex.swizzles[sampler];
1977
1978 if (ir->op == ir_txs || ir->type == glsl_type::float_type
1979 || s == SWIZZLE_NOOP)
1980 return;
1981
1982 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1983 int swizzle[4];
1984
1985 for (int i = 0; i < 4; i++) {
1986 switch (GET_SWZ(s, i)) {
1987 case SWIZZLE_ZERO:
1988 zero_mask |= (1 << i);
1989 break;
1990 case SWIZZLE_ONE:
1991 one_mask |= (1 << i);
1992 break;
1993 default:
1994 copy_mask |= (1 << i);
1995 swizzle[i] = GET_SWZ(s, i);
1996 break;
1997 }
1998 }
1999
2000 this->result = src_reg(this, ir->type);
2001 dst_reg swizzled_result(this->result);
2002
2003 if (copy_mask) {
2004 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2005 swizzled_result.writemask = copy_mask;
2006 emit(MOV(swizzled_result, orig_val));
2007 }
2008
2009 if (zero_mask) {
2010 swizzled_result.writemask = zero_mask;
2011 emit(MOV(swizzled_result, src_reg(0.0f)));
2012 }
2013
2014 if (one_mask) {
2015 swizzled_result.writemask = one_mask;
2016 emit(MOV(swizzled_result, src_reg(1.0f)));
2017 }
2018 }
2019
2020 void
2021 vec4_visitor::visit(ir_return *ir)
2022 {
2023 assert(!"not reached");
2024 }
2025
2026 void
2027 vec4_visitor::visit(ir_discard *ir)
2028 {
2029 assert(!"not reached");
2030 }
2031
2032 void
2033 vec4_visitor::visit(ir_if *ir)
2034 {
2035 /* Don't point the annotation at the if statement, because then it plus
2036 * the then and else blocks get printed.
2037 */
2038 this->base_ir = ir->condition;
2039
2040 if (intel->gen == 6) {
2041 emit_if_gen6(ir);
2042 } else {
2043 uint32_t predicate;
2044 emit_bool_to_cond_code(ir->condition, &predicate);
2045 emit(IF(predicate));
2046 }
2047
2048 visit_instructions(&ir->then_instructions);
2049
2050 if (!ir->else_instructions.is_empty()) {
2051 this->base_ir = ir->condition;
2052 emit(BRW_OPCODE_ELSE);
2053
2054 visit_instructions(&ir->else_instructions);
2055 }
2056
2057 this->base_ir = ir->condition;
2058 emit(BRW_OPCODE_ENDIF);
2059 }
2060
2061 void
2062 vec4_visitor::emit_ndc_computation()
2063 {
2064 /* Get the position */
2065 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2066
2067 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2068 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2069 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2070
2071 current_annotation = "NDC";
2072 dst_reg ndc_w = ndc;
2073 ndc_w.writemask = WRITEMASK_W;
2074 src_reg pos_w = pos;
2075 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2076 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2077
2078 dst_reg ndc_xyz = ndc;
2079 ndc_xyz.writemask = WRITEMASK_XYZ;
2080
2081 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2082 }
2083
2084 void
2085 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2086 {
2087 if (intel->gen < 6 &&
2088 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2089 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2090 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2091 dst_reg header1_w = header1;
2092 header1_w.writemask = WRITEMASK_W;
2093 GLuint i;
2094
2095 emit(MOV(header1, 0u));
2096
2097 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2098 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2099
2100 current_annotation = "Point size";
2101 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2102 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2103 }
2104
2105 current_annotation = "Clipping flags";
2106 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2107 vec4_instruction *inst;
2108
2109 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2110 src_reg(this->userplane[i])));
2111 inst->conditional_mod = BRW_CONDITIONAL_L;
2112
2113 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2114 inst->predicate = BRW_PREDICATE_NORMAL;
2115 }
2116
2117 /* i965 clipping workaround:
2118 * 1) Test for -ve rhw
2119 * 2) If set,
2120 * set ndc = (0,0,0,0)
2121 * set ucp[6] = 1
2122 *
2123 * Later, clipping will detect ucp[6] and ensure the primitive is
2124 * clipped against all fixed planes.
2125 */
2126 if (brw->has_negative_rhw_bug) {
2127 #if 0
2128 /* FINISHME */
2129 brw_CMP(p,
2130 vec8(brw_null_reg()),
2131 BRW_CONDITIONAL_L,
2132 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2133 brw_imm_f(0));
2134
2135 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2136 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2137 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2138 #endif
2139 }
2140
2141 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2142 } else if (intel->gen < 6) {
2143 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2144 } else {
2145 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2146 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2147 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2148 src_reg(output_reg[VERT_RESULT_PSIZ])));
2149 }
2150 }
2151 }
2152
2153 void
2154 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2155 {
2156 if (intel->gen < 6) {
2157 /* Clip distance slots are set aside in gen5, but they are not used. It
2158 * is not clear whether we actually need to set aside space for them,
2159 * but the performance cost is negligible.
2160 */
2161 return;
2162 }
2163
2164 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2165 *
2166 * "If a linked set of shaders forming the vertex stage contains no
2167 * static write to gl_ClipVertex or gl_ClipDistance, but the
2168 * application has requested clipping against user clip planes through
2169 * the API, then the coordinate written to gl_Position is used for
2170 * comparison against the user clip planes."
2171 *
2172 * This function is only called if the shader didn't write to
2173 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2174 * if the user wrote to it; otherwise we use gl_Position.
2175 */
2176 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2177 if (!(c->prog_data.outputs_written
2178 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2179 clip_vertex = VERT_RESULT_HPOS;
2180 }
2181
2182 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2183 ++i) {
2184 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2185 src_reg(output_reg[clip_vertex]),
2186 src_reg(this->userplane[i + offset])));
2187 }
2188 }
2189
2190 void
2191 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2192 {
2193 assert (vert_result < VERT_RESULT_MAX);
2194 reg.type = output_reg[vert_result].type;
2195 current_annotation = output_reg_annotation[vert_result];
2196 /* Copy the register, saturating if necessary */
2197 vec4_instruction *inst = emit(MOV(reg,
2198 src_reg(output_reg[vert_result])));
2199 if ((vert_result == VERT_RESULT_COL0 ||
2200 vert_result == VERT_RESULT_COL1 ||
2201 vert_result == VERT_RESULT_BFC0 ||
2202 vert_result == VERT_RESULT_BFC1) &&
2203 c->key.clamp_vertex_color) {
2204 inst->saturate = true;
2205 }
2206 }
2207
2208 void
2209 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2210 {
2211 struct brw_reg hw_reg = brw_message_reg(mrf);
2212 dst_reg reg = dst_reg(MRF, mrf);
2213 reg.type = BRW_REGISTER_TYPE_F;
2214
2215 switch (vert_result) {
2216 case VERT_RESULT_PSIZ:
2217 /* PSIZ is always in slot 0, and is coupled with other flags. */
2218 current_annotation = "indices, point width, clip flags";
2219 emit_psiz_and_flags(hw_reg);
2220 break;
2221 case BRW_VERT_RESULT_NDC:
2222 current_annotation = "NDC";
2223 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2224 break;
2225 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2226 case VERT_RESULT_HPOS:
2227 current_annotation = "gl_Position";
2228 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2229 break;
2230 case VERT_RESULT_CLIP_DIST0:
2231 case VERT_RESULT_CLIP_DIST1:
2232 if (this->c->key.uses_clip_distance) {
2233 emit_generic_urb_slot(reg, vert_result);
2234 } else {
2235 current_annotation = "user clip distances";
2236 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2237 }
2238 break;
2239 case BRW_VERT_RESULT_PAD:
2240 /* No need to write to this slot */
2241 break;
2242 default:
2243 emit_generic_urb_slot(reg, vert_result);
2244 break;
2245 }
2246 }
2247
2248 static int
2249 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2250 {
2251 struct intel_context *intel = &brw->intel;
2252
2253 if (intel->gen >= 6) {
2254 /* URB data written (does not include the message header reg) must
2255 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2256 * section 5.4.3.2.2: URB_INTERLEAVED.
2257 *
2258 * URB entries are allocated on a multiple of 1024 bits, so an
2259 * extra 128 bits written here to make the end align to 256 is
2260 * no problem.
2261 */
2262 if ((mlen % 2) != 1)
2263 mlen++;
2264 }
2265
2266 return mlen;
2267 }
2268
2269 /**
2270 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2271 * complete the VS thread.
2272 *
2273 * The VUE layout is documented in Volume 2a.
2274 */
2275 void
2276 vec4_visitor::emit_urb_writes()
2277 {
2278 /* MRF 0 is reserved for the debugger, so start with message header
2279 * in MRF 1.
2280 */
2281 int base_mrf = 1;
2282 int mrf = base_mrf;
2283 /* In the process of generating our URB write message contents, we
2284 * may need to unspill a register or load from an array. Those
2285 * reads would use MRFs 14-15.
2286 */
2287 int max_usable_mrf = 13;
2288
2289 /* The following assertion verifies that max_usable_mrf causes an
2290 * even-numbered amount of URB write data, which will meet gen6's
2291 * requirements for length alignment.
2292 */
2293 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2294
2295 /* FINISHME: edgeflag */
2296
2297 /* First mrf is the g0-based message header containing URB handles and such,
2298 * which is implied in VS_OPCODE_URB_WRITE.
2299 */
2300 mrf++;
2301
2302 if (intel->gen < 6) {
2303 emit_ndc_computation();
2304 }
2305
2306 /* Set up the VUE data for the first URB write */
2307 int slot;
2308 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2309 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2310
2311 /* If this was max_usable_mrf, we can't fit anything more into this URB
2312 * WRITE.
2313 */
2314 if (mrf > max_usable_mrf) {
2315 slot++;
2316 break;
2317 }
2318 }
2319
2320 current_annotation = "URB write";
2321 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2322 inst->base_mrf = base_mrf;
2323 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2324 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2325
2326 /* Optional second URB write */
2327 if (!inst->eot) {
2328 mrf = base_mrf + 1;
2329
2330 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2331 assert(mrf < max_usable_mrf);
2332
2333 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2334 }
2335
2336 current_annotation = "URB write";
2337 inst = emit(VS_OPCODE_URB_WRITE);
2338 inst->base_mrf = base_mrf;
2339 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2340 inst->eot = true;
2341 /* URB destination offset. In the previous write, we got MRFs
2342 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2343 * URB row increments, and each of our MRFs is half of one of
2344 * those, since we're doing interleaved writes.
2345 */
2346 inst->offset = (max_usable_mrf - base_mrf) / 2;
2347 }
2348 }
2349
2350 src_reg
2351 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2352 src_reg *reladdr, int reg_offset)
2353 {
2354 /* Because we store the values to scratch interleaved like our
2355 * vertex data, we need to scale the vec4 index by 2.
2356 */
2357 int message_header_scale = 2;
2358
2359 /* Pre-gen6, the message header uses byte offsets instead of vec4
2360 * (16-byte) offset units.
2361 */
2362 if (intel->gen < 6)
2363 message_header_scale *= 16;
2364
2365 if (reladdr) {
2366 src_reg index = src_reg(this, glsl_type::int_type);
2367
2368 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2369 emit_before(inst, MUL(dst_reg(index),
2370 index, src_reg(message_header_scale)));
2371
2372 return index;
2373 } else {
2374 return src_reg(reg_offset * message_header_scale);
2375 }
2376 }
2377
2378 src_reg
2379 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2380 src_reg *reladdr, int reg_offset)
2381 {
2382 if (reladdr) {
2383 src_reg index = src_reg(this, glsl_type::int_type);
2384
2385 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2386
2387 /* Pre-gen6, the message header uses byte offsets instead of vec4
2388 * (16-byte) offset units.
2389 */
2390 if (intel->gen < 6) {
2391 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2392 }
2393
2394 return index;
2395 } else {
2396 int message_header_scale = intel->gen < 6 ? 16 : 1;
2397 return src_reg(reg_offset * message_header_scale);
2398 }
2399 }
2400
2401 /**
2402 * Emits an instruction before @inst to load the value named by @orig_src
2403 * from scratch space at @base_offset to @temp.
2404 */
2405 void
2406 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2407 dst_reg temp, src_reg orig_src,
2408 int base_offset)
2409 {
2410 int reg_offset = base_offset + orig_src.reg_offset;
2411 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2412
2413 emit_before(inst, SCRATCH_READ(temp, index));
2414 }
2415
2416 /**
2417 * Emits an instruction after @inst to store the value to be written
2418 * to @orig_dst to scratch space at @base_offset, from @temp.
2419 */
2420 void
2421 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2422 src_reg temp, dst_reg orig_dst,
2423 int base_offset)
2424 {
2425 int reg_offset = base_offset + orig_dst.reg_offset;
2426 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2427
2428 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2429 orig_dst.writemask));
2430 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2431 write->predicate = inst->predicate;
2432 write->ir = inst->ir;
2433 write->annotation = inst->annotation;
2434 inst->insert_after(write);
2435 }
2436
2437 /**
2438 * We can't generally support array access in GRF space, because a
2439 * single instruction's destination can only span 2 contiguous
2440 * registers. So, we send all GRF arrays that get variable index
2441 * access to scratch space.
2442 */
2443 void
2444 vec4_visitor::move_grf_array_access_to_scratch()
2445 {
2446 int scratch_loc[this->virtual_grf_count];
2447
2448 for (int i = 0; i < this->virtual_grf_count; i++) {
2449 scratch_loc[i] = -1;
2450 }
2451
2452 /* First, calculate the set of virtual GRFs that need to be punted
2453 * to scratch due to having any array access on them, and where in
2454 * scratch.
2455 */
2456 foreach_list(node, &this->instructions) {
2457 vec4_instruction *inst = (vec4_instruction *)node;
2458
2459 if (inst->dst.file == GRF && inst->dst.reladdr &&
2460 scratch_loc[inst->dst.reg] == -1) {
2461 scratch_loc[inst->dst.reg] = c->last_scratch;
2462 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2463 }
2464
2465 for (int i = 0 ; i < 3; i++) {
2466 src_reg *src = &inst->src[i];
2467
2468 if (src->file == GRF && src->reladdr &&
2469 scratch_loc[src->reg] == -1) {
2470 scratch_loc[src->reg] = c->last_scratch;
2471 c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2472 }
2473 }
2474 }
2475
2476 /* Now, for anything that will be accessed through scratch, rewrite
2477 * it to load/store. Note that this is a _safe list walk, because
2478 * we may generate a new scratch_write instruction after the one
2479 * we're processing.
2480 */
2481 foreach_list_safe(node, &this->instructions) {
2482 vec4_instruction *inst = (vec4_instruction *)node;
2483
2484 /* Set up the annotation tracking for new generated instructions. */
2485 base_ir = inst->ir;
2486 current_annotation = inst->annotation;
2487
2488 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2489 src_reg temp = src_reg(this, glsl_type::vec4_type);
2490
2491 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2492
2493 inst->dst.file = temp.file;
2494 inst->dst.reg = temp.reg;
2495 inst->dst.reg_offset = temp.reg_offset;
2496 inst->dst.reladdr = NULL;
2497 }
2498
2499 for (int i = 0 ; i < 3; i++) {
2500 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2501 continue;
2502
2503 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2504
2505 emit_scratch_read(inst, temp, inst->src[i],
2506 scratch_loc[inst->src[i].reg]);
2507
2508 inst->src[i].file = temp.file;
2509 inst->src[i].reg = temp.reg;
2510 inst->src[i].reg_offset = temp.reg_offset;
2511 inst->src[i].reladdr = NULL;
2512 }
2513 }
2514 }
2515
2516 /**
2517 * Emits an instruction before @inst to load the value named by @orig_src
2518 * from the pull constant buffer (surface) at @base_offset to @temp.
2519 */
2520 void
2521 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2522 dst_reg temp, src_reg orig_src,
2523 int base_offset)
2524 {
2525 int reg_offset = base_offset + orig_src.reg_offset;
2526 src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2527 vec4_instruction *load;
2528
2529 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2530 temp, index);
2531 load->base_mrf = 14;
2532 load->mlen = 1;
2533 emit_before(inst, load);
2534 }
2535
2536 /**
2537 * Implements array access of uniforms by inserting a
2538 * PULL_CONSTANT_LOAD instruction.
2539 *
2540 * Unlike temporary GRF array access (where we don't support it due to
2541 * the difficulty of doing relative addressing on instruction
2542 * destinations), we could potentially do array access of uniforms
2543 * that were loaded in GRF space as push constants. In real-world
2544 * usage we've seen, though, the arrays being used are always larger
2545 * than we could load as push constants, so just always move all
2546 * uniform array access out to a pull constant buffer.
2547 */
2548 void
2549 vec4_visitor::move_uniform_array_access_to_pull_constants()
2550 {
2551 int pull_constant_loc[this->uniforms];
2552
2553 for (int i = 0; i < this->uniforms; i++) {
2554 pull_constant_loc[i] = -1;
2555 }
2556
2557 /* Walk through and find array access of uniforms. Put a copy of that
2558 * uniform in the pull constant buffer.
2559 *
2560 * Note that we don't move constant-indexed accesses to arrays. No
2561 * testing has been done of the performance impact of this choice.
2562 */
2563 foreach_list_safe(node, &this->instructions) {
2564 vec4_instruction *inst = (vec4_instruction *)node;
2565
2566 for (int i = 0 ; i < 3; i++) {
2567 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2568 continue;
2569
2570 int uniform = inst->src[i].reg;
2571
2572 /* If this array isn't already present in the pull constant buffer,
2573 * add it.
2574 */
2575 if (pull_constant_loc[uniform] == -1) {
2576 const float **values = &prog_data->param[uniform * 4];
2577
2578 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2579
2580 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2581 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2582 }
2583 }
2584
2585 /* Set up the annotation tracking for new generated instructions. */
2586 base_ir = inst->ir;
2587 current_annotation = inst->annotation;
2588
2589 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2590
2591 emit_pull_constant_load(inst, temp, inst->src[i],
2592 pull_constant_loc[uniform]);
2593
2594 inst->src[i].file = temp.file;
2595 inst->src[i].reg = temp.reg;
2596 inst->src[i].reg_offset = temp.reg_offset;
2597 inst->src[i].reladdr = NULL;
2598 }
2599 }
2600
2601 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2602 * no need to track them as larger-than-vec4 objects. This will be
2603 * relied on in cutting out unused uniform vectors from push
2604 * constants.
2605 */
2606 split_uniform_registers();
2607 }
2608
2609 void
2610 vec4_visitor::resolve_ud_negate(src_reg *reg)
2611 {
2612 if (reg->type != BRW_REGISTER_TYPE_UD ||
2613 !reg->negate)
2614 return;
2615
2616 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2617 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2618 *reg = temp;
2619 }
2620
2621 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2622 struct gl_shader_program *prog,
2623 struct brw_shader *shader)
2624 {
2625 this->c = c;
2626 this->p = &c->func;
2627 this->brw = p->brw;
2628 this->intel = &brw->intel;
2629 this->ctx = &intel->ctx;
2630 this->prog = prog;
2631 this->shader = shader;
2632
2633 this->mem_ctx = ralloc_context(NULL);
2634 this->failed = false;
2635
2636 this->base_ir = NULL;
2637 this->current_annotation = NULL;
2638
2639 this->c = c;
2640 this->vp = (struct gl_vertex_program *)
2641 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2642 this->prog_data = &c->prog_data;
2643
2644 this->variable_ht = hash_table_ctor(0,
2645 hash_table_pointer_hash,
2646 hash_table_pointer_compare);
2647
2648 this->virtual_grf_def = NULL;
2649 this->virtual_grf_use = NULL;
2650 this->virtual_grf_sizes = NULL;
2651 this->virtual_grf_count = 0;
2652 this->virtual_grf_reg_map = NULL;
2653 this->virtual_grf_reg_count = 0;
2654 this->virtual_grf_array_size = 0;
2655 this->live_intervals_valid = false;
2656
2657 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2658
2659 this->uniforms = 0;
2660 }
2661
2662 vec4_visitor::~vec4_visitor()
2663 {
2664 ralloc_free(this->mem_ctx);
2665 hash_table_dtor(this->variable_ht);
2666 }
2667
2668
2669 void
2670 vec4_visitor::fail(const char *format, ...)
2671 {
2672 va_list va;
2673 char *msg;
2674
2675 if (failed)
2676 return;
2677
2678 failed = true;
2679
2680 va_start(va, format);
2681 msg = ralloc_vasprintf(mem_ctx, format, va);
2682 va_end(va);
2683 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2684
2685 this->fail_msg = msg;
2686
2687 if (INTEL_DEBUG & DEBUG_VS) {
2688 fprintf(stderr, "%s", msg);
2689 }
2690 }
2691
2692 } /* namespace brw */