i965/vs: Implement EXT_texture_swizzle support for VS texturing.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 src_reg::src_reg(dst_reg reg)
34 {
35 init();
36
37 this->file = reg.file;
38 this->reg = reg.reg;
39 this->reg_offset = reg.reg_offset;
40 this->type = reg.type;
41 this->reladdr = reg.reladdr;
42 this->fixed_hw_reg = reg.fixed_hw_reg;
43
44 int swizzles[4];
45 int next_chan = 0;
46 int last = 0;
47
48 for (int i = 0; i < 4; i++) {
49 if (!(reg.writemask & (1 << i)))
50 continue;
51
52 swizzles[next_chan++] = last = i;
53 }
54
55 for (; next_chan < 4; next_chan++) {
56 swizzles[next_chan] = last;
57 }
58
59 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
60 swizzles[2], swizzles[3]);
61 }
62
63 dst_reg::dst_reg(src_reg reg)
64 {
65 init();
66
67 this->file = reg.file;
68 this->reg = reg.reg;
69 this->reg_offset = reg.reg_offset;
70 this->type = reg.type;
71 this->writemask = WRITEMASK_XYZW;
72 this->reladdr = reg.reladdr;
73 this->fixed_hw_reg = reg.fixed_hw_reg;
74 }
75
76 vec4_instruction::vec4_instruction(vec4_visitor *v,
77 enum opcode opcode, dst_reg dst,
78 src_reg src0, src_reg src1, src_reg src2)
79 {
80 this->opcode = opcode;
81 this->dst = dst;
82 this->src[0] = src0;
83 this->src[1] = src1;
84 this->src[2] = src2;
85 this->ir = v->base_ir;
86 this->annotation = v->current_annotation;
87 }
88
89 vec4_instruction *
90 vec4_visitor::emit(vec4_instruction *inst)
91 {
92 this->instructions.push_tail(inst);
93
94 return inst;
95 }
96
97 vec4_instruction *
98 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
99 {
100 new_inst->ir = inst->ir;
101 new_inst->annotation = inst->annotation;
102
103 inst->insert_before(new_inst);
104
105 return inst;
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
110 src_reg src0, src_reg src1, src_reg src2)
111 {
112 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
113 src0, src1, src2));
114 }
115
116
117 vec4_instruction *
118 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
119 {
120 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
121 }
122
123 vec4_instruction *
124 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
125 {
126 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
127 }
128
129 vec4_instruction *
130 vec4_visitor::emit(enum opcode opcode)
131 {
132 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
133 }
134
135 #define ALU1(op) \
136 vec4_instruction * \
137 vec4_visitor::op(dst_reg dst, src_reg src0) \
138 { \
139 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
140 src0); \
141 }
142
143 #define ALU2(op) \
144 vec4_instruction * \
145 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
146 { \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU2(ADD)
158 ALU2(MUL)
159 ALU2(MACH)
160 ALU2(AND)
161 ALU2(OR)
162 ALU2(XOR)
163 ALU2(DP3)
164 ALU2(DP4)
165
166 /** Gen4 predicated IF. */
167 vec4_instruction *
168 vec4_visitor::IF(uint32_t predicate)
169 {
170 vec4_instruction *inst;
171
172 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
173 inst->predicate = predicate;
174
175 return inst;
176 }
177
178 /** Gen6+ IF with embedded comparison. */
179 vec4_instruction *
180 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
181 {
182 assert(intel->gen >= 6);
183
184 vec4_instruction *inst;
185
186 resolve_ud_negate(&src0);
187 resolve_ud_negate(&src1);
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
190 src0, src1);
191 inst->conditional_mod = condition;
192
193 return inst;
194 }
195
196 /**
197 * CMP: Sets the low bit of the destination channels with the result
198 * of the comparison, while the upper bits are undefined, and updates
199 * the flag register with the packed 16 bits of the result.
200 */
201 vec4_instruction *
202 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
203 {
204 vec4_instruction *inst;
205
206 /* original gen4 does type conversion to the destination type
207 * before before comparison, producing garbage results for floating
208 * point comparisons.
209 */
210 if (intel->gen == 4) {
211 dst.type = src0.type;
212 if (dst.file == HW_REG)
213 dst.fixed_hw_reg.type = dst.type;
214 }
215
216 resolve_ud_negate(&src0);
217 resolve_ud_negate(&src1);
218
219 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
220 inst->conditional_mod = condition;
221
222 return inst;
223 }
224
225 vec4_instruction *
226 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
227 {
228 vec4_instruction *inst;
229
230 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
231 dst, index);
232 inst->base_mrf = 14;
233 inst->mlen = 1;
234
235 return inst;
236 }
237
238 vec4_instruction *
239 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
240 {
241 vec4_instruction *inst;
242
243 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
244 dst, src, index);
245 inst->base_mrf = 13;
246 inst->mlen = 2;
247
248 return inst;
249 }
250
251 void
252 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
253 {
254 static enum opcode dot_opcodes[] = {
255 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
256 };
257
258 emit(dot_opcodes[elements - 2], dst, src0, src1);
259 }
260
261 void
262 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
263 {
264 /* The gen6 math instruction ignores the source modifiers --
265 * swizzle, abs, negate, and at least some parts of the register
266 * region description.
267 *
268 * While it would seem that this MOV could be avoided at this point
269 * in the case that the swizzle is matched up with the destination
270 * writemask, note that uniform packing and register allocation
271 * could rearrange our swizzle, so let's leave this matter up to
272 * copy propagation later.
273 */
274 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
275 emit(MOV(dst_reg(temp_src), src));
276
277 if (dst.writemask != WRITEMASK_XYZW) {
278 /* The gen6 math instruction must be align1, so we can't do
279 * writemasks.
280 */
281 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
282
283 emit(opcode, temp_dst, temp_src);
284
285 emit(MOV(dst, src_reg(temp_dst)));
286 } else {
287 emit(opcode, dst, temp_src);
288 }
289 }
290
291 void
292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
293 {
294 vec4_instruction *inst = emit(opcode, dst, src);
295 inst->base_mrf = 1;
296 inst->mlen = 1;
297 }
298
299 void
300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
301 {
302 switch (opcode) {
303 case SHADER_OPCODE_RCP:
304 case SHADER_OPCODE_RSQ:
305 case SHADER_OPCODE_SQRT:
306 case SHADER_OPCODE_EXP2:
307 case SHADER_OPCODE_LOG2:
308 case SHADER_OPCODE_SIN:
309 case SHADER_OPCODE_COS:
310 break;
311 default:
312 assert(!"not reached: bad math opcode");
313 return;
314 }
315
316 if (intel->gen >= 6) {
317 return emit_math1_gen6(opcode, dst, src);
318 } else {
319 return emit_math1_gen4(opcode, dst, src);
320 }
321 }
322
323 void
324 vec4_visitor::emit_math2_gen6(enum opcode opcode,
325 dst_reg dst, src_reg src0, src_reg src1)
326 {
327 src_reg expanded;
328
329 /* The gen6 math instruction ignores the source modifiers --
330 * swizzle, abs, negate, and at least some parts of the register
331 * region description. Move the sources to temporaries to make it
332 * generally work.
333 */
334
335 expanded = src_reg(this, glsl_type::vec4_type);
336 expanded.type = src0.type;
337 emit(MOV(dst_reg(expanded), src0));
338 src0 = expanded;
339
340 expanded = src_reg(this, glsl_type::vec4_type);
341 expanded.type = src1.type;
342 emit(MOV(dst_reg(expanded), src1));
343 src1 = expanded;
344
345 if (dst.writemask != WRITEMASK_XYZW) {
346 /* The gen6 math instruction must be align1, so we can't do
347 * writemasks.
348 */
349 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
350 temp_dst.type = dst.type;
351
352 emit(opcode, temp_dst, src0, src1);
353
354 emit(MOV(dst, src_reg(temp_dst)));
355 } else {
356 emit(opcode, dst, src0, src1);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen4(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 vec4_instruction *inst = emit(opcode, dst, src0, src1);
365 inst->base_mrf = 1;
366 inst->mlen = 2;
367 }
368
369 void
370 vec4_visitor::emit_math(enum opcode opcode,
371 dst_reg dst, src_reg src0, src_reg src1)
372 {
373 switch (opcode) {
374 case SHADER_OPCODE_POW:
375 case SHADER_OPCODE_INT_QUOTIENT:
376 case SHADER_OPCODE_INT_REMAINDER:
377 break;
378 default:
379 assert(!"not reached: unsupported binary math opcode");
380 return;
381 }
382
383 if (intel->gen >= 6) {
384 return emit_math2_gen6(opcode, dst, src0, src1);
385 } else {
386 return emit_math2_gen4(opcode, dst, src0, src1);
387 }
388 }
389
390 void
391 vec4_visitor::visit_instructions(const exec_list *list)
392 {
393 foreach_list(node, list) {
394 ir_instruction *ir = (ir_instruction *)node;
395
396 base_ir = ir;
397 ir->accept(this);
398 }
399 }
400
401
402 static int
403 type_size(const struct glsl_type *type)
404 {
405 unsigned int i;
406 int size;
407
408 switch (type->base_type) {
409 case GLSL_TYPE_UINT:
410 case GLSL_TYPE_INT:
411 case GLSL_TYPE_FLOAT:
412 case GLSL_TYPE_BOOL:
413 if (type->is_matrix()) {
414 return type->matrix_columns;
415 } else {
416 /* Regardless of size of vector, it gets a vec4. This is bad
417 * packing for things like floats, but otherwise arrays become a
418 * mess. Hopefully a later pass over the code can pack scalars
419 * down if appropriate.
420 */
421 return 1;
422 }
423 case GLSL_TYPE_ARRAY:
424 assert(type->length > 0);
425 return type_size(type->fields.array) * type->length;
426 case GLSL_TYPE_STRUCT:
427 size = 0;
428 for (i = 0; i < type->length; i++) {
429 size += type_size(type->fields.structure[i].type);
430 }
431 return size;
432 case GLSL_TYPE_SAMPLER:
433 /* Samplers take up one slot in UNIFORMS[], but they're baked in
434 * at link time.
435 */
436 return 1;
437 default:
438 assert(0);
439 return 0;
440 }
441 }
442
443 int
444 vec4_visitor::virtual_grf_alloc(int size)
445 {
446 if (virtual_grf_array_size <= virtual_grf_count) {
447 if (virtual_grf_array_size == 0)
448 virtual_grf_array_size = 16;
449 else
450 virtual_grf_array_size *= 2;
451 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
452 virtual_grf_array_size);
453 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
454 virtual_grf_array_size);
455 }
456 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
457 virtual_grf_reg_count += size;
458 virtual_grf_sizes[virtual_grf_count] = size;
459 return virtual_grf_count++;
460 }
461
462 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
463 {
464 init();
465
466 this->file = GRF;
467 this->reg = v->virtual_grf_alloc(type_size(type));
468
469 if (type->is_array() || type->is_record()) {
470 this->swizzle = BRW_SWIZZLE_NOOP;
471 } else {
472 this->swizzle = swizzle_for_size(type->vector_elements);
473 }
474
475 this->type = brw_type_for_base_type(type);
476 }
477
478 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
479 {
480 init();
481
482 this->file = GRF;
483 this->reg = v->virtual_grf_alloc(type_size(type));
484
485 if (type->is_array() || type->is_record()) {
486 this->writemask = WRITEMASK_XYZW;
487 } else {
488 this->writemask = (1 << type->vector_elements) - 1;
489 }
490
491 this->type = brw_type_for_base_type(type);
492 }
493
494 /* Our support for uniforms is piggy-backed on the struct
495 * gl_fragment_program, because that's where the values actually
496 * get stored, rather than in some global gl_shader_program uniform
497 * store.
498 */
499 int
500 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
501 {
502 unsigned int offset = 0;
503 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
504
505 if (type->is_matrix()) {
506 const glsl_type *column = type->column_type();
507
508 for (unsigned int i = 0; i < type->matrix_columns; i++) {
509 offset += setup_uniform_values(loc + offset, column);
510 }
511
512 return offset;
513 }
514
515 switch (type->base_type) {
516 case GLSL_TYPE_FLOAT:
517 case GLSL_TYPE_UINT:
518 case GLSL_TYPE_INT:
519 case GLSL_TYPE_BOOL:
520 for (unsigned int i = 0; i < type->vector_elements; i++) {
521 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
522 }
523
524 /* Set up pad elements to get things aligned to a vec4 boundary. */
525 for (unsigned int i = type->vector_elements; i < 4; i++) {
526 static float zero = 0;
527
528 c->prog_data.param[this->uniforms * 4 + i] = &zero;
529 }
530
531 /* Track the size of this uniform vector, for future packing of
532 * uniforms.
533 */
534 this->uniform_vector_size[this->uniforms] = type->vector_elements;
535 this->uniforms++;
536
537 return 1;
538
539 case GLSL_TYPE_STRUCT:
540 for (unsigned int i = 0; i < type->length; i++) {
541 offset += setup_uniform_values(loc + offset,
542 type->fields.structure[i].type);
543 }
544 return offset;
545
546 case GLSL_TYPE_ARRAY:
547 for (unsigned int i = 0; i < type->length; i++) {
548 offset += setup_uniform_values(loc + offset, type->fields.array);
549 }
550 return offset;
551
552 case GLSL_TYPE_SAMPLER:
553 /* The sampler takes up a slot, but we don't use any values from it. */
554 return 1;
555
556 default:
557 assert(!"not reached");
558 return 0;
559 }
560 }
561
562 void
563 vec4_visitor::setup_uniform_clipplane_values()
564 {
565 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
566
567 /* Pre-Gen6, we compact clip planes. For example, if the user
568 * enables just clip planes 0, 1, and 3, we will enable clip planes
569 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
570 * plane 2. This simplifies the implementation of the Gen6 clip
571 * thread.
572 *
573 * In Gen6 and later, we don't compact clip planes, because this
574 * simplifies the implementation of gl_ClipDistance.
575 */
576 int compacted_clipplane_index = 0;
577 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
578 if (intel->gen < 6 &&
579 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
580 continue;
581 }
582 this->uniform_vector_size[this->uniforms] = 4;
583 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
584 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
585 for (int j = 0; j < 4; ++j) {
586 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
587 }
588 ++compacted_clipplane_index;
589 ++this->uniforms;
590 }
591 }
592
593 /* Our support for builtin uniforms is even scarier than non-builtin.
594 * It sits on top of the PROG_STATE_VAR parameters that are
595 * automatically updated from GL context state.
596 */
597 void
598 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
599 {
600 const ir_state_slot *const slots = ir->state_slots;
601 assert(ir->state_slots != NULL);
602
603 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
604 /* This state reference has already been setup by ir_to_mesa,
605 * but we'll get the same index back here. We can reference
606 * ParameterValues directly, since unlike brw_fs.cpp, we never
607 * add new state references during compile.
608 */
609 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
610 (gl_state_index *)slots[i].tokens);
611 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
612
613 this->uniform_vector_size[this->uniforms] = 0;
614 /* Add each of the unique swizzled channels of the element.
615 * This will end up matching the size of the glsl_type of this field.
616 */
617 int last_swiz = -1;
618 for (unsigned int j = 0; j < 4; j++) {
619 int swiz = GET_SWZ(slots[i].swizzle, j);
620 last_swiz = swiz;
621
622 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
623 if (swiz <= last_swiz)
624 this->uniform_vector_size[this->uniforms]++;
625 }
626 this->uniforms++;
627 }
628 }
629
630 dst_reg *
631 vec4_visitor::variable_storage(ir_variable *var)
632 {
633 return (dst_reg *)hash_table_find(this->variable_ht, var);
634 }
635
636 void
637 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
638 {
639 ir_expression *expr = ir->as_expression();
640
641 *predicate = BRW_PREDICATE_NORMAL;
642
643 if (expr) {
644 src_reg op[2];
645 vec4_instruction *inst;
646
647 assert(expr->get_num_operands() <= 2);
648 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
649 expr->operands[i]->accept(this);
650 op[i] = this->result;
651
652 resolve_ud_negate(&op[i]);
653 }
654
655 switch (expr->operation) {
656 case ir_unop_logic_not:
657 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
658 inst->conditional_mod = BRW_CONDITIONAL_Z;
659 break;
660
661 case ir_binop_logic_xor:
662 inst = emit(XOR(dst_null_d(), op[0], op[1]));
663 inst->conditional_mod = BRW_CONDITIONAL_NZ;
664 break;
665
666 case ir_binop_logic_or:
667 inst = emit(OR(dst_null_d(), op[0], op[1]));
668 inst->conditional_mod = BRW_CONDITIONAL_NZ;
669 break;
670
671 case ir_binop_logic_and:
672 inst = emit(AND(dst_null_d(), op[0], op[1]));
673 inst->conditional_mod = BRW_CONDITIONAL_NZ;
674 break;
675
676 case ir_unop_f2b:
677 if (intel->gen >= 6) {
678 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
679 } else {
680 inst = emit(MOV(dst_null_f(), op[0]));
681 inst->conditional_mod = BRW_CONDITIONAL_NZ;
682 }
683 break;
684
685 case ir_unop_i2b:
686 if (intel->gen >= 6) {
687 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
688 } else {
689 inst = emit(MOV(dst_null_d(), op[0]));
690 inst->conditional_mod = BRW_CONDITIONAL_NZ;
691 }
692 break;
693
694 case ir_binop_all_equal:
695 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
696 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
697 break;
698
699 case ir_binop_any_nequal:
700 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
701 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
702 break;
703
704 case ir_unop_any:
705 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
706 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
707 break;
708
709 case ir_binop_greater:
710 case ir_binop_gequal:
711 case ir_binop_less:
712 case ir_binop_lequal:
713 case ir_binop_equal:
714 case ir_binop_nequal:
715 emit(CMP(dst_null_d(), op[0], op[1],
716 brw_conditional_for_comparison(expr->operation)));
717 break;
718
719 default:
720 assert(!"not reached");
721 break;
722 }
723 return;
724 }
725
726 ir->accept(this);
727
728 resolve_ud_negate(&this->result);
729
730 if (intel->gen >= 6) {
731 vec4_instruction *inst = emit(AND(dst_null_d(),
732 this->result, src_reg(1)));
733 inst->conditional_mod = BRW_CONDITIONAL_NZ;
734 } else {
735 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
736 inst->conditional_mod = BRW_CONDITIONAL_NZ;
737 }
738 }
739
740 /**
741 * Emit a gen6 IF statement with the comparison folded into the IF
742 * instruction.
743 */
744 void
745 vec4_visitor::emit_if_gen6(ir_if *ir)
746 {
747 ir_expression *expr = ir->condition->as_expression();
748
749 if (expr) {
750 src_reg op[2];
751 dst_reg temp;
752
753 assert(expr->get_num_operands() <= 2);
754 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
755 expr->operands[i]->accept(this);
756 op[i] = this->result;
757 }
758
759 switch (expr->operation) {
760 case ir_unop_logic_not:
761 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
762 return;
763
764 case ir_binop_logic_xor:
765 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
766 return;
767
768 case ir_binop_logic_or:
769 temp = dst_reg(this, glsl_type::bool_type);
770 emit(OR(temp, op[0], op[1]));
771 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
772 return;
773
774 case ir_binop_logic_and:
775 temp = dst_reg(this, glsl_type::bool_type);
776 emit(AND(temp, op[0], op[1]));
777 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
778 return;
779
780 case ir_unop_f2b:
781 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
782 return;
783
784 case ir_unop_i2b:
785 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
786 return;
787
788 case ir_binop_greater:
789 case ir_binop_gequal:
790 case ir_binop_less:
791 case ir_binop_lequal:
792 case ir_binop_equal:
793 case ir_binop_nequal:
794 emit(IF(op[0], op[1],
795 brw_conditional_for_comparison(expr->operation)));
796 return;
797
798 case ir_binop_all_equal:
799 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
800 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
801 return;
802
803 case ir_binop_any_nequal:
804 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
805 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
806 return;
807
808 case ir_unop_any:
809 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
810 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
811 return;
812
813 default:
814 assert(!"not reached");
815 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
816 return;
817 }
818 return;
819 }
820
821 ir->condition->accept(this);
822
823 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
824 }
825
826 void
827 vec4_visitor::visit(ir_variable *ir)
828 {
829 dst_reg *reg = NULL;
830
831 if (variable_storage(ir))
832 return;
833
834 switch (ir->mode) {
835 case ir_var_in:
836 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
837
838 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
839 * come in as floating point conversions of the integer values.
840 */
841 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
842 if (!c->key.gl_fixed_input_size[i])
843 continue;
844
845 dst_reg dst = *reg;
846 dst.type = brw_type_for_base_type(ir->type);
847 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
848 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
849 }
850 break;
851
852 case ir_var_out:
853 reg = new(mem_ctx) dst_reg(this, ir->type);
854
855 for (int i = 0; i < type_size(ir->type); i++) {
856 output_reg[ir->location + i] = *reg;
857 output_reg[ir->location + i].reg_offset = i;
858 output_reg[ir->location + i].type =
859 brw_type_for_base_type(ir->type->get_scalar_type());
860 output_reg_annotation[ir->location + i] = ir->name;
861 }
862 break;
863
864 case ir_var_auto:
865 case ir_var_temporary:
866 reg = new(mem_ctx) dst_reg(this, ir->type);
867 break;
868
869 case ir_var_uniform:
870 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
871
872 /* Track how big the whole uniform variable is, in case we need to put a
873 * copy of its data into pull constants for array access.
874 */
875 this->uniform_size[this->uniforms] = type_size(ir->type);
876
877 if (!strncmp(ir->name, "gl_", 3)) {
878 setup_builtin_uniform_values(ir);
879 } else {
880 setup_uniform_values(ir->location, ir->type);
881 }
882 break;
883
884 case ir_var_system_value:
885 /* VertexID is stored by the VF as the last vertex element, but
886 * we don't represent it with a flag in inputs_read, so we call
887 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
888 */
889 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
890 prog_data->uses_vertexid = true;
891
892 switch (ir->location) {
893 case SYSTEM_VALUE_VERTEX_ID:
894 reg->writemask = WRITEMASK_X;
895 break;
896 case SYSTEM_VALUE_INSTANCE_ID:
897 reg->writemask = WRITEMASK_Y;
898 break;
899 default:
900 assert(!"not reached");
901 break;
902 }
903 break;
904
905 default:
906 assert(!"not reached");
907 }
908
909 reg->type = brw_type_for_base_type(ir->type);
910 hash_table_insert(this->variable_ht, reg, ir);
911 }
912
913 void
914 vec4_visitor::visit(ir_loop *ir)
915 {
916 dst_reg counter;
917
918 /* We don't want debugging output to print the whole body of the
919 * loop as the annotation.
920 */
921 this->base_ir = NULL;
922
923 if (ir->counter != NULL) {
924 this->base_ir = ir->counter;
925 ir->counter->accept(this);
926 counter = *(variable_storage(ir->counter));
927
928 if (ir->from != NULL) {
929 this->base_ir = ir->from;
930 ir->from->accept(this);
931
932 emit(MOV(counter, this->result));
933 }
934 }
935
936 emit(BRW_OPCODE_DO);
937
938 if (ir->to) {
939 this->base_ir = ir->to;
940 ir->to->accept(this);
941
942 emit(CMP(dst_null_d(), src_reg(counter), this->result,
943 brw_conditional_for_comparison(ir->cmp)));
944
945 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
946 inst->predicate = BRW_PREDICATE_NORMAL;
947 }
948
949 visit_instructions(&ir->body_instructions);
950
951
952 if (ir->increment) {
953 this->base_ir = ir->increment;
954 ir->increment->accept(this);
955 emit(ADD(counter, src_reg(counter), this->result));
956 }
957
958 emit(BRW_OPCODE_WHILE);
959 }
960
961 void
962 vec4_visitor::visit(ir_loop_jump *ir)
963 {
964 switch (ir->mode) {
965 case ir_loop_jump::jump_break:
966 emit(BRW_OPCODE_BREAK);
967 break;
968 case ir_loop_jump::jump_continue:
969 emit(BRW_OPCODE_CONTINUE);
970 break;
971 }
972 }
973
974
975 void
976 vec4_visitor::visit(ir_function_signature *ir)
977 {
978 assert(0);
979 (void)ir;
980 }
981
982 void
983 vec4_visitor::visit(ir_function *ir)
984 {
985 /* Ignore function bodies other than main() -- we shouldn't see calls to
986 * them since they should all be inlined.
987 */
988 if (strcmp(ir->name, "main") == 0) {
989 const ir_function_signature *sig;
990 exec_list empty;
991
992 sig = ir->matching_signature(&empty);
993
994 assert(sig);
995
996 visit_instructions(&sig->body);
997 }
998 }
999
1000 bool
1001 vec4_visitor::try_emit_sat(ir_expression *ir)
1002 {
1003 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1004 if (!sat_src)
1005 return false;
1006
1007 sat_src->accept(this);
1008 src_reg src = this->result;
1009
1010 this->result = src_reg(this, ir->type);
1011 vec4_instruction *inst;
1012 inst = emit(MOV(dst_reg(this->result), src));
1013 inst->saturate = true;
1014
1015 return true;
1016 }
1017
1018 void
1019 vec4_visitor::emit_bool_comparison(unsigned int op,
1020 dst_reg dst, src_reg src0, src_reg src1)
1021 {
1022 /* original gen4 does destination conversion before comparison. */
1023 if (intel->gen < 5)
1024 dst.type = src0.type;
1025
1026 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1027
1028 dst.type = BRW_REGISTER_TYPE_D;
1029 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1030 }
1031
1032 void
1033 vec4_visitor::visit(ir_expression *ir)
1034 {
1035 unsigned int operand;
1036 src_reg op[Elements(ir->operands)];
1037 src_reg result_src;
1038 dst_reg result_dst;
1039 vec4_instruction *inst;
1040
1041 if (try_emit_sat(ir))
1042 return;
1043
1044 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1045 this->result.file = BAD_FILE;
1046 ir->operands[operand]->accept(this);
1047 if (this->result.file == BAD_FILE) {
1048 printf("Failed to get tree for expression operand:\n");
1049 ir->operands[operand]->print();
1050 exit(1);
1051 }
1052 op[operand] = this->result;
1053
1054 /* Matrix expression operands should have been broken down to vector
1055 * operations already.
1056 */
1057 assert(!ir->operands[operand]->type->is_matrix());
1058 }
1059
1060 int vector_elements = ir->operands[0]->type->vector_elements;
1061 if (ir->operands[1]) {
1062 vector_elements = MAX2(vector_elements,
1063 ir->operands[1]->type->vector_elements);
1064 }
1065
1066 this->result.file = BAD_FILE;
1067
1068 /* Storage for our result. Ideally for an assignment we'd be using
1069 * the actual storage for the result here, instead.
1070 */
1071 result_src = src_reg(this, ir->type);
1072 /* convenience for the emit functions below. */
1073 result_dst = dst_reg(result_src);
1074 /* If nothing special happens, this is the result. */
1075 this->result = result_src;
1076 /* Limit writes to the channels that will be used by result_src later.
1077 * This does limit this temp's use as a temporary for multi-instruction
1078 * sequences.
1079 */
1080 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1081
1082 switch (ir->operation) {
1083 case ir_unop_logic_not:
1084 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1085 * ones complement of the whole register, not just bit 0.
1086 */
1087 emit(XOR(result_dst, op[0], src_reg(1)));
1088 break;
1089 case ir_unop_neg:
1090 op[0].negate = !op[0].negate;
1091 this->result = op[0];
1092 break;
1093 case ir_unop_abs:
1094 op[0].abs = true;
1095 op[0].negate = false;
1096 this->result = op[0];
1097 break;
1098
1099 case ir_unop_sign:
1100 emit(MOV(result_dst, src_reg(0.0f)));
1101
1102 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1103 inst = emit(MOV(result_dst, src_reg(1.0f)));
1104 inst->predicate = BRW_PREDICATE_NORMAL;
1105
1106 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1107 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1108 inst->predicate = BRW_PREDICATE_NORMAL;
1109
1110 break;
1111
1112 case ir_unop_rcp:
1113 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1114 break;
1115
1116 case ir_unop_exp2:
1117 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1118 break;
1119 case ir_unop_log2:
1120 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1121 break;
1122 case ir_unop_exp:
1123 case ir_unop_log:
1124 assert(!"not reached: should be handled by ir_explog_to_explog2");
1125 break;
1126 case ir_unop_sin:
1127 case ir_unop_sin_reduced:
1128 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1129 break;
1130 case ir_unop_cos:
1131 case ir_unop_cos_reduced:
1132 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1133 break;
1134
1135 case ir_unop_dFdx:
1136 case ir_unop_dFdy:
1137 assert(!"derivatives not valid in vertex shader");
1138 break;
1139
1140 case ir_unop_noise:
1141 assert(!"not reached: should be handled by lower_noise");
1142 break;
1143
1144 case ir_binop_add:
1145 emit(ADD(result_dst, op[0], op[1]));
1146 break;
1147 case ir_binop_sub:
1148 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1149 break;
1150
1151 case ir_binop_mul:
1152 if (ir->type->is_integer()) {
1153 /* For integer multiplication, the MUL uses the low 16 bits
1154 * of one of the operands (src0 on gen6, src1 on gen7). The
1155 * MACH accumulates in the contribution of the upper 16 bits
1156 * of that operand.
1157 *
1158 * FINISHME: Emit just the MUL if we know an operand is small
1159 * enough.
1160 */
1161 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1162
1163 emit(MUL(acc, op[0], op[1]));
1164 emit(MACH(dst_null_d(), op[0], op[1]));
1165 emit(MOV(result_dst, src_reg(acc)));
1166 } else {
1167 emit(MUL(result_dst, op[0], op[1]));
1168 }
1169 break;
1170 case ir_binop_div:
1171 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1172 assert(ir->type->is_integer());
1173 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1174 break;
1175 case ir_binop_mod:
1176 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1177 assert(ir->type->is_integer());
1178 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1179 break;
1180
1181 case ir_binop_less:
1182 case ir_binop_greater:
1183 case ir_binop_lequal:
1184 case ir_binop_gequal:
1185 case ir_binop_equal:
1186 case ir_binop_nequal: {
1187 emit(CMP(result_dst, op[0], op[1],
1188 brw_conditional_for_comparison(ir->operation)));
1189 emit(AND(result_dst, result_src, src_reg(0x1)));
1190 break;
1191 }
1192
1193 case ir_binop_all_equal:
1194 /* "==" operator producing a scalar boolean. */
1195 if (ir->operands[0]->type->is_vector() ||
1196 ir->operands[1]->type->is_vector()) {
1197 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1198 emit(MOV(result_dst, src_reg(0)));
1199 inst = emit(MOV(result_dst, src_reg(1)));
1200 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1201 } else {
1202 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1203 emit(AND(result_dst, result_src, src_reg(0x1)));
1204 }
1205 break;
1206 case ir_binop_any_nequal:
1207 /* "!=" operator producing a scalar boolean. */
1208 if (ir->operands[0]->type->is_vector() ||
1209 ir->operands[1]->type->is_vector()) {
1210 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1211
1212 emit(MOV(result_dst, src_reg(0)));
1213 inst = emit(MOV(result_dst, src_reg(1)));
1214 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1215 } else {
1216 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1217 emit(AND(result_dst, result_src, src_reg(0x1)));
1218 }
1219 break;
1220
1221 case ir_unop_any:
1222 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1223 emit(MOV(result_dst, src_reg(0)));
1224
1225 inst = emit(MOV(result_dst, src_reg(1)));
1226 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1227 break;
1228
1229 case ir_binop_logic_xor:
1230 emit(XOR(result_dst, op[0], op[1]));
1231 break;
1232
1233 case ir_binop_logic_or:
1234 emit(OR(result_dst, op[0], op[1]));
1235 break;
1236
1237 case ir_binop_logic_and:
1238 emit(AND(result_dst, op[0], op[1]));
1239 break;
1240
1241 case ir_binop_dot:
1242 assert(ir->operands[0]->type->is_vector());
1243 assert(ir->operands[0]->type == ir->operands[1]->type);
1244 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1245 break;
1246
1247 case ir_unop_sqrt:
1248 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1249 break;
1250 case ir_unop_rsq:
1251 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1252 break;
1253 case ir_unop_i2f:
1254 case ir_unop_i2u:
1255 case ir_unop_u2i:
1256 case ir_unop_u2f:
1257 case ir_unop_b2f:
1258 case ir_unop_b2i:
1259 case ir_unop_f2i:
1260 emit(MOV(result_dst, op[0]));
1261 break;
1262 case ir_unop_f2b:
1263 case ir_unop_i2b: {
1264 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1265 emit(AND(result_dst, result_src, src_reg(1)));
1266 break;
1267 }
1268
1269 case ir_unop_trunc:
1270 emit(RNDZ(result_dst, op[0]));
1271 break;
1272 case ir_unop_ceil:
1273 op[0].negate = !op[0].negate;
1274 inst = emit(RNDD(result_dst, op[0]));
1275 this->result.negate = true;
1276 break;
1277 case ir_unop_floor:
1278 inst = emit(RNDD(result_dst, op[0]));
1279 break;
1280 case ir_unop_fract:
1281 inst = emit(FRC(result_dst, op[0]));
1282 break;
1283 case ir_unop_round_even:
1284 emit(RNDE(result_dst, op[0]));
1285 break;
1286
1287 case ir_binop_min:
1288 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1289
1290 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1291 inst->predicate = BRW_PREDICATE_NORMAL;
1292 break;
1293 case ir_binop_max:
1294 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1295
1296 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1297 inst->predicate = BRW_PREDICATE_NORMAL;
1298 break;
1299
1300 case ir_binop_pow:
1301 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1302 break;
1303
1304 case ir_unop_bit_not:
1305 inst = emit(NOT(result_dst, op[0]));
1306 break;
1307 case ir_binop_bit_and:
1308 inst = emit(AND(result_dst, op[0], op[1]));
1309 break;
1310 case ir_binop_bit_xor:
1311 inst = emit(XOR(result_dst, op[0], op[1]));
1312 break;
1313 case ir_binop_bit_or:
1314 inst = emit(OR(result_dst, op[0], op[1]));
1315 break;
1316
1317 case ir_binop_lshift:
1318 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1319 break;
1320
1321 case ir_binop_rshift:
1322 if (ir->type->base_type == GLSL_TYPE_INT)
1323 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1324 else
1325 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1326 break;
1327
1328 case ir_quadop_vector:
1329 assert(!"not reached: should be handled by lower_quadop_vector");
1330 break;
1331 }
1332 }
1333
1334
1335 void
1336 vec4_visitor::visit(ir_swizzle *ir)
1337 {
1338 src_reg src;
1339 int i = 0;
1340 int swizzle[4];
1341
1342 /* Note that this is only swizzles in expressions, not those on the left
1343 * hand side of an assignment, which do write masking. See ir_assignment
1344 * for that.
1345 */
1346
1347 ir->val->accept(this);
1348 src = this->result;
1349 assert(src.file != BAD_FILE);
1350
1351 for (i = 0; i < ir->type->vector_elements; i++) {
1352 switch (i) {
1353 case 0:
1354 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1355 break;
1356 case 1:
1357 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1358 break;
1359 case 2:
1360 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1361 break;
1362 case 3:
1363 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1364 break;
1365 }
1366 }
1367 for (; i < 4; i++) {
1368 /* Replicate the last channel out. */
1369 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1370 }
1371
1372 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1373
1374 this->result = src;
1375 }
1376
1377 void
1378 vec4_visitor::visit(ir_dereference_variable *ir)
1379 {
1380 const struct glsl_type *type = ir->type;
1381 dst_reg *reg = variable_storage(ir->var);
1382
1383 if (!reg) {
1384 fail("Failed to find variable storage for %s\n", ir->var->name);
1385 this->result = src_reg(brw_null_reg());
1386 return;
1387 }
1388
1389 this->result = src_reg(*reg);
1390
1391 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1392 this->result.swizzle = swizzle_for_size(type->vector_elements);
1393 }
1394
1395 void
1396 vec4_visitor::visit(ir_dereference_array *ir)
1397 {
1398 ir_constant *constant_index;
1399 src_reg src;
1400 int element_size = type_size(ir->type);
1401
1402 constant_index = ir->array_index->constant_expression_value();
1403
1404 ir->array->accept(this);
1405 src = this->result;
1406
1407 if (constant_index) {
1408 src.reg_offset += constant_index->value.i[0] * element_size;
1409 } else {
1410 /* Variable index array dereference. It eats the "vec4" of the
1411 * base of the array and an index that offsets the Mesa register
1412 * index.
1413 */
1414 ir->array_index->accept(this);
1415
1416 src_reg index_reg;
1417
1418 if (element_size == 1) {
1419 index_reg = this->result;
1420 } else {
1421 index_reg = src_reg(this, glsl_type::int_type);
1422
1423 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1424 }
1425
1426 if (src.reladdr) {
1427 src_reg temp = src_reg(this, glsl_type::int_type);
1428
1429 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1430
1431 index_reg = temp;
1432 }
1433
1434 src.reladdr = ralloc(mem_ctx, src_reg);
1435 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1436 }
1437
1438 /* If the type is smaller than a vec4, replicate the last channel out. */
1439 if (ir->type->is_scalar() || ir->type->is_vector())
1440 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1441 else
1442 src.swizzle = BRW_SWIZZLE_NOOP;
1443 src.type = brw_type_for_base_type(ir->type);
1444
1445 this->result = src;
1446 }
1447
1448 void
1449 vec4_visitor::visit(ir_dereference_record *ir)
1450 {
1451 unsigned int i;
1452 const glsl_type *struct_type = ir->record->type;
1453 int offset = 0;
1454
1455 ir->record->accept(this);
1456
1457 for (i = 0; i < struct_type->length; i++) {
1458 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1459 break;
1460 offset += type_size(struct_type->fields.structure[i].type);
1461 }
1462
1463 /* If the type is smaller than a vec4, replicate the last channel out. */
1464 if (ir->type->is_scalar() || ir->type->is_vector())
1465 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1466 else
1467 this->result.swizzle = BRW_SWIZZLE_NOOP;
1468 this->result.type = brw_type_for_base_type(ir->type);
1469
1470 this->result.reg_offset += offset;
1471 }
1472
1473 /**
1474 * We want to be careful in assignment setup to hit the actual storage
1475 * instead of potentially using a temporary like we might with the
1476 * ir_dereference handler.
1477 */
1478 static dst_reg
1479 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1480 {
1481 /* The LHS must be a dereference. If the LHS is a variable indexed array
1482 * access of a vector, it must be separated into a series conditional moves
1483 * before reaching this point (see ir_vec_index_to_cond_assign).
1484 */
1485 assert(ir->as_dereference());
1486 ir_dereference_array *deref_array = ir->as_dereference_array();
1487 if (deref_array) {
1488 assert(!deref_array->array->type->is_vector());
1489 }
1490
1491 /* Use the rvalue deref handler for the most part. We'll ignore
1492 * swizzles in it and write swizzles using writemask, though.
1493 */
1494 ir->accept(v);
1495 return dst_reg(v->result);
1496 }
1497
1498 void
1499 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1500 const struct glsl_type *type, uint32_t predicate)
1501 {
1502 if (type->base_type == GLSL_TYPE_STRUCT) {
1503 for (unsigned int i = 0; i < type->length; i++) {
1504 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1505 }
1506 return;
1507 }
1508
1509 if (type->is_array()) {
1510 for (unsigned int i = 0; i < type->length; i++) {
1511 emit_block_move(dst, src, type->fields.array, predicate);
1512 }
1513 return;
1514 }
1515
1516 if (type->is_matrix()) {
1517 const struct glsl_type *vec_type;
1518
1519 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1520 type->vector_elements, 1);
1521
1522 for (int i = 0; i < type->matrix_columns; i++) {
1523 emit_block_move(dst, src, vec_type, predicate);
1524 }
1525 return;
1526 }
1527
1528 assert(type->is_scalar() || type->is_vector());
1529
1530 dst->type = brw_type_for_base_type(type);
1531 src->type = dst->type;
1532
1533 dst->writemask = (1 << type->vector_elements) - 1;
1534
1535 /* Do we need to worry about swizzling a swizzle? */
1536 assert(src->swizzle == BRW_SWIZZLE_NOOP
1537 || src->swizzle == swizzle_for_size(type->vector_elements));
1538 src->swizzle = swizzle_for_size(type->vector_elements);
1539
1540 vec4_instruction *inst = emit(MOV(*dst, *src));
1541 inst->predicate = predicate;
1542
1543 dst->reg_offset++;
1544 src->reg_offset++;
1545 }
1546
1547
1548 /* If the RHS processing resulted in an instruction generating a
1549 * temporary value, and it would be easy to rewrite the instruction to
1550 * generate its result right into the LHS instead, do so. This ends
1551 * up reliably removing instructions where it can be tricky to do so
1552 * later without real UD chain information.
1553 */
1554 bool
1555 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1556 dst_reg dst,
1557 src_reg src,
1558 vec4_instruction *pre_rhs_inst,
1559 vec4_instruction *last_rhs_inst)
1560 {
1561 /* This could be supported, but it would take more smarts. */
1562 if (ir->condition)
1563 return false;
1564
1565 if (pre_rhs_inst == last_rhs_inst)
1566 return false; /* No instructions generated to work with. */
1567
1568 /* Make sure the last instruction generated our source reg. */
1569 if (src.file != GRF ||
1570 src.file != last_rhs_inst->dst.file ||
1571 src.reg != last_rhs_inst->dst.reg ||
1572 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1573 src.reladdr ||
1574 src.abs ||
1575 src.negate ||
1576 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1577 return false;
1578
1579 /* Check that that last instruction fully initialized the channels
1580 * we want to use, in the order we want to use them. We could
1581 * potentially reswizzle the operands of many instructions so that
1582 * we could handle out of order channels, but don't yet.
1583 */
1584
1585 for (unsigned i = 0; i < 4; i++) {
1586 if (dst.writemask & (1 << i)) {
1587 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1588 return false;
1589
1590 if (BRW_GET_SWZ(src.swizzle, i) != i)
1591 return false;
1592 }
1593 }
1594
1595 /* Success! Rewrite the instruction. */
1596 last_rhs_inst->dst.file = dst.file;
1597 last_rhs_inst->dst.reg = dst.reg;
1598 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1599 last_rhs_inst->dst.reladdr = dst.reladdr;
1600 last_rhs_inst->dst.writemask &= dst.writemask;
1601
1602 return true;
1603 }
1604
1605 void
1606 vec4_visitor::visit(ir_assignment *ir)
1607 {
1608 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1609 uint32_t predicate = BRW_PREDICATE_NONE;
1610
1611 if (!ir->lhs->type->is_scalar() &&
1612 !ir->lhs->type->is_vector()) {
1613 ir->rhs->accept(this);
1614 src_reg src = this->result;
1615
1616 if (ir->condition) {
1617 emit_bool_to_cond_code(ir->condition, &predicate);
1618 }
1619
1620 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1621 return;
1622 }
1623
1624 /* Now we're down to just a scalar/vector with writemasks. */
1625 int i;
1626
1627 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1628 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1629
1630 ir->rhs->accept(this);
1631
1632 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1633
1634 src_reg src = this->result;
1635
1636 int swizzles[4];
1637 int first_enabled_chan = 0;
1638 int src_chan = 0;
1639
1640 assert(ir->lhs->type->is_vector() ||
1641 ir->lhs->type->is_scalar());
1642 dst.writemask = ir->write_mask;
1643
1644 for (int i = 0; i < 4; i++) {
1645 if (dst.writemask & (1 << i)) {
1646 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1647 break;
1648 }
1649 }
1650
1651 /* Swizzle a small RHS vector into the channels being written.
1652 *
1653 * glsl ir treats write_mask as dictating how many channels are
1654 * present on the RHS while in our instructions we need to make
1655 * those channels appear in the slots of the vec4 they're written to.
1656 */
1657 for (int i = 0; i < 4; i++) {
1658 if (dst.writemask & (1 << i))
1659 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1660 else
1661 swizzles[i] = first_enabled_chan;
1662 }
1663 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1664 swizzles[2], swizzles[3]);
1665
1666 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1667 return;
1668 }
1669
1670 if (ir->condition) {
1671 emit_bool_to_cond_code(ir->condition, &predicate);
1672 }
1673
1674 for (i = 0; i < type_size(ir->lhs->type); i++) {
1675 vec4_instruction *inst = emit(MOV(dst, src));
1676 inst->predicate = predicate;
1677
1678 dst.reg_offset++;
1679 src.reg_offset++;
1680 }
1681 }
1682
1683 void
1684 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1685 {
1686 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1687 foreach_list(node, &ir->components) {
1688 ir_constant *field_value = (ir_constant *)node;
1689
1690 emit_constant_values(dst, field_value);
1691 }
1692 return;
1693 }
1694
1695 if (ir->type->is_array()) {
1696 for (unsigned int i = 0; i < ir->type->length; i++) {
1697 emit_constant_values(dst, ir->array_elements[i]);
1698 }
1699 return;
1700 }
1701
1702 if (ir->type->is_matrix()) {
1703 for (int i = 0; i < ir->type->matrix_columns; i++) {
1704 for (int j = 0; j < ir->type->vector_elements; j++) {
1705 dst->writemask = 1 << j;
1706 dst->type = BRW_REGISTER_TYPE_F;
1707
1708 emit(MOV(*dst,
1709 src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1710 }
1711 dst->reg_offset++;
1712 }
1713 return;
1714 }
1715
1716 for (int i = 0; i < ir->type->vector_elements; i++) {
1717 dst->writemask = 1 << i;
1718 dst->type = brw_type_for_base_type(ir->type);
1719
1720 switch (ir->type->base_type) {
1721 case GLSL_TYPE_FLOAT:
1722 emit(MOV(*dst, src_reg(ir->value.f[i])));
1723 break;
1724 case GLSL_TYPE_INT:
1725 emit(MOV(*dst, src_reg(ir->value.i[i])));
1726 break;
1727 case GLSL_TYPE_UINT:
1728 emit(MOV(*dst, src_reg(ir->value.u[i])));
1729 break;
1730 case GLSL_TYPE_BOOL:
1731 emit(MOV(*dst, src_reg(ir->value.b[i])));
1732 break;
1733 default:
1734 assert(!"Non-float/uint/int/bool constant");
1735 break;
1736 }
1737 }
1738 dst->reg_offset++;
1739 }
1740
1741 void
1742 vec4_visitor::visit(ir_constant *ir)
1743 {
1744 dst_reg dst = dst_reg(this, ir->type);
1745 this->result = src_reg(dst);
1746
1747 emit_constant_values(&dst, ir);
1748 }
1749
1750 void
1751 vec4_visitor::visit(ir_call *ir)
1752 {
1753 assert(!"not reached");
1754 }
1755
1756 void
1757 vec4_visitor::visit(ir_texture *ir)
1758 {
1759 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1760 sampler = vp->Base.SamplerUnits[sampler];
1761
1762 /* Should be lowered by do_lower_texture_projection */
1763 assert(!ir->projector);
1764
1765 vec4_instruction *inst;
1766 switch (ir->op) {
1767 case ir_tex:
1768 case ir_txl:
1769 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1770 break;
1771 case ir_txd:
1772 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1773 break;
1774 case ir_txf:
1775 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1776 break;
1777 case ir_txs:
1778 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1779 break;
1780 case ir_txb:
1781 assert(!"TXB is not valid for vertex shaders.");
1782 }
1783
1784 /* Texel offsets go in the message header; Gen4 also requires headers. */
1785 inst->header_present = ir->offset || intel->gen < 5;
1786 inst->base_mrf = 2;
1787 inst->mlen = inst->header_present + 1; /* always at least one */
1788 inst->sampler = sampler;
1789 inst->dst = dst_reg(this, glsl_type::get_instance(ir->type->base_type,4,1));
1790 inst->shadow_compare = ir->shadow_comparitor != NULL;
1791
1792 if (ir->offset != NULL)
1793 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1794
1795 /* MRF for the first parameter */
1796 int param_base = inst->base_mrf + inst->header_present;
1797
1798 if (ir->op == ir_txs) {
1799 ir->lod_info.lod->accept(this);
1800 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1801 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1802 this->result));
1803 } else {
1804 int i, coord_mask = 0, zero_mask = 0;
1805 /* Load the coordinate */
1806 /* FINISHME: gl_clamp_mask and saturate */
1807 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1808 coord_mask |= (1 << i);
1809 for (; i < 4; i++)
1810 zero_mask |= (1 << i);
1811
1812 ir->coordinate->accept(this);
1813 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1814 this->result));
1815 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1816 src_reg(0)));
1817 /* Load the shadow comparitor */
1818 if (ir->shadow_comparitor) {
1819 ir->shadow_comparitor->accept(this);
1820 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1821 WRITEMASK_X),
1822 this->result));
1823 inst->mlen++;
1824 }
1825
1826 /* Load the LOD info */
1827 if (ir->op == ir_txl) {
1828 int mrf, writemask;
1829 if (intel->gen >= 5) {
1830 mrf = param_base + 1;
1831 if (ir->shadow_comparitor) {
1832 writemask = WRITEMASK_Y;
1833 /* mlen already incremented */
1834 } else {
1835 writemask = WRITEMASK_X;
1836 inst->mlen++;
1837 }
1838 } else /* intel->gen == 4 */ {
1839 mrf = param_base;
1840 writemask = WRITEMASK_Z;
1841 }
1842 ir->lod_info.lod->accept(this);
1843 emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
1844 this->result));
1845 } else if (ir->op == ir_txf) {
1846 ir->lod_info.lod->accept(this);
1847 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1848 this->result));
1849 } else if (ir->op == ir_txd) {
1850 const glsl_type *type = ir->lod_info.grad.dPdx->type;
1851
1852 ir->lod_info.grad.dPdx->accept(this);
1853 src_reg dPdx = this->result;
1854 ir->lod_info.grad.dPdy->accept(this);
1855 src_reg dPdy = this->result;
1856
1857 if (intel->gen >= 5) {
1858 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1859 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1860 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1861 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1862 inst->mlen++;
1863
1864 if (ir->type->vector_elements == 3) {
1865 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1866 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1867 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1868 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1869 inst->mlen++;
1870 }
1871 } else /* intel->gen == 4 */ {
1872 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1873 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1874 inst->mlen += 2;
1875 }
1876 }
1877 }
1878
1879 emit(inst);
1880
1881 swizzle_result(ir, src_reg(inst->dst), sampler);
1882 }
1883
1884 void
1885 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
1886 {
1887 this->result = orig_val;
1888
1889 int s = c->key.tex.swizzles[sampler];
1890
1891 if (ir->op == ir_txs || ir->type == glsl_type::float_type
1892 || s == SWIZZLE_NOOP)
1893 return;
1894
1895 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1896 int swizzle[4];
1897
1898 for (int i = 0; i < 4; i++) {
1899 switch (GET_SWZ(s, i)) {
1900 case SWIZZLE_ZERO:
1901 zero_mask |= (1 << i);
1902 break;
1903 case SWIZZLE_ONE:
1904 one_mask |= (1 << i);
1905 break;
1906 default:
1907 copy_mask |= (1 << i);
1908 swizzle[i] = GET_SWZ(s, i);
1909 break;
1910 }
1911 }
1912
1913 this->result = src_reg(this, ir->type);
1914 dst_reg swizzled_result(this->result);
1915
1916 if (copy_mask) {
1917 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1918 swizzled_result.writemask = copy_mask;
1919 emit(MOV(swizzled_result, orig_val));
1920 }
1921
1922 if (zero_mask) {
1923 swizzled_result.writemask = zero_mask;
1924 emit(MOV(swizzled_result, src_reg(0.0f)));
1925 }
1926
1927 if (one_mask) {
1928 swizzled_result.writemask = one_mask;
1929 emit(MOV(swizzled_result, src_reg(1.0f)));
1930 }
1931 }
1932
1933 void
1934 vec4_visitor::visit(ir_return *ir)
1935 {
1936 assert(!"not reached");
1937 }
1938
1939 void
1940 vec4_visitor::visit(ir_discard *ir)
1941 {
1942 assert(!"not reached");
1943 }
1944
1945 void
1946 vec4_visitor::visit(ir_if *ir)
1947 {
1948 /* Don't point the annotation at the if statement, because then it plus
1949 * the then and else blocks get printed.
1950 */
1951 this->base_ir = ir->condition;
1952
1953 if (intel->gen == 6) {
1954 emit_if_gen6(ir);
1955 } else {
1956 uint32_t predicate;
1957 emit_bool_to_cond_code(ir->condition, &predicate);
1958 emit(IF(predicate));
1959 }
1960
1961 visit_instructions(&ir->then_instructions);
1962
1963 if (!ir->else_instructions.is_empty()) {
1964 this->base_ir = ir->condition;
1965 emit(BRW_OPCODE_ELSE);
1966
1967 visit_instructions(&ir->else_instructions);
1968 }
1969
1970 this->base_ir = ir->condition;
1971 emit(BRW_OPCODE_ENDIF);
1972 }
1973
1974 void
1975 vec4_visitor::emit_ndc_computation()
1976 {
1977 /* Get the position */
1978 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1979
1980 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1981 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1982 output_reg[BRW_VERT_RESULT_NDC] = ndc;
1983
1984 current_annotation = "NDC";
1985 dst_reg ndc_w = ndc;
1986 ndc_w.writemask = WRITEMASK_W;
1987 src_reg pos_w = pos;
1988 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1989 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1990
1991 dst_reg ndc_xyz = ndc;
1992 ndc_xyz.writemask = WRITEMASK_XYZ;
1993
1994 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1995 }
1996
1997 void
1998 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1999 {
2000 if (intel->gen < 6 &&
2001 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2002 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2003 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2004 dst_reg header1_w = header1;
2005 header1_w.writemask = WRITEMASK_W;
2006 GLuint i;
2007
2008 emit(MOV(header1, 0u));
2009
2010 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2011 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2012
2013 current_annotation = "Point size";
2014 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2015 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2016 }
2017
2018 current_annotation = "Clipping flags";
2019 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2020 vec4_instruction *inst;
2021
2022 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2023 src_reg(this->userplane[i])));
2024 inst->conditional_mod = BRW_CONDITIONAL_L;
2025
2026 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2027 inst->predicate = BRW_PREDICATE_NORMAL;
2028 }
2029
2030 /* i965 clipping workaround:
2031 * 1) Test for -ve rhw
2032 * 2) If set,
2033 * set ndc = (0,0,0,0)
2034 * set ucp[6] = 1
2035 *
2036 * Later, clipping will detect ucp[6] and ensure the primitive is
2037 * clipped against all fixed planes.
2038 */
2039 if (brw->has_negative_rhw_bug) {
2040 #if 0
2041 /* FINISHME */
2042 brw_CMP(p,
2043 vec8(brw_null_reg()),
2044 BRW_CONDITIONAL_L,
2045 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2046 brw_imm_f(0));
2047
2048 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2049 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2050 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2051 #endif
2052 }
2053
2054 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2055 } else if (intel->gen < 6) {
2056 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2057 } else {
2058 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2059 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2060 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2061 src_reg(output_reg[VERT_RESULT_PSIZ])));
2062 }
2063 }
2064 }
2065
2066 void
2067 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2068 {
2069 if (intel->gen < 6) {
2070 /* Clip distance slots are set aside in gen5, but they are not used. It
2071 * is not clear whether we actually need to set aside space for them,
2072 * but the performance cost is negligible.
2073 */
2074 return;
2075 }
2076
2077 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2078 *
2079 * "If a linked set of shaders forming the vertex stage contains no
2080 * static write to gl_ClipVertex or gl_ClipDistance, but the
2081 * application has requested clipping against user clip planes through
2082 * the API, then the coordinate written to gl_Position is used for
2083 * comparison against the user clip planes."
2084 *
2085 * This function is only called if the shader didn't write to
2086 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2087 * if the user wrote to it; otherwise we use gl_Position.
2088 */
2089 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2090 if (!(c->prog_data.outputs_written
2091 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2092 clip_vertex = VERT_RESULT_HPOS;
2093 }
2094
2095 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2096 ++i) {
2097 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2098 src_reg(output_reg[clip_vertex]),
2099 src_reg(this->userplane[i + offset])));
2100 }
2101 }
2102
2103 void
2104 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2105 {
2106 assert (vert_result < VERT_RESULT_MAX);
2107 reg.type = output_reg[vert_result].type;
2108 current_annotation = output_reg_annotation[vert_result];
2109 /* Copy the register, saturating if necessary */
2110 vec4_instruction *inst = emit(MOV(reg,
2111 src_reg(output_reg[vert_result])));
2112 if ((vert_result == VERT_RESULT_COL0 ||
2113 vert_result == VERT_RESULT_COL1 ||
2114 vert_result == VERT_RESULT_BFC0 ||
2115 vert_result == VERT_RESULT_BFC1) &&
2116 c->key.clamp_vertex_color) {
2117 inst->saturate = true;
2118 }
2119 }
2120
2121 void
2122 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2123 {
2124 struct brw_reg hw_reg = brw_message_reg(mrf);
2125 dst_reg reg = dst_reg(MRF, mrf);
2126 reg.type = BRW_REGISTER_TYPE_F;
2127
2128 switch (vert_result) {
2129 case VERT_RESULT_PSIZ:
2130 /* PSIZ is always in slot 0, and is coupled with other flags. */
2131 current_annotation = "indices, point width, clip flags";
2132 emit_psiz_and_flags(hw_reg);
2133 break;
2134 case BRW_VERT_RESULT_NDC:
2135 current_annotation = "NDC";
2136 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2137 break;
2138 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2139 case VERT_RESULT_HPOS:
2140 current_annotation = "gl_Position";
2141 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2142 break;
2143 case VERT_RESULT_CLIP_DIST0:
2144 case VERT_RESULT_CLIP_DIST1:
2145 if (this->c->key.uses_clip_distance) {
2146 emit_generic_urb_slot(reg, vert_result);
2147 } else {
2148 current_annotation = "user clip distances";
2149 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2150 }
2151 break;
2152 case BRW_VERT_RESULT_PAD:
2153 /* No need to write to this slot */
2154 break;
2155 default:
2156 emit_generic_urb_slot(reg, vert_result);
2157 break;
2158 }
2159 }
2160
2161 static int
2162 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2163 {
2164 struct intel_context *intel = &brw->intel;
2165
2166 if (intel->gen >= 6) {
2167 /* URB data written (does not include the message header reg) must
2168 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2169 * section 5.4.3.2.2: URB_INTERLEAVED.
2170 *
2171 * URB entries are allocated on a multiple of 1024 bits, so an
2172 * extra 128 bits written here to make the end align to 256 is
2173 * no problem.
2174 */
2175 if ((mlen % 2) != 1)
2176 mlen++;
2177 }
2178
2179 return mlen;
2180 }
2181
2182 /**
2183 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2184 * complete the VS thread.
2185 *
2186 * The VUE layout is documented in Volume 2a.
2187 */
2188 void
2189 vec4_visitor::emit_urb_writes()
2190 {
2191 /* MRF 0 is reserved for the debugger, so start with message header
2192 * in MRF 1.
2193 */
2194 int base_mrf = 1;
2195 int mrf = base_mrf;
2196 /* In the process of generating our URB write message contents, we
2197 * may need to unspill a register or load from an array. Those
2198 * reads would use MRFs 14-15.
2199 */
2200 int max_usable_mrf = 13;
2201
2202 /* The following assertion verifies that max_usable_mrf causes an
2203 * even-numbered amount of URB write data, which will meet gen6's
2204 * requirements for length alignment.
2205 */
2206 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2207
2208 /* FINISHME: edgeflag */
2209
2210 brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2211 c->prog_data.outputs_written);
2212
2213 /* First mrf is the g0-based message header containing URB handles and such,
2214 * which is implied in VS_OPCODE_URB_WRITE.
2215 */
2216 mrf++;
2217
2218 if (intel->gen < 6) {
2219 emit_ndc_computation();
2220 }
2221
2222 /* Set up the VUE data for the first URB write */
2223 int slot;
2224 for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2225 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2226
2227 /* If this was max_usable_mrf, we can't fit anything more into this URB
2228 * WRITE.
2229 */
2230 if (mrf > max_usable_mrf) {
2231 slot++;
2232 break;
2233 }
2234 }
2235
2236 current_annotation = "URB write";
2237 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2238 inst->base_mrf = base_mrf;
2239 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2240 inst->eot = (slot >= c->vue_map.num_slots);
2241
2242 /* Optional second URB write */
2243 if (!inst->eot) {
2244 mrf = base_mrf + 1;
2245
2246 for (; slot < c->vue_map.num_slots; ++slot) {
2247 assert(mrf < max_usable_mrf);
2248
2249 emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2250 }
2251
2252 current_annotation = "URB write";
2253 inst = emit(VS_OPCODE_URB_WRITE);
2254 inst->base_mrf = base_mrf;
2255 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2256 inst->eot = true;
2257 /* URB destination offset. In the previous write, we got MRFs
2258 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2259 * URB row increments, and each of our MRFs is half of one of
2260 * those, since we're doing interleaved writes.
2261 */
2262 inst->offset = (max_usable_mrf - base_mrf) / 2;
2263 }
2264
2265 if (intel->gen == 6)
2266 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2267 else
2268 c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2269 }
2270
2271 src_reg
2272 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2273 src_reg *reladdr, int reg_offset)
2274 {
2275 /* Because we store the values to scratch interleaved like our
2276 * vertex data, we need to scale the vec4 index by 2.
2277 */
2278 int message_header_scale = 2;
2279
2280 /* Pre-gen6, the message header uses byte offsets instead of vec4
2281 * (16-byte) offset units.
2282 */
2283 if (intel->gen < 6)
2284 message_header_scale *= 16;
2285
2286 if (reladdr) {
2287 src_reg index = src_reg(this, glsl_type::int_type);
2288
2289 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2290 emit_before(inst, MUL(dst_reg(index),
2291 index, src_reg(message_header_scale)));
2292
2293 return index;
2294 } else {
2295 return src_reg(reg_offset * message_header_scale);
2296 }
2297 }
2298
2299 src_reg
2300 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2301 src_reg *reladdr, int reg_offset)
2302 {
2303 if (reladdr) {
2304 src_reg index = src_reg(this, glsl_type::int_type);
2305
2306 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2307
2308 /* Pre-gen6, the message header uses byte offsets instead of vec4
2309 * (16-byte) offset units.
2310 */
2311 if (intel->gen < 6) {
2312 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2313 }
2314
2315 return index;
2316 } else {
2317 int message_header_scale = intel->gen < 6 ? 16 : 1;
2318 return src_reg(reg_offset * message_header_scale);
2319 }
2320 }
2321
2322 /**
2323 * Emits an instruction before @inst to load the value named by @orig_src
2324 * from scratch space at @base_offset to @temp.
2325 */
2326 void
2327 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2328 dst_reg temp, src_reg orig_src,
2329 int base_offset)
2330 {
2331 int reg_offset = base_offset + orig_src.reg_offset;
2332 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2333
2334 emit_before(inst, SCRATCH_READ(temp, index));
2335 }
2336
2337 /**
2338 * Emits an instruction after @inst to store the value to be written
2339 * to @orig_dst to scratch space at @base_offset, from @temp.
2340 */
2341 void
2342 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2343 src_reg temp, dst_reg orig_dst,
2344 int base_offset)
2345 {
2346 int reg_offset = base_offset + orig_dst.reg_offset;
2347 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2348
2349 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2350 orig_dst.writemask));
2351 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2352 write->predicate = inst->predicate;
2353 write->ir = inst->ir;
2354 write->annotation = inst->annotation;
2355 inst->insert_after(write);
2356 }
2357
2358 /**
2359 * We can't generally support array access in GRF space, because a
2360 * single instruction's destination can only span 2 contiguous
2361 * registers. So, we send all GRF arrays that get variable index
2362 * access to scratch space.
2363 */
2364 void
2365 vec4_visitor::move_grf_array_access_to_scratch()
2366 {
2367 int scratch_loc[this->virtual_grf_count];
2368
2369 for (int i = 0; i < this->virtual_grf_count; i++) {
2370 scratch_loc[i] = -1;
2371 }
2372
2373 /* First, calculate the set of virtual GRFs that need to be punted
2374 * to scratch due to having any array access on them, and where in
2375 * scratch.
2376 */
2377 foreach_list(node, &this->instructions) {
2378 vec4_instruction *inst = (vec4_instruction *)node;
2379
2380 if (inst->dst.file == GRF && inst->dst.reladdr &&
2381 scratch_loc[inst->dst.reg] == -1) {
2382 scratch_loc[inst->dst.reg] = c->last_scratch;
2383 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2384 }
2385
2386 for (int i = 0 ; i < 3; i++) {
2387 src_reg *src = &inst->src[i];
2388
2389 if (src->file == GRF && src->reladdr &&
2390 scratch_loc[src->reg] == -1) {
2391 scratch_loc[src->reg] = c->last_scratch;
2392 c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2393 }
2394 }
2395 }
2396
2397 /* Now, for anything that will be accessed through scratch, rewrite
2398 * it to load/store. Note that this is a _safe list walk, because
2399 * we may generate a new scratch_write instruction after the one
2400 * we're processing.
2401 */
2402 foreach_list_safe(node, &this->instructions) {
2403 vec4_instruction *inst = (vec4_instruction *)node;
2404
2405 /* Set up the annotation tracking for new generated instructions. */
2406 base_ir = inst->ir;
2407 current_annotation = inst->annotation;
2408
2409 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2410 src_reg temp = src_reg(this, glsl_type::vec4_type);
2411
2412 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2413
2414 inst->dst.file = temp.file;
2415 inst->dst.reg = temp.reg;
2416 inst->dst.reg_offset = temp.reg_offset;
2417 inst->dst.reladdr = NULL;
2418 }
2419
2420 for (int i = 0 ; i < 3; i++) {
2421 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2422 continue;
2423
2424 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2425
2426 emit_scratch_read(inst, temp, inst->src[i],
2427 scratch_loc[inst->src[i].reg]);
2428
2429 inst->src[i].file = temp.file;
2430 inst->src[i].reg = temp.reg;
2431 inst->src[i].reg_offset = temp.reg_offset;
2432 inst->src[i].reladdr = NULL;
2433 }
2434 }
2435 }
2436
2437 /**
2438 * Emits an instruction before @inst to load the value named by @orig_src
2439 * from the pull constant buffer (surface) at @base_offset to @temp.
2440 */
2441 void
2442 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2443 dst_reg temp, src_reg orig_src,
2444 int base_offset)
2445 {
2446 int reg_offset = base_offset + orig_src.reg_offset;
2447 src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2448 vec4_instruction *load;
2449
2450 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2451 temp, index);
2452 load->base_mrf = 14;
2453 load->mlen = 1;
2454 emit_before(inst, load);
2455 }
2456
2457 /**
2458 * Implements array access of uniforms by inserting a
2459 * PULL_CONSTANT_LOAD instruction.
2460 *
2461 * Unlike temporary GRF array access (where we don't support it due to
2462 * the difficulty of doing relative addressing on instruction
2463 * destinations), we could potentially do array access of uniforms
2464 * that were loaded in GRF space as push constants. In real-world
2465 * usage we've seen, though, the arrays being used are always larger
2466 * than we could load as push constants, so just always move all
2467 * uniform array access out to a pull constant buffer.
2468 */
2469 void
2470 vec4_visitor::move_uniform_array_access_to_pull_constants()
2471 {
2472 int pull_constant_loc[this->uniforms];
2473
2474 for (int i = 0; i < this->uniforms; i++) {
2475 pull_constant_loc[i] = -1;
2476 }
2477
2478 /* Walk through and find array access of uniforms. Put a copy of that
2479 * uniform in the pull constant buffer.
2480 *
2481 * Note that we don't move constant-indexed accesses to arrays. No
2482 * testing has been done of the performance impact of this choice.
2483 */
2484 foreach_list_safe(node, &this->instructions) {
2485 vec4_instruction *inst = (vec4_instruction *)node;
2486
2487 for (int i = 0 ; i < 3; i++) {
2488 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2489 continue;
2490
2491 int uniform = inst->src[i].reg;
2492
2493 /* If this array isn't already present in the pull constant buffer,
2494 * add it.
2495 */
2496 if (pull_constant_loc[uniform] == -1) {
2497 const float **values = &prog_data->param[uniform * 4];
2498
2499 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2500
2501 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2502 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2503 }
2504 }
2505
2506 /* Set up the annotation tracking for new generated instructions. */
2507 base_ir = inst->ir;
2508 current_annotation = inst->annotation;
2509
2510 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2511
2512 emit_pull_constant_load(inst, temp, inst->src[i],
2513 pull_constant_loc[uniform]);
2514
2515 inst->src[i].file = temp.file;
2516 inst->src[i].reg = temp.reg;
2517 inst->src[i].reg_offset = temp.reg_offset;
2518 inst->src[i].reladdr = NULL;
2519 }
2520 }
2521
2522 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2523 * no need to track them as larger-than-vec4 objects. This will be
2524 * relied on in cutting out unused uniform vectors from push
2525 * constants.
2526 */
2527 split_uniform_registers();
2528 }
2529
2530 void
2531 vec4_visitor::resolve_ud_negate(src_reg *reg)
2532 {
2533 if (reg->type != BRW_REGISTER_TYPE_UD ||
2534 !reg->negate)
2535 return;
2536
2537 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2538 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2539 *reg = temp;
2540 }
2541
2542 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2543 struct gl_shader_program *prog,
2544 struct brw_shader *shader)
2545 {
2546 this->c = c;
2547 this->p = &c->func;
2548 this->brw = p->brw;
2549 this->intel = &brw->intel;
2550 this->ctx = &intel->ctx;
2551 this->prog = prog;
2552 this->shader = shader;
2553
2554 this->mem_ctx = ralloc_context(NULL);
2555 this->failed = false;
2556
2557 this->base_ir = NULL;
2558 this->current_annotation = NULL;
2559
2560 this->c = c;
2561 this->vp = (struct gl_vertex_program *)
2562 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2563 this->prog_data = &c->prog_data;
2564
2565 this->variable_ht = hash_table_ctor(0,
2566 hash_table_pointer_hash,
2567 hash_table_pointer_compare);
2568
2569 this->virtual_grf_def = NULL;
2570 this->virtual_grf_use = NULL;
2571 this->virtual_grf_sizes = NULL;
2572 this->virtual_grf_count = 0;
2573 this->virtual_grf_reg_map = NULL;
2574 this->virtual_grf_reg_count = 0;
2575 this->virtual_grf_array_size = 0;
2576 this->live_intervals_valid = false;
2577
2578 this->uniforms = 0;
2579
2580 this->variable_ht = hash_table_ctor(0,
2581 hash_table_pointer_hash,
2582 hash_table_pointer_compare);
2583 }
2584
2585 vec4_visitor::~vec4_visitor()
2586 {
2587 ralloc_free(this->mem_ctx);
2588 hash_table_dtor(this->variable_ht);
2589 }
2590
2591
2592 void
2593 vec4_visitor::fail(const char *format, ...)
2594 {
2595 va_list va;
2596 char *msg;
2597
2598 if (failed)
2599 return;
2600
2601 failed = true;
2602
2603 va_start(va, format);
2604 msg = ralloc_vasprintf(mem_ctx, format, va);
2605 va_end(va);
2606 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2607
2608 this->fail_msg = msg;
2609
2610 if (INTEL_DEBUG & DEBUG_VS) {
2611 fprintf(stderr, "%s", msg);
2612 }
2613 }
2614
2615 } /* namespace brw */