cff04ba887c20c0ebbfe165582daca37575537ee
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 ALU1(NOT)
111 ALU1(MOV)
112 ALU1(FRC)
113 ALU1(RNDD)
114 ALU1(RNDE)
115 ALU1(RNDZ)
116 ALU2(ADD)
117 ALU2(MUL)
118 ALU2(MACH)
119 ALU2(AND)
120 ALU2(OR)
121 ALU2(XOR)
122 ALU2(DP3)
123 ALU2(DP4)
124 ALU2(DPH)
125 ALU2(SHL)
126 ALU2(SHR)
127 ALU2(ASR)
128
129 /** Gen4 predicated IF. */
130 vec4_instruction *
131 vec4_visitor::IF(uint32_t predicate)
132 {
133 vec4_instruction *inst;
134
135 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
136 inst->predicate = predicate;
137
138 return inst;
139 }
140
141 /** Gen6+ IF with embedded comparison. */
142 vec4_instruction *
143 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
144 {
145 assert(intel->gen >= 6);
146
147 vec4_instruction *inst;
148
149 resolve_ud_negate(&src0);
150 resolve_ud_negate(&src1);
151
152 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
153 src0, src1);
154 inst->conditional_mod = condition;
155
156 return inst;
157 }
158
159 /**
160 * CMP: Sets the low bit of the destination channels with the result
161 * of the comparison, while the upper bits are undefined, and updates
162 * the flag register with the packed 16 bits of the result.
163 */
164 vec4_instruction *
165 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
166 {
167 vec4_instruction *inst;
168
169 /* original gen4 does type conversion to the destination type
170 * before before comparison, producing garbage results for floating
171 * point comparisons.
172 */
173 if (intel->gen == 4) {
174 dst.type = src0.type;
175 if (dst.file == HW_REG)
176 dst.fixed_hw_reg.type = dst.type;
177 }
178
179 resolve_ud_negate(&src0);
180 resolve_ud_negate(&src1);
181
182 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
183 inst->conditional_mod = condition;
184
185 return inst;
186 }
187
188 vec4_instruction *
189 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
194 dst, index);
195 inst->base_mrf = 14;
196 inst->mlen = 2;
197
198 return inst;
199 }
200
201 vec4_instruction *
202 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
203 {
204 vec4_instruction *inst;
205
206 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
207 dst, src, index);
208 inst->base_mrf = 13;
209 inst->mlen = 3;
210
211 return inst;
212 }
213
214 void
215 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
216 {
217 static enum opcode dot_opcodes[] = {
218 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
219 };
220
221 emit(dot_opcodes[elements - 2], dst, src0, src1);
222 }
223
224 src_reg
225 vec4_visitor::fix_math_operand(src_reg src)
226 {
227 /* The gen6 math instruction ignores the source modifiers --
228 * swizzle, abs, negate, and at least some parts of the register
229 * region description.
230 *
231 * Rather than trying to enumerate all these cases, *always* expand the
232 * operand to a temp GRF for gen6.
233 *
234 * For gen7, keep the operand as-is, except if immediate, which gen7 still
235 * can't use.
236 */
237
238 if (intel->gen == 7 && src.file != IMM)
239 return src;
240
241 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
242 expanded.type = src.type;
243 emit(MOV(expanded, src));
244 return src_reg(expanded);
245 }
246
247 void
248 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
249 {
250 src = fix_math_operand(src);
251
252 if (dst.writemask != WRITEMASK_XYZW) {
253 /* The gen6 math instruction must be align1, so we can't do
254 * writemasks.
255 */
256 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
257
258 emit(opcode, temp_dst, src);
259
260 emit(MOV(dst, src_reg(temp_dst)));
261 } else {
262 emit(opcode, dst, src);
263 }
264 }
265
266 void
267 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
268 {
269 vec4_instruction *inst = emit(opcode, dst, src);
270 inst->base_mrf = 1;
271 inst->mlen = 1;
272 }
273
274 void
275 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
276 {
277 switch (opcode) {
278 case SHADER_OPCODE_RCP:
279 case SHADER_OPCODE_RSQ:
280 case SHADER_OPCODE_SQRT:
281 case SHADER_OPCODE_EXP2:
282 case SHADER_OPCODE_LOG2:
283 case SHADER_OPCODE_SIN:
284 case SHADER_OPCODE_COS:
285 break;
286 default:
287 assert(!"not reached: bad math opcode");
288 return;
289 }
290
291 if (intel->gen >= 6) {
292 return emit_math1_gen6(opcode, dst, src);
293 } else {
294 return emit_math1_gen4(opcode, dst, src);
295 }
296 }
297
298 void
299 vec4_visitor::emit_math2_gen6(enum opcode opcode,
300 dst_reg dst, src_reg src0, src_reg src1)
301 {
302 src0 = fix_math_operand(src0);
303 src1 = fix_math_operand(src1);
304
305 if (dst.writemask != WRITEMASK_XYZW) {
306 /* The gen6 math instruction must be align1, so we can't do
307 * writemasks.
308 */
309 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
310 temp_dst.type = dst.type;
311
312 emit(opcode, temp_dst, src0, src1);
313
314 emit(MOV(dst, src_reg(temp_dst)));
315 } else {
316 emit(opcode, dst, src0, src1);
317 }
318 }
319
320 void
321 vec4_visitor::emit_math2_gen4(enum opcode opcode,
322 dst_reg dst, src_reg src0, src_reg src1)
323 {
324 vec4_instruction *inst = emit(opcode, dst, src0, src1);
325 inst->base_mrf = 1;
326 inst->mlen = 2;
327 }
328
329 void
330 vec4_visitor::emit_math(enum opcode opcode,
331 dst_reg dst, src_reg src0, src_reg src1)
332 {
333 switch (opcode) {
334 case SHADER_OPCODE_POW:
335 case SHADER_OPCODE_INT_QUOTIENT:
336 case SHADER_OPCODE_INT_REMAINDER:
337 break;
338 default:
339 assert(!"not reached: unsupported binary math opcode");
340 return;
341 }
342
343 if (intel->gen >= 6) {
344 return emit_math2_gen6(opcode, dst, src0, src1);
345 } else {
346 return emit_math2_gen4(opcode, dst, src0, src1);
347 }
348 }
349
350 void
351 vec4_visitor::visit_instructions(const exec_list *list)
352 {
353 foreach_list(node, list) {
354 ir_instruction *ir = (ir_instruction *)node;
355
356 base_ir = ir;
357 ir->accept(this);
358 }
359 }
360
361
362 static int
363 type_size(const struct glsl_type *type)
364 {
365 unsigned int i;
366 int size;
367
368 switch (type->base_type) {
369 case GLSL_TYPE_UINT:
370 case GLSL_TYPE_INT:
371 case GLSL_TYPE_FLOAT:
372 case GLSL_TYPE_BOOL:
373 if (type->is_matrix()) {
374 return type->matrix_columns;
375 } else {
376 /* Regardless of size of vector, it gets a vec4. This is bad
377 * packing for things like floats, but otherwise arrays become a
378 * mess. Hopefully a later pass over the code can pack scalars
379 * down if appropriate.
380 */
381 return 1;
382 }
383 case GLSL_TYPE_ARRAY:
384 assert(type->length > 0);
385 return type_size(type->fields.array) * type->length;
386 case GLSL_TYPE_STRUCT:
387 size = 0;
388 for (i = 0; i < type->length; i++) {
389 size += type_size(type->fields.structure[i].type);
390 }
391 return size;
392 case GLSL_TYPE_SAMPLER:
393 /* Samplers take up one slot in UNIFORMS[], but they're baked in
394 * at link time.
395 */
396 return 1;
397 default:
398 assert(0);
399 return 0;
400 }
401 }
402
403 int
404 vec4_visitor::virtual_grf_alloc(int size)
405 {
406 if (virtual_grf_array_size <= virtual_grf_count) {
407 if (virtual_grf_array_size == 0)
408 virtual_grf_array_size = 16;
409 else
410 virtual_grf_array_size *= 2;
411 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
412 virtual_grf_array_size);
413 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
414 virtual_grf_array_size);
415 }
416 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
417 virtual_grf_reg_count += size;
418 virtual_grf_sizes[virtual_grf_count] = size;
419 return virtual_grf_count++;
420 }
421
422 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
423 {
424 init();
425
426 this->file = GRF;
427 this->reg = v->virtual_grf_alloc(type_size(type));
428
429 if (type->is_array() || type->is_record()) {
430 this->swizzle = BRW_SWIZZLE_NOOP;
431 } else {
432 this->swizzle = swizzle_for_size(type->vector_elements);
433 }
434
435 this->type = brw_type_for_base_type(type);
436 }
437
438 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
439 {
440 init();
441
442 this->file = GRF;
443 this->reg = v->virtual_grf_alloc(type_size(type));
444
445 if (type->is_array() || type->is_record()) {
446 this->writemask = WRITEMASK_XYZW;
447 } else {
448 this->writemask = (1 << type->vector_elements) - 1;
449 }
450
451 this->type = brw_type_for_base_type(type);
452 }
453
454 /* Our support for uniforms is piggy-backed on the struct
455 * gl_fragment_program, because that's where the values actually
456 * get stored, rather than in some global gl_shader_program uniform
457 * store.
458 */
459 void
460 vec4_visitor::setup_uniform_values(ir_variable *ir)
461 {
462 int namelen = strlen(ir->name);
463
464 /* The data for our (non-builtin) uniforms is stored in a series of
465 * gl_uniform_driver_storage structs for each subcomponent that
466 * glGetUniformLocation() could name. We know it's been set up in the same
467 * order we'd walk the type, so walk the list of storage and find anything
468 * with our name, or the prefix of a component that starts with our name.
469 */
470 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
471 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
472
473 if (strncmp(ir->name, storage->name, namelen) != 0 ||
474 (storage->name[namelen] != 0 &&
475 storage->name[namelen] != '.' &&
476 storage->name[namelen] != '[')) {
477 continue;
478 }
479
480 gl_constant_value *components = storage->storage;
481 unsigned vector_count = (MAX2(storage->array_elements, 1) *
482 storage->type->matrix_columns);
483
484 for (unsigned s = 0; s < vector_count; s++) {
485 uniform_vector_size[uniforms] = storage->type->vector_elements;
486
487 int i;
488 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
489 c->prog_data.param[uniforms * 4 + i] = &components->f;
490 components++;
491 }
492 for (; i < 4; i++) {
493 static float zero = 0;
494 c->prog_data.param[uniforms * 4 + i] = &zero;
495 }
496
497 uniforms++;
498 }
499 }
500 }
501
502 void
503 vec4_visitor::setup_uniform_clipplane_values()
504 {
505 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
506
507 if (intel->gen < 6) {
508 /* Pre-Gen6, we compact clip planes. For example, if the user
509 * enables just clip planes 0, 1, and 3, we will enable clip planes
510 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
511 * plane 2. This simplifies the implementation of the Gen6 clip
512 * thread.
513 */
514 int compacted_clipplane_index = 0;
515 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
516 if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
517 continue;
518
519 this->uniform_vector_size[this->uniforms] = 4;
520 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
521 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
522 for (int j = 0; j < 4; ++j) {
523 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
524 }
525 ++compacted_clipplane_index;
526 ++this->uniforms;
527 }
528 } else {
529 /* In Gen6 and later, we don't compact clip planes, because this
530 * simplifies the implementation of gl_ClipDistance.
531 */
532 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
533 this->uniform_vector_size[this->uniforms] = 4;
534 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
535 this->userplane[i].type = BRW_REGISTER_TYPE_F;
536 for (int j = 0; j < 4; ++j) {
537 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
538 }
539 ++this->uniforms;
540 }
541 }
542 }
543
544 /* Our support for builtin uniforms is even scarier than non-builtin.
545 * It sits on top of the PROG_STATE_VAR parameters that are
546 * automatically updated from GL context state.
547 */
548 void
549 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
550 {
551 const ir_state_slot *const slots = ir->state_slots;
552 assert(ir->state_slots != NULL);
553
554 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
555 /* This state reference has already been setup by ir_to_mesa,
556 * but we'll get the same index back here. We can reference
557 * ParameterValues directly, since unlike brw_fs.cpp, we never
558 * add new state references during compile.
559 */
560 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
561 (gl_state_index *)slots[i].tokens);
562 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
563
564 this->uniform_vector_size[this->uniforms] = 0;
565 /* Add each of the unique swizzled channels of the element.
566 * This will end up matching the size of the glsl_type of this field.
567 */
568 int last_swiz = -1;
569 for (unsigned int j = 0; j < 4; j++) {
570 int swiz = GET_SWZ(slots[i].swizzle, j);
571 last_swiz = swiz;
572
573 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
574 if (swiz <= last_swiz)
575 this->uniform_vector_size[this->uniforms]++;
576 }
577 this->uniforms++;
578 }
579 }
580
581 dst_reg *
582 vec4_visitor::variable_storage(ir_variable *var)
583 {
584 return (dst_reg *)hash_table_find(this->variable_ht, var);
585 }
586
587 void
588 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
589 {
590 ir_expression *expr = ir->as_expression();
591
592 *predicate = BRW_PREDICATE_NORMAL;
593
594 if (expr) {
595 src_reg op[2];
596 vec4_instruction *inst;
597
598 assert(expr->get_num_operands() <= 2);
599 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
600 expr->operands[i]->accept(this);
601 op[i] = this->result;
602
603 resolve_ud_negate(&op[i]);
604 }
605
606 switch (expr->operation) {
607 case ir_unop_logic_not:
608 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
609 inst->conditional_mod = BRW_CONDITIONAL_Z;
610 break;
611
612 case ir_binop_logic_xor:
613 inst = emit(XOR(dst_null_d(), op[0], op[1]));
614 inst->conditional_mod = BRW_CONDITIONAL_NZ;
615 break;
616
617 case ir_binop_logic_or:
618 inst = emit(OR(dst_null_d(), op[0], op[1]));
619 inst->conditional_mod = BRW_CONDITIONAL_NZ;
620 break;
621
622 case ir_binop_logic_and:
623 inst = emit(AND(dst_null_d(), op[0], op[1]));
624 inst->conditional_mod = BRW_CONDITIONAL_NZ;
625 break;
626
627 case ir_unop_f2b:
628 if (intel->gen >= 6) {
629 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
630 } else {
631 inst = emit(MOV(dst_null_f(), op[0]));
632 inst->conditional_mod = BRW_CONDITIONAL_NZ;
633 }
634 break;
635
636 case ir_unop_i2b:
637 if (intel->gen >= 6) {
638 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
639 } else {
640 inst = emit(MOV(dst_null_d(), op[0]));
641 inst->conditional_mod = BRW_CONDITIONAL_NZ;
642 }
643 break;
644
645 case ir_binop_all_equal:
646 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
647 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
648 break;
649
650 case ir_binop_any_nequal:
651 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
652 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
653 break;
654
655 case ir_unop_any:
656 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
657 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
658 break;
659
660 case ir_binop_greater:
661 case ir_binop_gequal:
662 case ir_binop_less:
663 case ir_binop_lequal:
664 case ir_binop_equal:
665 case ir_binop_nequal:
666 emit(CMP(dst_null_d(), op[0], op[1],
667 brw_conditional_for_comparison(expr->operation)));
668 break;
669
670 default:
671 assert(!"not reached");
672 break;
673 }
674 return;
675 }
676
677 ir->accept(this);
678
679 resolve_ud_negate(&this->result);
680
681 if (intel->gen >= 6) {
682 vec4_instruction *inst = emit(AND(dst_null_d(),
683 this->result, src_reg(1)));
684 inst->conditional_mod = BRW_CONDITIONAL_NZ;
685 } else {
686 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
687 inst->conditional_mod = BRW_CONDITIONAL_NZ;
688 }
689 }
690
691 /**
692 * Emit a gen6 IF statement with the comparison folded into the IF
693 * instruction.
694 */
695 void
696 vec4_visitor::emit_if_gen6(ir_if *ir)
697 {
698 ir_expression *expr = ir->condition->as_expression();
699
700 if (expr) {
701 src_reg op[2];
702 dst_reg temp;
703
704 assert(expr->get_num_operands() <= 2);
705 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
706 expr->operands[i]->accept(this);
707 op[i] = this->result;
708 }
709
710 switch (expr->operation) {
711 case ir_unop_logic_not:
712 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
713 return;
714
715 case ir_binop_logic_xor:
716 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
717 return;
718
719 case ir_binop_logic_or:
720 temp = dst_reg(this, glsl_type::bool_type);
721 emit(OR(temp, op[0], op[1]));
722 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
723 return;
724
725 case ir_binop_logic_and:
726 temp = dst_reg(this, glsl_type::bool_type);
727 emit(AND(temp, op[0], op[1]));
728 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
729 return;
730
731 case ir_unop_f2b:
732 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
733 return;
734
735 case ir_unop_i2b:
736 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
737 return;
738
739 case ir_binop_greater:
740 case ir_binop_gequal:
741 case ir_binop_less:
742 case ir_binop_lequal:
743 case ir_binop_equal:
744 case ir_binop_nequal:
745 emit(IF(op[0], op[1],
746 brw_conditional_for_comparison(expr->operation)));
747 return;
748
749 case ir_binop_all_equal:
750 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
751 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
752 return;
753
754 case ir_binop_any_nequal:
755 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
756 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
757 return;
758
759 case ir_unop_any:
760 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
761 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
762 return;
763
764 default:
765 assert(!"not reached");
766 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
767 return;
768 }
769 return;
770 }
771
772 ir->condition->accept(this);
773
774 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
775 }
776
777 static dst_reg
778 with_writemask(dst_reg const & r, int mask)
779 {
780 dst_reg result = r;
781 result.writemask = mask;
782 return result;
783 }
784
785 void
786 vec4_visitor::emit_attribute_fixups()
787 {
788 dst_reg sign_recovery_shift;
789 dst_reg normalize_factor;
790 dst_reg es3_normalize_factor;
791
792 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
793 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
794 uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
795 dst_reg reg(ATTR, i);
796 dst_reg reg_d = reg;
797 reg_d.type = BRW_REGISTER_TYPE_D;
798 dst_reg reg_ud = reg;
799 reg_ud.type = BRW_REGISTER_TYPE_UD;
800
801 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
802 * come in as floating point conversions of the integer values.
803 */
804 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
805 dst_reg dst = reg;
806 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
807 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
808 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
809 }
810
811 /* Do sign recovery for 2101010 formats if required. */
812 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
813 if (sign_recovery_shift.file == BAD_FILE) {
814 /* shift constant: <22,22,22,30> */
815 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
816 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
817 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
818 }
819
820 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
821 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
822 }
823
824 /* Apply BGRA swizzle if required. */
825 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
826 src_reg temp = src_reg(reg);
827 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
828 emit(MOV(reg, temp));
829 }
830
831 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
832 /* ES 3.0 has different rules for converting signed normalized
833 * fixed-point numbers than desktop GL.
834 */
835 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
836 /* According to equation 2.2 of the ES 3.0 specification,
837 * signed normalization conversion is done by:
838 *
839 * f = c / (2^(b-1)-1)
840 */
841 if (es3_normalize_factor.file == BAD_FILE) {
842 /* mul constant: 1 / (2^(b-1) - 1) */
843 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
844 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
845 src_reg(1.0f / ((1<<9) - 1))));
846 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
847 src_reg(1.0f / ((1<<1) - 1))));
848 }
849
850 dst_reg dst = reg;
851 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
852 emit(MOV(dst, src_reg(reg_d)));
853 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
854 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
855 } else {
856 /* The following equations are from the OpenGL 3.2 specification:
857 *
858 * 2.1 unsigned normalization
859 * f = c/(2^n-1)
860 *
861 * 2.2 signed normalization
862 * f = (2c+1)/(2^n-1)
863 *
864 * Both of these share a common divisor, which is represented by
865 * "normalize_factor" in the code below.
866 */
867 if (normalize_factor.file == BAD_FILE) {
868 /* 1 / (2^b - 1) for b=<10,10,10,2> */
869 normalize_factor = dst_reg(this, glsl_type::vec4_type);
870 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
871 src_reg(1.0f / ((1<<10) - 1))));
872 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
873 src_reg(1.0f / ((1<<2) - 1))));
874 }
875
876 dst_reg dst = reg;
877 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
878 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
879
880 /* For signed normalization, we want the numerator to be 2c+1. */
881 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
882 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
883 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
884 }
885
886 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
887 }
888 }
889
890 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
891 dst_reg dst = reg;
892 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
893 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
894 }
895 }
896 }
897 }
898
899 void
900 vec4_visitor::visit(ir_variable *ir)
901 {
902 dst_reg *reg = NULL;
903
904 if (variable_storage(ir))
905 return;
906
907 switch (ir->mode) {
908 case ir_var_shader_in:
909 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
910 break;
911
912 case ir_var_shader_out:
913 reg = new(mem_ctx) dst_reg(this, ir->type);
914
915 for (int i = 0; i < type_size(ir->type); i++) {
916 output_reg[ir->location + i] = *reg;
917 output_reg[ir->location + i].reg_offset = i;
918 output_reg[ir->location + i].type =
919 brw_type_for_base_type(ir->type->get_scalar_type());
920 output_reg_annotation[ir->location + i] = ir->name;
921 }
922 break;
923
924 case ir_var_auto:
925 case ir_var_temporary:
926 reg = new(mem_ctx) dst_reg(this, ir->type);
927 break;
928
929 case ir_var_uniform:
930 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
931
932 /* Thanks to the lower_ubo_reference pass, we will see only
933 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
934 * variables, so no need for them to be in variable_ht.
935 */
936 if (ir->uniform_block != -1)
937 return;
938
939 /* Track how big the whole uniform variable is, in case we need to put a
940 * copy of its data into pull constants for array access.
941 */
942 this->uniform_size[this->uniforms] = type_size(ir->type);
943
944 if (!strncmp(ir->name, "gl_", 3)) {
945 setup_builtin_uniform_values(ir);
946 } else {
947 setup_uniform_values(ir);
948 }
949 break;
950
951 case ir_var_system_value:
952 /* VertexID is stored by the VF as the last vertex element, but
953 * we don't represent it with a flag in inputs_read, so we call
954 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
955 */
956 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
957 prog_data->uses_vertexid = true;
958
959 switch (ir->location) {
960 case SYSTEM_VALUE_VERTEX_ID:
961 reg->writemask = WRITEMASK_X;
962 break;
963 case SYSTEM_VALUE_INSTANCE_ID:
964 reg->writemask = WRITEMASK_Y;
965 break;
966 default:
967 assert(!"not reached");
968 break;
969 }
970 break;
971
972 default:
973 assert(!"not reached");
974 }
975
976 reg->type = brw_type_for_base_type(ir->type);
977 hash_table_insert(this->variable_ht, reg, ir);
978 }
979
980 void
981 vec4_visitor::visit(ir_loop *ir)
982 {
983 dst_reg counter;
984
985 /* We don't want debugging output to print the whole body of the
986 * loop as the annotation.
987 */
988 this->base_ir = NULL;
989
990 if (ir->counter != NULL) {
991 this->base_ir = ir->counter;
992 ir->counter->accept(this);
993 counter = *(variable_storage(ir->counter));
994
995 if (ir->from != NULL) {
996 this->base_ir = ir->from;
997 ir->from->accept(this);
998
999 emit(MOV(counter, this->result));
1000 }
1001 }
1002
1003 emit(BRW_OPCODE_DO);
1004
1005 if (ir->to) {
1006 this->base_ir = ir->to;
1007 ir->to->accept(this);
1008
1009 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010 brw_conditional_for_comparison(ir->cmp)));
1011
1012 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013 inst->predicate = BRW_PREDICATE_NORMAL;
1014 }
1015
1016 visit_instructions(&ir->body_instructions);
1017
1018
1019 if (ir->increment) {
1020 this->base_ir = ir->increment;
1021 ir->increment->accept(this);
1022 emit(ADD(counter, src_reg(counter), this->result));
1023 }
1024
1025 emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031 switch (ir->mode) {
1032 case ir_loop_jump::jump_break:
1033 emit(BRW_OPCODE_BREAK);
1034 break;
1035 case ir_loop_jump::jump_continue:
1036 emit(BRW_OPCODE_CONTINUE);
1037 break;
1038 }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045 assert(0);
1046 (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052 /* Ignore function bodies other than main() -- we shouldn't see calls to
1053 * them since they should all be inlined.
1054 */
1055 if (strcmp(ir->name, "main") == 0) {
1056 const ir_function_signature *sig;
1057 exec_list empty;
1058
1059 sig = ir->matching_signature(&empty);
1060
1061 assert(sig);
1062
1063 visit_instructions(&sig->body);
1064 }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071 if (!sat_src)
1072 return false;
1073
1074 sat_src->accept(this);
1075 src_reg src = this->result;
1076
1077 this->result = src_reg(this, ir->type);
1078 vec4_instruction *inst;
1079 inst = emit(MOV(dst_reg(this->result), src));
1080 inst->saturate = true;
1081
1082 return true;
1083 }
1084
1085 void
1086 vec4_visitor::emit_bool_comparison(unsigned int op,
1087 dst_reg dst, src_reg src0, src_reg src1)
1088 {
1089 /* original gen4 does destination conversion before comparison. */
1090 if (intel->gen < 5)
1091 dst.type = src0.type;
1092
1093 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1094
1095 dst.type = BRW_REGISTER_TYPE_D;
1096 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1097 }
1098
1099 void
1100 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1101 src_reg src0, src_reg src1)
1102 {
1103 vec4_instruction *inst;
1104
1105 if (intel->gen >= 6) {
1106 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1107 inst->conditional_mod = conditionalmod;
1108 } else {
1109 emit(CMP(dst, src0, src1, conditionalmod));
1110
1111 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1112 inst->predicate = BRW_PREDICATE_NORMAL;
1113 }
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_expression *ir)
1118 {
1119 unsigned int operand;
1120 src_reg op[Elements(ir->operands)];
1121 src_reg result_src;
1122 dst_reg result_dst;
1123 vec4_instruction *inst;
1124
1125 if (try_emit_sat(ir))
1126 return;
1127
1128 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1129 this->result.file = BAD_FILE;
1130 ir->operands[operand]->accept(this);
1131 if (this->result.file == BAD_FILE) {
1132 printf("Failed to get tree for expression operand:\n");
1133 ir->operands[operand]->print();
1134 exit(1);
1135 }
1136 op[operand] = this->result;
1137
1138 /* Matrix expression operands should have been broken down to vector
1139 * operations already.
1140 */
1141 assert(!ir->operands[operand]->type->is_matrix());
1142 }
1143
1144 int vector_elements = ir->operands[0]->type->vector_elements;
1145 if (ir->operands[1]) {
1146 vector_elements = MAX2(vector_elements,
1147 ir->operands[1]->type->vector_elements);
1148 }
1149
1150 this->result.file = BAD_FILE;
1151
1152 /* Storage for our result. Ideally for an assignment we'd be using
1153 * the actual storage for the result here, instead.
1154 */
1155 result_src = src_reg(this, ir->type);
1156 /* convenience for the emit functions below. */
1157 result_dst = dst_reg(result_src);
1158 /* If nothing special happens, this is the result. */
1159 this->result = result_src;
1160 /* Limit writes to the channels that will be used by result_src later.
1161 * This does limit this temp's use as a temporary for multi-instruction
1162 * sequences.
1163 */
1164 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1165
1166 switch (ir->operation) {
1167 case ir_unop_logic_not:
1168 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1169 * ones complement of the whole register, not just bit 0.
1170 */
1171 emit(XOR(result_dst, op[0], src_reg(1)));
1172 break;
1173 case ir_unop_neg:
1174 op[0].negate = !op[0].negate;
1175 this->result = op[0];
1176 break;
1177 case ir_unop_abs:
1178 op[0].abs = true;
1179 op[0].negate = false;
1180 this->result = op[0];
1181 break;
1182
1183 case ir_unop_sign:
1184 emit(MOV(result_dst, src_reg(0.0f)));
1185
1186 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1187 inst = emit(MOV(result_dst, src_reg(1.0f)));
1188 inst->predicate = BRW_PREDICATE_NORMAL;
1189
1190 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1191 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1192 inst->predicate = BRW_PREDICATE_NORMAL;
1193
1194 break;
1195
1196 case ir_unop_rcp:
1197 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1198 break;
1199
1200 case ir_unop_exp2:
1201 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1202 break;
1203 case ir_unop_log2:
1204 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1205 break;
1206 case ir_unop_exp:
1207 case ir_unop_log:
1208 assert(!"not reached: should be handled by ir_explog_to_explog2");
1209 break;
1210 case ir_unop_sin:
1211 case ir_unop_sin_reduced:
1212 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1213 break;
1214 case ir_unop_cos:
1215 case ir_unop_cos_reduced:
1216 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1217 break;
1218
1219 case ir_unop_dFdx:
1220 case ir_unop_dFdy:
1221 assert(!"derivatives not valid in vertex shader");
1222 break;
1223
1224 case ir_unop_noise:
1225 assert(!"not reached: should be handled by lower_noise");
1226 break;
1227
1228 case ir_binop_add:
1229 emit(ADD(result_dst, op[0], op[1]));
1230 break;
1231 case ir_binop_sub:
1232 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1233 break;
1234
1235 case ir_binop_mul:
1236 if (ir->type->is_integer()) {
1237 /* For integer multiplication, the MUL uses the low 16 bits
1238 * of one of the operands (src0 on gen6, src1 on gen7). The
1239 * MACH accumulates in the contribution of the upper 16 bits
1240 * of that operand.
1241 *
1242 * FINISHME: Emit just the MUL if we know an operand is small
1243 * enough.
1244 */
1245 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1246
1247 emit(MUL(acc, op[0], op[1]));
1248 emit(MACH(dst_null_d(), op[0], op[1]));
1249 emit(MOV(result_dst, src_reg(acc)));
1250 } else {
1251 emit(MUL(result_dst, op[0], op[1]));
1252 }
1253 break;
1254 case ir_binop_div:
1255 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1256 assert(ir->type->is_integer());
1257 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1258 break;
1259 case ir_binop_mod:
1260 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1261 assert(ir->type->is_integer());
1262 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1263 break;
1264
1265 case ir_binop_less:
1266 case ir_binop_greater:
1267 case ir_binop_lequal:
1268 case ir_binop_gequal:
1269 case ir_binop_equal:
1270 case ir_binop_nequal: {
1271 emit(CMP(result_dst, op[0], op[1],
1272 brw_conditional_for_comparison(ir->operation)));
1273 emit(AND(result_dst, result_src, src_reg(0x1)));
1274 break;
1275 }
1276
1277 case ir_binop_all_equal:
1278 /* "==" operator producing a scalar boolean. */
1279 if (ir->operands[0]->type->is_vector() ||
1280 ir->operands[1]->type->is_vector()) {
1281 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1282 emit(MOV(result_dst, src_reg(0)));
1283 inst = emit(MOV(result_dst, src_reg(1)));
1284 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1285 } else {
1286 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1287 emit(AND(result_dst, result_src, src_reg(0x1)));
1288 }
1289 break;
1290 case ir_binop_any_nequal:
1291 /* "!=" operator producing a scalar boolean. */
1292 if (ir->operands[0]->type->is_vector() ||
1293 ir->operands[1]->type->is_vector()) {
1294 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1295
1296 emit(MOV(result_dst, src_reg(0)));
1297 inst = emit(MOV(result_dst, src_reg(1)));
1298 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1299 } else {
1300 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1301 emit(AND(result_dst, result_src, src_reg(0x1)));
1302 }
1303 break;
1304
1305 case ir_unop_any:
1306 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1307 emit(MOV(result_dst, src_reg(0)));
1308
1309 inst = emit(MOV(result_dst, src_reg(1)));
1310 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1311 break;
1312
1313 case ir_binop_logic_xor:
1314 emit(XOR(result_dst, op[0], op[1]));
1315 break;
1316
1317 case ir_binop_logic_or:
1318 emit(OR(result_dst, op[0], op[1]));
1319 break;
1320
1321 case ir_binop_logic_and:
1322 emit(AND(result_dst, op[0], op[1]));
1323 break;
1324
1325 case ir_binop_dot:
1326 assert(ir->operands[0]->type->is_vector());
1327 assert(ir->operands[0]->type == ir->operands[1]->type);
1328 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1329 break;
1330
1331 case ir_unop_sqrt:
1332 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1333 break;
1334 case ir_unop_rsq:
1335 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1336 break;
1337
1338 case ir_unop_bitcast_i2f:
1339 case ir_unop_bitcast_u2f:
1340 this->result = op[0];
1341 this->result.type = BRW_REGISTER_TYPE_F;
1342 break;
1343
1344 case ir_unop_bitcast_f2i:
1345 this->result = op[0];
1346 this->result.type = BRW_REGISTER_TYPE_D;
1347 break;
1348
1349 case ir_unop_bitcast_f2u:
1350 this->result = op[0];
1351 this->result.type = BRW_REGISTER_TYPE_UD;
1352 break;
1353
1354 case ir_unop_i2f:
1355 case ir_unop_i2u:
1356 case ir_unop_u2i:
1357 case ir_unop_u2f:
1358 case ir_unop_b2f:
1359 case ir_unop_b2i:
1360 case ir_unop_f2i:
1361 case ir_unop_f2u:
1362 emit(MOV(result_dst, op[0]));
1363 break;
1364 case ir_unop_f2b:
1365 case ir_unop_i2b: {
1366 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1367 emit(AND(result_dst, result_src, src_reg(1)));
1368 break;
1369 }
1370
1371 case ir_unop_trunc:
1372 emit(RNDZ(result_dst, op[0]));
1373 break;
1374 case ir_unop_ceil:
1375 op[0].negate = !op[0].negate;
1376 inst = emit(RNDD(result_dst, op[0]));
1377 this->result.negate = true;
1378 break;
1379 case ir_unop_floor:
1380 inst = emit(RNDD(result_dst, op[0]));
1381 break;
1382 case ir_unop_fract:
1383 inst = emit(FRC(result_dst, op[0]));
1384 break;
1385 case ir_unop_round_even:
1386 emit(RNDE(result_dst, op[0]));
1387 break;
1388
1389 case ir_binop_min:
1390 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1391 break;
1392 case ir_binop_max:
1393 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1394 break;
1395
1396 case ir_binop_pow:
1397 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1398 break;
1399
1400 case ir_unop_bit_not:
1401 inst = emit(NOT(result_dst, op[0]));
1402 break;
1403 case ir_binop_bit_and:
1404 inst = emit(AND(result_dst, op[0], op[1]));
1405 break;
1406 case ir_binop_bit_xor:
1407 inst = emit(XOR(result_dst, op[0], op[1]));
1408 break;
1409 case ir_binop_bit_or:
1410 inst = emit(OR(result_dst, op[0], op[1]));
1411 break;
1412
1413 case ir_binop_lshift:
1414 inst = emit(SHL(result_dst, op[0], op[1]));
1415 break;
1416
1417 case ir_binop_rshift:
1418 if (ir->type->base_type == GLSL_TYPE_INT)
1419 inst = emit(ASR(result_dst, op[0], op[1]));
1420 else
1421 inst = emit(SHR(result_dst, op[0], op[1]));
1422 break;
1423
1424 case ir_binop_ubo_load: {
1425 ir_constant *uniform_block = ir->operands[0]->as_constant();
1426 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1427 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1428 src_reg offset = op[1];
1429
1430 /* Now, load the vector from that offset. */
1431 assert(ir->type->is_vector() || ir->type->is_scalar());
1432
1433 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1434 packed_consts.type = result.type;
1435 src_reg surf_index =
1436 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1437 if (const_offset_ir) {
1438 offset = src_reg(const_offset / 16);
1439 } else {
1440 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1441 }
1442
1443 vec4_instruction *pull =
1444 emit(new(mem_ctx) vec4_instruction(this,
1445 VS_OPCODE_PULL_CONSTANT_LOAD,
1446 dst_reg(packed_consts),
1447 surf_index,
1448 offset));
1449 pull->base_mrf = 14;
1450 pull->mlen = 1;
1451
1452 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1453 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1454 const_offset % 16 / 4,
1455 const_offset % 16 / 4,
1456 const_offset % 16 / 4);
1457
1458 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1459 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1460 emit(CMP(result_dst, packed_consts, src_reg(0u),
1461 BRW_CONDITIONAL_NZ));
1462 emit(AND(result_dst, result, src_reg(0x1)));
1463 } else {
1464 emit(MOV(result_dst, packed_consts));
1465 }
1466 break;
1467 }
1468
1469 case ir_quadop_vector:
1470 assert(!"not reached: should be handled by lower_quadop_vector");
1471 break;
1472 }
1473 }
1474
1475
1476 void
1477 vec4_visitor::visit(ir_swizzle *ir)
1478 {
1479 src_reg src;
1480 int i = 0;
1481 int swizzle[4];
1482
1483 /* Note that this is only swizzles in expressions, not those on the left
1484 * hand side of an assignment, which do write masking. See ir_assignment
1485 * for that.
1486 */
1487
1488 ir->val->accept(this);
1489 src = this->result;
1490 assert(src.file != BAD_FILE);
1491
1492 for (i = 0; i < ir->type->vector_elements; i++) {
1493 switch (i) {
1494 case 0:
1495 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1496 break;
1497 case 1:
1498 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1499 break;
1500 case 2:
1501 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1502 break;
1503 case 3:
1504 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1505 break;
1506 }
1507 }
1508 for (; i < 4; i++) {
1509 /* Replicate the last channel out. */
1510 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1511 }
1512
1513 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1514
1515 this->result = src;
1516 }
1517
1518 void
1519 vec4_visitor::visit(ir_dereference_variable *ir)
1520 {
1521 const struct glsl_type *type = ir->type;
1522 dst_reg *reg = variable_storage(ir->var);
1523
1524 if (!reg) {
1525 fail("Failed to find variable storage for %s\n", ir->var->name);
1526 this->result = src_reg(brw_null_reg());
1527 return;
1528 }
1529
1530 this->result = src_reg(*reg);
1531
1532 /* System values get their swizzle from the dst_reg writemask */
1533 if (ir->var->mode == ir_var_system_value)
1534 return;
1535
1536 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1537 this->result.swizzle = swizzle_for_size(type->vector_elements);
1538 }
1539
1540 void
1541 vec4_visitor::visit(ir_dereference_array *ir)
1542 {
1543 ir_constant *constant_index;
1544 src_reg src;
1545 int element_size = type_size(ir->type);
1546
1547 constant_index = ir->array_index->constant_expression_value();
1548
1549 ir->array->accept(this);
1550 src = this->result;
1551
1552 if (constant_index) {
1553 src.reg_offset += constant_index->value.i[0] * element_size;
1554 } else {
1555 /* Variable index array dereference. It eats the "vec4" of the
1556 * base of the array and an index that offsets the Mesa register
1557 * index.
1558 */
1559 ir->array_index->accept(this);
1560
1561 src_reg index_reg;
1562
1563 if (element_size == 1) {
1564 index_reg = this->result;
1565 } else {
1566 index_reg = src_reg(this, glsl_type::int_type);
1567
1568 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1569 }
1570
1571 if (src.reladdr) {
1572 src_reg temp = src_reg(this, glsl_type::int_type);
1573
1574 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1575
1576 index_reg = temp;
1577 }
1578
1579 src.reladdr = ralloc(mem_ctx, src_reg);
1580 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1581 }
1582
1583 /* If the type is smaller than a vec4, replicate the last channel out. */
1584 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1585 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1586 else
1587 src.swizzle = BRW_SWIZZLE_NOOP;
1588 src.type = brw_type_for_base_type(ir->type);
1589
1590 this->result = src;
1591 }
1592
1593 void
1594 vec4_visitor::visit(ir_dereference_record *ir)
1595 {
1596 unsigned int i;
1597 const glsl_type *struct_type = ir->record->type;
1598 int offset = 0;
1599
1600 ir->record->accept(this);
1601
1602 for (i = 0; i < struct_type->length; i++) {
1603 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1604 break;
1605 offset += type_size(struct_type->fields.structure[i].type);
1606 }
1607
1608 /* If the type is smaller than a vec4, replicate the last channel out. */
1609 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1610 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1611 else
1612 this->result.swizzle = BRW_SWIZZLE_NOOP;
1613 this->result.type = brw_type_for_base_type(ir->type);
1614
1615 this->result.reg_offset += offset;
1616 }
1617
1618 /**
1619 * We want to be careful in assignment setup to hit the actual storage
1620 * instead of potentially using a temporary like we might with the
1621 * ir_dereference handler.
1622 */
1623 static dst_reg
1624 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1625 {
1626 /* The LHS must be a dereference. If the LHS is a variable indexed array
1627 * access of a vector, it must be separated into a series conditional moves
1628 * before reaching this point (see ir_vec_index_to_cond_assign).
1629 */
1630 assert(ir->as_dereference());
1631 ir_dereference_array *deref_array = ir->as_dereference_array();
1632 if (deref_array) {
1633 assert(!deref_array->array->type->is_vector());
1634 }
1635
1636 /* Use the rvalue deref handler for the most part. We'll ignore
1637 * swizzles in it and write swizzles using writemask, though.
1638 */
1639 ir->accept(v);
1640 return dst_reg(v->result);
1641 }
1642
1643 void
1644 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1645 const struct glsl_type *type, uint32_t predicate)
1646 {
1647 if (type->base_type == GLSL_TYPE_STRUCT) {
1648 for (unsigned int i = 0; i < type->length; i++) {
1649 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1650 }
1651 return;
1652 }
1653
1654 if (type->is_array()) {
1655 for (unsigned int i = 0; i < type->length; i++) {
1656 emit_block_move(dst, src, type->fields.array, predicate);
1657 }
1658 return;
1659 }
1660
1661 if (type->is_matrix()) {
1662 const struct glsl_type *vec_type;
1663
1664 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1665 type->vector_elements, 1);
1666
1667 for (int i = 0; i < type->matrix_columns; i++) {
1668 emit_block_move(dst, src, vec_type, predicate);
1669 }
1670 return;
1671 }
1672
1673 assert(type->is_scalar() || type->is_vector());
1674
1675 dst->type = brw_type_for_base_type(type);
1676 src->type = dst->type;
1677
1678 dst->writemask = (1 << type->vector_elements) - 1;
1679
1680 src->swizzle = swizzle_for_size(type->vector_elements);
1681
1682 vec4_instruction *inst = emit(MOV(*dst, *src));
1683 inst->predicate = predicate;
1684
1685 dst->reg_offset++;
1686 src->reg_offset++;
1687 }
1688
1689
1690 /* If the RHS processing resulted in an instruction generating a
1691 * temporary value, and it would be easy to rewrite the instruction to
1692 * generate its result right into the LHS instead, do so. This ends
1693 * up reliably removing instructions where it can be tricky to do so
1694 * later without real UD chain information.
1695 */
1696 bool
1697 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1698 dst_reg dst,
1699 src_reg src,
1700 vec4_instruction *pre_rhs_inst,
1701 vec4_instruction *last_rhs_inst)
1702 {
1703 /* This could be supported, but it would take more smarts. */
1704 if (ir->condition)
1705 return false;
1706
1707 if (pre_rhs_inst == last_rhs_inst)
1708 return false; /* No instructions generated to work with. */
1709
1710 /* Make sure the last instruction generated our source reg. */
1711 if (src.file != GRF ||
1712 src.file != last_rhs_inst->dst.file ||
1713 src.reg != last_rhs_inst->dst.reg ||
1714 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1715 src.reladdr ||
1716 src.abs ||
1717 src.negate ||
1718 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1719 return false;
1720
1721 /* Check that that last instruction fully initialized the channels
1722 * we want to use, in the order we want to use them. We could
1723 * potentially reswizzle the operands of many instructions so that
1724 * we could handle out of order channels, but don't yet.
1725 */
1726
1727 for (unsigned i = 0; i < 4; i++) {
1728 if (dst.writemask & (1 << i)) {
1729 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1730 return false;
1731
1732 if (BRW_GET_SWZ(src.swizzle, i) != i)
1733 return false;
1734 }
1735 }
1736
1737 /* Success! Rewrite the instruction. */
1738 last_rhs_inst->dst.file = dst.file;
1739 last_rhs_inst->dst.reg = dst.reg;
1740 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1741 last_rhs_inst->dst.reladdr = dst.reladdr;
1742 last_rhs_inst->dst.writemask &= dst.writemask;
1743
1744 return true;
1745 }
1746
1747 void
1748 vec4_visitor::visit(ir_assignment *ir)
1749 {
1750 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1751 uint32_t predicate = BRW_PREDICATE_NONE;
1752
1753 if (!ir->lhs->type->is_scalar() &&
1754 !ir->lhs->type->is_vector()) {
1755 ir->rhs->accept(this);
1756 src_reg src = this->result;
1757
1758 if (ir->condition) {
1759 emit_bool_to_cond_code(ir->condition, &predicate);
1760 }
1761
1762 /* emit_block_move doesn't account for swizzles in the source register.
1763 * This should be ok, since the source register is a structure or an
1764 * array, and those can't be swizzled. But double-check to be sure.
1765 */
1766 assert(src.swizzle ==
1767 (ir->rhs->type->is_matrix()
1768 ? swizzle_for_size(ir->rhs->type->vector_elements)
1769 : BRW_SWIZZLE_NOOP));
1770
1771 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1772 return;
1773 }
1774
1775 /* Now we're down to just a scalar/vector with writemasks. */
1776 int i;
1777
1778 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1779 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1780
1781 ir->rhs->accept(this);
1782
1783 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1784
1785 src_reg src = this->result;
1786
1787 int swizzles[4];
1788 int first_enabled_chan = 0;
1789 int src_chan = 0;
1790
1791 assert(ir->lhs->type->is_vector() ||
1792 ir->lhs->type->is_scalar());
1793 dst.writemask = ir->write_mask;
1794
1795 for (int i = 0; i < 4; i++) {
1796 if (dst.writemask & (1 << i)) {
1797 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1798 break;
1799 }
1800 }
1801
1802 /* Swizzle a small RHS vector into the channels being written.
1803 *
1804 * glsl ir treats write_mask as dictating how many channels are
1805 * present on the RHS while in our instructions we need to make
1806 * those channels appear in the slots of the vec4 they're written to.
1807 */
1808 for (int i = 0; i < 4; i++) {
1809 if (dst.writemask & (1 << i))
1810 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1811 else
1812 swizzles[i] = first_enabled_chan;
1813 }
1814 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1815 swizzles[2], swizzles[3]);
1816
1817 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1818 return;
1819 }
1820
1821 if (ir->condition) {
1822 emit_bool_to_cond_code(ir->condition, &predicate);
1823 }
1824
1825 for (i = 0; i < type_size(ir->lhs->type); i++) {
1826 vec4_instruction *inst = emit(MOV(dst, src));
1827 inst->predicate = predicate;
1828
1829 dst.reg_offset++;
1830 src.reg_offset++;
1831 }
1832 }
1833
1834 void
1835 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1836 {
1837 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1838 foreach_list(node, &ir->components) {
1839 ir_constant *field_value = (ir_constant *)node;
1840
1841 emit_constant_values(dst, field_value);
1842 }
1843 return;
1844 }
1845
1846 if (ir->type->is_array()) {
1847 for (unsigned int i = 0; i < ir->type->length; i++) {
1848 emit_constant_values(dst, ir->array_elements[i]);
1849 }
1850 return;
1851 }
1852
1853 if (ir->type->is_matrix()) {
1854 for (int i = 0; i < ir->type->matrix_columns; i++) {
1855 float *vec = &ir->value.f[i * ir->type->vector_elements];
1856
1857 for (int j = 0; j < ir->type->vector_elements; j++) {
1858 dst->writemask = 1 << j;
1859 dst->type = BRW_REGISTER_TYPE_F;
1860
1861 emit(MOV(*dst, src_reg(vec[j])));
1862 }
1863 dst->reg_offset++;
1864 }
1865 return;
1866 }
1867
1868 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1869
1870 for (int i = 0; i < ir->type->vector_elements; i++) {
1871 if (!(remaining_writemask & (1 << i)))
1872 continue;
1873
1874 dst->writemask = 1 << i;
1875 dst->type = brw_type_for_base_type(ir->type);
1876
1877 /* Find other components that match the one we're about to
1878 * write. Emits fewer instructions for things like vec4(0.5,
1879 * 1.5, 1.5, 1.5).
1880 */
1881 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1882 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1883 if (ir->value.b[i] == ir->value.b[j])
1884 dst->writemask |= (1 << j);
1885 } else {
1886 /* u, i, and f storage all line up, so no need for a
1887 * switch case for comparing each type.
1888 */
1889 if (ir->value.u[i] == ir->value.u[j])
1890 dst->writemask |= (1 << j);
1891 }
1892 }
1893
1894 switch (ir->type->base_type) {
1895 case GLSL_TYPE_FLOAT:
1896 emit(MOV(*dst, src_reg(ir->value.f[i])));
1897 break;
1898 case GLSL_TYPE_INT:
1899 emit(MOV(*dst, src_reg(ir->value.i[i])));
1900 break;
1901 case GLSL_TYPE_UINT:
1902 emit(MOV(*dst, src_reg(ir->value.u[i])));
1903 break;
1904 case GLSL_TYPE_BOOL:
1905 emit(MOV(*dst, src_reg(ir->value.b[i])));
1906 break;
1907 default:
1908 assert(!"Non-float/uint/int/bool constant");
1909 break;
1910 }
1911
1912 remaining_writemask &= ~dst->writemask;
1913 }
1914 dst->reg_offset++;
1915 }
1916
1917 void
1918 vec4_visitor::visit(ir_constant *ir)
1919 {
1920 dst_reg dst = dst_reg(this, ir->type);
1921 this->result = src_reg(dst);
1922
1923 emit_constant_values(&dst, ir);
1924 }
1925
1926 void
1927 vec4_visitor::visit(ir_call *ir)
1928 {
1929 assert(!"not reached");
1930 }
1931
1932 void
1933 vec4_visitor::visit(ir_texture *ir)
1934 {
1935 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1936
1937 /* Should be lowered by do_lower_texture_projection */
1938 assert(!ir->projector);
1939
1940 /* Generate code to compute all the subexpression trees. This has to be
1941 * done before loading any values into MRFs for the sampler message since
1942 * generating these values may involve SEND messages that need the MRFs.
1943 */
1944 src_reg coordinate;
1945 if (ir->coordinate) {
1946 ir->coordinate->accept(this);
1947 coordinate = this->result;
1948 }
1949
1950 src_reg shadow_comparitor;
1951 if (ir->shadow_comparitor) {
1952 ir->shadow_comparitor->accept(this);
1953 shadow_comparitor = this->result;
1954 }
1955
1956 const glsl_type *lod_type;
1957 src_reg lod, dPdx, dPdy;
1958 switch (ir->op) {
1959 case ir_tex:
1960 lod = src_reg(0.0f);
1961 lod_type = glsl_type::float_type;
1962 break;
1963 case ir_txf:
1964 case ir_txl:
1965 case ir_txs:
1966 ir->lod_info.lod->accept(this);
1967 lod = this->result;
1968 lod_type = ir->lod_info.lod->type;
1969 break;
1970 case ir_txd:
1971 ir->lod_info.grad.dPdx->accept(this);
1972 dPdx = this->result;
1973
1974 ir->lod_info.grad.dPdy->accept(this);
1975 dPdy = this->result;
1976
1977 lod_type = ir->lod_info.grad.dPdx->type;
1978 break;
1979 case ir_txb:
1980 break;
1981 }
1982
1983 vec4_instruction *inst = NULL;
1984 switch (ir->op) {
1985 case ir_tex:
1986 case ir_txl:
1987 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1988 break;
1989 case ir_txd:
1990 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1991 break;
1992 case ir_txf:
1993 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1994 break;
1995 case ir_txs:
1996 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1997 break;
1998 case ir_txb:
1999 assert(!"TXB is not valid for vertex shaders.");
2000 }
2001
2002 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2003
2004 /* Texel offsets go in the message header; Gen4 also requires headers. */
2005 inst->header_present = use_texture_offset || intel->gen < 5;
2006 inst->base_mrf = 2;
2007 inst->mlen = inst->header_present + 1; /* always at least one */
2008 inst->sampler = sampler;
2009 inst->dst = dst_reg(this, ir->type);
2010 inst->dst.writemask = WRITEMASK_XYZW;
2011 inst->shadow_compare = ir->shadow_comparitor != NULL;
2012
2013 if (use_texture_offset)
2014 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2015
2016 /* MRF for the first parameter */
2017 int param_base = inst->base_mrf + inst->header_present;
2018
2019 if (ir->op == ir_txs) {
2020 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2021 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2022 } else {
2023 int i, coord_mask = 0, zero_mask = 0;
2024 /* Load the coordinate */
2025 /* FINISHME: gl_clamp_mask and saturate */
2026 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2027 coord_mask |= (1 << i);
2028 for (; i < 4; i++)
2029 zero_mask |= (1 << i);
2030
2031 if (ir->offset && ir->op == ir_txf) {
2032 /* It appears that the ld instruction used for txf does its
2033 * address bounds check before adding in the offset. To work
2034 * around this, just add the integer offset to the integer
2035 * texel coordinate, and don't put the offset in the header.
2036 */
2037 ir_constant *offset = ir->offset->as_constant();
2038 assert(offset);
2039
2040 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2041 src_reg src = coordinate;
2042 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2043 BRW_GET_SWZ(src.swizzle, j),
2044 BRW_GET_SWZ(src.swizzle, j),
2045 BRW_GET_SWZ(src.swizzle, j));
2046 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2047 src, offset->value.i[j]));
2048 }
2049 } else {
2050 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2051 coordinate));
2052 }
2053 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2054 src_reg(0)));
2055 /* Load the shadow comparitor */
2056 if (ir->shadow_comparitor) {
2057 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2058 WRITEMASK_X),
2059 shadow_comparitor));
2060 inst->mlen++;
2061 }
2062
2063 /* Load the LOD info */
2064 if (ir->op == ir_tex || ir->op == ir_txl) {
2065 int mrf, writemask;
2066 if (intel->gen >= 5) {
2067 mrf = param_base + 1;
2068 if (ir->shadow_comparitor) {
2069 writemask = WRITEMASK_Y;
2070 /* mlen already incremented */
2071 } else {
2072 writemask = WRITEMASK_X;
2073 inst->mlen++;
2074 }
2075 } else /* intel->gen == 4 */ {
2076 mrf = param_base;
2077 writemask = WRITEMASK_Z;
2078 }
2079 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2080 } else if (ir->op == ir_txf) {
2081 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2082 lod));
2083 } else if (ir->op == ir_txd) {
2084 const glsl_type *type = lod_type;
2085
2086 if (intel->gen >= 5) {
2087 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2088 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2089 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2090 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2091 inst->mlen++;
2092
2093 if (ir->type->vector_elements == 3) {
2094 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2095 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2096 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2097 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2098 inst->mlen++;
2099 }
2100 } else /* intel->gen == 4 */ {
2101 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2102 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2103 inst->mlen += 2;
2104 }
2105 }
2106 }
2107
2108 emit(inst);
2109
2110 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2111 * spec requires layers.
2112 */
2113 if (ir->op == ir_txs) {
2114 glsl_type const *type = ir->sampler->type;
2115 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2116 type->sampler_array) {
2117 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2118 with_writemask(inst->dst, WRITEMASK_Z),
2119 src_reg(inst->dst), src_reg(6));
2120 }
2121 }
2122
2123 swizzle_result(ir, src_reg(inst->dst), sampler);
2124 }
2125
2126 void
2127 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2128 {
2129 int s = c->key.tex.swizzles[sampler];
2130
2131 this->result = src_reg(this, ir->type);
2132 dst_reg swizzled_result(this->result);
2133
2134 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2135 || s == SWIZZLE_NOOP) {
2136 emit(MOV(swizzled_result, orig_val));
2137 return;
2138 }
2139
2140 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2141 int swizzle[4];
2142
2143 for (int i = 0; i < 4; i++) {
2144 switch (GET_SWZ(s, i)) {
2145 case SWIZZLE_ZERO:
2146 zero_mask |= (1 << i);
2147 break;
2148 case SWIZZLE_ONE:
2149 one_mask |= (1 << i);
2150 break;
2151 default:
2152 copy_mask |= (1 << i);
2153 swizzle[i] = GET_SWZ(s, i);
2154 break;
2155 }
2156 }
2157
2158 if (copy_mask) {
2159 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2160 swizzled_result.writemask = copy_mask;
2161 emit(MOV(swizzled_result, orig_val));
2162 }
2163
2164 if (zero_mask) {
2165 swizzled_result.writemask = zero_mask;
2166 emit(MOV(swizzled_result, src_reg(0.0f)));
2167 }
2168
2169 if (one_mask) {
2170 swizzled_result.writemask = one_mask;
2171 emit(MOV(swizzled_result, src_reg(1.0f)));
2172 }
2173 }
2174
2175 void
2176 vec4_visitor::visit(ir_return *ir)
2177 {
2178 assert(!"not reached");
2179 }
2180
2181 void
2182 vec4_visitor::visit(ir_discard *ir)
2183 {
2184 assert(!"not reached");
2185 }
2186
2187 void
2188 vec4_visitor::visit(ir_if *ir)
2189 {
2190 /* Don't point the annotation at the if statement, because then it plus
2191 * the then and else blocks get printed.
2192 */
2193 this->base_ir = ir->condition;
2194
2195 if (intel->gen == 6) {
2196 emit_if_gen6(ir);
2197 } else {
2198 uint32_t predicate;
2199 emit_bool_to_cond_code(ir->condition, &predicate);
2200 emit(IF(predicate));
2201 }
2202
2203 visit_instructions(&ir->then_instructions);
2204
2205 if (!ir->else_instructions.is_empty()) {
2206 this->base_ir = ir->condition;
2207 emit(BRW_OPCODE_ELSE);
2208
2209 visit_instructions(&ir->else_instructions);
2210 }
2211
2212 this->base_ir = ir->condition;
2213 emit(BRW_OPCODE_ENDIF);
2214 }
2215
2216 void
2217 vec4_visitor::emit_ndc_computation()
2218 {
2219 /* Get the position */
2220 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2221
2222 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2223 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2224 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2225
2226 current_annotation = "NDC";
2227 dst_reg ndc_w = ndc;
2228 ndc_w.writemask = WRITEMASK_W;
2229 src_reg pos_w = pos;
2230 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2231 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2232
2233 dst_reg ndc_xyz = ndc;
2234 ndc_xyz.writemask = WRITEMASK_XYZ;
2235
2236 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2237 }
2238
2239 void
2240 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2241 {
2242 if (intel->gen < 6 &&
2243 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2244 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2245 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2246 dst_reg header1_w = header1;
2247 header1_w.writemask = WRITEMASK_W;
2248 GLuint i;
2249
2250 emit(MOV(header1, 0u));
2251
2252 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2253 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2254
2255 current_annotation = "Point size";
2256 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2257 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2258 }
2259
2260 current_annotation = "Clipping flags";
2261 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2262 vec4_instruction *inst;
2263
2264 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2265 src_reg(this->userplane[i])));
2266 inst->conditional_mod = BRW_CONDITIONAL_L;
2267
2268 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2269 inst->predicate = BRW_PREDICATE_NORMAL;
2270 }
2271
2272 /* i965 clipping workaround:
2273 * 1) Test for -ve rhw
2274 * 2) If set,
2275 * set ndc = (0,0,0,0)
2276 * set ucp[6] = 1
2277 *
2278 * Later, clipping will detect ucp[6] and ensure the primitive is
2279 * clipped against all fixed planes.
2280 */
2281 if (brw->has_negative_rhw_bug) {
2282 #if 0
2283 /* FINISHME */
2284 brw_CMP(p,
2285 vec8(brw_null_reg()),
2286 BRW_CONDITIONAL_L,
2287 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2288 brw_imm_f(0));
2289
2290 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2291 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2292 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2293 #endif
2294 }
2295
2296 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2297 } else if (intel->gen < 6) {
2298 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2299 } else {
2300 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2301 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2302 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2303 src_reg(output_reg[VERT_RESULT_PSIZ])));
2304 }
2305 }
2306 }
2307
2308 void
2309 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2310 {
2311 if (intel->gen < 6) {
2312 /* Clip distance slots are set aside in gen5, but they are not used. It
2313 * is not clear whether we actually need to set aside space for them,
2314 * but the performance cost is negligible.
2315 */
2316 return;
2317 }
2318
2319 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2320 *
2321 * "If a linked set of shaders forming the vertex stage contains no
2322 * static write to gl_ClipVertex or gl_ClipDistance, but the
2323 * application has requested clipping against user clip planes through
2324 * the API, then the coordinate written to gl_Position is used for
2325 * comparison against the user clip planes."
2326 *
2327 * This function is only called if the shader didn't write to
2328 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2329 * if the user wrote to it; otherwise we use gl_Position.
2330 */
2331 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2332 if (!(c->prog_data.outputs_written
2333 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2334 clip_vertex = VERT_RESULT_HPOS;
2335 }
2336
2337 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2338 ++i) {
2339 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2340 src_reg(output_reg[clip_vertex]),
2341 src_reg(this->userplane[i + offset])));
2342 }
2343 }
2344
2345 void
2346 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2347 {
2348 assert (vert_result < VERT_RESULT_MAX);
2349 reg.type = output_reg[vert_result].type;
2350 current_annotation = output_reg_annotation[vert_result];
2351 /* Copy the register, saturating if necessary */
2352 vec4_instruction *inst = emit(MOV(reg,
2353 src_reg(output_reg[vert_result])));
2354 if ((vert_result == VERT_RESULT_COL0 ||
2355 vert_result == VERT_RESULT_COL1 ||
2356 vert_result == VERT_RESULT_BFC0 ||
2357 vert_result == VERT_RESULT_BFC1) &&
2358 c->key.clamp_vertex_color) {
2359 inst->saturate = true;
2360 }
2361 }
2362
2363 void
2364 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2365 {
2366 struct brw_reg hw_reg = brw_message_reg(mrf);
2367 dst_reg reg = dst_reg(MRF, mrf);
2368 reg.type = BRW_REGISTER_TYPE_F;
2369
2370 switch (vert_result) {
2371 case VERT_RESULT_PSIZ:
2372 /* PSIZ is always in slot 0, and is coupled with other flags. */
2373 current_annotation = "indices, point width, clip flags";
2374 emit_psiz_and_flags(hw_reg);
2375 break;
2376 case BRW_VERT_RESULT_NDC:
2377 current_annotation = "NDC";
2378 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2379 break;
2380 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2381 case VERT_RESULT_HPOS:
2382 current_annotation = "gl_Position";
2383 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2384 break;
2385 case VERT_RESULT_CLIP_DIST0:
2386 case VERT_RESULT_CLIP_DIST1:
2387 if (this->c->key.uses_clip_distance) {
2388 emit_generic_urb_slot(reg, vert_result);
2389 } else {
2390 current_annotation = "user clip distances";
2391 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2392 }
2393 break;
2394 case VERT_RESULT_EDGE:
2395 /* This is present when doing unfilled polygons. We're supposed to copy
2396 * the edge flag from the user-provided vertex array
2397 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2398 * of that attribute (starts as 1.0f). This is then used in clipping to
2399 * determine which edges should be drawn as wireframe.
2400 */
2401 current_annotation = "edge flag";
2402 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2403 glsl_type::float_type, WRITEMASK_XYZW))));
2404 break;
2405 case BRW_VERT_RESULT_PAD:
2406 /* No need to write to this slot */
2407 break;
2408 default:
2409 emit_generic_urb_slot(reg, vert_result);
2410 break;
2411 }
2412 }
2413
2414 static int
2415 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2416 {
2417 struct intel_context *intel = &brw->intel;
2418
2419 if (intel->gen >= 6) {
2420 /* URB data written (does not include the message header reg) must
2421 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2422 * section 5.4.3.2.2: URB_INTERLEAVED.
2423 *
2424 * URB entries are allocated on a multiple of 1024 bits, so an
2425 * extra 128 bits written here to make the end align to 256 is
2426 * no problem.
2427 */
2428 if ((mlen % 2) != 1)
2429 mlen++;
2430 }
2431
2432 return mlen;
2433 }
2434
2435 /**
2436 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2437 * complete the VS thread.
2438 *
2439 * The VUE layout is documented in Volume 2a.
2440 */
2441 void
2442 vec4_visitor::emit_urb_writes()
2443 {
2444 /* MRF 0 is reserved for the debugger, so start with message header
2445 * in MRF 1.
2446 */
2447 int base_mrf = 1;
2448 int mrf = base_mrf;
2449 /* In the process of generating our URB write message contents, we
2450 * may need to unspill a register or load from an array. Those
2451 * reads would use MRFs 14-15.
2452 */
2453 int max_usable_mrf = 13;
2454
2455 /* The following assertion verifies that max_usable_mrf causes an
2456 * even-numbered amount of URB write data, which will meet gen6's
2457 * requirements for length alignment.
2458 */
2459 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2460
2461 /* First mrf is the g0-based message header containing URB handles and such,
2462 * which is implied in VS_OPCODE_URB_WRITE.
2463 */
2464 mrf++;
2465
2466 if (intel->gen < 6) {
2467 emit_ndc_computation();
2468 }
2469
2470 /* Set up the VUE data for the first URB write */
2471 int slot;
2472 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2473 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2474
2475 /* If this was max_usable_mrf, we can't fit anything more into this URB
2476 * WRITE.
2477 */
2478 if (mrf > max_usable_mrf) {
2479 slot++;
2480 break;
2481 }
2482 }
2483
2484 current_annotation = "URB write";
2485 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2486 inst->base_mrf = base_mrf;
2487 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2488 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2489
2490 /* Optional second URB write */
2491 if (!inst->eot) {
2492 mrf = base_mrf + 1;
2493
2494 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2495 assert(mrf < max_usable_mrf);
2496
2497 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2498 }
2499
2500 current_annotation = "URB write";
2501 inst = emit(VS_OPCODE_URB_WRITE);
2502 inst->base_mrf = base_mrf;
2503 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2504 inst->eot = true;
2505 /* URB destination offset. In the previous write, we got MRFs
2506 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2507 * URB row increments, and each of our MRFs is half of one of
2508 * those, since we're doing interleaved writes.
2509 */
2510 inst->offset = (max_usable_mrf - base_mrf) / 2;
2511 }
2512 }
2513
2514 src_reg
2515 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2516 src_reg *reladdr, int reg_offset)
2517 {
2518 /* Because we store the values to scratch interleaved like our
2519 * vertex data, we need to scale the vec4 index by 2.
2520 */
2521 int message_header_scale = 2;
2522
2523 /* Pre-gen6, the message header uses byte offsets instead of vec4
2524 * (16-byte) offset units.
2525 */
2526 if (intel->gen < 6)
2527 message_header_scale *= 16;
2528
2529 if (reladdr) {
2530 src_reg index = src_reg(this, glsl_type::int_type);
2531
2532 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2533 emit_before(inst, MUL(dst_reg(index),
2534 index, src_reg(message_header_scale)));
2535
2536 return index;
2537 } else {
2538 return src_reg(reg_offset * message_header_scale);
2539 }
2540 }
2541
2542 src_reg
2543 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2544 src_reg *reladdr, int reg_offset)
2545 {
2546 if (reladdr) {
2547 src_reg index = src_reg(this, glsl_type::int_type);
2548
2549 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2550
2551 /* Pre-gen6, the message header uses byte offsets instead of vec4
2552 * (16-byte) offset units.
2553 */
2554 if (intel->gen < 6) {
2555 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2556 }
2557
2558 return index;
2559 } else {
2560 int message_header_scale = intel->gen < 6 ? 16 : 1;
2561 return src_reg(reg_offset * message_header_scale);
2562 }
2563 }
2564
2565 /**
2566 * Emits an instruction before @inst to load the value named by @orig_src
2567 * from scratch space at @base_offset to @temp.
2568 *
2569 * @base_offset is measured in 32-byte units (the size of a register).
2570 */
2571 void
2572 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2573 dst_reg temp, src_reg orig_src,
2574 int base_offset)
2575 {
2576 int reg_offset = base_offset + orig_src.reg_offset;
2577 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2578
2579 emit_before(inst, SCRATCH_READ(temp, index));
2580 }
2581
2582 /**
2583 * Emits an instruction after @inst to store the value to be written
2584 * to @orig_dst to scratch space at @base_offset, from @temp.
2585 *
2586 * @base_offset is measured in 32-byte units (the size of a register).
2587 */
2588 void
2589 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2590 {
2591 int reg_offset = base_offset + inst->dst.reg_offset;
2592 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2593
2594 /* Create a temporary register to store *inst's result in.
2595 *
2596 * We have to be careful in MOVing from our temporary result register in
2597 * the scratch write. If we swizzle from channels of the temporary that
2598 * weren't initialized, it will confuse live interval analysis, which will
2599 * make spilling fail to make progress.
2600 */
2601 src_reg temp = src_reg(this, glsl_type::vec4_type);
2602 temp.type = inst->dst.type;
2603 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2604 int swizzles[4];
2605 for (int i = 0; i < 4; i++)
2606 if (inst->dst.writemask & (1 << i))
2607 swizzles[i] = i;
2608 else
2609 swizzles[i] = first_writemask_chan;
2610 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2611 swizzles[2], swizzles[3]);
2612
2613 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2614 inst->dst.writemask));
2615 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2616 write->predicate = inst->predicate;
2617 write->ir = inst->ir;
2618 write->annotation = inst->annotation;
2619 inst->insert_after(write);
2620
2621 inst->dst.file = temp.file;
2622 inst->dst.reg = temp.reg;
2623 inst->dst.reg_offset = temp.reg_offset;
2624 inst->dst.reladdr = NULL;
2625 }
2626
2627 /**
2628 * We can't generally support array access in GRF space, because a
2629 * single instruction's destination can only span 2 contiguous
2630 * registers. So, we send all GRF arrays that get variable index
2631 * access to scratch space.
2632 */
2633 void
2634 vec4_visitor::move_grf_array_access_to_scratch()
2635 {
2636 int scratch_loc[this->virtual_grf_count];
2637
2638 for (int i = 0; i < this->virtual_grf_count; i++) {
2639 scratch_loc[i] = -1;
2640 }
2641
2642 /* First, calculate the set of virtual GRFs that need to be punted
2643 * to scratch due to having any array access on them, and where in
2644 * scratch.
2645 */
2646 foreach_list(node, &this->instructions) {
2647 vec4_instruction *inst = (vec4_instruction *)node;
2648
2649 if (inst->dst.file == GRF && inst->dst.reladdr &&
2650 scratch_loc[inst->dst.reg] == -1) {
2651 scratch_loc[inst->dst.reg] = c->last_scratch;
2652 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2653 }
2654
2655 for (int i = 0 ; i < 3; i++) {
2656 src_reg *src = &inst->src[i];
2657
2658 if (src->file == GRF && src->reladdr &&
2659 scratch_loc[src->reg] == -1) {
2660 scratch_loc[src->reg] = c->last_scratch;
2661 c->last_scratch += this->virtual_grf_sizes[src->reg];
2662 }
2663 }
2664 }
2665
2666 /* Now, for anything that will be accessed through scratch, rewrite
2667 * it to load/store. Note that this is a _safe list walk, because
2668 * we may generate a new scratch_write instruction after the one
2669 * we're processing.
2670 */
2671 foreach_list_safe(node, &this->instructions) {
2672 vec4_instruction *inst = (vec4_instruction *)node;
2673
2674 /* Set up the annotation tracking for new generated instructions. */
2675 base_ir = inst->ir;
2676 current_annotation = inst->annotation;
2677
2678 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2679 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2680 }
2681
2682 for (int i = 0 ; i < 3; i++) {
2683 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2684 continue;
2685
2686 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2687
2688 emit_scratch_read(inst, temp, inst->src[i],
2689 scratch_loc[inst->src[i].reg]);
2690
2691 inst->src[i].file = temp.file;
2692 inst->src[i].reg = temp.reg;
2693 inst->src[i].reg_offset = temp.reg_offset;
2694 inst->src[i].reladdr = NULL;
2695 }
2696 }
2697 }
2698
2699 /**
2700 * Emits an instruction before @inst to load the value named by @orig_src
2701 * from the pull constant buffer (surface) at @base_offset to @temp.
2702 */
2703 void
2704 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2705 dst_reg temp, src_reg orig_src,
2706 int base_offset)
2707 {
2708 int reg_offset = base_offset + orig_src.reg_offset;
2709 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2710 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2711 vec4_instruction *load;
2712
2713 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2714 temp, index, offset);
2715 load->base_mrf = 14;
2716 load->mlen = 1;
2717 emit_before(inst, load);
2718 }
2719
2720 /**
2721 * Implements array access of uniforms by inserting a
2722 * PULL_CONSTANT_LOAD instruction.
2723 *
2724 * Unlike temporary GRF array access (where we don't support it due to
2725 * the difficulty of doing relative addressing on instruction
2726 * destinations), we could potentially do array access of uniforms
2727 * that were loaded in GRF space as push constants. In real-world
2728 * usage we've seen, though, the arrays being used are always larger
2729 * than we could load as push constants, so just always move all
2730 * uniform array access out to a pull constant buffer.
2731 */
2732 void
2733 vec4_visitor::move_uniform_array_access_to_pull_constants()
2734 {
2735 int pull_constant_loc[this->uniforms];
2736
2737 for (int i = 0; i < this->uniforms; i++) {
2738 pull_constant_loc[i] = -1;
2739 }
2740
2741 /* Walk through and find array access of uniforms. Put a copy of that
2742 * uniform in the pull constant buffer.
2743 *
2744 * Note that we don't move constant-indexed accesses to arrays. No
2745 * testing has been done of the performance impact of this choice.
2746 */
2747 foreach_list_safe(node, &this->instructions) {
2748 vec4_instruction *inst = (vec4_instruction *)node;
2749
2750 for (int i = 0 ; i < 3; i++) {
2751 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2752 continue;
2753
2754 int uniform = inst->src[i].reg;
2755
2756 /* If this array isn't already present in the pull constant buffer,
2757 * add it.
2758 */
2759 if (pull_constant_loc[uniform] == -1) {
2760 const float **values = &prog_data->param[uniform * 4];
2761
2762 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2763
2764 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2765 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2766 }
2767 }
2768
2769 /* Set up the annotation tracking for new generated instructions. */
2770 base_ir = inst->ir;
2771 current_annotation = inst->annotation;
2772
2773 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2774
2775 emit_pull_constant_load(inst, temp, inst->src[i],
2776 pull_constant_loc[uniform]);
2777
2778 inst->src[i].file = temp.file;
2779 inst->src[i].reg = temp.reg;
2780 inst->src[i].reg_offset = temp.reg_offset;
2781 inst->src[i].reladdr = NULL;
2782 }
2783 }
2784
2785 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2786 * no need to track them as larger-than-vec4 objects. This will be
2787 * relied on in cutting out unused uniform vectors from push
2788 * constants.
2789 */
2790 split_uniform_registers();
2791 }
2792
2793 void
2794 vec4_visitor::resolve_ud_negate(src_reg *reg)
2795 {
2796 if (reg->type != BRW_REGISTER_TYPE_UD ||
2797 !reg->negate)
2798 return;
2799
2800 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2801 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2802 *reg = temp;
2803 }
2804
2805 vec4_visitor::vec4_visitor(struct brw_context *brw,
2806 struct brw_vs_compile *c,
2807 struct gl_shader_program *prog,
2808 struct brw_shader *shader,
2809 void *mem_ctx)
2810 {
2811 this->c = c;
2812 this->brw = brw;
2813 this->intel = &brw->intel;
2814 this->ctx = &intel->ctx;
2815 this->prog = prog;
2816 this->shader = shader;
2817
2818 this->mem_ctx = mem_ctx;
2819 this->failed = false;
2820
2821 this->base_ir = NULL;
2822 this->current_annotation = NULL;
2823 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2824
2825 this->c = c;
2826 this->vp = &c->vp->program;
2827 this->prog_data = &c->prog_data;
2828
2829 this->variable_ht = hash_table_ctor(0,
2830 hash_table_pointer_hash,
2831 hash_table_pointer_compare);
2832
2833 this->virtual_grf_def = NULL;
2834 this->virtual_grf_use = NULL;
2835 this->virtual_grf_sizes = NULL;
2836 this->virtual_grf_count = 0;
2837 this->virtual_grf_reg_map = NULL;
2838 this->virtual_grf_reg_count = 0;
2839 this->virtual_grf_array_size = 0;
2840 this->live_intervals_valid = false;
2841
2842 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2843
2844 this->uniforms = 0;
2845 }
2846
2847 vec4_visitor::~vec4_visitor()
2848 {
2849 hash_table_dtor(this->variable_ht);
2850 }
2851
2852
2853 void
2854 vec4_visitor::fail(const char *format, ...)
2855 {
2856 va_list va;
2857 char *msg;
2858
2859 if (failed)
2860 return;
2861
2862 failed = true;
2863
2864 va_start(va, format);
2865 msg = ralloc_vasprintf(mem_ctx, format, va);
2866 va_end(va);
2867 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2868
2869 this->fail_msg = msg;
2870
2871 if (INTEL_DEBUG & DEBUG_VS) {
2872 fprintf(stderr, "%s", msg);
2873 }
2874 }
2875
2876 } /* namespace brw */