i965/fs: print non-1 strides when dumping instructions
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_vs.h"
28 #include "brw_nir.h"
29 #include "brw_vec4_live_variables.h"
30 #include "brw_dead_control_flow.h"
31
32 extern "C" {
33 #include "main/macros.h"
34 #include "main/shaderobj.h"
35 #include "program/prog_print.h"
36 #include "program/prog_parameter.h"
37 }
38 #include "main/context.h"
39
40 #define MAX_INSTRUCTION (1 << 30)
41
42 using namespace brw;
43
44 namespace brw {
45
46 void
47 src_reg::init()
48 {
49 memset(this, 0, sizeof(*this));
50
51 this->file = BAD_FILE;
52 }
53
54 src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
55 {
56 init();
57
58 this->file = file;
59 this->nr = nr;
60 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
61 this->swizzle = brw_swizzle_for_size(type->vector_elements);
62 else
63 this->swizzle = BRW_SWIZZLE_XYZW;
64 if (type)
65 this->type = brw_type_for_base_type(type);
66 }
67
68 /** Generic unset register constructor. */
69 src_reg::src_reg()
70 {
71 init();
72 }
73
74 src_reg::src_reg(struct brw_reg reg) :
75 backend_reg(reg)
76 {
77 this->reg_offset = 0;
78 this->reladdr = NULL;
79 }
80
81 src_reg::src_reg(const dst_reg &reg) :
82 backend_reg(static_cast<struct brw_reg>(reg))
83 {
84 this->reg_offset = reg.reg_offset;
85 this->reladdr = reg.reladdr;
86 this->swizzle = brw_swizzle_for_mask(reg.writemask);
87 }
88
89 void
90 dst_reg::init()
91 {
92 memset(this, 0, sizeof(*this));
93 this->file = BAD_FILE;
94 this->writemask = WRITEMASK_XYZW;
95 }
96
97 dst_reg::dst_reg()
98 {
99 init();
100 }
101
102 dst_reg::dst_reg(enum brw_reg_file file, int nr)
103 {
104 init();
105
106 this->file = file;
107 this->nr = nr;
108 }
109
110 dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
111 unsigned writemask)
112 {
113 init();
114
115 this->file = file;
116 this->nr = nr;
117 this->type = brw_type_for_base_type(type);
118 this->writemask = writemask;
119 }
120
121 dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
122 unsigned writemask)
123 {
124 init();
125
126 this->file = file;
127 this->nr = nr;
128 this->type = type;
129 this->writemask = writemask;
130 }
131
132 dst_reg::dst_reg(struct brw_reg reg) :
133 backend_reg(reg)
134 {
135 this->reg_offset = 0;
136 this->reladdr = NULL;
137 }
138
139 dst_reg::dst_reg(const src_reg &reg) :
140 backend_reg(static_cast<struct brw_reg>(reg))
141 {
142 this->reg_offset = reg.reg_offset;
143 this->writemask = brw_mask_for_swizzle(reg.swizzle);
144 this->reladdr = reg.reladdr;
145 }
146
147 bool
148 dst_reg::equals(const dst_reg &r) const
149 {
150 return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
151 reg_offset == r.reg_offset &&
152 (reladdr == r.reladdr ||
153 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
154 }
155
156 bool
157 vec4_instruction::is_send_from_grf()
158 {
159 switch (opcode) {
160 case SHADER_OPCODE_SHADER_TIME_ADD:
161 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
162 case SHADER_OPCODE_UNTYPED_ATOMIC:
163 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
164 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
165 case SHADER_OPCODE_TYPED_ATOMIC:
166 case SHADER_OPCODE_TYPED_SURFACE_READ:
167 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
168 return true;
169 default:
170 return false;
171 }
172 }
173
174 unsigned
175 vec4_instruction::regs_read(unsigned arg) const
176 {
177 if (src[arg].file == BAD_FILE)
178 return 0;
179
180 switch (opcode) {
181 case SHADER_OPCODE_SHADER_TIME_ADD:
182 case SHADER_OPCODE_UNTYPED_ATOMIC:
183 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
184 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
185 case SHADER_OPCODE_TYPED_ATOMIC:
186 case SHADER_OPCODE_TYPED_SURFACE_READ:
187 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
188 return arg == 0 ? mlen : 1;
189
190 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
191 return arg == 1 ? mlen : 1;
192
193 default:
194 return 1;
195 }
196 }
197
198 bool
199 vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
200 {
201 if (devinfo->gen == 6 && is_math())
202 return false;
203
204 if (is_send_from_grf())
205 return false;
206
207 if (!backend_instruction::can_do_source_mods())
208 return false;
209
210 return true;
211 }
212
213 bool
214 vec4_instruction::can_change_types() const
215 {
216 return dst.type == src[0].type &&
217 !src[0].abs && !src[0].negate && !saturate &&
218 (opcode == BRW_OPCODE_MOV ||
219 (opcode == BRW_OPCODE_SEL &&
220 dst.type == src[1].type &&
221 predicate != BRW_PREDICATE_NONE &&
222 !src[1].abs && !src[1].negate));
223 }
224
225 /**
226 * Returns how many MRFs an opcode will write over.
227 *
228 * Note that this is not the 0 or 1 implied writes in an actual gen
229 * instruction -- the generate_* functions generate additional MOVs
230 * for setup.
231 */
232 int
233 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
234 {
235 if (inst->mlen == 0 || inst->is_send_from_grf())
236 return 0;
237
238 switch (inst->opcode) {
239 case SHADER_OPCODE_RCP:
240 case SHADER_OPCODE_RSQ:
241 case SHADER_OPCODE_SQRT:
242 case SHADER_OPCODE_EXP2:
243 case SHADER_OPCODE_LOG2:
244 case SHADER_OPCODE_SIN:
245 case SHADER_OPCODE_COS:
246 return 1;
247 case SHADER_OPCODE_INT_QUOTIENT:
248 case SHADER_OPCODE_INT_REMAINDER:
249 case SHADER_OPCODE_POW:
250 return 2;
251 case VS_OPCODE_URB_WRITE:
252 return 1;
253 case VS_OPCODE_PULL_CONSTANT_LOAD:
254 return 2;
255 case SHADER_OPCODE_GEN4_SCRATCH_READ:
256 return 2;
257 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
258 return 3;
259 case GS_OPCODE_URB_WRITE:
260 case GS_OPCODE_URB_WRITE_ALLOCATE:
261 case GS_OPCODE_THREAD_END:
262 return 0;
263 case GS_OPCODE_FF_SYNC:
264 return 1;
265 case SHADER_OPCODE_SHADER_TIME_ADD:
266 return 0;
267 case SHADER_OPCODE_TEX:
268 case SHADER_OPCODE_TXL:
269 case SHADER_OPCODE_TXD:
270 case SHADER_OPCODE_TXF:
271 case SHADER_OPCODE_TXF_CMS:
272 case SHADER_OPCODE_TXF_CMS_W:
273 case SHADER_OPCODE_TXF_MCS:
274 case SHADER_OPCODE_TXS:
275 case SHADER_OPCODE_TG4:
276 case SHADER_OPCODE_TG4_OFFSET:
277 case SHADER_OPCODE_SAMPLEINFO:
278 case VS_OPCODE_GET_BUFFER_SIZE:
279 return inst->header_size;
280 default:
281 unreachable("not reached");
282 }
283 }
284
285 bool
286 src_reg::equals(const src_reg &r) const
287 {
288 return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
289 reg_offset == r.reg_offset &&
290 !reladdr && !r.reladdr);
291 }
292
293 bool
294 vec4_visitor::opt_vector_float()
295 {
296 bool progress = false;
297
298 int last_reg = -1, last_reg_offset = -1;
299 enum brw_reg_file last_reg_file = BAD_FILE;
300
301 int remaining_channels = 0;
302 uint8_t imm[4];
303 int inst_count = 0;
304 vec4_instruction *imm_inst[4];
305
306 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
307 if (last_reg != inst->dst.nr ||
308 last_reg_offset != inst->dst.reg_offset ||
309 last_reg_file != inst->dst.file) {
310 last_reg = inst->dst.nr;
311 last_reg_offset = inst->dst.reg_offset;
312 last_reg_file = inst->dst.file;
313 remaining_channels = WRITEMASK_XYZW;
314
315 inst_count = 0;
316 }
317
318 if (inst->opcode != BRW_OPCODE_MOV ||
319 inst->dst.writemask == WRITEMASK_XYZW ||
320 inst->src[0].file != IMM)
321 continue;
322
323 int vf = brw_float_to_vf(inst->src[0].f);
324 if (vf == -1)
325 continue;
326
327 if ((inst->dst.writemask & WRITEMASK_X) != 0)
328 imm[0] = vf;
329 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
330 imm[1] = vf;
331 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
332 imm[2] = vf;
333 if ((inst->dst.writemask & WRITEMASK_W) != 0)
334 imm[3] = vf;
335
336 imm_inst[inst_count++] = inst;
337
338 remaining_channels &= ~inst->dst.writemask;
339 if (remaining_channels == 0) {
340 unsigned vf;
341 memcpy(&vf, imm, sizeof(vf));
342 vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf));
343 mov->dst.type = BRW_REGISTER_TYPE_F;
344 mov->dst.writemask = WRITEMASK_XYZW;
345 inst->insert_after(block, mov);
346 last_reg = -1;
347
348 for (int i = 0; i < inst_count; i++) {
349 imm_inst[i]->remove(block);
350 }
351 progress = true;
352 }
353 }
354
355 if (progress)
356 invalidate_live_intervals();
357
358 return progress;
359 }
360
361 /* Replaces unused channels of a swizzle with channels that are used.
362 *
363 * For instance, this pass transforms
364 *
365 * mov vgrf4.yz, vgrf5.wxzy
366 *
367 * into
368 *
369 * mov vgrf4.yz, vgrf5.xxzx
370 *
371 * This eliminates false uses of some channels, letting dead code elimination
372 * remove the instructions that wrote them.
373 */
374 bool
375 vec4_visitor::opt_reduce_swizzle()
376 {
377 bool progress = false;
378
379 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
380 if (inst->dst.file == BAD_FILE ||
381 inst->dst.file == ARF ||
382 inst->dst.file == FIXED_GRF ||
383 inst->is_send_from_grf())
384 continue;
385
386 unsigned swizzle;
387
388 /* Determine which channels of the sources are read. */
389 switch (inst->opcode) {
390 case VEC4_OPCODE_PACK_BYTES:
391 case BRW_OPCODE_DP4:
392 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
393 * but all four of src1.
394 */
395 swizzle = brw_swizzle_for_size(4);
396 break;
397 case BRW_OPCODE_DP3:
398 swizzle = brw_swizzle_for_size(3);
399 break;
400 case BRW_OPCODE_DP2:
401 swizzle = brw_swizzle_for_size(2);
402 break;
403 default:
404 swizzle = brw_swizzle_for_mask(inst->dst.writemask);
405 break;
406 }
407
408 /* Update sources' swizzles. */
409 for (int i = 0; i < 3; i++) {
410 if (inst->src[i].file != VGRF &&
411 inst->src[i].file != ATTR &&
412 inst->src[i].file != UNIFORM)
413 continue;
414
415 const unsigned new_swizzle =
416 brw_compose_swizzle(swizzle, inst->src[i].swizzle);
417 if (inst->src[i].swizzle != new_swizzle) {
418 inst->src[i].swizzle = new_swizzle;
419 progress = true;
420 }
421 }
422 }
423
424 if (progress)
425 invalidate_live_intervals();
426
427 return progress;
428 }
429
430 void
431 vec4_visitor::split_uniform_registers()
432 {
433 /* Prior to this, uniforms have been in an array sized according to
434 * the number of vector uniforms present, sparsely filled (so an
435 * aggregate results in reg indices being skipped over). Now we're
436 * going to cut those aggregates up so each .nr index is one
437 * vector. The goal is to make elimination of unused uniform
438 * components easier later.
439 */
440 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
441 for (int i = 0 ; i < 3; i++) {
442 if (inst->src[i].file != UNIFORM)
443 continue;
444
445 assert(!inst->src[i].reladdr);
446
447 inst->src[i].nr += inst->src[i].reg_offset;
448 inst->src[i].reg_offset = 0;
449 }
450 }
451
452 /* Update that everything is now vector-sized. */
453 for (int i = 0; i < this->uniforms; i++) {
454 this->uniform_size[i] = 1;
455 }
456 }
457
458 void
459 vec4_visitor::pack_uniform_registers()
460 {
461 uint8_t chans_used[this->uniforms];
462 int new_loc[this->uniforms];
463 int new_chan[this->uniforms];
464
465 memset(chans_used, 0, sizeof(chans_used));
466 memset(new_loc, 0, sizeof(new_loc));
467 memset(new_chan, 0, sizeof(new_chan));
468
469 /* Find which uniform vectors are actually used by the program. We
470 * expect unused vector elements when we've moved array access out
471 * to pull constants, and from some GLSL code generators like wine.
472 */
473 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
474 unsigned readmask;
475 switch (inst->opcode) {
476 case VEC4_OPCODE_PACK_BYTES:
477 case BRW_OPCODE_DP4:
478 case BRW_OPCODE_DPH:
479 readmask = 0xf;
480 break;
481 case BRW_OPCODE_DP3:
482 readmask = 0x7;
483 break;
484 case BRW_OPCODE_DP2:
485 readmask = 0x3;
486 break;
487 default:
488 readmask = inst->dst.writemask;
489 break;
490 }
491
492 for (int i = 0 ; i < 3; i++) {
493 if (inst->src[i].file != UNIFORM)
494 continue;
495
496 int reg = inst->src[i].nr;
497 for (int c = 0; c < 4; c++) {
498 if (!(readmask & (1 << c)))
499 continue;
500
501 chans_used[reg] = MAX2(chans_used[reg],
502 BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
503 }
504 }
505 }
506
507 int new_uniform_count = 0;
508
509 /* Now, figure out a packing of the live uniform vectors into our
510 * push constants.
511 */
512 for (int src = 0; src < uniforms; src++) {
513 assert(src < uniform_array_size);
514 int size = chans_used[src];
515
516 if (size == 0)
517 continue;
518
519 int dst;
520 /* Find the lowest place we can slot this uniform in. */
521 for (dst = 0; dst < src; dst++) {
522 if (chans_used[dst] + size <= 4)
523 break;
524 }
525
526 if (src == dst) {
527 new_loc[src] = dst;
528 new_chan[src] = 0;
529 } else {
530 new_loc[src] = dst;
531 new_chan[src] = chans_used[dst];
532
533 /* Move the references to the data */
534 for (int j = 0; j < size; j++) {
535 stage_prog_data->param[dst * 4 + new_chan[src] + j] =
536 stage_prog_data->param[src * 4 + j];
537 }
538
539 chans_used[dst] += size;
540 chans_used[src] = 0;
541 }
542
543 new_uniform_count = MAX2(new_uniform_count, dst + 1);
544 }
545
546 this->uniforms = new_uniform_count;
547
548 /* Now, update the instructions for our repacked uniforms. */
549 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
550 for (int i = 0 ; i < 3; i++) {
551 int src = inst->src[i].nr;
552
553 if (inst->src[i].file != UNIFORM)
554 continue;
555
556 inst->src[i].nr = new_loc[src];
557 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
558 new_chan[src], new_chan[src]);
559 }
560 }
561 }
562
563 /**
564 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
565 *
566 * While GLSL IR also performs this optimization, we end up with it in
567 * our instruction stream for a couple of reasons. One is that we
568 * sometimes generate silly instructions, for example in array access
569 * where we'll generate "ADD offset, index, base" even if base is 0.
570 * The other is that GLSL IR's constant propagation doesn't track the
571 * components of aggregates, so some VS patterns (initialize matrix to
572 * 0, accumulate in vertex blending factors) end up breaking down to
573 * instructions involving 0.
574 */
575 bool
576 vec4_visitor::opt_algebraic()
577 {
578 bool progress = false;
579
580 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
581 switch (inst->opcode) {
582 case BRW_OPCODE_MOV:
583 if (inst->src[0].file != IMM)
584 break;
585
586 if (inst->saturate) {
587 if (inst->dst.type != inst->src[0].type)
588 assert(!"unimplemented: saturate mixed types");
589
590 if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
591 inst->saturate = false;
592 progress = true;
593 }
594 }
595 break;
596
597 case VEC4_OPCODE_UNPACK_UNIFORM:
598 if (inst->src[0].file != UNIFORM) {
599 inst->opcode = BRW_OPCODE_MOV;
600 progress = true;
601 }
602 break;
603
604 case BRW_OPCODE_ADD:
605 if (inst->src[1].is_zero()) {
606 inst->opcode = BRW_OPCODE_MOV;
607 inst->src[1] = src_reg();
608 progress = true;
609 }
610 break;
611
612 case BRW_OPCODE_MUL:
613 if (inst->src[1].is_zero()) {
614 inst->opcode = BRW_OPCODE_MOV;
615 switch (inst->src[0].type) {
616 case BRW_REGISTER_TYPE_F:
617 inst->src[0] = brw_imm_f(0.0f);
618 break;
619 case BRW_REGISTER_TYPE_D:
620 inst->src[0] = brw_imm_d(0);
621 break;
622 case BRW_REGISTER_TYPE_UD:
623 inst->src[0] = brw_imm_ud(0u);
624 break;
625 default:
626 unreachable("not reached");
627 }
628 inst->src[1] = src_reg();
629 progress = true;
630 } else if (inst->src[1].is_one()) {
631 inst->opcode = BRW_OPCODE_MOV;
632 inst->src[1] = src_reg();
633 progress = true;
634 } else if (inst->src[1].is_negative_one()) {
635 inst->opcode = BRW_OPCODE_MOV;
636 inst->src[0].negate = !inst->src[0].negate;
637 inst->src[1] = src_reg();
638 progress = true;
639 }
640 break;
641 case BRW_OPCODE_CMP:
642 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
643 inst->src[0].abs &&
644 inst->src[0].negate &&
645 inst->src[1].is_zero()) {
646 inst->src[0].abs = false;
647 inst->src[0].negate = false;
648 inst->conditional_mod = BRW_CONDITIONAL_Z;
649 progress = true;
650 break;
651 }
652 break;
653 case SHADER_OPCODE_RCP: {
654 vec4_instruction *prev = (vec4_instruction *)inst->prev;
655 if (prev->opcode == SHADER_OPCODE_SQRT) {
656 if (inst->src[0].equals(src_reg(prev->dst))) {
657 inst->opcode = SHADER_OPCODE_RSQ;
658 inst->src[0] = prev->src[0];
659 progress = true;
660 }
661 }
662 break;
663 }
664 case SHADER_OPCODE_BROADCAST:
665 if (is_uniform(inst->src[0]) ||
666 inst->src[1].is_zero()) {
667 inst->opcode = BRW_OPCODE_MOV;
668 inst->src[1] = src_reg();
669 inst->force_writemask_all = true;
670 progress = true;
671 }
672 break;
673
674 default:
675 break;
676 }
677 }
678
679 if (progress)
680 invalidate_live_intervals();
681
682 return progress;
683 }
684
685 /**
686 * Only a limited number of hardware registers may be used for push
687 * constants, so this turns access to the overflowed constants into
688 * pull constants.
689 */
690 void
691 vec4_visitor::move_push_constants_to_pull_constants()
692 {
693 int pull_constant_loc[this->uniforms];
694
695 /* Only allow 32 registers (256 uniform components) as push constants,
696 * which is the limit on gen6.
697 *
698 * If changing this value, note the limitation about total_regs in
699 * brw_curbe.c.
700 */
701 int max_uniform_components = 32 * 8;
702 if (this->uniforms * 4 <= max_uniform_components)
703 return;
704
705 /* Make some sort of choice as to which uniforms get sent to pull
706 * constants. We could potentially do something clever here like
707 * look for the most infrequently used uniform vec4s, but leave
708 * that for later.
709 */
710 for (int i = 0; i < this->uniforms * 4; i += 4) {
711 pull_constant_loc[i / 4] = -1;
712
713 if (i >= max_uniform_components) {
714 const gl_constant_value **values = &stage_prog_data->param[i];
715
716 /* Try to find an existing copy of this uniform in the pull
717 * constants if it was part of an array access already.
718 */
719 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
720 int matches;
721
722 for (matches = 0; matches < 4; matches++) {
723 if (stage_prog_data->pull_param[j + matches] != values[matches])
724 break;
725 }
726
727 if (matches == 4) {
728 pull_constant_loc[i / 4] = j / 4;
729 break;
730 }
731 }
732
733 if (pull_constant_loc[i / 4] == -1) {
734 assert(stage_prog_data->nr_pull_params % 4 == 0);
735 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
736
737 for (int j = 0; j < 4; j++) {
738 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
739 values[j];
740 }
741 }
742 }
743 }
744
745 /* Now actually rewrite usage of the things we've moved to pull
746 * constants.
747 */
748 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
749 for (int i = 0 ; i < 3; i++) {
750 if (inst->src[i].file != UNIFORM ||
751 pull_constant_loc[inst->src[i].nr] == -1)
752 continue;
753
754 int uniform = inst->src[i].nr;
755
756 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
757
758 emit_pull_constant_load(block, inst, temp, inst->src[i],
759 pull_constant_loc[uniform]);
760
761 inst->src[i].file = temp.file;
762 inst->src[i].nr = temp.nr;
763 inst->src[i].reg_offset = temp.reg_offset;
764 inst->src[i].reladdr = NULL;
765 }
766 }
767
768 /* Repack push constants to remove the now-unused ones. */
769 pack_uniform_registers();
770 }
771
772 /* Conditions for which we want to avoid setting the dependency control bits */
773 bool
774 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
775 {
776 #define IS_DWORD(reg) \
777 (reg.type == BRW_REGISTER_TYPE_UD || \
778 reg.type == BRW_REGISTER_TYPE_D)
779
780 /* "When source or destination datatype is 64b or operation is integer DWord
781 * multiply, DepCtrl must not be used."
782 * May apply to future SoCs as well.
783 */
784 if (devinfo->is_cherryview) {
785 if (inst->opcode == BRW_OPCODE_MUL &&
786 IS_DWORD(inst->src[0]) &&
787 IS_DWORD(inst->src[1]))
788 return true;
789 }
790 #undef IS_DWORD
791
792 if (devinfo->gen >= 8) {
793 if (inst->opcode == BRW_OPCODE_F32TO16)
794 return true;
795 }
796
797 /*
798 * mlen:
799 * In the presence of send messages, totally interrupt dependency
800 * control. They're long enough that the chance of dependency
801 * control around them just doesn't matter.
802 *
803 * predicate:
804 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
805 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
806 * completes the scoreboard clear must have a non-zero execution mask. This
807 * means, if any kind of predication can change the execution mask or channel
808 * enable of the last instruction, the optimization must be avoided. This is
809 * to avoid instructions being shot down the pipeline when no writes are
810 * required.
811 *
812 * math:
813 * Dependency control does not work well over math instructions.
814 * NB: Discovered empirically
815 */
816 return (inst->mlen || inst->predicate || inst->is_math());
817 }
818
819 /**
820 * Sets the dependency control fields on instructions after register
821 * allocation and before the generator is run.
822 *
823 * When you have a sequence of instructions like:
824 *
825 * DP4 temp.x vertex uniform[0]
826 * DP4 temp.y vertex uniform[0]
827 * DP4 temp.z vertex uniform[0]
828 * DP4 temp.w vertex uniform[0]
829 *
830 * The hardware doesn't know that it can actually run the later instructions
831 * while the previous ones are in flight, producing stalls. However, we have
832 * manual fields we can set in the instructions that let it do so.
833 */
834 void
835 vec4_visitor::opt_set_dependency_control()
836 {
837 vec4_instruction *last_grf_write[BRW_MAX_GRF];
838 uint8_t grf_channels_written[BRW_MAX_GRF];
839 vec4_instruction *last_mrf_write[BRW_MAX_GRF];
840 uint8_t mrf_channels_written[BRW_MAX_GRF];
841
842 assert(prog_data->total_grf ||
843 !"Must be called after register allocation");
844
845 foreach_block (block, cfg) {
846 memset(last_grf_write, 0, sizeof(last_grf_write));
847 memset(last_mrf_write, 0, sizeof(last_mrf_write));
848
849 foreach_inst_in_block (vec4_instruction, inst, block) {
850 /* If we read from a register that we were doing dependency control
851 * on, don't do dependency control across the read.
852 */
853 for (int i = 0; i < 3; i++) {
854 int reg = inst->src[i].nr + inst->src[i].reg_offset;
855 if (inst->src[i].file == VGRF) {
856 last_grf_write[reg] = NULL;
857 } else if (inst->src[i].file == FIXED_GRF) {
858 memset(last_grf_write, 0, sizeof(last_grf_write));
859 break;
860 }
861 assert(inst->src[i].file != MRF);
862 }
863
864 if (is_dep_ctrl_unsafe(inst)) {
865 memset(last_grf_write, 0, sizeof(last_grf_write));
866 memset(last_mrf_write, 0, sizeof(last_mrf_write));
867 continue;
868 }
869
870 /* Now, see if we can do dependency control for this instruction
871 * against a previous one writing to its destination.
872 */
873 int reg = inst->dst.nr + inst->dst.reg_offset;
874 if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
875 if (last_grf_write[reg] &&
876 !(inst->dst.writemask & grf_channels_written[reg])) {
877 last_grf_write[reg]->no_dd_clear = true;
878 inst->no_dd_check = true;
879 } else {
880 grf_channels_written[reg] = 0;
881 }
882
883 last_grf_write[reg] = inst;
884 grf_channels_written[reg] |= inst->dst.writemask;
885 } else if (inst->dst.file == MRF) {
886 if (last_mrf_write[reg] &&
887 !(inst->dst.writemask & mrf_channels_written[reg])) {
888 last_mrf_write[reg]->no_dd_clear = true;
889 inst->no_dd_check = true;
890 } else {
891 mrf_channels_written[reg] = 0;
892 }
893
894 last_mrf_write[reg] = inst;
895 mrf_channels_written[reg] |= inst->dst.writemask;
896 }
897 }
898 }
899 }
900
901 bool
902 vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo,
903 int dst_writemask,
904 int swizzle,
905 int swizzle_mask)
906 {
907 /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
908 * or writemasking are not allowed.
909 */
910 if (devinfo->gen == 6 && is_math() &&
911 (swizzle != BRW_SWIZZLE_XYZW || dst_writemask != WRITEMASK_XYZW))
912 return false;
913
914 /* If this instruction sets anything not referenced by swizzle, then we'd
915 * totally break it when we reswizzle.
916 */
917 if (dst.writemask & ~swizzle_mask)
918 return false;
919
920 if (mlen > 0)
921 return false;
922
923 for (int i = 0; i < 3; i++) {
924 if (src[i].is_accumulator())
925 return false;
926 }
927
928 return true;
929 }
930
931 /**
932 * For any channels in the swizzle's source that were populated by this
933 * instruction, rewrite the instruction to put the appropriate result directly
934 * in those channels.
935 *
936 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
937 */
938 void
939 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
940 {
941 /* Destination write mask doesn't correspond to source swizzle for the dot
942 * product and pack_bytes instructions.
943 */
944 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
945 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
946 opcode != VEC4_OPCODE_PACK_BYTES) {
947 for (int i = 0; i < 3; i++) {
948 if (src[i].file == BAD_FILE || src[i].file == IMM)
949 continue;
950
951 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
952 }
953 }
954
955 /* Apply the specified swizzle and writemask to the original mask of
956 * written components.
957 */
958 dst.writemask = dst_writemask &
959 brw_apply_swizzle_to_mask(swizzle, dst.writemask);
960 }
961
962 /*
963 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
964 * just written and then MOVed into another reg and making the original write
965 * of the GRF write directly to the final destination instead.
966 */
967 bool
968 vec4_visitor::opt_register_coalesce()
969 {
970 bool progress = false;
971 int next_ip = 0;
972
973 calculate_live_intervals();
974
975 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
976 int ip = next_ip;
977 next_ip++;
978
979 if (inst->opcode != BRW_OPCODE_MOV ||
980 (inst->dst.file != VGRF && inst->dst.file != MRF) ||
981 inst->predicate ||
982 inst->src[0].file != VGRF ||
983 inst->dst.type != inst->src[0].type ||
984 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
985 continue;
986
987 /* Remove no-op MOVs */
988 if (inst->dst.file == inst->src[0].file &&
989 inst->dst.nr == inst->src[0].nr &&
990 inst->dst.reg_offset == inst->src[0].reg_offset) {
991 bool is_nop_mov = true;
992
993 for (unsigned c = 0; c < 4; c++) {
994 if ((inst->dst.writemask & (1 << c)) == 0)
995 continue;
996
997 if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
998 is_nop_mov = false;
999 break;
1000 }
1001 }
1002
1003 if (is_nop_mov) {
1004 inst->remove(block);
1005 continue;
1006 }
1007 }
1008
1009 bool to_mrf = (inst->dst.file == MRF);
1010
1011 /* Can't coalesce this GRF if someone else was going to
1012 * read it later.
1013 */
1014 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1015 continue;
1016
1017 /* We need to check interference with the final destination between this
1018 * instruction and the earliest instruction involved in writing the GRF
1019 * we're eliminating. To do that, keep track of which of our source
1020 * channels we've seen initialized.
1021 */
1022 const unsigned chans_needed =
1023 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1024 inst->dst.writemask);
1025 unsigned chans_remaining = chans_needed;
1026
1027 /* Now walk up the instruction stream trying to see if we can rewrite
1028 * everything writing to the temporary to write into the destination
1029 * instead.
1030 */
1031 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1032 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1033 inst) {
1034 _scan_inst = scan_inst;
1035
1036 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1037 /* Found something writing to the reg we want to coalesce away. */
1038 if (to_mrf) {
1039 /* SEND instructions can't have MRF as a destination. */
1040 if (scan_inst->mlen)
1041 break;
1042
1043 if (devinfo->gen == 6) {
1044 /* gen6 math instructions must have the destination be
1045 * VGRF, so no compute-to-MRF for them.
1046 */
1047 if (scan_inst->is_math()) {
1048 break;
1049 }
1050 }
1051 }
1052
1053 /* This doesn't handle saturation on the instruction we
1054 * want to coalesce away if the register types do not match.
1055 * But if scan_inst is a non type-converting 'mov', we can fix
1056 * the types later.
1057 */
1058 if (inst->saturate &&
1059 inst->dst.type != scan_inst->dst.type &&
1060 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1061 scan_inst->dst.type == scan_inst->src[0].type))
1062 break;
1063
1064 /* If we can't handle the swizzle, bail. */
1065 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1066 inst->src[0].swizzle,
1067 chans_needed)) {
1068 break;
1069 }
1070
1071 /* This doesn't handle coalescing of multiple registers. */
1072 if (scan_inst->regs_written > 1)
1073 break;
1074
1075 /* Mark which channels we found unconditional writes for. */
1076 if (!scan_inst->predicate)
1077 chans_remaining &= ~scan_inst->dst.writemask;
1078
1079 if (chans_remaining == 0)
1080 break;
1081 }
1082
1083 /* You can't read from an MRF, so if someone else reads our MRF's
1084 * source GRF that we wanted to rewrite, that stops us. If it's a
1085 * GRF we're trying to coalesce to, we don't actually handle
1086 * rewriting sources so bail in that case as well.
1087 */
1088 bool interfered = false;
1089 for (int i = 0; i < 3; i++) {
1090 if (inst->src[0].in_range(scan_inst->src[i],
1091 scan_inst->regs_read(i)))
1092 interfered = true;
1093 }
1094 if (interfered)
1095 break;
1096
1097 /* If somebody else writes the same channels of our destination here,
1098 * we can't coalesce before that.
1099 */
1100 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written) &&
1101 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1102 break;
1103 }
1104
1105 /* Check for reads of the register we're trying to coalesce into. We
1106 * can't go rewriting instructions above that to put some other value
1107 * in the register instead.
1108 */
1109 if (to_mrf && scan_inst->mlen > 0) {
1110 if (inst->dst.nr >= scan_inst->base_mrf &&
1111 inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
1112 break;
1113 }
1114 } else {
1115 for (int i = 0; i < 3; i++) {
1116 if (inst->dst.in_range(scan_inst->src[i],
1117 scan_inst->regs_read(i)))
1118 interfered = true;
1119 }
1120 if (interfered)
1121 break;
1122 }
1123 }
1124
1125 if (chans_remaining == 0) {
1126 /* If we've made it here, we have an MOV we want to coalesce out, and
1127 * a scan_inst pointing to the earliest instruction involved in
1128 * computing the value. Now go rewrite the instruction stream
1129 * between the two.
1130 */
1131 vec4_instruction *scan_inst = _scan_inst;
1132 while (scan_inst != inst) {
1133 if (scan_inst->dst.file == VGRF &&
1134 scan_inst->dst.nr == inst->src[0].nr &&
1135 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1136 scan_inst->reswizzle(inst->dst.writemask,
1137 inst->src[0].swizzle);
1138 scan_inst->dst.file = inst->dst.file;
1139 scan_inst->dst.nr = inst->dst.nr;
1140 scan_inst->dst.reg_offset = inst->dst.reg_offset;
1141 if (inst->saturate &&
1142 inst->dst.type != scan_inst->dst.type) {
1143 /* If we have reached this point, scan_inst is a non
1144 * type-converting 'mov' and we can modify its register types
1145 * to match the ones in inst. Otherwise, we could have an
1146 * incorrect saturation result.
1147 */
1148 scan_inst->dst.type = inst->dst.type;
1149 scan_inst->src[0].type = inst->src[0].type;
1150 }
1151 scan_inst->saturate |= inst->saturate;
1152 }
1153 scan_inst = (vec4_instruction *)scan_inst->next;
1154 }
1155 inst->remove(block);
1156 progress = true;
1157 }
1158 }
1159
1160 if (progress)
1161 invalidate_live_intervals();
1162
1163 return progress;
1164 }
1165
1166 /**
1167 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1168 * flow. We could probably do better here with some form of divergence
1169 * analysis.
1170 */
1171 bool
1172 vec4_visitor::eliminate_find_live_channel()
1173 {
1174 bool progress = false;
1175 unsigned depth = 0;
1176
1177 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1178 switch (inst->opcode) {
1179 case BRW_OPCODE_IF:
1180 case BRW_OPCODE_DO:
1181 depth++;
1182 break;
1183
1184 case BRW_OPCODE_ENDIF:
1185 case BRW_OPCODE_WHILE:
1186 depth--;
1187 break;
1188
1189 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1190 if (depth == 0) {
1191 inst->opcode = BRW_OPCODE_MOV;
1192 inst->src[0] = brw_imm_d(0);
1193 inst->force_writemask_all = true;
1194 progress = true;
1195 }
1196 break;
1197
1198 default:
1199 break;
1200 }
1201 }
1202
1203 return progress;
1204 }
1205
1206 /**
1207 * Splits virtual GRFs requesting more than one contiguous physical register.
1208 *
1209 * We initially create large virtual GRFs for temporary structures, arrays,
1210 * and matrices, so that the dereference visitor functions can add reg_offsets
1211 * to work their way down to the actual member being accessed. But when it
1212 * comes to optimization, we'd like to treat each register as individual
1213 * storage if possible.
1214 *
1215 * So far, the only thing that might prevent splitting is a send message from
1216 * a GRF on IVB.
1217 */
1218 void
1219 vec4_visitor::split_virtual_grfs()
1220 {
1221 int num_vars = this->alloc.count;
1222 int new_virtual_grf[num_vars];
1223 bool split_grf[num_vars];
1224
1225 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1226
1227 /* Try to split anything > 0 sized. */
1228 for (int i = 0; i < num_vars; i++) {
1229 split_grf[i] = this->alloc.sizes[i] != 1;
1230 }
1231
1232 /* Check that the instructions are compatible with the registers we're trying
1233 * to split.
1234 */
1235 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1236 if (inst->dst.file == VGRF && inst->regs_written > 1)
1237 split_grf[inst->dst.nr] = false;
1238
1239 for (int i = 0; i < 3; i++) {
1240 if (inst->src[i].file == VGRF && inst->regs_read(i) > 1)
1241 split_grf[inst->src[i].nr] = false;
1242 }
1243 }
1244
1245 /* Allocate new space for split regs. Note that the virtual
1246 * numbers will be contiguous.
1247 */
1248 for (int i = 0; i < num_vars; i++) {
1249 if (!split_grf[i])
1250 continue;
1251
1252 new_virtual_grf[i] = alloc.allocate(1);
1253 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1254 unsigned reg = alloc.allocate(1);
1255 assert(reg == new_virtual_grf[i] + j - 1);
1256 (void) reg;
1257 }
1258 this->alloc.sizes[i] = 1;
1259 }
1260
1261 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1262 if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
1263 inst->dst.reg_offset != 0) {
1264 inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
1265 inst->dst.reg_offset - 1);
1266 inst->dst.reg_offset = 0;
1267 }
1268 for (int i = 0; i < 3; i++) {
1269 if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
1270 inst->src[i].reg_offset != 0) {
1271 inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
1272 inst->src[i].reg_offset - 1);
1273 inst->src[i].reg_offset = 0;
1274 }
1275 }
1276 }
1277 invalidate_live_intervals();
1278 }
1279
1280 void
1281 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1282 {
1283 dump_instruction(be_inst, stderr);
1284 }
1285
1286 void
1287 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1288 {
1289 vec4_instruction *inst = (vec4_instruction *)be_inst;
1290
1291 if (inst->predicate) {
1292 fprintf(file, "(%cf0.%d%s) ",
1293 inst->predicate_inverse ? '-' : '+',
1294 inst->flag_subreg,
1295 pred_ctrl_align16[inst->predicate]);
1296 }
1297
1298 fprintf(file, "%s", brw_instruction_name(inst->opcode));
1299 if (inst->saturate)
1300 fprintf(file, ".sat");
1301 if (inst->conditional_mod) {
1302 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1303 if (!inst->predicate &&
1304 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1305 inst->opcode != BRW_OPCODE_IF &&
1306 inst->opcode != BRW_OPCODE_WHILE))) {
1307 fprintf(file, ".f0.%d", inst->flag_subreg);
1308 }
1309 }
1310 fprintf(file, " ");
1311
1312 switch (inst->dst.file) {
1313 case VGRF:
1314 fprintf(file, "vgrf%d.%d", inst->dst.nr, inst->dst.reg_offset);
1315 break;
1316 case FIXED_GRF:
1317 fprintf(file, "g%d", inst->dst.nr);
1318 break;
1319 case MRF:
1320 fprintf(file, "m%d", inst->dst.nr);
1321 break;
1322 case ARF:
1323 switch (inst->dst.nr) {
1324 case BRW_ARF_NULL:
1325 fprintf(file, "null");
1326 break;
1327 case BRW_ARF_ADDRESS:
1328 fprintf(file, "a0.%d", inst->dst.subnr);
1329 break;
1330 case BRW_ARF_ACCUMULATOR:
1331 fprintf(file, "acc%d", inst->dst.subnr);
1332 break;
1333 case BRW_ARF_FLAG:
1334 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1335 break;
1336 default:
1337 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1338 break;
1339 }
1340 if (inst->dst.subnr)
1341 fprintf(file, "+%d", inst->dst.subnr);
1342 break;
1343 case BAD_FILE:
1344 fprintf(file, "(null)");
1345 break;
1346 case IMM:
1347 case ATTR:
1348 case UNIFORM:
1349 unreachable("not reached");
1350 }
1351 if (inst->dst.writemask != WRITEMASK_XYZW) {
1352 fprintf(file, ".");
1353 if (inst->dst.writemask & 1)
1354 fprintf(file, "x");
1355 if (inst->dst.writemask & 2)
1356 fprintf(file, "y");
1357 if (inst->dst.writemask & 4)
1358 fprintf(file, "z");
1359 if (inst->dst.writemask & 8)
1360 fprintf(file, "w");
1361 }
1362 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1363
1364 if (inst->src[0].file != BAD_FILE)
1365 fprintf(file, ", ");
1366
1367 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1368 if (inst->src[i].negate)
1369 fprintf(file, "-");
1370 if (inst->src[i].abs)
1371 fprintf(file, "|");
1372 switch (inst->src[i].file) {
1373 case VGRF:
1374 fprintf(file, "vgrf%d", inst->src[i].nr);
1375 break;
1376 case FIXED_GRF:
1377 fprintf(file, "g%d", inst->src[i].nr);
1378 break;
1379 case ATTR:
1380 fprintf(file, "attr%d", inst->src[i].nr);
1381 break;
1382 case UNIFORM:
1383 fprintf(file, "u%d", inst->src[i].nr);
1384 break;
1385 case IMM:
1386 switch (inst->src[i].type) {
1387 case BRW_REGISTER_TYPE_F:
1388 fprintf(file, "%fF", inst->src[i].f);
1389 break;
1390 case BRW_REGISTER_TYPE_D:
1391 fprintf(file, "%dD", inst->src[i].d);
1392 break;
1393 case BRW_REGISTER_TYPE_UD:
1394 fprintf(file, "%uU", inst->src[i].ud);
1395 break;
1396 case BRW_REGISTER_TYPE_VF:
1397 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1398 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1399 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1400 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1401 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1402 break;
1403 default:
1404 fprintf(file, "???");
1405 break;
1406 }
1407 break;
1408 case ARF:
1409 switch (inst->src[i].nr) {
1410 case BRW_ARF_NULL:
1411 fprintf(file, "null");
1412 break;
1413 case BRW_ARF_ADDRESS:
1414 fprintf(file, "a0.%d", inst->src[i].subnr);
1415 break;
1416 case BRW_ARF_ACCUMULATOR:
1417 fprintf(file, "acc%d", inst->src[i].subnr);
1418 break;
1419 case BRW_ARF_FLAG:
1420 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1421 break;
1422 default:
1423 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1424 break;
1425 }
1426 if (inst->src[i].subnr)
1427 fprintf(file, "+%d", inst->src[i].subnr);
1428 break;
1429 case BAD_FILE:
1430 fprintf(file, "(null)");
1431 break;
1432 case MRF:
1433 unreachable("not reached");
1434 }
1435
1436 /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1437 if (inst->src[i].reg_offset != 0 &&
1438 inst->src[i].file == VGRF &&
1439 alloc.sizes[inst->src[i].nr] != 1)
1440 fprintf(file, ".%d", inst->src[i].reg_offset);
1441
1442 if (inst->src[i].file != IMM) {
1443 static const char *chans[4] = {"x", "y", "z", "w"};
1444 fprintf(file, ".");
1445 for (int c = 0; c < 4; c++) {
1446 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1447 }
1448 }
1449
1450 if (inst->src[i].abs)
1451 fprintf(file, "|");
1452
1453 if (inst->src[i].file != IMM) {
1454 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1455 }
1456
1457 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1458 fprintf(file, ", ");
1459 }
1460
1461 if (inst->force_writemask_all)
1462 fprintf(file, " NoMask");
1463
1464 fprintf(file, "\n");
1465 }
1466
1467
1468 static inline struct brw_reg
1469 attribute_to_hw_reg(int attr, bool interleaved)
1470 {
1471 if (interleaved)
1472 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1473 else
1474 return brw_vec8_grf(attr, 0);
1475 }
1476
1477
1478 /**
1479 * Replace each register of type ATTR in this->instructions with a reference
1480 * to a fixed HW register.
1481 *
1482 * If interleaved is true, then each attribute takes up half a register, with
1483 * register N containing attribute 2*N in its first half and attribute 2*N+1
1484 * in its second half (this corresponds to the payload setup used by geometry
1485 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
1486 * false, then each attribute takes up a whole register, with register N
1487 * containing attribute N (this corresponds to the payload setup used by
1488 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1489 */
1490 void
1491 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1492 bool interleaved)
1493 {
1494 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1495 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1496 if (inst->dst.file == ATTR) {
1497 int grf = attribute_map[inst->dst.nr + inst->dst.reg_offset];
1498
1499 /* All attributes used in the shader need to have been assigned a
1500 * hardware register by the caller
1501 */
1502 assert(grf != 0);
1503
1504 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1505 reg.type = inst->dst.type;
1506 reg.writemask = inst->dst.writemask;
1507
1508 inst->dst = reg;
1509 }
1510
1511 for (int i = 0; i < 3; i++) {
1512 if (inst->src[i].file != ATTR)
1513 continue;
1514
1515 int grf = attribute_map[inst->src[i].nr + inst->src[i].reg_offset];
1516
1517 /* All attributes used in the shader need to have been assigned a
1518 * hardware register by the caller
1519 */
1520 assert(grf != 0);
1521
1522 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1523 reg.swizzle = inst->src[i].swizzle;
1524 reg.type = inst->src[i].type;
1525 if (inst->src[i].abs)
1526 reg = brw_abs(reg);
1527 if (inst->src[i].negate)
1528 reg = negate(reg);
1529
1530 inst->src[i] = reg;
1531 }
1532 }
1533 }
1534
1535 int
1536 vec4_vs_visitor::setup_attributes(int payload_reg)
1537 {
1538 int nr_attributes;
1539 int attribute_map[VERT_ATTRIB_MAX + 1];
1540 memset(attribute_map, 0, sizeof(attribute_map));
1541
1542 nr_attributes = 0;
1543 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1544 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1545 attribute_map[i] = payload_reg + nr_attributes;
1546 nr_attributes++;
1547 }
1548 }
1549
1550 /* VertexID is stored by the VF as the last vertex element, but we
1551 * don't represent it with a flag in inputs_read, so we call it
1552 * VERT_ATTRIB_MAX.
1553 */
1554 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
1555 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1556 }
1557
1558 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1559
1560 return payload_reg + vs_prog_data->nr_attributes;
1561 }
1562
1563 int
1564 vec4_visitor::setup_uniforms(int reg)
1565 {
1566 prog_data->base.dispatch_grf_start_reg = reg;
1567
1568 /* The pre-gen6 VS requires that some push constants get loaded no
1569 * matter what, or the GPU would hang.
1570 */
1571 if (devinfo->gen < 6 && this->uniforms == 0) {
1572 assert(this->uniforms < this->uniform_array_size);
1573
1574 stage_prog_data->param =
1575 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1576 for (unsigned int i = 0; i < 4; i++) {
1577 unsigned int slot = this->uniforms * 4 + i;
1578 static gl_constant_value zero = { 0.0 };
1579 stage_prog_data->param[slot] = &zero;
1580 }
1581
1582 this->uniforms++;
1583 reg++;
1584 } else {
1585 reg += ALIGN(uniforms, 2) / 2;
1586 }
1587
1588 stage_prog_data->nr_params = this->uniforms * 4;
1589
1590 prog_data->base.curb_read_length =
1591 reg - prog_data->base.dispatch_grf_start_reg;
1592
1593 return reg;
1594 }
1595
1596 void
1597 vec4_vs_visitor::setup_payload(void)
1598 {
1599 int reg = 0;
1600
1601 /* The payload always contains important data in g0, which contains
1602 * the URB handles that are passed on to the URB write at the end
1603 * of the thread. So, we always start push constants at g1.
1604 */
1605 reg++;
1606
1607 reg = setup_uniforms(reg);
1608
1609 reg = setup_attributes(reg);
1610
1611 this->first_non_payload_grf = reg;
1612 }
1613
1614 src_reg
1615 vec4_visitor::get_timestamp()
1616 {
1617 assert(devinfo->gen >= 7);
1618
1619 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1620 BRW_ARF_TIMESTAMP,
1621 0,
1622 0,
1623 0,
1624 BRW_REGISTER_TYPE_UD,
1625 BRW_VERTICAL_STRIDE_0,
1626 BRW_WIDTH_4,
1627 BRW_HORIZONTAL_STRIDE_4,
1628 BRW_SWIZZLE_XYZW,
1629 WRITEMASK_XYZW));
1630
1631 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1632
1633 vec4_instruction *mov = emit(MOV(dst, ts));
1634 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1635 * even if it's not enabled in the dispatch.
1636 */
1637 mov->force_writemask_all = true;
1638
1639 return src_reg(dst);
1640 }
1641
1642 void
1643 vec4_visitor::emit_shader_time_begin()
1644 {
1645 current_annotation = "shader time start";
1646 shader_start_time = get_timestamp();
1647 }
1648
1649 void
1650 vec4_visitor::emit_shader_time_end()
1651 {
1652 current_annotation = "shader time end";
1653 src_reg shader_end_time = get_timestamp();
1654
1655
1656 /* Check that there weren't any timestamp reset events (assuming these
1657 * were the only two timestamp reads that happened).
1658 */
1659 src_reg reset_end = shader_end_time;
1660 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1661 vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
1662 test->conditional_mod = BRW_CONDITIONAL_Z;
1663
1664 emit(IF(BRW_PREDICATE_NORMAL));
1665
1666 /* Take the current timestamp and get the delta. */
1667 shader_start_time.negate = true;
1668 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1669 emit(ADD(diff, shader_start_time, shader_end_time));
1670
1671 /* If there were no instructions between the two timestamp gets, the diff
1672 * is 2 cycles. Remove that overhead, so I can forget about that when
1673 * trying to determine the time taken for single instructions.
1674 */
1675 emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
1676
1677 emit_shader_time_write(0, src_reg(diff));
1678 emit_shader_time_write(1, brw_imm_ud(1u));
1679 emit(BRW_OPCODE_ELSE);
1680 emit_shader_time_write(2, brw_imm_ud(1u));
1681 emit(BRW_OPCODE_ENDIF);
1682 }
1683
1684 void
1685 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1686 {
1687 dst_reg dst =
1688 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1689
1690 dst_reg offset = dst;
1691 dst_reg time = dst;
1692 time.reg_offset++;
1693
1694 offset.type = BRW_REGISTER_TYPE_UD;
1695 int index = shader_time_index * 3 + shader_time_subindex;
1696 emit(MOV(offset, brw_imm_d(index * SHADER_TIME_STRIDE)));
1697
1698 time.type = BRW_REGISTER_TYPE_UD;
1699 emit(MOV(time, value));
1700
1701 vec4_instruction *inst =
1702 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1703 inst->mlen = 2;
1704 }
1705
1706 void
1707 vec4_visitor::convert_to_hw_regs()
1708 {
1709 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1710 for (int i = 0; i < 3; i++) {
1711 struct src_reg &src = inst->src[i];
1712 struct brw_reg reg;
1713 switch (src.file) {
1714 case VGRF:
1715 reg = brw_vec8_grf(src.nr + src.reg_offset, 0);
1716 reg.type = src.type;
1717 reg.swizzle = src.swizzle;
1718 reg.abs = src.abs;
1719 reg.negate = src.negate;
1720 break;
1721
1722 case UNIFORM:
1723 reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
1724 (src.nr + src.reg_offset) / 2,
1725 ((src.nr + src.reg_offset) % 2) * 4),
1726 0, 4, 1);
1727 reg.type = src.type;
1728 reg.swizzle = src.swizzle;
1729 reg.abs = src.abs;
1730 reg.negate = src.negate;
1731
1732 /* This should have been moved to pull constants. */
1733 assert(!src.reladdr);
1734 break;
1735
1736 case ARF:
1737 case FIXED_GRF:
1738 case IMM:
1739 continue;
1740
1741 case BAD_FILE:
1742 /* Probably unused. */
1743 reg = brw_null_reg();
1744 break;
1745
1746 case MRF:
1747 case ATTR:
1748 unreachable("not reached");
1749 }
1750 src = reg;
1751 }
1752
1753 dst_reg &dst = inst->dst;
1754 struct brw_reg reg;
1755
1756 switch (inst->dst.file) {
1757 case VGRF:
1758 reg = brw_vec8_grf(dst.nr + dst.reg_offset, 0);
1759 reg.type = dst.type;
1760 reg.writemask = dst.writemask;
1761 break;
1762
1763 case MRF:
1764 assert(((dst.nr + dst.reg_offset) & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
1765 reg = brw_message_reg(dst.nr + dst.reg_offset);
1766 reg.type = dst.type;
1767 reg.writemask = dst.writemask;
1768 break;
1769
1770 case ARF:
1771 case FIXED_GRF:
1772 reg = dst;
1773 break;
1774
1775 case BAD_FILE:
1776 reg = brw_null_reg();
1777 break;
1778
1779 case IMM:
1780 case ATTR:
1781 case UNIFORM:
1782 unreachable("not reached");
1783 }
1784
1785 dst = reg;
1786 }
1787 }
1788
1789 bool
1790 vec4_visitor::run()
1791 {
1792 if (shader_time_index >= 0)
1793 emit_shader_time_begin();
1794
1795 emit_prolog();
1796
1797 emit_nir_code();
1798 if (failed)
1799 return false;
1800 base_ir = NULL;
1801
1802 emit_thread_end();
1803
1804 calculate_cfg();
1805
1806 /* Before any optimization, push array accesses out to scratch
1807 * space where we need them to be. This pass may allocate new
1808 * virtual GRFs, so we want to do it early. It also makes sure
1809 * that we have reladdr computations available for CSE, since we'll
1810 * often do repeated subexpressions for those.
1811 */
1812 move_grf_array_access_to_scratch();
1813 move_uniform_array_access_to_pull_constants();
1814
1815 pack_uniform_registers();
1816 move_push_constants_to_pull_constants();
1817 split_virtual_grfs();
1818
1819 #define OPT(pass, args...) ({ \
1820 pass_num++; \
1821 bool this_progress = pass(args); \
1822 \
1823 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
1824 char filename[64]; \
1825 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
1826 stage_abbrev, nir->info.name, iteration, pass_num); \
1827 \
1828 backend_shader::dump_instructions(filename); \
1829 } \
1830 \
1831 progress = progress || this_progress; \
1832 this_progress; \
1833 })
1834
1835
1836 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1837 char filename[64];
1838 snprintf(filename, 64, "%s-%s-00-start",
1839 stage_abbrev, nir->info.name);
1840
1841 backend_shader::dump_instructions(filename);
1842 }
1843
1844 bool progress;
1845 int iteration = 0;
1846 int pass_num = 0;
1847 do {
1848 progress = false;
1849 pass_num = 0;
1850 iteration++;
1851
1852 OPT(opt_predicated_break, this);
1853 OPT(opt_reduce_swizzle);
1854 OPT(dead_code_eliminate);
1855 OPT(dead_control_flow_eliminate, this);
1856 OPT(opt_copy_propagation);
1857 OPT(opt_cmod_propagation);
1858 OPT(opt_cse);
1859 OPT(opt_algebraic);
1860 OPT(opt_register_coalesce);
1861 OPT(eliminate_find_live_channel);
1862 } while (progress);
1863
1864 pass_num = 0;
1865
1866 if (OPT(opt_vector_float)) {
1867 OPT(opt_cse);
1868 OPT(opt_copy_propagation, false);
1869 OPT(opt_copy_propagation, true);
1870 OPT(dead_code_eliminate);
1871 }
1872
1873 if (failed)
1874 return false;
1875
1876 setup_payload();
1877
1878 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
1879 /* Debug of register spilling: Go spill everything. */
1880 const int grf_count = alloc.count;
1881 float spill_costs[alloc.count];
1882 bool no_spill[alloc.count];
1883 evaluate_spill_costs(spill_costs, no_spill);
1884 for (int i = 0; i < grf_count; i++) {
1885 if (no_spill[i])
1886 continue;
1887 spill_reg(i);
1888 }
1889 }
1890
1891 bool allocated_without_spills = reg_allocate();
1892
1893 if (!allocated_without_spills) {
1894 compiler->shader_perf_log(log_data,
1895 "%s shader triggered register spilling. "
1896 "Try reducing the number of live vec4 values "
1897 "to improve performance.\n",
1898 stage_name);
1899
1900 while (!reg_allocate()) {
1901 if (failed)
1902 return false;
1903 }
1904 }
1905
1906 opt_schedule_instructions();
1907
1908 opt_set_dependency_control();
1909
1910 convert_to_hw_regs();
1911
1912 if (last_scratch > 0) {
1913 prog_data->base.total_scratch =
1914 brw_get_scratch_size(last_scratch * REG_SIZE);
1915 }
1916
1917 return !failed;
1918 }
1919
1920 } /* namespace brw */
1921
1922 extern "C" {
1923
1924 /**
1925 * Compile a vertex shader.
1926 *
1927 * Returns the final assembly and the program's size.
1928 */
1929 const unsigned *
1930 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
1931 void *mem_ctx,
1932 const struct brw_vs_prog_key *key,
1933 struct brw_vs_prog_data *prog_data,
1934 const nir_shader *shader,
1935 gl_clip_plane *clip_planes,
1936 bool use_legacy_snorm_formula,
1937 int shader_time_index,
1938 unsigned *final_assembly_size,
1939 char **error_str)
1940 {
1941 const unsigned *assembly = NULL;
1942
1943 unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
1944
1945 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
1946 * incoming vertex attribute. So, add an extra slot.
1947 */
1948 if (shader->info.system_values_read &
1949 (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
1950 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
1951 nr_attributes++;
1952 }
1953
1954 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
1955 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
1956 * vec4 mode, the hardware appears to wedge unless we read something.
1957 */
1958 if (compiler->scalar_stage[MESA_SHADER_VERTEX])
1959 prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
1960 else
1961 prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
1962
1963 prog_data->nr_attributes = nr_attributes;
1964
1965 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
1966 * (overwriting the original contents), we need to make sure the size is
1967 * the larger of the two.
1968 */
1969 const unsigned vue_entries =
1970 MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots);
1971
1972 if (compiler->devinfo->gen == 6)
1973 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
1974 else
1975 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
1976
1977 if (compiler->scalar_stage[MESA_SHADER_VERTEX]) {
1978 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
1979
1980 fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
1981 NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
1982 shader, 8, shader_time_index);
1983 if (!v.run_vs(clip_planes)) {
1984 if (error_str)
1985 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
1986
1987 return NULL;
1988 }
1989
1990 fs_generator g(compiler, log_data, mem_ctx, (void *) key,
1991 &prog_data->base.base, v.promoted_constants,
1992 v.runtime_check_aads_emit, "VS");
1993 if (INTEL_DEBUG & DEBUG_VS) {
1994 const char *debug_name =
1995 ralloc_asprintf(mem_ctx, "%s vertex shader %s",
1996 shader->info.label ? shader->info.label : "unnamed",
1997 shader->info.name);
1998
1999 g.enable_debug(debug_name);
2000 }
2001 g.generate_code(v.cfg, 8);
2002 assembly = g.get_assembly(final_assembly_size);
2003 }
2004
2005 if (!assembly) {
2006 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2007
2008 vec4_vs_visitor v(compiler, log_data, key, prog_data,
2009 shader, clip_planes, mem_ctx,
2010 shader_time_index, use_legacy_snorm_formula);
2011 if (!v.run()) {
2012 if (error_str)
2013 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2014
2015 return NULL;
2016 }
2017
2018 assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2019 shader, &prog_data->base, v.cfg,
2020 final_assembly_size);
2021 }
2022
2023 return assembly;
2024 }
2025
2026 } /* extern "C" */