37170e7fc577eb2dfecb7f9690303a0b0ddea479
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_vs.h"
28 #include "brw_nir.h"
29 #include "brw_vec4_live_variables.h"
30 #include "brw_dead_control_flow.h"
31
32 extern "C" {
33 #include "main/macros.h"
34 #include "main/shaderobj.h"
35 #include "program/prog_print.h"
36 #include "program/prog_parameter.h"
37 }
38 #include "main/context.h"
39
40 #define MAX_INSTRUCTION (1 << 30)
41
42 using namespace brw;
43
44 namespace brw {
45
46 void
47 src_reg::init()
48 {
49 memset(this, 0, sizeof(*this));
50
51 this->file = BAD_FILE;
52 }
53
54 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
55 {
56 init();
57
58 this->file = file;
59 this->reg = reg;
60 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
61 this->swizzle = brw_swizzle_for_size(type->vector_elements);
62 else
63 this->swizzle = BRW_SWIZZLE_XYZW;
64 if (type)
65 this->type = brw_type_for_base_type(type);
66 }
67
68 /** Generic unset register constructor. */
69 src_reg::src_reg()
70 {
71 init();
72 }
73
74 src_reg::src_reg(float f)
75 {
76 init();
77
78 this->file = IMM;
79 this->type = BRW_REGISTER_TYPE_F;
80 this->f = f;
81 }
82
83 src_reg::src_reg(uint32_t u)
84 {
85 init();
86
87 this->file = IMM;
88 this->type = BRW_REGISTER_TYPE_UD;
89 this->ud = u;
90 }
91
92 src_reg::src_reg(int32_t i)
93 {
94 init();
95
96 this->file = IMM;
97 this->type = BRW_REGISTER_TYPE_D;
98 this->d = i;
99 }
100
101 src_reg::src_reg(uint8_t vf[4])
102 {
103 init();
104
105 this->file = IMM;
106 this->type = BRW_REGISTER_TYPE_VF;
107 memcpy(&this->ud, vf, sizeof(unsigned));
108 }
109
110 src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
111 {
112 init();
113
114 this->file = IMM;
115 this->type = BRW_REGISTER_TYPE_VF;
116 this->ud = (vf0 << 0) |
117 (vf1 << 8) |
118 (vf2 << 16) |
119 (vf3 << 24);
120 }
121
122 src_reg::src_reg(struct brw_reg reg) :
123 backend_reg(reg)
124 {
125 this->file = HW_REG;
126 this->reg = 0;
127 this->reg_offset = 0;
128 this->reladdr = NULL;
129 }
130
131 src_reg::src_reg(const dst_reg &reg) :
132 backend_reg(static_cast<struct brw_reg>(reg))
133 {
134 this->file = reg.file;
135 this->reg = reg.reg;
136 this->reg_offset = reg.reg_offset;
137 this->reladdr = reg.reladdr;
138 this->swizzle = brw_swizzle_for_mask(reg.writemask);
139 }
140
141 void
142 dst_reg::init()
143 {
144 memset(this, 0, sizeof(*this));
145 this->file = BAD_FILE;
146 this->writemask = WRITEMASK_XYZW;
147 }
148
149 dst_reg::dst_reg()
150 {
151 init();
152 }
153
154 dst_reg::dst_reg(register_file file, int reg)
155 {
156 init();
157
158 this->file = file;
159 this->reg = reg;
160 }
161
162 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
163 unsigned writemask)
164 {
165 init();
166
167 this->file = file;
168 this->reg = reg;
169 this->type = brw_type_for_base_type(type);
170 this->writemask = writemask;
171 }
172
173 dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
174 unsigned writemask)
175 {
176 init();
177
178 this->file = file;
179 this->reg = reg;
180 this->type = type;
181 this->writemask = writemask;
182 }
183
184 dst_reg::dst_reg(struct brw_reg reg) :
185 backend_reg(reg)
186 {
187 this->file = HW_REG;
188 this->reg = 0;
189 this->reg_offset = 0;
190 this->reladdr = NULL;
191 }
192
193 dst_reg::dst_reg(const src_reg &reg) :
194 backend_reg(static_cast<struct brw_reg>(reg))
195 {
196 this->file = reg.file;
197 this->reg = reg.reg;
198 this->reg_offset = reg.reg_offset;
199 this->writemask = brw_mask_for_swizzle(reg.swizzle);
200 this->reladdr = reg.reladdr;
201 }
202
203 bool
204 dst_reg::equals(const dst_reg &r) const
205 {
206 return (file == r.file &&
207 reg == r.reg &&
208 reg_offset == r.reg_offset &&
209 type == r.type &&
210 negate == r.negate &&
211 abs == r.abs &&
212 writemask == r.writemask &&
213 (reladdr == r.reladdr ||
214 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
215 (file != HW_REG ||
216 memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0));
217 }
218
219 bool
220 vec4_instruction::is_send_from_grf()
221 {
222 switch (opcode) {
223 case SHADER_OPCODE_SHADER_TIME_ADD:
224 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
225 case SHADER_OPCODE_UNTYPED_ATOMIC:
226 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
227 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
228 case SHADER_OPCODE_TYPED_ATOMIC:
229 case SHADER_OPCODE_TYPED_SURFACE_READ:
230 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
231 return true;
232 default:
233 return false;
234 }
235 }
236
237 unsigned
238 vec4_instruction::regs_read(unsigned arg) const
239 {
240 if (src[arg].file == BAD_FILE)
241 return 0;
242
243 switch (opcode) {
244 case SHADER_OPCODE_SHADER_TIME_ADD:
245 case SHADER_OPCODE_UNTYPED_ATOMIC:
246 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
247 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
248 case SHADER_OPCODE_TYPED_ATOMIC:
249 case SHADER_OPCODE_TYPED_SURFACE_READ:
250 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
251 return arg == 0 ? mlen : 1;
252
253 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
254 return arg == 1 ? mlen : 1;
255
256 default:
257 return 1;
258 }
259 }
260
261 bool
262 vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
263 {
264 if (devinfo->gen == 6 && is_math())
265 return false;
266
267 if (is_send_from_grf())
268 return false;
269
270 if (!backend_instruction::can_do_source_mods())
271 return false;
272
273 return true;
274 }
275
276 bool
277 vec4_instruction::can_change_types() const
278 {
279 return dst.type == src[0].type &&
280 !src[0].abs && !src[0].negate && !saturate &&
281 (opcode == BRW_OPCODE_MOV ||
282 (opcode == BRW_OPCODE_SEL &&
283 dst.type == src[1].type &&
284 predicate != BRW_PREDICATE_NONE &&
285 !src[1].abs && !src[1].negate));
286 }
287
288 /**
289 * Returns how many MRFs an opcode will write over.
290 *
291 * Note that this is not the 0 or 1 implied writes in an actual gen
292 * instruction -- the generate_* functions generate additional MOVs
293 * for setup.
294 */
295 int
296 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
297 {
298 if (inst->mlen == 0 || inst->is_send_from_grf())
299 return 0;
300
301 switch (inst->opcode) {
302 case SHADER_OPCODE_RCP:
303 case SHADER_OPCODE_RSQ:
304 case SHADER_OPCODE_SQRT:
305 case SHADER_OPCODE_EXP2:
306 case SHADER_OPCODE_LOG2:
307 case SHADER_OPCODE_SIN:
308 case SHADER_OPCODE_COS:
309 return 1;
310 case SHADER_OPCODE_INT_QUOTIENT:
311 case SHADER_OPCODE_INT_REMAINDER:
312 case SHADER_OPCODE_POW:
313 return 2;
314 case VS_OPCODE_URB_WRITE:
315 return 1;
316 case VS_OPCODE_PULL_CONSTANT_LOAD:
317 return 2;
318 case SHADER_OPCODE_GEN4_SCRATCH_READ:
319 return 2;
320 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
321 return 3;
322 case GS_OPCODE_URB_WRITE:
323 case GS_OPCODE_URB_WRITE_ALLOCATE:
324 case GS_OPCODE_THREAD_END:
325 return 0;
326 case GS_OPCODE_FF_SYNC:
327 return 1;
328 case SHADER_OPCODE_SHADER_TIME_ADD:
329 return 0;
330 case SHADER_OPCODE_TEX:
331 case SHADER_OPCODE_TXL:
332 case SHADER_OPCODE_TXD:
333 case SHADER_OPCODE_TXF:
334 case SHADER_OPCODE_TXF_CMS:
335 case SHADER_OPCODE_TXF_CMS_W:
336 case SHADER_OPCODE_TXF_MCS:
337 case SHADER_OPCODE_TXS:
338 case SHADER_OPCODE_TG4:
339 case SHADER_OPCODE_TG4_OFFSET:
340 case SHADER_OPCODE_SAMPLEINFO:
341 case VS_OPCODE_GET_BUFFER_SIZE:
342 return inst->header_size;
343 default:
344 unreachable("not reached");
345 }
346 }
347
348 bool
349 src_reg::equals(const src_reg &r) const
350 {
351 return (file == r.file &&
352 reg == r.reg &&
353 reg_offset == r.reg_offset &&
354 type == r.type &&
355 negate == r.negate &&
356 abs == r.abs &&
357 swizzle == r.swizzle &&
358 !reladdr && !r.reladdr &&
359 (file != HW_REG ||
360 memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0) &&
361 (file != IMM || d == r.d));
362 }
363
364 bool
365 vec4_visitor::opt_vector_float()
366 {
367 bool progress = false;
368
369 int last_reg = -1, last_reg_offset = -1;
370 enum register_file last_reg_file = BAD_FILE;
371
372 int remaining_channels = 0;
373 uint8_t imm[4];
374 int inst_count = 0;
375 vec4_instruction *imm_inst[4];
376
377 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
378 if (last_reg != inst->dst.reg ||
379 last_reg_offset != inst->dst.reg_offset ||
380 last_reg_file != inst->dst.file) {
381 last_reg = inst->dst.reg;
382 last_reg_offset = inst->dst.reg_offset;
383 last_reg_file = inst->dst.file;
384 remaining_channels = WRITEMASK_XYZW;
385
386 inst_count = 0;
387 }
388
389 if (inst->opcode != BRW_OPCODE_MOV ||
390 inst->dst.writemask == WRITEMASK_XYZW ||
391 inst->src[0].file != IMM)
392 continue;
393
394 int vf = brw_float_to_vf(inst->src[0].f);
395 if (vf == -1)
396 continue;
397
398 if ((inst->dst.writemask & WRITEMASK_X) != 0)
399 imm[0] = vf;
400 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
401 imm[1] = vf;
402 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
403 imm[2] = vf;
404 if ((inst->dst.writemask & WRITEMASK_W) != 0)
405 imm[3] = vf;
406
407 imm_inst[inst_count++] = inst;
408
409 remaining_channels &= ~inst->dst.writemask;
410 if (remaining_channels == 0) {
411 vec4_instruction *mov = MOV(inst->dst, imm);
412 mov->dst.type = BRW_REGISTER_TYPE_F;
413 mov->dst.writemask = WRITEMASK_XYZW;
414 inst->insert_after(block, mov);
415 last_reg = -1;
416
417 for (int i = 0; i < inst_count; i++) {
418 imm_inst[i]->remove(block);
419 }
420 progress = true;
421 }
422 }
423
424 if (progress)
425 invalidate_live_intervals();
426
427 return progress;
428 }
429
430 /* Replaces unused channels of a swizzle with channels that are used.
431 *
432 * For instance, this pass transforms
433 *
434 * mov vgrf4.yz, vgrf5.wxzy
435 *
436 * into
437 *
438 * mov vgrf4.yz, vgrf5.xxzx
439 *
440 * This eliminates false uses of some channels, letting dead code elimination
441 * remove the instructions that wrote them.
442 */
443 bool
444 vec4_visitor::opt_reduce_swizzle()
445 {
446 bool progress = false;
447
448 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
449 if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG ||
450 inst->is_send_from_grf())
451 continue;
452
453 unsigned swizzle;
454
455 /* Determine which channels of the sources are read. */
456 switch (inst->opcode) {
457 case VEC4_OPCODE_PACK_BYTES:
458 case BRW_OPCODE_DP4:
459 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
460 * but all four of src1.
461 */
462 swizzle = brw_swizzle_for_size(4);
463 break;
464 case BRW_OPCODE_DP3:
465 swizzle = brw_swizzle_for_size(3);
466 break;
467 case BRW_OPCODE_DP2:
468 swizzle = brw_swizzle_for_size(2);
469 break;
470 default:
471 swizzle = brw_swizzle_for_mask(inst->dst.writemask);
472 break;
473 }
474
475 /* Update sources' swizzles. */
476 for (int i = 0; i < 3; i++) {
477 if (inst->src[i].file != GRF &&
478 inst->src[i].file != ATTR &&
479 inst->src[i].file != UNIFORM)
480 continue;
481
482 const unsigned new_swizzle =
483 brw_compose_swizzle(swizzle, inst->src[i].swizzle);
484 if (inst->src[i].swizzle != new_swizzle) {
485 inst->src[i].swizzle = new_swizzle;
486 progress = true;
487 }
488 }
489 }
490
491 if (progress)
492 invalidate_live_intervals();
493
494 return progress;
495 }
496
497 void
498 vec4_visitor::split_uniform_registers()
499 {
500 /* Prior to this, uniforms have been in an array sized according to
501 * the number of vector uniforms present, sparsely filled (so an
502 * aggregate results in reg indices being skipped over). Now we're
503 * going to cut those aggregates up so each .reg index is one
504 * vector. The goal is to make elimination of unused uniform
505 * components easier later.
506 */
507 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
508 for (int i = 0 ; i < 3; i++) {
509 if (inst->src[i].file != UNIFORM)
510 continue;
511
512 assert(!inst->src[i].reladdr);
513
514 inst->src[i].reg += inst->src[i].reg_offset;
515 inst->src[i].reg_offset = 0;
516 }
517 }
518
519 /* Update that everything is now vector-sized. */
520 for (int i = 0; i < this->uniforms; i++) {
521 this->uniform_size[i] = 1;
522 }
523 }
524
525 void
526 vec4_visitor::pack_uniform_registers()
527 {
528 uint8_t chans_used[this->uniforms];
529 int new_loc[this->uniforms];
530 int new_chan[this->uniforms];
531
532 memset(chans_used, 0, sizeof(chans_used));
533 memset(new_loc, 0, sizeof(new_loc));
534 memset(new_chan, 0, sizeof(new_chan));
535
536 /* Find which uniform vectors are actually used by the program. We
537 * expect unused vector elements when we've moved array access out
538 * to pull constants, and from some GLSL code generators like wine.
539 */
540 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
541 unsigned readmask;
542 switch (inst->opcode) {
543 case VEC4_OPCODE_PACK_BYTES:
544 case BRW_OPCODE_DP4:
545 case BRW_OPCODE_DPH:
546 readmask = 0xf;
547 break;
548 case BRW_OPCODE_DP3:
549 readmask = 0x7;
550 break;
551 case BRW_OPCODE_DP2:
552 readmask = 0x3;
553 break;
554 default:
555 readmask = inst->dst.writemask;
556 break;
557 }
558
559 for (int i = 0 ; i < 3; i++) {
560 if (inst->src[i].file != UNIFORM)
561 continue;
562
563 int reg = inst->src[i].reg;
564 for (int c = 0; c < 4; c++) {
565 if (!(readmask & (1 << c)))
566 continue;
567
568 chans_used[reg] = MAX2(chans_used[reg],
569 BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
570 }
571 }
572 }
573
574 int new_uniform_count = 0;
575
576 /* Now, figure out a packing of the live uniform vectors into our
577 * push constants.
578 */
579 for (int src = 0; src < uniforms; src++) {
580 assert(src < uniform_array_size);
581 int size = chans_used[src];
582
583 if (size == 0)
584 continue;
585
586 int dst;
587 /* Find the lowest place we can slot this uniform in. */
588 for (dst = 0; dst < src; dst++) {
589 if (chans_used[dst] + size <= 4)
590 break;
591 }
592
593 if (src == dst) {
594 new_loc[src] = dst;
595 new_chan[src] = 0;
596 } else {
597 new_loc[src] = dst;
598 new_chan[src] = chans_used[dst];
599
600 /* Move the references to the data */
601 for (int j = 0; j < size; j++) {
602 stage_prog_data->param[dst * 4 + new_chan[src] + j] =
603 stage_prog_data->param[src * 4 + j];
604 }
605
606 chans_used[dst] += size;
607 chans_used[src] = 0;
608 }
609
610 new_uniform_count = MAX2(new_uniform_count, dst + 1);
611 }
612
613 this->uniforms = new_uniform_count;
614
615 /* Now, update the instructions for our repacked uniforms. */
616 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
617 for (int i = 0 ; i < 3; i++) {
618 int src = inst->src[i].reg;
619
620 if (inst->src[i].file != UNIFORM)
621 continue;
622
623 inst->src[i].reg = new_loc[src];
624 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
625 new_chan[src], new_chan[src]);
626 }
627 }
628 }
629
630 /**
631 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
632 *
633 * While GLSL IR also performs this optimization, we end up with it in
634 * our instruction stream for a couple of reasons. One is that we
635 * sometimes generate silly instructions, for example in array access
636 * where we'll generate "ADD offset, index, base" even if base is 0.
637 * The other is that GLSL IR's constant propagation doesn't track the
638 * components of aggregates, so some VS patterns (initialize matrix to
639 * 0, accumulate in vertex blending factors) end up breaking down to
640 * instructions involving 0.
641 */
642 bool
643 vec4_visitor::opt_algebraic()
644 {
645 bool progress = false;
646
647 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
648 switch (inst->opcode) {
649 case BRW_OPCODE_MOV:
650 if (inst->src[0].file != IMM)
651 break;
652
653 if (inst->saturate) {
654 if (inst->dst.type != inst->src[0].type)
655 assert(!"unimplemented: saturate mixed types");
656
657 if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
658 inst->saturate = false;
659 progress = true;
660 }
661 }
662 break;
663
664 case VEC4_OPCODE_UNPACK_UNIFORM:
665 if (inst->src[0].file != UNIFORM) {
666 inst->opcode = BRW_OPCODE_MOV;
667 progress = true;
668 }
669 break;
670
671 case BRW_OPCODE_ADD:
672 if (inst->src[1].is_zero()) {
673 inst->opcode = BRW_OPCODE_MOV;
674 inst->src[1] = src_reg();
675 progress = true;
676 }
677 break;
678
679 case BRW_OPCODE_MUL:
680 if (inst->src[1].is_zero()) {
681 inst->opcode = BRW_OPCODE_MOV;
682 switch (inst->src[0].type) {
683 case BRW_REGISTER_TYPE_F:
684 inst->src[0] = src_reg(0.0f);
685 break;
686 case BRW_REGISTER_TYPE_D:
687 inst->src[0] = src_reg(0);
688 break;
689 case BRW_REGISTER_TYPE_UD:
690 inst->src[0] = src_reg(0u);
691 break;
692 default:
693 unreachable("not reached");
694 }
695 inst->src[1] = src_reg();
696 progress = true;
697 } else if (inst->src[1].is_one()) {
698 inst->opcode = BRW_OPCODE_MOV;
699 inst->src[1] = src_reg();
700 progress = true;
701 } else if (inst->src[1].is_negative_one()) {
702 inst->opcode = BRW_OPCODE_MOV;
703 inst->src[0].negate = !inst->src[0].negate;
704 inst->src[1] = src_reg();
705 progress = true;
706 }
707 break;
708 case BRW_OPCODE_CMP:
709 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
710 inst->src[0].abs &&
711 inst->src[0].negate &&
712 inst->src[1].is_zero()) {
713 inst->src[0].abs = false;
714 inst->src[0].negate = false;
715 inst->conditional_mod = BRW_CONDITIONAL_Z;
716 progress = true;
717 break;
718 }
719 break;
720 case SHADER_OPCODE_RCP: {
721 vec4_instruction *prev = (vec4_instruction *)inst->prev;
722 if (prev->opcode == SHADER_OPCODE_SQRT) {
723 if (inst->src[0].equals(src_reg(prev->dst))) {
724 inst->opcode = SHADER_OPCODE_RSQ;
725 inst->src[0] = prev->src[0];
726 progress = true;
727 }
728 }
729 break;
730 }
731 case SHADER_OPCODE_BROADCAST:
732 if (is_uniform(inst->src[0]) ||
733 inst->src[1].is_zero()) {
734 inst->opcode = BRW_OPCODE_MOV;
735 inst->src[1] = src_reg();
736 inst->force_writemask_all = true;
737 progress = true;
738 }
739 break;
740
741 default:
742 break;
743 }
744 }
745
746 if (progress)
747 invalidate_live_intervals();
748
749 return progress;
750 }
751
752 /**
753 * Only a limited number of hardware registers may be used for push
754 * constants, so this turns access to the overflowed constants into
755 * pull constants.
756 */
757 void
758 vec4_visitor::move_push_constants_to_pull_constants()
759 {
760 int pull_constant_loc[this->uniforms];
761
762 /* Only allow 32 registers (256 uniform components) as push constants,
763 * which is the limit on gen6.
764 *
765 * If changing this value, note the limitation about total_regs in
766 * brw_curbe.c.
767 */
768 int max_uniform_components = 32 * 8;
769 if (this->uniforms * 4 <= max_uniform_components)
770 return;
771
772 /* Make some sort of choice as to which uniforms get sent to pull
773 * constants. We could potentially do something clever here like
774 * look for the most infrequently used uniform vec4s, but leave
775 * that for later.
776 */
777 for (int i = 0; i < this->uniforms * 4; i += 4) {
778 pull_constant_loc[i / 4] = -1;
779
780 if (i >= max_uniform_components) {
781 const gl_constant_value **values = &stage_prog_data->param[i];
782
783 /* Try to find an existing copy of this uniform in the pull
784 * constants if it was part of an array access already.
785 */
786 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
787 int matches;
788
789 for (matches = 0; matches < 4; matches++) {
790 if (stage_prog_data->pull_param[j + matches] != values[matches])
791 break;
792 }
793
794 if (matches == 4) {
795 pull_constant_loc[i / 4] = j / 4;
796 break;
797 }
798 }
799
800 if (pull_constant_loc[i / 4] == -1) {
801 assert(stage_prog_data->nr_pull_params % 4 == 0);
802 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
803
804 for (int j = 0; j < 4; j++) {
805 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
806 values[j];
807 }
808 }
809 }
810 }
811
812 /* Now actually rewrite usage of the things we've moved to pull
813 * constants.
814 */
815 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
816 for (int i = 0 ; i < 3; i++) {
817 if (inst->src[i].file != UNIFORM ||
818 pull_constant_loc[inst->src[i].reg] == -1)
819 continue;
820
821 int uniform = inst->src[i].reg;
822
823 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
824
825 emit_pull_constant_load(block, inst, temp, inst->src[i],
826 pull_constant_loc[uniform]);
827
828 inst->src[i].file = temp.file;
829 inst->src[i].reg = temp.reg;
830 inst->src[i].reg_offset = temp.reg_offset;
831 inst->src[i].reladdr = NULL;
832 }
833 }
834
835 /* Repack push constants to remove the now-unused ones. */
836 pack_uniform_registers();
837 }
838
839 /* Conditions for which we want to avoid setting the dependency control bits */
840 bool
841 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
842 {
843 #define IS_DWORD(reg) \
844 (reg.type == BRW_REGISTER_TYPE_UD || \
845 reg.type == BRW_REGISTER_TYPE_D)
846
847 /* "When source or destination datatype is 64b or operation is integer DWord
848 * multiply, DepCtrl must not be used."
849 * May apply to future SoCs as well.
850 */
851 if (devinfo->is_cherryview) {
852 if (inst->opcode == BRW_OPCODE_MUL &&
853 IS_DWORD(inst->src[0]) &&
854 IS_DWORD(inst->src[1]))
855 return true;
856 }
857 #undef IS_DWORD
858
859 if (devinfo->gen >= 8) {
860 if (inst->opcode == BRW_OPCODE_F32TO16)
861 return true;
862 }
863
864 /*
865 * mlen:
866 * In the presence of send messages, totally interrupt dependency
867 * control. They're long enough that the chance of dependency
868 * control around them just doesn't matter.
869 *
870 * predicate:
871 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
872 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
873 * completes the scoreboard clear must have a non-zero execution mask. This
874 * means, if any kind of predication can change the execution mask or channel
875 * enable of the last instruction, the optimization must be avoided. This is
876 * to avoid instructions being shot down the pipeline when no writes are
877 * required.
878 *
879 * math:
880 * Dependency control does not work well over math instructions.
881 * NB: Discovered empirically
882 */
883 return (inst->mlen || inst->predicate || inst->is_math());
884 }
885
886 /**
887 * Sets the dependency control fields on instructions after register
888 * allocation and before the generator is run.
889 *
890 * When you have a sequence of instructions like:
891 *
892 * DP4 temp.x vertex uniform[0]
893 * DP4 temp.y vertex uniform[0]
894 * DP4 temp.z vertex uniform[0]
895 * DP4 temp.w vertex uniform[0]
896 *
897 * The hardware doesn't know that it can actually run the later instructions
898 * while the previous ones are in flight, producing stalls. However, we have
899 * manual fields we can set in the instructions that let it do so.
900 */
901 void
902 vec4_visitor::opt_set_dependency_control()
903 {
904 vec4_instruction *last_grf_write[BRW_MAX_GRF];
905 uint8_t grf_channels_written[BRW_MAX_GRF];
906 vec4_instruction *last_mrf_write[BRW_MAX_GRF];
907 uint8_t mrf_channels_written[BRW_MAX_GRF];
908
909 assert(prog_data->total_grf ||
910 !"Must be called after register allocation");
911
912 foreach_block (block, cfg) {
913 memset(last_grf_write, 0, sizeof(last_grf_write));
914 memset(last_mrf_write, 0, sizeof(last_mrf_write));
915
916 foreach_inst_in_block (vec4_instruction, inst, block) {
917 /* If we read from a register that we were doing dependency control
918 * on, don't do dependency control across the read.
919 */
920 for (int i = 0; i < 3; i++) {
921 int reg = inst->src[i].reg + inst->src[i].reg_offset;
922 if (inst->src[i].file == GRF) {
923 last_grf_write[reg] = NULL;
924 } else if (inst->src[i].file == HW_REG) {
925 memset(last_grf_write, 0, sizeof(last_grf_write));
926 break;
927 }
928 assert(inst->src[i].file != MRF);
929 }
930
931 if (is_dep_ctrl_unsafe(inst)) {
932 memset(last_grf_write, 0, sizeof(last_grf_write));
933 memset(last_mrf_write, 0, sizeof(last_mrf_write));
934 continue;
935 }
936
937 /* Now, see if we can do dependency control for this instruction
938 * against a previous one writing to its destination.
939 */
940 int reg = inst->dst.reg + inst->dst.reg_offset;
941 if (inst->dst.file == GRF) {
942 if (last_grf_write[reg] &&
943 !(inst->dst.writemask & grf_channels_written[reg])) {
944 last_grf_write[reg]->no_dd_clear = true;
945 inst->no_dd_check = true;
946 } else {
947 grf_channels_written[reg] = 0;
948 }
949
950 last_grf_write[reg] = inst;
951 grf_channels_written[reg] |= inst->dst.writemask;
952 } else if (inst->dst.file == MRF) {
953 if (last_mrf_write[reg] &&
954 !(inst->dst.writemask & mrf_channels_written[reg])) {
955 last_mrf_write[reg]->no_dd_clear = true;
956 inst->no_dd_check = true;
957 } else {
958 mrf_channels_written[reg] = 0;
959 }
960
961 last_mrf_write[reg] = inst;
962 mrf_channels_written[reg] |= inst->dst.writemask;
963 } else if (inst->dst.reg == HW_REG) {
964 if (inst->dst.brw_reg::file == BRW_GENERAL_REGISTER_FILE)
965 memset(last_grf_write, 0, sizeof(last_grf_write));
966 if (inst->dst.brw_reg::file == BRW_MESSAGE_REGISTER_FILE)
967 memset(last_mrf_write, 0, sizeof(last_mrf_write));
968 }
969 }
970 }
971 }
972
973 bool
974 vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo,
975 int dst_writemask,
976 int swizzle,
977 int swizzle_mask)
978 {
979 /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
980 * or writemasking are not allowed.
981 */
982 if (devinfo->gen == 6 && is_math() &&
983 (swizzle != BRW_SWIZZLE_XYZW || dst_writemask != WRITEMASK_XYZW))
984 return false;
985
986 /* If this instruction sets anything not referenced by swizzle, then we'd
987 * totally break it when we reswizzle.
988 */
989 if (dst.writemask & ~swizzle_mask)
990 return false;
991
992 if (mlen > 0)
993 return false;
994
995 /* We can't use swizzles on the accumulator and that's really the only
996 * HW_REG we would care to reswizzle so just disallow them all.
997 */
998 for (int i = 0; i < 3; i++) {
999 if (src[i].file == HW_REG)
1000 return false;
1001 }
1002
1003 return true;
1004 }
1005
1006 /**
1007 * For any channels in the swizzle's source that were populated by this
1008 * instruction, rewrite the instruction to put the appropriate result directly
1009 * in those channels.
1010 *
1011 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
1012 */
1013 void
1014 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
1015 {
1016 /* Destination write mask doesn't correspond to source swizzle for the dot
1017 * product and pack_bytes instructions.
1018 */
1019 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
1020 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
1021 opcode != VEC4_OPCODE_PACK_BYTES) {
1022 for (int i = 0; i < 3; i++) {
1023 if (src[i].file == BAD_FILE || src[i].file == IMM)
1024 continue;
1025
1026 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
1027 }
1028 }
1029
1030 /* Apply the specified swizzle and writemask to the original mask of
1031 * written components.
1032 */
1033 dst.writemask = dst_writemask &
1034 brw_apply_swizzle_to_mask(swizzle, dst.writemask);
1035 }
1036
1037 /*
1038 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
1039 * just written and then MOVed into another reg and making the original write
1040 * of the GRF write directly to the final destination instead.
1041 */
1042 bool
1043 vec4_visitor::opt_register_coalesce()
1044 {
1045 bool progress = false;
1046 int next_ip = 0;
1047
1048 calculate_live_intervals();
1049
1050 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1051 int ip = next_ip;
1052 next_ip++;
1053
1054 if (inst->opcode != BRW_OPCODE_MOV ||
1055 (inst->dst.file != GRF && inst->dst.file != MRF) ||
1056 inst->predicate ||
1057 inst->src[0].file != GRF ||
1058 inst->dst.type != inst->src[0].type ||
1059 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1060 continue;
1061
1062 /* Remove no-op MOVs */
1063 if (inst->dst.file == inst->src[0].file &&
1064 inst->dst.reg == inst->src[0].reg &&
1065 inst->dst.reg_offset == inst->src[0].reg_offset) {
1066 bool is_nop_mov = true;
1067
1068 for (unsigned c = 0; c < 4; c++) {
1069 if ((inst->dst.writemask & (1 << c)) == 0)
1070 continue;
1071
1072 if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
1073 is_nop_mov = false;
1074 break;
1075 }
1076 }
1077
1078 if (is_nop_mov) {
1079 inst->remove(block);
1080 continue;
1081 }
1082 }
1083
1084 bool to_mrf = (inst->dst.file == MRF);
1085
1086 /* Can't coalesce this GRF if someone else was going to
1087 * read it later.
1088 */
1089 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1090 continue;
1091
1092 /* We need to check interference with the final destination between this
1093 * instruction and the earliest instruction involved in writing the GRF
1094 * we're eliminating. To do that, keep track of which of our source
1095 * channels we've seen initialized.
1096 */
1097 const unsigned chans_needed =
1098 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1099 inst->dst.writemask);
1100 unsigned chans_remaining = chans_needed;
1101
1102 /* Now walk up the instruction stream trying to see if we can rewrite
1103 * everything writing to the temporary to write into the destination
1104 * instead.
1105 */
1106 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1107 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1108 inst) {
1109 _scan_inst = scan_inst;
1110
1111 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1112 /* Found something writing to the reg we want to coalesce away. */
1113 if (to_mrf) {
1114 /* SEND instructions can't have MRF as a destination. */
1115 if (scan_inst->mlen)
1116 break;
1117
1118 if (devinfo->gen == 6) {
1119 /* gen6 math instructions must have the destination be
1120 * GRF, so no compute-to-MRF for them.
1121 */
1122 if (scan_inst->is_math()) {
1123 break;
1124 }
1125 }
1126 }
1127
1128 /* This doesn't handle saturation on the instruction we
1129 * want to coalesce away if the register types do not match.
1130 * But if scan_inst is a non type-converting 'mov', we can fix
1131 * the types later.
1132 */
1133 if (inst->saturate &&
1134 inst->dst.type != scan_inst->dst.type &&
1135 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1136 scan_inst->dst.type == scan_inst->src[0].type))
1137 break;
1138
1139 /* If we can't handle the swizzle, bail. */
1140 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1141 inst->src[0].swizzle,
1142 chans_needed)) {
1143 break;
1144 }
1145
1146 /* This doesn't handle coalescing of multiple registers. */
1147 if (scan_inst->regs_written > 1)
1148 break;
1149
1150 /* Mark which channels we found unconditional writes for. */
1151 if (!scan_inst->predicate)
1152 chans_remaining &= ~scan_inst->dst.writemask;
1153
1154 if (chans_remaining == 0)
1155 break;
1156 }
1157
1158 /* You can't read from an MRF, so if someone else reads our MRF's
1159 * source GRF that we wanted to rewrite, that stops us. If it's a
1160 * GRF we're trying to coalesce to, we don't actually handle
1161 * rewriting sources so bail in that case as well.
1162 */
1163 bool interfered = false;
1164 for (int i = 0; i < 3; i++) {
1165 if (inst->src[0].in_range(scan_inst->src[i],
1166 scan_inst->regs_read(i)))
1167 interfered = true;
1168 }
1169 if (interfered)
1170 break;
1171
1172 /* If somebody else writes the same channels of our destination here,
1173 * we can't coalesce before that.
1174 */
1175 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written) &&
1176 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1177 break;
1178 }
1179
1180 /* Check for reads of the register we're trying to coalesce into. We
1181 * can't go rewriting instructions above that to put some other value
1182 * in the register instead.
1183 */
1184 if (to_mrf && scan_inst->mlen > 0) {
1185 if (inst->dst.reg >= scan_inst->base_mrf &&
1186 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
1187 break;
1188 }
1189 } else {
1190 for (int i = 0; i < 3; i++) {
1191 if (inst->dst.in_range(scan_inst->src[i],
1192 scan_inst->regs_read(i)))
1193 interfered = true;
1194 }
1195 if (interfered)
1196 break;
1197 }
1198 }
1199
1200 if (chans_remaining == 0) {
1201 /* If we've made it here, we have an MOV we want to coalesce out, and
1202 * a scan_inst pointing to the earliest instruction involved in
1203 * computing the value. Now go rewrite the instruction stream
1204 * between the two.
1205 */
1206 vec4_instruction *scan_inst = _scan_inst;
1207 while (scan_inst != inst) {
1208 if (scan_inst->dst.file == GRF &&
1209 scan_inst->dst.reg == inst->src[0].reg &&
1210 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1211 scan_inst->reswizzle(inst->dst.writemask,
1212 inst->src[0].swizzle);
1213 scan_inst->dst.file = inst->dst.file;
1214 scan_inst->dst.reg = inst->dst.reg;
1215 scan_inst->dst.reg_offset = inst->dst.reg_offset;
1216 if (inst->saturate &&
1217 inst->dst.type != scan_inst->dst.type) {
1218 /* If we have reached this point, scan_inst is a non
1219 * type-converting 'mov' and we can modify its register types
1220 * to match the ones in inst. Otherwise, we could have an
1221 * incorrect saturation result.
1222 */
1223 scan_inst->dst.type = inst->dst.type;
1224 scan_inst->src[0].type = inst->src[0].type;
1225 }
1226 scan_inst->saturate |= inst->saturate;
1227 }
1228 scan_inst = (vec4_instruction *)scan_inst->next;
1229 }
1230 inst->remove(block);
1231 progress = true;
1232 }
1233 }
1234
1235 if (progress)
1236 invalidate_live_intervals();
1237
1238 return progress;
1239 }
1240
1241 /**
1242 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1243 * flow. We could probably do better here with some form of divergence
1244 * analysis.
1245 */
1246 bool
1247 vec4_visitor::eliminate_find_live_channel()
1248 {
1249 bool progress = false;
1250 unsigned depth = 0;
1251
1252 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1253 switch (inst->opcode) {
1254 case BRW_OPCODE_IF:
1255 case BRW_OPCODE_DO:
1256 depth++;
1257 break;
1258
1259 case BRW_OPCODE_ENDIF:
1260 case BRW_OPCODE_WHILE:
1261 depth--;
1262 break;
1263
1264 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1265 if (depth == 0) {
1266 inst->opcode = BRW_OPCODE_MOV;
1267 inst->src[0] = src_reg(0);
1268 inst->force_writemask_all = true;
1269 progress = true;
1270 }
1271 break;
1272
1273 default:
1274 break;
1275 }
1276 }
1277
1278 return progress;
1279 }
1280
1281 /**
1282 * Splits virtual GRFs requesting more than one contiguous physical register.
1283 *
1284 * We initially create large virtual GRFs for temporary structures, arrays,
1285 * and matrices, so that the dereference visitor functions can add reg_offsets
1286 * to work their way down to the actual member being accessed. But when it
1287 * comes to optimization, we'd like to treat each register as individual
1288 * storage if possible.
1289 *
1290 * So far, the only thing that might prevent splitting is a send message from
1291 * a GRF on IVB.
1292 */
1293 void
1294 vec4_visitor::split_virtual_grfs()
1295 {
1296 int num_vars = this->alloc.count;
1297 int new_virtual_grf[num_vars];
1298 bool split_grf[num_vars];
1299
1300 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1301
1302 /* Try to split anything > 0 sized. */
1303 for (int i = 0; i < num_vars; i++) {
1304 split_grf[i] = this->alloc.sizes[i] != 1;
1305 }
1306
1307 /* Check that the instructions are compatible with the registers we're trying
1308 * to split.
1309 */
1310 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1311 if (inst->dst.file == GRF && inst->regs_written > 1)
1312 split_grf[inst->dst.reg] = false;
1313
1314 for (int i = 0; i < 3; i++) {
1315 if (inst->src[i].file == GRF && inst->regs_read(i) > 1)
1316 split_grf[inst->src[i].reg] = false;
1317 }
1318 }
1319
1320 /* Allocate new space for split regs. Note that the virtual
1321 * numbers will be contiguous.
1322 */
1323 for (int i = 0; i < num_vars; i++) {
1324 if (!split_grf[i])
1325 continue;
1326
1327 new_virtual_grf[i] = alloc.allocate(1);
1328 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1329 unsigned reg = alloc.allocate(1);
1330 assert(reg == new_virtual_grf[i] + j - 1);
1331 (void) reg;
1332 }
1333 this->alloc.sizes[i] = 1;
1334 }
1335
1336 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1337 if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
1338 inst->dst.reg_offset != 0) {
1339 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1340 inst->dst.reg_offset - 1);
1341 inst->dst.reg_offset = 0;
1342 }
1343 for (int i = 0; i < 3; i++) {
1344 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
1345 inst->src[i].reg_offset != 0) {
1346 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1347 inst->src[i].reg_offset - 1);
1348 inst->src[i].reg_offset = 0;
1349 }
1350 }
1351 }
1352 invalidate_live_intervals();
1353 }
1354
1355 void
1356 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1357 {
1358 dump_instruction(be_inst, stderr);
1359 }
1360
1361 void
1362 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1363 {
1364 vec4_instruction *inst = (vec4_instruction *)be_inst;
1365
1366 if (inst->predicate) {
1367 fprintf(file, "(%cf0.%d%s) ",
1368 inst->predicate_inverse ? '-' : '+',
1369 inst->flag_subreg,
1370 pred_ctrl_align16[inst->predicate]);
1371 }
1372
1373 fprintf(file, "%s", brw_instruction_name(inst->opcode));
1374 if (inst->saturate)
1375 fprintf(file, ".sat");
1376 if (inst->conditional_mod) {
1377 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1378 if (!inst->predicate &&
1379 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1380 inst->opcode != BRW_OPCODE_IF &&
1381 inst->opcode != BRW_OPCODE_WHILE))) {
1382 fprintf(file, ".f0.%d", inst->flag_subreg);
1383 }
1384 }
1385 fprintf(file, " ");
1386
1387 switch (inst->dst.file) {
1388 case GRF:
1389 fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
1390 break;
1391 case MRF:
1392 fprintf(file, "m%d", inst->dst.reg);
1393 break;
1394 case HW_REG:
1395 if (inst->dst.brw_reg::file == BRW_ARCHITECTURE_REGISTER_FILE) {
1396 switch (inst->dst.nr) {
1397 case BRW_ARF_NULL:
1398 fprintf(file, "null");
1399 break;
1400 case BRW_ARF_ADDRESS:
1401 fprintf(file, "a0.%d", inst->dst.subnr);
1402 break;
1403 case BRW_ARF_ACCUMULATOR:
1404 fprintf(file, "acc%d", inst->dst.subnr);
1405 break;
1406 case BRW_ARF_FLAG:
1407 fprintf(file, "f%d.%d", inst->dst.nr & 0xf,
1408 inst->dst.subnr);
1409 break;
1410 default:
1411 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf,
1412 inst->dst.subnr);
1413 break;
1414 }
1415 } else {
1416 fprintf(file, "hw_reg%d", inst->dst.nr);
1417 }
1418 if (inst->dst.subnr)
1419 fprintf(file, "+%d", inst->dst.subnr);
1420 break;
1421 case BAD_FILE:
1422 fprintf(file, "(null)");
1423 break;
1424 case IMM:
1425 case ATTR:
1426 case UNIFORM:
1427 unreachable("not reached");
1428 }
1429 if (inst->dst.writemask != WRITEMASK_XYZW) {
1430 fprintf(file, ".");
1431 if (inst->dst.writemask & 1)
1432 fprintf(file, "x");
1433 if (inst->dst.writemask & 2)
1434 fprintf(file, "y");
1435 if (inst->dst.writemask & 4)
1436 fprintf(file, "z");
1437 if (inst->dst.writemask & 8)
1438 fprintf(file, "w");
1439 }
1440 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1441
1442 if (inst->src[0].file != BAD_FILE)
1443 fprintf(file, ", ");
1444
1445 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1446 if (inst->src[i].negate)
1447 fprintf(file, "-");
1448 if (inst->src[i].abs)
1449 fprintf(file, "|");
1450 switch (inst->src[i].file) {
1451 case GRF:
1452 fprintf(file, "vgrf%d", inst->src[i].reg);
1453 break;
1454 case ATTR:
1455 fprintf(file, "attr%d", inst->src[i].reg);
1456 break;
1457 case UNIFORM:
1458 fprintf(file, "u%d", inst->src[i].reg);
1459 break;
1460 case IMM:
1461 switch (inst->src[i].type) {
1462 case BRW_REGISTER_TYPE_F:
1463 fprintf(file, "%fF", inst->src[i].f);
1464 break;
1465 case BRW_REGISTER_TYPE_D:
1466 fprintf(file, "%dD", inst->src[i].d);
1467 break;
1468 case BRW_REGISTER_TYPE_UD:
1469 fprintf(file, "%uU", inst->src[i].ud);
1470 break;
1471 case BRW_REGISTER_TYPE_VF:
1472 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1473 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1474 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1475 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1476 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1477 break;
1478 default:
1479 fprintf(file, "???");
1480 break;
1481 }
1482 break;
1483 case HW_REG:
1484 if (inst->src[i].brw_reg::file == BRW_ARCHITECTURE_REGISTER_FILE) {
1485 switch (inst->src[i].nr) {
1486 case BRW_ARF_NULL:
1487 fprintf(file, "null");
1488 break;
1489 case BRW_ARF_ADDRESS:
1490 fprintf(file, "a0.%d", inst->src[i].subnr);
1491 break;
1492 case BRW_ARF_ACCUMULATOR:
1493 fprintf(file, "acc%d", inst->src[i].subnr);
1494 break;
1495 case BRW_ARF_FLAG:
1496 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf,
1497 inst->src[i].subnr);
1498 break;
1499 default:
1500 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf,
1501 inst->src[i].subnr);
1502 break;
1503 }
1504 } else {
1505 fprintf(file, "hw_reg%d", inst->src[i].nr);
1506 }
1507 if (inst->src[i].subnr)
1508 fprintf(file, "+%d", inst->src[i].subnr);
1509 break;
1510 case BAD_FILE:
1511 fprintf(file, "(null)");
1512 break;
1513 case MRF:
1514 unreachable("not reached");
1515 }
1516
1517 /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1518 if (inst->src[i].reg_offset != 0 &&
1519 inst->src[i].file == GRF &&
1520 alloc.sizes[inst->src[i].reg] != 1)
1521 fprintf(file, ".%d", inst->src[i].reg_offset);
1522
1523 if (inst->src[i].file != IMM) {
1524 static const char *chans[4] = {"x", "y", "z", "w"};
1525 fprintf(file, ".");
1526 for (int c = 0; c < 4; c++) {
1527 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1528 }
1529 }
1530
1531 if (inst->src[i].abs)
1532 fprintf(file, "|");
1533
1534 if (inst->src[i].file != IMM) {
1535 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1536 }
1537
1538 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1539 fprintf(file, ", ");
1540 }
1541
1542 if (inst->force_writemask_all)
1543 fprintf(file, " NoMask");
1544
1545 fprintf(file, "\n");
1546 }
1547
1548
1549 static inline struct brw_reg
1550 attribute_to_hw_reg(int attr, bool interleaved)
1551 {
1552 if (interleaved)
1553 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1554 else
1555 return brw_vec8_grf(attr, 0);
1556 }
1557
1558
1559 /**
1560 * Replace each register of type ATTR in this->instructions with a reference
1561 * to a fixed HW register.
1562 *
1563 * If interleaved is true, then each attribute takes up half a register, with
1564 * register N containing attribute 2*N in its first half and attribute 2*N+1
1565 * in its second half (this corresponds to the payload setup used by geometry
1566 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
1567 * false, then each attribute takes up a whole register, with register N
1568 * containing attribute N (this corresponds to the payload setup used by
1569 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1570 */
1571 void
1572 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1573 bool interleaved)
1574 {
1575 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1576 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1577 if (inst->dst.file == ATTR) {
1578 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1579
1580 /* All attributes used in the shader need to have been assigned a
1581 * hardware register by the caller
1582 */
1583 assert(grf != 0);
1584
1585 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1586 reg.type = inst->dst.type;
1587 reg.writemask = inst->dst.writemask;
1588
1589 inst->dst = reg;
1590 }
1591
1592 for (int i = 0; i < 3; i++) {
1593 if (inst->src[i].file != ATTR)
1594 continue;
1595
1596 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1597
1598 /* All attributes used in the shader need to have been assigned a
1599 * hardware register by the caller
1600 */
1601 assert(grf != 0);
1602
1603 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1604 reg.swizzle = inst->src[i].swizzle;
1605 reg.type = inst->src[i].type;
1606 if (inst->src[i].abs)
1607 reg = brw_abs(reg);
1608 if (inst->src[i].negate)
1609 reg = negate(reg);
1610
1611 inst->src[i] = reg;
1612 }
1613 }
1614 }
1615
1616 int
1617 vec4_vs_visitor::setup_attributes(int payload_reg)
1618 {
1619 int nr_attributes;
1620 int attribute_map[VERT_ATTRIB_MAX + 1];
1621 memset(attribute_map, 0, sizeof(attribute_map));
1622
1623 nr_attributes = 0;
1624 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1625 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1626 attribute_map[i] = payload_reg + nr_attributes;
1627 nr_attributes++;
1628 }
1629 }
1630
1631 /* VertexID is stored by the VF as the last vertex element, but we
1632 * don't represent it with a flag in inputs_read, so we call it
1633 * VERT_ATTRIB_MAX.
1634 */
1635 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
1636 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1637 }
1638
1639 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1640
1641 return payload_reg + vs_prog_data->nr_attributes;
1642 }
1643
1644 int
1645 vec4_visitor::setup_uniforms(int reg)
1646 {
1647 prog_data->base.dispatch_grf_start_reg = reg;
1648
1649 /* The pre-gen6 VS requires that some push constants get loaded no
1650 * matter what, or the GPU would hang.
1651 */
1652 if (devinfo->gen < 6 && this->uniforms == 0) {
1653 assert(this->uniforms < this->uniform_array_size);
1654
1655 stage_prog_data->param =
1656 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1657 for (unsigned int i = 0; i < 4; i++) {
1658 unsigned int slot = this->uniforms * 4 + i;
1659 static gl_constant_value zero = { 0.0 };
1660 stage_prog_data->param[slot] = &zero;
1661 }
1662
1663 this->uniforms++;
1664 reg++;
1665 } else {
1666 reg += ALIGN(uniforms, 2) / 2;
1667 }
1668
1669 stage_prog_data->nr_params = this->uniforms * 4;
1670
1671 prog_data->base.curb_read_length =
1672 reg - prog_data->base.dispatch_grf_start_reg;
1673
1674 return reg;
1675 }
1676
1677 void
1678 vec4_vs_visitor::setup_payload(void)
1679 {
1680 int reg = 0;
1681
1682 /* The payload always contains important data in g0, which contains
1683 * the URB handles that are passed on to the URB write at the end
1684 * of the thread. So, we always start push constants at g1.
1685 */
1686 reg++;
1687
1688 reg = setup_uniforms(reg);
1689
1690 reg = setup_attributes(reg);
1691
1692 this->first_non_payload_grf = reg;
1693 }
1694
1695 src_reg
1696 vec4_visitor::get_timestamp()
1697 {
1698 assert(devinfo->gen >= 7);
1699
1700 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1701 BRW_ARF_TIMESTAMP,
1702 0,
1703 0,
1704 0,
1705 BRW_REGISTER_TYPE_UD,
1706 BRW_VERTICAL_STRIDE_0,
1707 BRW_WIDTH_4,
1708 BRW_HORIZONTAL_STRIDE_4,
1709 BRW_SWIZZLE_XYZW,
1710 WRITEMASK_XYZW));
1711
1712 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1713
1714 vec4_instruction *mov = emit(MOV(dst, ts));
1715 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1716 * even if it's not enabled in the dispatch.
1717 */
1718 mov->force_writemask_all = true;
1719
1720 return src_reg(dst);
1721 }
1722
1723 void
1724 vec4_visitor::emit_shader_time_begin()
1725 {
1726 current_annotation = "shader time start";
1727 shader_start_time = get_timestamp();
1728 }
1729
1730 void
1731 vec4_visitor::emit_shader_time_end()
1732 {
1733 current_annotation = "shader time end";
1734 src_reg shader_end_time = get_timestamp();
1735
1736
1737 /* Check that there weren't any timestamp reset events (assuming these
1738 * were the only two timestamp reads that happened).
1739 */
1740 src_reg reset_end = shader_end_time;
1741 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1742 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1743 test->conditional_mod = BRW_CONDITIONAL_Z;
1744
1745 emit(IF(BRW_PREDICATE_NORMAL));
1746
1747 /* Take the current timestamp and get the delta. */
1748 shader_start_time.negate = true;
1749 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1750 emit(ADD(diff, shader_start_time, shader_end_time));
1751
1752 /* If there were no instructions between the two timestamp gets, the diff
1753 * is 2 cycles. Remove that overhead, so I can forget about that when
1754 * trying to determine the time taken for single instructions.
1755 */
1756 emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1757
1758 emit_shader_time_write(0, src_reg(diff));
1759 emit_shader_time_write(1, src_reg(1u));
1760 emit(BRW_OPCODE_ELSE);
1761 emit_shader_time_write(2, src_reg(1u));
1762 emit(BRW_OPCODE_ENDIF);
1763 }
1764
1765 void
1766 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1767 {
1768 dst_reg dst =
1769 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1770
1771 dst_reg offset = dst;
1772 dst_reg time = dst;
1773 time.reg_offset++;
1774
1775 offset.type = BRW_REGISTER_TYPE_UD;
1776 int index = shader_time_index * 3 + shader_time_subindex;
1777 emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
1778
1779 time.type = BRW_REGISTER_TYPE_UD;
1780 emit(MOV(time, value));
1781
1782 vec4_instruction *inst =
1783 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1784 inst->mlen = 2;
1785 }
1786
1787 void
1788 vec4_visitor::convert_to_hw_regs()
1789 {
1790 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1791 for (int i = 0; i < 3; i++) {
1792 struct src_reg &src = inst->src[i];
1793 struct brw_reg reg;
1794 switch (src.file) {
1795 case GRF:
1796 reg = brw_vec8_grf(src.reg + src.reg_offset, 0);
1797 reg.type = src.type;
1798 reg.swizzle = src.swizzle;
1799 reg.abs = src.abs;
1800 reg.negate = src.negate;
1801 break;
1802
1803 case IMM:
1804 reg = brw_imm_reg(src.type);
1805 reg.ud = src.ud;
1806 break;
1807
1808 case UNIFORM:
1809 reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
1810 (src.reg + src.reg_offset) / 2,
1811 ((src.reg + src.reg_offset) % 2) * 4),
1812 0, 4, 1);
1813 reg.type = src.type;
1814 reg.swizzle = src.swizzle;
1815 reg.abs = src.abs;
1816 reg.negate = src.negate;
1817
1818 /* This should have been moved to pull constants. */
1819 assert(!src.reladdr);
1820 break;
1821
1822 case HW_REG:
1823 continue;
1824
1825 case BAD_FILE:
1826 /* Probably unused. */
1827 reg = brw_null_reg();
1828 break;
1829
1830 case MRF:
1831 case ATTR:
1832 unreachable("not reached");
1833 }
1834 src = reg;
1835 }
1836
1837 dst_reg &dst = inst->dst;
1838 struct brw_reg reg;
1839
1840 switch (inst->dst.file) {
1841 case GRF:
1842 reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
1843 reg.type = dst.type;
1844 reg.writemask = dst.writemask;
1845 break;
1846
1847 case MRF:
1848 assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen));
1849 reg = brw_message_reg(dst.reg + dst.reg_offset);
1850 reg.type = dst.type;
1851 reg.writemask = dst.writemask;
1852 break;
1853
1854 case HW_REG:
1855 reg = dst;
1856 break;
1857
1858 case BAD_FILE:
1859 reg = brw_null_reg();
1860 break;
1861
1862 case IMM:
1863 case ATTR:
1864 case UNIFORM:
1865 unreachable("not reached");
1866 }
1867
1868 dst = reg;
1869 }
1870 }
1871
1872 bool
1873 vec4_visitor::run()
1874 {
1875 if (shader_time_index >= 0)
1876 emit_shader_time_begin();
1877
1878 emit_prolog();
1879
1880 emit_nir_code();
1881 if (failed)
1882 return false;
1883 base_ir = NULL;
1884
1885 emit_thread_end();
1886
1887 calculate_cfg();
1888
1889 /* Before any optimization, push array accesses out to scratch
1890 * space where we need them to be. This pass may allocate new
1891 * virtual GRFs, so we want to do it early. It also makes sure
1892 * that we have reladdr computations available for CSE, since we'll
1893 * often do repeated subexpressions for those.
1894 */
1895 move_grf_array_access_to_scratch();
1896 move_uniform_array_access_to_pull_constants();
1897
1898 pack_uniform_registers();
1899 move_push_constants_to_pull_constants();
1900 split_virtual_grfs();
1901
1902 #define OPT(pass, args...) ({ \
1903 pass_num++; \
1904 bool this_progress = pass(args); \
1905 \
1906 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
1907 char filename[64]; \
1908 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
1909 stage_abbrev, nir->info.name, iteration, pass_num); \
1910 \
1911 backend_shader::dump_instructions(filename); \
1912 } \
1913 \
1914 progress = progress || this_progress; \
1915 this_progress; \
1916 })
1917
1918
1919 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1920 char filename[64];
1921 snprintf(filename, 64, "%s-%s-00-start",
1922 stage_abbrev, nir->info.name);
1923
1924 backend_shader::dump_instructions(filename);
1925 }
1926
1927 bool progress;
1928 int iteration = 0;
1929 int pass_num = 0;
1930 do {
1931 progress = false;
1932 pass_num = 0;
1933 iteration++;
1934
1935 OPT(opt_predicated_break, this);
1936 OPT(opt_reduce_swizzle);
1937 OPT(dead_code_eliminate);
1938 OPT(dead_control_flow_eliminate, this);
1939 OPT(opt_copy_propagation);
1940 OPT(opt_cmod_propagation);
1941 OPT(opt_cse);
1942 OPT(opt_algebraic);
1943 OPT(opt_register_coalesce);
1944 OPT(eliminate_find_live_channel);
1945 } while (progress);
1946
1947 pass_num = 0;
1948
1949 if (OPT(opt_vector_float)) {
1950 OPT(opt_cse);
1951 OPT(opt_copy_propagation, false);
1952 OPT(opt_copy_propagation, true);
1953 OPT(dead_code_eliminate);
1954 }
1955
1956 if (failed)
1957 return false;
1958
1959 setup_payload();
1960
1961 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
1962 /* Debug of register spilling: Go spill everything. */
1963 const int grf_count = alloc.count;
1964 float spill_costs[alloc.count];
1965 bool no_spill[alloc.count];
1966 evaluate_spill_costs(spill_costs, no_spill);
1967 for (int i = 0; i < grf_count; i++) {
1968 if (no_spill[i])
1969 continue;
1970 spill_reg(i);
1971 }
1972 }
1973
1974 bool allocated_without_spills = reg_allocate();
1975
1976 if (!allocated_without_spills) {
1977 compiler->shader_perf_log(log_data,
1978 "%s shader triggered register spilling. "
1979 "Try reducing the number of live vec4 values "
1980 "to improve performance.\n",
1981 stage_name);
1982
1983 while (!reg_allocate()) {
1984 if (failed)
1985 return false;
1986 }
1987 }
1988
1989 opt_schedule_instructions();
1990
1991 opt_set_dependency_control();
1992
1993 convert_to_hw_regs();
1994
1995 if (last_scratch > 0) {
1996 prog_data->base.total_scratch =
1997 brw_get_scratch_size(last_scratch * REG_SIZE);
1998 }
1999
2000 return !failed;
2001 }
2002
2003 } /* namespace brw */
2004
2005 extern "C" {
2006
2007 /**
2008 * Compile a vertex shader.
2009 *
2010 * Returns the final assembly and the program's size.
2011 */
2012 const unsigned *
2013 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
2014 void *mem_ctx,
2015 const struct brw_vs_prog_key *key,
2016 struct brw_vs_prog_data *prog_data,
2017 const nir_shader *shader,
2018 gl_clip_plane *clip_planes,
2019 bool use_legacy_snorm_formula,
2020 int shader_time_index,
2021 unsigned *final_assembly_size,
2022 char **error_str)
2023 {
2024 const unsigned *assembly = NULL;
2025
2026 unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
2027
2028 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2029 * incoming vertex attribute. So, add an extra slot.
2030 */
2031 if (shader->info.system_values_read &
2032 (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
2033 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
2034 nr_attributes++;
2035 }
2036
2037 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2038 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
2039 * vec4 mode, the hardware appears to wedge unless we read something.
2040 */
2041 if (compiler->scalar_vs)
2042 prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
2043 else
2044 prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
2045
2046 prog_data->nr_attributes = nr_attributes;
2047
2048 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2049 * (overwriting the original contents), we need to make sure the size is
2050 * the larger of the two.
2051 */
2052 const unsigned vue_entries =
2053 MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots);
2054
2055 if (compiler->devinfo->gen == 6)
2056 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2057 else
2058 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2059
2060 if (compiler->scalar_vs) {
2061 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
2062
2063 fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
2064 NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
2065 shader, 8, shader_time_index);
2066 if (!v.run_vs(clip_planes)) {
2067 if (error_str)
2068 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2069
2070 return NULL;
2071 }
2072
2073 fs_generator g(compiler, log_data, mem_ctx, (void *) key,
2074 &prog_data->base.base, v.promoted_constants,
2075 v.runtime_check_aads_emit, "VS");
2076 if (INTEL_DEBUG & DEBUG_VS) {
2077 const char *debug_name =
2078 ralloc_asprintf(mem_ctx, "%s vertex shader %s",
2079 shader->info.label ? shader->info.label : "unnamed",
2080 shader->info.name);
2081
2082 g.enable_debug(debug_name);
2083 }
2084 g.generate_code(v.cfg, 8);
2085 assembly = g.get_assembly(final_assembly_size);
2086 }
2087
2088 if (!assembly) {
2089 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2090
2091 vec4_vs_visitor v(compiler, log_data, key, prog_data,
2092 shader, clip_planes, mem_ctx,
2093 shader_time_index, use_legacy_snorm_formula);
2094 if (!v.run()) {
2095 if (error_str)
2096 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2097
2098 return NULL;
2099 }
2100
2101 assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2102 shader, &prog_data->base, v.cfg,
2103 final_assembly_size);
2104 }
2105
2106 return assembly;
2107 }
2108
2109 } /* extern "C" */