i965: Unwrap some lines.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_vs.h"
28 #include "brw_nir.h"
29 #include "brw_vec4_live_variables.h"
30 #include "brw_dead_control_flow.h"
31
32 extern "C" {
33 #include "main/macros.h"
34 #include "main/shaderobj.h"
35 #include "program/prog_print.h"
36 #include "program/prog_parameter.h"
37 }
38 #include "main/context.h"
39
40 #define MAX_INSTRUCTION (1 << 30)
41
42 using namespace brw;
43
44 namespace brw {
45
46 void
47 src_reg::init()
48 {
49 memset(this, 0, sizeof(*this));
50
51 this->file = BAD_FILE;
52 }
53
54 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
55 {
56 init();
57
58 this->file = file;
59 this->reg = reg;
60 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
61 this->swizzle = brw_swizzle_for_size(type->vector_elements);
62 else
63 this->swizzle = BRW_SWIZZLE_XYZW;
64 if (type)
65 this->type = brw_type_for_base_type(type);
66 }
67
68 /** Generic unset register constructor. */
69 src_reg::src_reg()
70 {
71 init();
72 }
73
74 src_reg::src_reg(float f)
75 {
76 init();
77
78 this->file = IMM;
79 this->type = BRW_REGISTER_TYPE_F;
80 this->f = f;
81 }
82
83 src_reg::src_reg(uint32_t u)
84 {
85 init();
86
87 this->file = IMM;
88 this->type = BRW_REGISTER_TYPE_UD;
89 this->ud = u;
90 }
91
92 src_reg::src_reg(int32_t i)
93 {
94 init();
95
96 this->file = IMM;
97 this->type = BRW_REGISTER_TYPE_D;
98 this->d = i;
99 }
100
101 src_reg::src_reg(uint8_t vf[4])
102 {
103 init();
104
105 this->file = IMM;
106 this->type = BRW_REGISTER_TYPE_VF;
107 memcpy(&this->ud, vf, sizeof(unsigned));
108 }
109
110 src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
111 {
112 init();
113
114 this->file = IMM;
115 this->type = BRW_REGISTER_TYPE_VF;
116 this->ud = (vf0 << 0) | (vf1 << 8) | (vf2 << 16) | (vf3 << 24);
117 }
118
119 src_reg::src_reg(struct brw_reg reg) :
120 backend_reg(reg)
121 {
122 this->file = HW_REG;
123 this->reg = 0;
124 this->reg_offset = 0;
125 this->reladdr = NULL;
126 }
127
128 src_reg::src_reg(const dst_reg &reg) :
129 backend_reg(static_cast<struct brw_reg>(reg))
130 {
131 this->file = reg.file;
132 this->reg = reg.reg;
133 this->reg_offset = reg.reg_offset;
134 this->reladdr = reg.reladdr;
135 this->swizzle = brw_swizzle_for_mask(reg.writemask);
136 }
137
138 void
139 dst_reg::init()
140 {
141 memset(this, 0, sizeof(*this));
142 this->file = BAD_FILE;
143 this->writemask = WRITEMASK_XYZW;
144 }
145
146 dst_reg::dst_reg()
147 {
148 init();
149 }
150
151 dst_reg::dst_reg(register_file file, int reg)
152 {
153 init();
154
155 this->file = file;
156 this->reg = reg;
157 }
158
159 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
160 unsigned writemask)
161 {
162 init();
163
164 this->file = file;
165 this->reg = reg;
166 this->type = brw_type_for_base_type(type);
167 this->writemask = writemask;
168 }
169
170 dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
171 unsigned writemask)
172 {
173 init();
174
175 this->file = file;
176 this->reg = reg;
177 this->type = type;
178 this->writemask = writemask;
179 }
180
181 dst_reg::dst_reg(struct brw_reg reg) :
182 backend_reg(reg)
183 {
184 this->file = HW_REG;
185 this->reg = 0;
186 this->reg_offset = 0;
187 this->reladdr = NULL;
188 }
189
190 dst_reg::dst_reg(const src_reg &reg) :
191 backend_reg(static_cast<struct brw_reg>(reg))
192 {
193 this->file = reg.file;
194 this->reg = reg.reg;
195 this->reg_offset = reg.reg_offset;
196 this->writemask = brw_mask_for_swizzle(reg.swizzle);
197 this->reladdr = reg.reladdr;
198 }
199
200 bool
201 dst_reg::equals(const dst_reg &r) const
202 {
203 return (file == r.file &&
204 reg == r.reg &&
205 reg_offset == r.reg_offset &&
206 type == r.type &&
207 negate == r.negate &&
208 abs == r.abs &&
209 writemask == r.writemask &&
210 (reladdr == r.reladdr ||
211 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
212 (file != HW_REG ||
213 memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0));
214 }
215
216 bool
217 vec4_instruction::is_send_from_grf()
218 {
219 switch (opcode) {
220 case SHADER_OPCODE_SHADER_TIME_ADD:
221 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
222 case SHADER_OPCODE_UNTYPED_ATOMIC:
223 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
224 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
225 case SHADER_OPCODE_TYPED_ATOMIC:
226 case SHADER_OPCODE_TYPED_SURFACE_READ:
227 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
228 return true;
229 default:
230 return false;
231 }
232 }
233
234 unsigned
235 vec4_instruction::regs_read(unsigned arg) const
236 {
237 if (src[arg].file == BAD_FILE)
238 return 0;
239
240 switch (opcode) {
241 case SHADER_OPCODE_SHADER_TIME_ADD:
242 case SHADER_OPCODE_UNTYPED_ATOMIC:
243 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
244 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
245 case SHADER_OPCODE_TYPED_ATOMIC:
246 case SHADER_OPCODE_TYPED_SURFACE_READ:
247 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
248 return arg == 0 ? mlen : 1;
249
250 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
251 return arg == 1 ? mlen : 1;
252
253 default:
254 return 1;
255 }
256 }
257
258 bool
259 vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
260 {
261 if (devinfo->gen == 6 && is_math())
262 return false;
263
264 if (is_send_from_grf())
265 return false;
266
267 if (!backend_instruction::can_do_source_mods())
268 return false;
269
270 return true;
271 }
272
273 bool
274 vec4_instruction::can_change_types() const
275 {
276 return dst.type == src[0].type &&
277 !src[0].abs && !src[0].negate && !saturate &&
278 (opcode == BRW_OPCODE_MOV ||
279 (opcode == BRW_OPCODE_SEL &&
280 dst.type == src[1].type &&
281 predicate != BRW_PREDICATE_NONE &&
282 !src[1].abs && !src[1].negate));
283 }
284
285 /**
286 * Returns how many MRFs an opcode will write over.
287 *
288 * Note that this is not the 0 or 1 implied writes in an actual gen
289 * instruction -- the generate_* functions generate additional MOVs
290 * for setup.
291 */
292 int
293 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
294 {
295 if (inst->mlen == 0 || inst->is_send_from_grf())
296 return 0;
297
298 switch (inst->opcode) {
299 case SHADER_OPCODE_RCP:
300 case SHADER_OPCODE_RSQ:
301 case SHADER_OPCODE_SQRT:
302 case SHADER_OPCODE_EXP2:
303 case SHADER_OPCODE_LOG2:
304 case SHADER_OPCODE_SIN:
305 case SHADER_OPCODE_COS:
306 return 1;
307 case SHADER_OPCODE_INT_QUOTIENT:
308 case SHADER_OPCODE_INT_REMAINDER:
309 case SHADER_OPCODE_POW:
310 return 2;
311 case VS_OPCODE_URB_WRITE:
312 return 1;
313 case VS_OPCODE_PULL_CONSTANT_LOAD:
314 return 2;
315 case SHADER_OPCODE_GEN4_SCRATCH_READ:
316 return 2;
317 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
318 return 3;
319 case GS_OPCODE_URB_WRITE:
320 case GS_OPCODE_URB_WRITE_ALLOCATE:
321 case GS_OPCODE_THREAD_END:
322 return 0;
323 case GS_OPCODE_FF_SYNC:
324 return 1;
325 case SHADER_OPCODE_SHADER_TIME_ADD:
326 return 0;
327 case SHADER_OPCODE_TEX:
328 case SHADER_OPCODE_TXL:
329 case SHADER_OPCODE_TXD:
330 case SHADER_OPCODE_TXF:
331 case SHADER_OPCODE_TXF_CMS:
332 case SHADER_OPCODE_TXF_CMS_W:
333 case SHADER_OPCODE_TXF_MCS:
334 case SHADER_OPCODE_TXS:
335 case SHADER_OPCODE_TG4:
336 case SHADER_OPCODE_TG4_OFFSET:
337 case SHADER_OPCODE_SAMPLEINFO:
338 case VS_OPCODE_GET_BUFFER_SIZE:
339 return inst->header_size;
340 default:
341 unreachable("not reached");
342 }
343 }
344
345 bool
346 src_reg::equals(const src_reg &r) const
347 {
348 return (file == r.file &&
349 reg == r.reg &&
350 reg_offset == r.reg_offset &&
351 type == r.type &&
352 negate == r.negate &&
353 abs == r.abs &&
354 swizzle == r.swizzle &&
355 !reladdr && !r.reladdr &&
356 (file != HW_REG ||
357 memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0) &&
358 (file != IMM || d == r.d));
359 }
360
361 bool
362 vec4_visitor::opt_vector_float()
363 {
364 bool progress = false;
365
366 int last_reg = -1, last_reg_offset = -1;
367 enum register_file last_reg_file = BAD_FILE;
368
369 int remaining_channels = 0;
370 uint8_t imm[4];
371 int inst_count = 0;
372 vec4_instruction *imm_inst[4];
373
374 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
375 if (last_reg != inst->dst.reg ||
376 last_reg_offset != inst->dst.reg_offset ||
377 last_reg_file != inst->dst.file) {
378 last_reg = inst->dst.reg;
379 last_reg_offset = inst->dst.reg_offset;
380 last_reg_file = inst->dst.file;
381 remaining_channels = WRITEMASK_XYZW;
382
383 inst_count = 0;
384 }
385
386 if (inst->opcode != BRW_OPCODE_MOV ||
387 inst->dst.writemask == WRITEMASK_XYZW ||
388 inst->src[0].file != IMM)
389 continue;
390
391 int vf = brw_float_to_vf(inst->src[0].f);
392 if (vf == -1)
393 continue;
394
395 if ((inst->dst.writemask & WRITEMASK_X) != 0)
396 imm[0] = vf;
397 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
398 imm[1] = vf;
399 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
400 imm[2] = vf;
401 if ((inst->dst.writemask & WRITEMASK_W) != 0)
402 imm[3] = vf;
403
404 imm_inst[inst_count++] = inst;
405
406 remaining_channels &= ~inst->dst.writemask;
407 if (remaining_channels == 0) {
408 vec4_instruction *mov = MOV(inst->dst, imm);
409 mov->dst.type = BRW_REGISTER_TYPE_F;
410 mov->dst.writemask = WRITEMASK_XYZW;
411 inst->insert_after(block, mov);
412 last_reg = -1;
413
414 for (int i = 0; i < inst_count; i++) {
415 imm_inst[i]->remove(block);
416 }
417 progress = true;
418 }
419 }
420
421 if (progress)
422 invalidate_live_intervals();
423
424 return progress;
425 }
426
427 /* Replaces unused channels of a swizzle with channels that are used.
428 *
429 * For instance, this pass transforms
430 *
431 * mov vgrf4.yz, vgrf5.wxzy
432 *
433 * into
434 *
435 * mov vgrf4.yz, vgrf5.xxzx
436 *
437 * This eliminates false uses of some channels, letting dead code elimination
438 * remove the instructions that wrote them.
439 */
440 bool
441 vec4_visitor::opt_reduce_swizzle()
442 {
443 bool progress = false;
444
445 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
446 if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG ||
447 inst->is_send_from_grf())
448 continue;
449
450 unsigned swizzle;
451
452 /* Determine which channels of the sources are read. */
453 switch (inst->opcode) {
454 case VEC4_OPCODE_PACK_BYTES:
455 case BRW_OPCODE_DP4:
456 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
457 * but all four of src1.
458 */
459 swizzle = brw_swizzle_for_size(4);
460 break;
461 case BRW_OPCODE_DP3:
462 swizzle = brw_swizzle_for_size(3);
463 break;
464 case BRW_OPCODE_DP2:
465 swizzle = brw_swizzle_for_size(2);
466 break;
467 default:
468 swizzle = brw_swizzle_for_mask(inst->dst.writemask);
469 break;
470 }
471
472 /* Update sources' swizzles. */
473 for (int i = 0; i < 3; i++) {
474 if (inst->src[i].file != GRF &&
475 inst->src[i].file != ATTR &&
476 inst->src[i].file != UNIFORM)
477 continue;
478
479 const unsigned new_swizzle =
480 brw_compose_swizzle(swizzle, inst->src[i].swizzle);
481 if (inst->src[i].swizzle != new_swizzle) {
482 inst->src[i].swizzle = new_swizzle;
483 progress = true;
484 }
485 }
486 }
487
488 if (progress)
489 invalidate_live_intervals();
490
491 return progress;
492 }
493
494 void
495 vec4_visitor::split_uniform_registers()
496 {
497 /* Prior to this, uniforms have been in an array sized according to
498 * the number of vector uniforms present, sparsely filled (so an
499 * aggregate results in reg indices being skipped over). Now we're
500 * going to cut those aggregates up so each .reg index is one
501 * vector. The goal is to make elimination of unused uniform
502 * components easier later.
503 */
504 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
505 for (int i = 0 ; i < 3; i++) {
506 if (inst->src[i].file != UNIFORM)
507 continue;
508
509 assert(!inst->src[i].reladdr);
510
511 inst->src[i].reg += inst->src[i].reg_offset;
512 inst->src[i].reg_offset = 0;
513 }
514 }
515
516 /* Update that everything is now vector-sized. */
517 for (int i = 0; i < this->uniforms; i++) {
518 this->uniform_size[i] = 1;
519 }
520 }
521
522 void
523 vec4_visitor::pack_uniform_registers()
524 {
525 uint8_t chans_used[this->uniforms];
526 int new_loc[this->uniforms];
527 int new_chan[this->uniforms];
528
529 memset(chans_used, 0, sizeof(chans_used));
530 memset(new_loc, 0, sizeof(new_loc));
531 memset(new_chan, 0, sizeof(new_chan));
532
533 /* Find which uniform vectors are actually used by the program. We
534 * expect unused vector elements when we've moved array access out
535 * to pull constants, and from some GLSL code generators like wine.
536 */
537 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
538 unsigned readmask;
539 switch (inst->opcode) {
540 case VEC4_OPCODE_PACK_BYTES:
541 case BRW_OPCODE_DP4:
542 case BRW_OPCODE_DPH:
543 readmask = 0xf;
544 break;
545 case BRW_OPCODE_DP3:
546 readmask = 0x7;
547 break;
548 case BRW_OPCODE_DP2:
549 readmask = 0x3;
550 break;
551 default:
552 readmask = inst->dst.writemask;
553 break;
554 }
555
556 for (int i = 0 ; i < 3; i++) {
557 if (inst->src[i].file != UNIFORM)
558 continue;
559
560 int reg = inst->src[i].reg;
561 for (int c = 0; c < 4; c++) {
562 if (!(readmask & (1 << c)))
563 continue;
564
565 chans_used[reg] = MAX2(chans_used[reg],
566 BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
567 }
568 }
569 }
570
571 int new_uniform_count = 0;
572
573 /* Now, figure out a packing of the live uniform vectors into our
574 * push constants.
575 */
576 for (int src = 0; src < uniforms; src++) {
577 assert(src < uniform_array_size);
578 int size = chans_used[src];
579
580 if (size == 0)
581 continue;
582
583 int dst;
584 /* Find the lowest place we can slot this uniform in. */
585 for (dst = 0; dst < src; dst++) {
586 if (chans_used[dst] + size <= 4)
587 break;
588 }
589
590 if (src == dst) {
591 new_loc[src] = dst;
592 new_chan[src] = 0;
593 } else {
594 new_loc[src] = dst;
595 new_chan[src] = chans_used[dst];
596
597 /* Move the references to the data */
598 for (int j = 0; j < size; j++) {
599 stage_prog_data->param[dst * 4 + new_chan[src] + j] =
600 stage_prog_data->param[src * 4 + j];
601 }
602
603 chans_used[dst] += size;
604 chans_used[src] = 0;
605 }
606
607 new_uniform_count = MAX2(new_uniform_count, dst + 1);
608 }
609
610 this->uniforms = new_uniform_count;
611
612 /* Now, update the instructions for our repacked uniforms. */
613 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
614 for (int i = 0 ; i < 3; i++) {
615 int src = inst->src[i].reg;
616
617 if (inst->src[i].file != UNIFORM)
618 continue;
619
620 inst->src[i].reg = new_loc[src];
621 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
622 new_chan[src], new_chan[src]);
623 }
624 }
625 }
626
627 /**
628 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
629 *
630 * While GLSL IR also performs this optimization, we end up with it in
631 * our instruction stream for a couple of reasons. One is that we
632 * sometimes generate silly instructions, for example in array access
633 * where we'll generate "ADD offset, index, base" even if base is 0.
634 * The other is that GLSL IR's constant propagation doesn't track the
635 * components of aggregates, so some VS patterns (initialize matrix to
636 * 0, accumulate in vertex blending factors) end up breaking down to
637 * instructions involving 0.
638 */
639 bool
640 vec4_visitor::opt_algebraic()
641 {
642 bool progress = false;
643
644 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
645 switch (inst->opcode) {
646 case BRW_OPCODE_MOV:
647 if (inst->src[0].file != IMM)
648 break;
649
650 if (inst->saturate) {
651 if (inst->dst.type != inst->src[0].type)
652 assert(!"unimplemented: saturate mixed types");
653
654 if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
655 inst->saturate = false;
656 progress = true;
657 }
658 }
659 break;
660
661 case VEC4_OPCODE_UNPACK_UNIFORM:
662 if (inst->src[0].file != UNIFORM) {
663 inst->opcode = BRW_OPCODE_MOV;
664 progress = true;
665 }
666 break;
667
668 case BRW_OPCODE_ADD:
669 if (inst->src[1].is_zero()) {
670 inst->opcode = BRW_OPCODE_MOV;
671 inst->src[1] = src_reg();
672 progress = true;
673 }
674 break;
675
676 case BRW_OPCODE_MUL:
677 if (inst->src[1].is_zero()) {
678 inst->opcode = BRW_OPCODE_MOV;
679 switch (inst->src[0].type) {
680 case BRW_REGISTER_TYPE_F:
681 inst->src[0] = src_reg(0.0f);
682 break;
683 case BRW_REGISTER_TYPE_D:
684 inst->src[0] = src_reg(0);
685 break;
686 case BRW_REGISTER_TYPE_UD:
687 inst->src[0] = src_reg(0u);
688 break;
689 default:
690 unreachable("not reached");
691 }
692 inst->src[1] = src_reg();
693 progress = true;
694 } else if (inst->src[1].is_one()) {
695 inst->opcode = BRW_OPCODE_MOV;
696 inst->src[1] = src_reg();
697 progress = true;
698 } else if (inst->src[1].is_negative_one()) {
699 inst->opcode = BRW_OPCODE_MOV;
700 inst->src[0].negate = !inst->src[0].negate;
701 inst->src[1] = src_reg();
702 progress = true;
703 }
704 break;
705 case BRW_OPCODE_CMP:
706 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
707 inst->src[0].abs &&
708 inst->src[0].negate &&
709 inst->src[1].is_zero()) {
710 inst->src[0].abs = false;
711 inst->src[0].negate = false;
712 inst->conditional_mod = BRW_CONDITIONAL_Z;
713 progress = true;
714 break;
715 }
716 break;
717 case SHADER_OPCODE_RCP: {
718 vec4_instruction *prev = (vec4_instruction *)inst->prev;
719 if (prev->opcode == SHADER_OPCODE_SQRT) {
720 if (inst->src[0].equals(src_reg(prev->dst))) {
721 inst->opcode = SHADER_OPCODE_RSQ;
722 inst->src[0] = prev->src[0];
723 progress = true;
724 }
725 }
726 break;
727 }
728 case SHADER_OPCODE_BROADCAST:
729 if (is_uniform(inst->src[0]) ||
730 inst->src[1].is_zero()) {
731 inst->opcode = BRW_OPCODE_MOV;
732 inst->src[1] = src_reg();
733 inst->force_writemask_all = true;
734 progress = true;
735 }
736 break;
737
738 default:
739 break;
740 }
741 }
742
743 if (progress)
744 invalidate_live_intervals();
745
746 return progress;
747 }
748
749 /**
750 * Only a limited number of hardware registers may be used for push
751 * constants, so this turns access to the overflowed constants into
752 * pull constants.
753 */
754 void
755 vec4_visitor::move_push_constants_to_pull_constants()
756 {
757 int pull_constant_loc[this->uniforms];
758
759 /* Only allow 32 registers (256 uniform components) as push constants,
760 * which is the limit on gen6.
761 *
762 * If changing this value, note the limitation about total_regs in
763 * brw_curbe.c.
764 */
765 int max_uniform_components = 32 * 8;
766 if (this->uniforms * 4 <= max_uniform_components)
767 return;
768
769 /* Make some sort of choice as to which uniforms get sent to pull
770 * constants. We could potentially do something clever here like
771 * look for the most infrequently used uniform vec4s, but leave
772 * that for later.
773 */
774 for (int i = 0; i < this->uniforms * 4; i += 4) {
775 pull_constant_loc[i / 4] = -1;
776
777 if (i >= max_uniform_components) {
778 const gl_constant_value **values = &stage_prog_data->param[i];
779
780 /* Try to find an existing copy of this uniform in the pull
781 * constants if it was part of an array access already.
782 */
783 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
784 int matches;
785
786 for (matches = 0; matches < 4; matches++) {
787 if (stage_prog_data->pull_param[j + matches] != values[matches])
788 break;
789 }
790
791 if (matches == 4) {
792 pull_constant_loc[i / 4] = j / 4;
793 break;
794 }
795 }
796
797 if (pull_constant_loc[i / 4] == -1) {
798 assert(stage_prog_data->nr_pull_params % 4 == 0);
799 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
800
801 for (int j = 0; j < 4; j++) {
802 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
803 values[j];
804 }
805 }
806 }
807 }
808
809 /* Now actually rewrite usage of the things we've moved to pull
810 * constants.
811 */
812 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
813 for (int i = 0 ; i < 3; i++) {
814 if (inst->src[i].file != UNIFORM ||
815 pull_constant_loc[inst->src[i].reg] == -1)
816 continue;
817
818 int uniform = inst->src[i].reg;
819
820 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
821
822 emit_pull_constant_load(block, inst, temp, inst->src[i],
823 pull_constant_loc[uniform]);
824
825 inst->src[i].file = temp.file;
826 inst->src[i].reg = temp.reg;
827 inst->src[i].reg_offset = temp.reg_offset;
828 inst->src[i].reladdr = NULL;
829 }
830 }
831
832 /* Repack push constants to remove the now-unused ones. */
833 pack_uniform_registers();
834 }
835
836 /* Conditions for which we want to avoid setting the dependency control bits */
837 bool
838 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
839 {
840 #define IS_DWORD(reg) \
841 (reg.type == BRW_REGISTER_TYPE_UD || \
842 reg.type == BRW_REGISTER_TYPE_D)
843
844 /* "When source or destination datatype is 64b or operation is integer DWord
845 * multiply, DepCtrl must not be used."
846 * May apply to future SoCs as well.
847 */
848 if (devinfo->is_cherryview) {
849 if (inst->opcode == BRW_OPCODE_MUL &&
850 IS_DWORD(inst->src[0]) &&
851 IS_DWORD(inst->src[1]))
852 return true;
853 }
854 #undef IS_DWORD
855
856 if (devinfo->gen >= 8) {
857 if (inst->opcode == BRW_OPCODE_F32TO16)
858 return true;
859 }
860
861 /*
862 * mlen:
863 * In the presence of send messages, totally interrupt dependency
864 * control. They're long enough that the chance of dependency
865 * control around them just doesn't matter.
866 *
867 * predicate:
868 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
869 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
870 * completes the scoreboard clear must have a non-zero execution mask. This
871 * means, if any kind of predication can change the execution mask or channel
872 * enable of the last instruction, the optimization must be avoided. This is
873 * to avoid instructions being shot down the pipeline when no writes are
874 * required.
875 *
876 * math:
877 * Dependency control does not work well over math instructions.
878 * NB: Discovered empirically
879 */
880 return (inst->mlen || inst->predicate || inst->is_math());
881 }
882
883 /**
884 * Sets the dependency control fields on instructions after register
885 * allocation and before the generator is run.
886 *
887 * When you have a sequence of instructions like:
888 *
889 * DP4 temp.x vertex uniform[0]
890 * DP4 temp.y vertex uniform[0]
891 * DP4 temp.z vertex uniform[0]
892 * DP4 temp.w vertex uniform[0]
893 *
894 * The hardware doesn't know that it can actually run the later instructions
895 * while the previous ones are in flight, producing stalls. However, we have
896 * manual fields we can set in the instructions that let it do so.
897 */
898 void
899 vec4_visitor::opt_set_dependency_control()
900 {
901 vec4_instruction *last_grf_write[BRW_MAX_GRF];
902 uint8_t grf_channels_written[BRW_MAX_GRF];
903 vec4_instruction *last_mrf_write[BRW_MAX_GRF];
904 uint8_t mrf_channels_written[BRW_MAX_GRF];
905
906 assert(prog_data->total_grf ||
907 !"Must be called after register allocation");
908
909 foreach_block (block, cfg) {
910 memset(last_grf_write, 0, sizeof(last_grf_write));
911 memset(last_mrf_write, 0, sizeof(last_mrf_write));
912
913 foreach_inst_in_block (vec4_instruction, inst, block) {
914 /* If we read from a register that we were doing dependency control
915 * on, don't do dependency control across the read.
916 */
917 for (int i = 0; i < 3; i++) {
918 int reg = inst->src[i].reg + inst->src[i].reg_offset;
919 if (inst->src[i].file == GRF) {
920 last_grf_write[reg] = NULL;
921 } else if (inst->src[i].file == HW_REG) {
922 memset(last_grf_write, 0, sizeof(last_grf_write));
923 break;
924 }
925 assert(inst->src[i].file != MRF);
926 }
927
928 if (is_dep_ctrl_unsafe(inst)) {
929 memset(last_grf_write, 0, sizeof(last_grf_write));
930 memset(last_mrf_write, 0, sizeof(last_mrf_write));
931 continue;
932 }
933
934 /* Now, see if we can do dependency control for this instruction
935 * against a previous one writing to its destination.
936 */
937 int reg = inst->dst.reg + inst->dst.reg_offset;
938 if (inst->dst.file == GRF) {
939 if (last_grf_write[reg] &&
940 !(inst->dst.writemask & grf_channels_written[reg])) {
941 last_grf_write[reg]->no_dd_clear = true;
942 inst->no_dd_check = true;
943 } else {
944 grf_channels_written[reg] = 0;
945 }
946
947 last_grf_write[reg] = inst;
948 grf_channels_written[reg] |= inst->dst.writemask;
949 } else if (inst->dst.file == MRF) {
950 if (last_mrf_write[reg] &&
951 !(inst->dst.writemask & mrf_channels_written[reg])) {
952 last_mrf_write[reg]->no_dd_clear = true;
953 inst->no_dd_check = true;
954 } else {
955 mrf_channels_written[reg] = 0;
956 }
957
958 last_mrf_write[reg] = inst;
959 mrf_channels_written[reg] |= inst->dst.writemask;
960 } else if (inst->dst.reg == HW_REG) {
961 if (inst->dst.brw_reg::file == BRW_GENERAL_REGISTER_FILE)
962 memset(last_grf_write, 0, sizeof(last_grf_write));
963 if (inst->dst.brw_reg::file == BRW_MESSAGE_REGISTER_FILE)
964 memset(last_mrf_write, 0, sizeof(last_mrf_write));
965 }
966 }
967 }
968 }
969
970 bool
971 vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo,
972 int dst_writemask,
973 int swizzle,
974 int swizzle_mask)
975 {
976 /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
977 * or writemasking are not allowed.
978 */
979 if (devinfo->gen == 6 && is_math() &&
980 (swizzle != BRW_SWIZZLE_XYZW || dst_writemask != WRITEMASK_XYZW))
981 return false;
982
983 /* If this instruction sets anything not referenced by swizzle, then we'd
984 * totally break it when we reswizzle.
985 */
986 if (dst.writemask & ~swizzle_mask)
987 return false;
988
989 if (mlen > 0)
990 return false;
991
992 /* We can't use swizzles on the accumulator and that's really the only
993 * HW_REG we would care to reswizzle so just disallow them all.
994 */
995 for (int i = 0; i < 3; i++) {
996 if (src[i].file == HW_REG)
997 return false;
998 }
999
1000 return true;
1001 }
1002
1003 /**
1004 * For any channels in the swizzle's source that were populated by this
1005 * instruction, rewrite the instruction to put the appropriate result directly
1006 * in those channels.
1007 *
1008 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
1009 */
1010 void
1011 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
1012 {
1013 /* Destination write mask doesn't correspond to source swizzle for the dot
1014 * product and pack_bytes instructions.
1015 */
1016 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
1017 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
1018 opcode != VEC4_OPCODE_PACK_BYTES) {
1019 for (int i = 0; i < 3; i++) {
1020 if (src[i].file == BAD_FILE || src[i].file == IMM)
1021 continue;
1022
1023 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
1024 }
1025 }
1026
1027 /* Apply the specified swizzle and writemask to the original mask of
1028 * written components.
1029 */
1030 dst.writemask = dst_writemask &
1031 brw_apply_swizzle_to_mask(swizzle, dst.writemask);
1032 }
1033
1034 /*
1035 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
1036 * just written and then MOVed into another reg and making the original write
1037 * of the GRF write directly to the final destination instead.
1038 */
1039 bool
1040 vec4_visitor::opt_register_coalesce()
1041 {
1042 bool progress = false;
1043 int next_ip = 0;
1044
1045 calculate_live_intervals();
1046
1047 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1048 int ip = next_ip;
1049 next_ip++;
1050
1051 if (inst->opcode != BRW_OPCODE_MOV ||
1052 (inst->dst.file != GRF && inst->dst.file != MRF) ||
1053 inst->predicate ||
1054 inst->src[0].file != GRF ||
1055 inst->dst.type != inst->src[0].type ||
1056 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1057 continue;
1058
1059 /* Remove no-op MOVs */
1060 if (inst->dst.file == inst->src[0].file &&
1061 inst->dst.reg == inst->src[0].reg &&
1062 inst->dst.reg_offset == inst->src[0].reg_offset) {
1063 bool is_nop_mov = true;
1064
1065 for (unsigned c = 0; c < 4; c++) {
1066 if ((inst->dst.writemask & (1 << c)) == 0)
1067 continue;
1068
1069 if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
1070 is_nop_mov = false;
1071 break;
1072 }
1073 }
1074
1075 if (is_nop_mov) {
1076 inst->remove(block);
1077 continue;
1078 }
1079 }
1080
1081 bool to_mrf = (inst->dst.file == MRF);
1082
1083 /* Can't coalesce this GRF if someone else was going to
1084 * read it later.
1085 */
1086 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1087 continue;
1088
1089 /* We need to check interference with the final destination between this
1090 * instruction and the earliest instruction involved in writing the GRF
1091 * we're eliminating. To do that, keep track of which of our source
1092 * channels we've seen initialized.
1093 */
1094 const unsigned chans_needed =
1095 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1096 inst->dst.writemask);
1097 unsigned chans_remaining = chans_needed;
1098
1099 /* Now walk up the instruction stream trying to see if we can rewrite
1100 * everything writing to the temporary to write into the destination
1101 * instead.
1102 */
1103 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1104 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1105 inst) {
1106 _scan_inst = scan_inst;
1107
1108 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1109 /* Found something writing to the reg we want to coalesce away. */
1110 if (to_mrf) {
1111 /* SEND instructions can't have MRF as a destination. */
1112 if (scan_inst->mlen)
1113 break;
1114
1115 if (devinfo->gen == 6) {
1116 /* gen6 math instructions must have the destination be
1117 * GRF, so no compute-to-MRF for them.
1118 */
1119 if (scan_inst->is_math()) {
1120 break;
1121 }
1122 }
1123 }
1124
1125 /* This doesn't handle saturation on the instruction we
1126 * want to coalesce away if the register types do not match.
1127 * But if scan_inst is a non type-converting 'mov', we can fix
1128 * the types later.
1129 */
1130 if (inst->saturate &&
1131 inst->dst.type != scan_inst->dst.type &&
1132 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1133 scan_inst->dst.type == scan_inst->src[0].type))
1134 break;
1135
1136 /* If we can't handle the swizzle, bail. */
1137 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1138 inst->src[0].swizzle,
1139 chans_needed)) {
1140 break;
1141 }
1142
1143 /* This doesn't handle coalescing of multiple registers. */
1144 if (scan_inst->regs_written > 1)
1145 break;
1146
1147 /* Mark which channels we found unconditional writes for. */
1148 if (!scan_inst->predicate)
1149 chans_remaining &= ~scan_inst->dst.writemask;
1150
1151 if (chans_remaining == 0)
1152 break;
1153 }
1154
1155 /* You can't read from an MRF, so if someone else reads our MRF's
1156 * source GRF that we wanted to rewrite, that stops us. If it's a
1157 * GRF we're trying to coalesce to, we don't actually handle
1158 * rewriting sources so bail in that case as well.
1159 */
1160 bool interfered = false;
1161 for (int i = 0; i < 3; i++) {
1162 if (inst->src[0].in_range(scan_inst->src[i],
1163 scan_inst->regs_read(i)))
1164 interfered = true;
1165 }
1166 if (interfered)
1167 break;
1168
1169 /* If somebody else writes the same channels of our destination here,
1170 * we can't coalesce before that.
1171 */
1172 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written) &&
1173 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1174 break;
1175 }
1176
1177 /* Check for reads of the register we're trying to coalesce into. We
1178 * can't go rewriting instructions above that to put some other value
1179 * in the register instead.
1180 */
1181 if (to_mrf && scan_inst->mlen > 0) {
1182 if (inst->dst.reg >= scan_inst->base_mrf &&
1183 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
1184 break;
1185 }
1186 } else {
1187 for (int i = 0; i < 3; i++) {
1188 if (inst->dst.in_range(scan_inst->src[i],
1189 scan_inst->regs_read(i)))
1190 interfered = true;
1191 }
1192 if (interfered)
1193 break;
1194 }
1195 }
1196
1197 if (chans_remaining == 0) {
1198 /* If we've made it here, we have an MOV we want to coalesce out, and
1199 * a scan_inst pointing to the earliest instruction involved in
1200 * computing the value. Now go rewrite the instruction stream
1201 * between the two.
1202 */
1203 vec4_instruction *scan_inst = _scan_inst;
1204 while (scan_inst != inst) {
1205 if (scan_inst->dst.file == GRF &&
1206 scan_inst->dst.reg == inst->src[0].reg &&
1207 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1208 scan_inst->reswizzle(inst->dst.writemask,
1209 inst->src[0].swizzle);
1210 scan_inst->dst.file = inst->dst.file;
1211 scan_inst->dst.reg = inst->dst.reg;
1212 scan_inst->dst.reg_offset = inst->dst.reg_offset;
1213 if (inst->saturate &&
1214 inst->dst.type != scan_inst->dst.type) {
1215 /* If we have reached this point, scan_inst is a non
1216 * type-converting 'mov' and we can modify its register types
1217 * to match the ones in inst. Otherwise, we could have an
1218 * incorrect saturation result.
1219 */
1220 scan_inst->dst.type = inst->dst.type;
1221 scan_inst->src[0].type = inst->src[0].type;
1222 }
1223 scan_inst->saturate |= inst->saturate;
1224 }
1225 scan_inst = (vec4_instruction *)scan_inst->next;
1226 }
1227 inst->remove(block);
1228 progress = true;
1229 }
1230 }
1231
1232 if (progress)
1233 invalidate_live_intervals();
1234
1235 return progress;
1236 }
1237
1238 /**
1239 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1240 * flow. We could probably do better here with some form of divergence
1241 * analysis.
1242 */
1243 bool
1244 vec4_visitor::eliminate_find_live_channel()
1245 {
1246 bool progress = false;
1247 unsigned depth = 0;
1248
1249 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1250 switch (inst->opcode) {
1251 case BRW_OPCODE_IF:
1252 case BRW_OPCODE_DO:
1253 depth++;
1254 break;
1255
1256 case BRW_OPCODE_ENDIF:
1257 case BRW_OPCODE_WHILE:
1258 depth--;
1259 break;
1260
1261 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1262 if (depth == 0) {
1263 inst->opcode = BRW_OPCODE_MOV;
1264 inst->src[0] = src_reg(0);
1265 inst->force_writemask_all = true;
1266 progress = true;
1267 }
1268 break;
1269
1270 default:
1271 break;
1272 }
1273 }
1274
1275 return progress;
1276 }
1277
1278 /**
1279 * Splits virtual GRFs requesting more than one contiguous physical register.
1280 *
1281 * We initially create large virtual GRFs for temporary structures, arrays,
1282 * and matrices, so that the dereference visitor functions can add reg_offsets
1283 * to work their way down to the actual member being accessed. But when it
1284 * comes to optimization, we'd like to treat each register as individual
1285 * storage if possible.
1286 *
1287 * So far, the only thing that might prevent splitting is a send message from
1288 * a GRF on IVB.
1289 */
1290 void
1291 vec4_visitor::split_virtual_grfs()
1292 {
1293 int num_vars = this->alloc.count;
1294 int new_virtual_grf[num_vars];
1295 bool split_grf[num_vars];
1296
1297 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1298
1299 /* Try to split anything > 0 sized. */
1300 for (int i = 0; i < num_vars; i++) {
1301 split_grf[i] = this->alloc.sizes[i] != 1;
1302 }
1303
1304 /* Check that the instructions are compatible with the registers we're trying
1305 * to split.
1306 */
1307 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1308 if (inst->dst.file == GRF && inst->regs_written > 1)
1309 split_grf[inst->dst.reg] = false;
1310
1311 for (int i = 0; i < 3; i++) {
1312 if (inst->src[i].file == GRF && inst->regs_read(i) > 1)
1313 split_grf[inst->src[i].reg] = false;
1314 }
1315 }
1316
1317 /* Allocate new space for split regs. Note that the virtual
1318 * numbers will be contiguous.
1319 */
1320 for (int i = 0; i < num_vars; i++) {
1321 if (!split_grf[i])
1322 continue;
1323
1324 new_virtual_grf[i] = alloc.allocate(1);
1325 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1326 unsigned reg = alloc.allocate(1);
1327 assert(reg == new_virtual_grf[i] + j - 1);
1328 (void) reg;
1329 }
1330 this->alloc.sizes[i] = 1;
1331 }
1332
1333 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1334 if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
1335 inst->dst.reg_offset != 0) {
1336 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1337 inst->dst.reg_offset - 1);
1338 inst->dst.reg_offset = 0;
1339 }
1340 for (int i = 0; i < 3; i++) {
1341 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
1342 inst->src[i].reg_offset != 0) {
1343 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1344 inst->src[i].reg_offset - 1);
1345 inst->src[i].reg_offset = 0;
1346 }
1347 }
1348 }
1349 invalidate_live_intervals();
1350 }
1351
1352 void
1353 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1354 {
1355 dump_instruction(be_inst, stderr);
1356 }
1357
1358 void
1359 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1360 {
1361 vec4_instruction *inst = (vec4_instruction *)be_inst;
1362
1363 if (inst->predicate) {
1364 fprintf(file, "(%cf0.%d%s) ",
1365 inst->predicate_inverse ? '-' : '+',
1366 inst->flag_subreg,
1367 pred_ctrl_align16[inst->predicate]);
1368 }
1369
1370 fprintf(file, "%s", brw_instruction_name(inst->opcode));
1371 if (inst->saturate)
1372 fprintf(file, ".sat");
1373 if (inst->conditional_mod) {
1374 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1375 if (!inst->predicate &&
1376 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1377 inst->opcode != BRW_OPCODE_IF &&
1378 inst->opcode != BRW_OPCODE_WHILE))) {
1379 fprintf(file, ".f0.%d", inst->flag_subreg);
1380 }
1381 }
1382 fprintf(file, " ");
1383
1384 switch (inst->dst.file) {
1385 case GRF:
1386 fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
1387 break;
1388 case MRF:
1389 fprintf(file, "m%d", inst->dst.reg);
1390 break;
1391 case HW_REG:
1392 if (inst->dst.brw_reg::file == BRW_ARCHITECTURE_REGISTER_FILE) {
1393 switch (inst->dst.nr) {
1394 case BRW_ARF_NULL:
1395 fprintf(file, "null");
1396 break;
1397 case BRW_ARF_ADDRESS:
1398 fprintf(file, "a0.%d", inst->dst.subnr);
1399 break;
1400 case BRW_ARF_ACCUMULATOR:
1401 fprintf(file, "acc%d", inst->dst.subnr);
1402 break;
1403 case BRW_ARF_FLAG:
1404 fprintf(file, "f%d.%d", inst->dst.nr & 0xf,
1405 inst->dst.subnr);
1406 break;
1407 default:
1408 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf,
1409 inst->dst.subnr);
1410 break;
1411 }
1412 } else {
1413 fprintf(file, "hw_reg%d", inst->dst.nr);
1414 }
1415 if (inst->dst.subnr)
1416 fprintf(file, "+%d", inst->dst.subnr);
1417 break;
1418 case BAD_FILE:
1419 fprintf(file, "(null)");
1420 break;
1421 case IMM:
1422 case ATTR:
1423 case UNIFORM:
1424 unreachable("not reached");
1425 }
1426 if (inst->dst.writemask != WRITEMASK_XYZW) {
1427 fprintf(file, ".");
1428 if (inst->dst.writemask & 1)
1429 fprintf(file, "x");
1430 if (inst->dst.writemask & 2)
1431 fprintf(file, "y");
1432 if (inst->dst.writemask & 4)
1433 fprintf(file, "z");
1434 if (inst->dst.writemask & 8)
1435 fprintf(file, "w");
1436 }
1437 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1438
1439 if (inst->src[0].file != BAD_FILE)
1440 fprintf(file, ", ");
1441
1442 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1443 if (inst->src[i].negate)
1444 fprintf(file, "-");
1445 if (inst->src[i].abs)
1446 fprintf(file, "|");
1447 switch (inst->src[i].file) {
1448 case GRF:
1449 fprintf(file, "vgrf%d", inst->src[i].reg);
1450 break;
1451 case ATTR:
1452 fprintf(file, "attr%d", inst->src[i].reg);
1453 break;
1454 case UNIFORM:
1455 fprintf(file, "u%d", inst->src[i].reg);
1456 break;
1457 case IMM:
1458 switch (inst->src[i].type) {
1459 case BRW_REGISTER_TYPE_F:
1460 fprintf(file, "%fF", inst->src[i].f);
1461 break;
1462 case BRW_REGISTER_TYPE_D:
1463 fprintf(file, "%dD", inst->src[i].d);
1464 break;
1465 case BRW_REGISTER_TYPE_UD:
1466 fprintf(file, "%uU", inst->src[i].ud);
1467 break;
1468 case BRW_REGISTER_TYPE_VF:
1469 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1470 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1471 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1472 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1473 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1474 break;
1475 default:
1476 fprintf(file, "???");
1477 break;
1478 }
1479 break;
1480 case HW_REG:
1481 if (inst->src[i].brw_reg::file == BRW_ARCHITECTURE_REGISTER_FILE) {
1482 switch (inst->src[i].nr) {
1483 case BRW_ARF_NULL:
1484 fprintf(file, "null");
1485 break;
1486 case BRW_ARF_ADDRESS:
1487 fprintf(file, "a0.%d", inst->src[i].subnr);
1488 break;
1489 case BRW_ARF_ACCUMULATOR:
1490 fprintf(file, "acc%d", inst->src[i].subnr);
1491 break;
1492 case BRW_ARF_FLAG:
1493 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf,
1494 inst->src[i].subnr);
1495 break;
1496 default:
1497 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf,
1498 inst->src[i].subnr);
1499 break;
1500 }
1501 } else {
1502 fprintf(file, "hw_reg%d", inst->src[i].nr);
1503 }
1504 if (inst->src[i].subnr)
1505 fprintf(file, "+%d", inst->src[i].subnr);
1506 break;
1507 case BAD_FILE:
1508 fprintf(file, "(null)");
1509 break;
1510 case MRF:
1511 unreachable("not reached");
1512 }
1513
1514 /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1515 if (inst->src[i].reg_offset != 0 &&
1516 inst->src[i].file == GRF &&
1517 alloc.sizes[inst->src[i].reg] != 1)
1518 fprintf(file, ".%d", inst->src[i].reg_offset);
1519
1520 if (inst->src[i].file != IMM) {
1521 static const char *chans[4] = {"x", "y", "z", "w"};
1522 fprintf(file, ".");
1523 for (int c = 0; c < 4; c++) {
1524 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1525 }
1526 }
1527
1528 if (inst->src[i].abs)
1529 fprintf(file, "|");
1530
1531 if (inst->src[i].file != IMM) {
1532 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1533 }
1534
1535 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1536 fprintf(file, ", ");
1537 }
1538
1539 if (inst->force_writemask_all)
1540 fprintf(file, " NoMask");
1541
1542 fprintf(file, "\n");
1543 }
1544
1545
1546 static inline struct brw_reg
1547 attribute_to_hw_reg(int attr, bool interleaved)
1548 {
1549 if (interleaved)
1550 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1551 else
1552 return brw_vec8_grf(attr, 0);
1553 }
1554
1555
1556 /**
1557 * Replace each register of type ATTR in this->instructions with a reference
1558 * to a fixed HW register.
1559 *
1560 * If interleaved is true, then each attribute takes up half a register, with
1561 * register N containing attribute 2*N in its first half and attribute 2*N+1
1562 * in its second half (this corresponds to the payload setup used by geometry
1563 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
1564 * false, then each attribute takes up a whole register, with register N
1565 * containing attribute N (this corresponds to the payload setup used by
1566 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1567 */
1568 void
1569 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1570 bool interleaved)
1571 {
1572 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1573 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1574 if (inst->dst.file == ATTR) {
1575 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1576
1577 /* All attributes used in the shader need to have been assigned a
1578 * hardware register by the caller
1579 */
1580 assert(grf != 0);
1581
1582 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1583 reg.type = inst->dst.type;
1584 reg.writemask = inst->dst.writemask;
1585
1586 inst->dst = reg;
1587 }
1588
1589 for (int i = 0; i < 3; i++) {
1590 if (inst->src[i].file != ATTR)
1591 continue;
1592
1593 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1594
1595 /* All attributes used in the shader need to have been assigned a
1596 * hardware register by the caller
1597 */
1598 assert(grf != 0);
1599
1600 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1601 reg.swizzle = inst->src[i].swizzle;
1602 reg.type = inst->src[i].type;
1603 if (inst->src[i].abs)
1604 reg = brw_abs(reg);
1605 if (inst->src[i].negate)
1606 reg = negate(reg);
1607
1608 inst->src[i] = reg;
1609 }
1610 }
1611 }
1612
1613 int
1614 vec4_vs_visitor::setup_attributes(int payload_reg)
1615 {
1616 int nr_attributes;
1617 int attribute_map[VERT_ATTRIB_MAX + 1];
1618 memset(attribute_map, 0, sizeof(attribute_map));
1619
1620 nr_attributes = 0;
1621 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1622 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1623 attribute_map[i] = payload_reg + nr_attributes;
1624 nr_attributes++;
1625 }
1626 }
1627
1628 /* VertexID is stored by the VF as the last vertex element, but we
1629 * don't represent it with a flag in inputs_read, so we call it
1630 * VERT_ATTRIB_MAX.
1631 */
1632 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
1633 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1634 }
1635
1636 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1637
1638 return payload_reg + vs_prog_data->nr_attributes;
1639 }
1640
1641 int
1642 vec4_visitor::setup_uniforms(int reg)
1643 {
1644 prog_data->base.dispatch_grf_start_reg = reg;
1645
1646 /* The pre-gen6 VS requires that some push constants get loaded no
1647 * matter what, or the GPU would hang.
1648 */
1649 if (devinfo->gen < 6 && this->uniforms == 0) {
1650 assert(this->uniforms < this->uniform_array_size);
1651
1652 stage_prog_data->param =
1653 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1654 for (unsigned int i = 0; i < 4; i++) {
1655 unsigned int slot = this->uniforms * 4 + i;
1656 static gl_constant_value zero = { 0.0 };
1657 stage_prog_data->param[slot] = &zero;
1658 }
1659
1660 this->uniforms++;
1661 reg++;
1662 } else {
1663 reg += ALIGN(uniforms, 2) / 2;
1664 }
1665
1666 stage_prog_data->nr_params = this->uniforms * 4;
1667
1668 prog_data->base.curb_read_length =
1669 reg - prog_data->base.dispatch_grf_start_reg;
1670
1671 return reg;
1672 }
1673
1674 void
1675 vec4_vs_visitor::setup_payload(void)
1676 {
1677 int reg = 0;
1678
1679 /* The payload always contains important data in g0, which contains
1680 * the URB handles that are passed on to the URB write at the end
1681 * of the thread. So, we always start push constants at g1.
1682 */
1683 reg++;
1684
1685 reg = setup_uniforms(reg);
1686
1687 reg = setup_attributes(reg);
1688
1689 this->first_non_payload_grf = reg;
1690 }
1691
1692 src_reg
1693 vec4_visitor::get_timestamp()
1694 {
1695 assert(devinfo->gen >= 7);
1696
1697 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1698 BRW_ARF_TIMESTAMP,
1699 0,
1700 0,
1701 0,
1702 BRW_REGISTER_TYPE_UD,
1703 BRW_VERTICAL_STRIDE_0,
1704 BRW_WIDTH_4,
1705 BRW_HORIZONTAL_STRIDE_4,
1706 BRW_SWIZZLE_XYZW,
1707 WRITEMASK_XYZW));
1708
1709 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1710
1711 vec4_instruction *mov = emit(MOV(dst, ts));
1712 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1713 * even if it's not enabled in the dispatch.
1714 */
1715 mov->force_writemask_all = true;
1716
1717 return src_reg(dst);
1718 }
1719
1720 void
1721 vec4_visitor::emit_shader_time_begin()
1722 {
1723 current_annotation = "shader time start";
1724 shader_start_time = get_timestamp();
1725 }
1726
1727 void
1728 vec4_visitor::emit_shader_time_end()
1729 {
1730 current_annotation = "shader time end";
1731 src_reg shader_end_time = get_timestamp();
1732
1733
1734 /* Check that there weren't any timestamp reset events (assuming these
1735 * were the only two timestamp reads that happened).
1736 */
1737 src_reg reset_end = shader_end_time;
1738 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1739 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1740 test->conditional_mod = BRW_CONDITIONAL_Z;
1741
1742 emit(IF(BRW_PREDICATE_NORMAL));
1743
1744 /* Take the current timestamp and get the delta. */
1745 shader_start_time.negate = true;
1746 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1747 emit(ADD(diff, shader_start_time, shader_end_time));
1748
1749 /* If there were no instructions between the two timestamp gets, the diff
1750 * is 2 cycles. Remove that overhead, so I can forget about that when
1751 * trying to determine the time taken for single instructions.
1752 */
1753 emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1754
1755 emit_shader_time_write(0, src_reg(diff));
1756 emit_shader_time_write(1, src_reg(1u));
1757 emit(BRW_OPCODE_ELSE);
1758 emit_shader_time_write(2, src_reg(1u));
1759 emit(BRW_OPCODE_ENDIF);
1760 }
1761
1762 void
1763 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1764 {
1765 dst_reg dst =
1766 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1767
1768 dst_reg offset = dst;
1769 dst_reg time = dst;
1770 time.reg_offset++;
1771
1772 offset.type = BRW_REGISTER_TYPE_UD;
1773 int index = shader_time_index * 3 + shader_time_subindex;
1774 emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
1775
1776 time.type = BRW_REGISTER_TYPE_UD;
1777 emit(MOV(time, value));
1778
1779 vec4_instruction *inst =
1780 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1781 inst->mlen = 2;
1782 }
1783
1784 void
1785 vec4_visitor::convert_to_hw_regs()
1786 {
1787 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1788 for (int i = 0; i < 3; i++) {
1789 struct src_reg &src = inst->src[i];
1790 struct brw_reg reg;
1791 switch (src.file) {
1792 case GRF:
1793 reg = brw_vec8_grf(src.reg + src.reg_offset, 0);
1794 reg.type = src.type;
1795 reg.swizzle = src.swizzle;
1796 reg.abs = src.abs;
1797 reg.negate = src.negate;
1798 break;
1799
1800 case IMM:
1801 reg = brw_imm_reg(src.type);
1802 reg.ud = src.ud;
1803 break;
1804
1805 case UNIFORM:
1806 reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
1807 (src.reg + src.reg_offset) / 2,
1808 ((src.reg + src.reg_offset) % 2) * 4),
1809 0, 4, 1);
1810 reg.type = src.type;
1811 reg.swizzle = src.swizzle;
1812 reg.abs = src.abs;
1813 reg.negate = src.negate;
1814
1815 /* This should have been moved to pull constants. */
1816 assert(!src.reladdr);
1817 break;
1818
1819 case HW_REG:
1820 continue;
1821
1822 case BAD_FILE:
1823 /* Probably unused. */
1824 reg = brw_null_reg();
1825 break;
1826
1827 case MRF:
1828 case ATTR:
1829 unreachable("not reached");
1830 }
1831 src = reg;
1832 }
1833
1834 dst_reg &dst = inst->dst;
1835 struct brw_reg reg;
1836
1837 switch (inst->dst.file) {
1838 case GRF:
1839 reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
1840 reg.type = dst.type;
1841 reg.writemask = dst.writemask;
1842 break;
1843
1844 case MRF:
1845 assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen));
1846 reg = brw_message_reg(dst.reg + dst.reg_offset);
1847 reg.type = dst.type;
1848 reg.writemask = dst.writemask;
1849 break;
1850
1851 case HW_REG:
1852 reg = dst;
1853 break;
1854
1855 case BAD_FILE:
1856 reg = brw_null_reg();
1857 break;
1858
1859 case IMM:
1860 case ATTR:
1861 case UNIFORM:
1862 unreachable("not reached");
1863 }
1864
1865 dst = reg;
1866 }
1867 }
1868
1869 bool
1870 vec4_visitor::run()
1871 {
1872 if (shader_time_index >= 0)
1873 emit_shader_time_begin();
1874
1875 emit_prolog();
1876
1877 emit_nir_code();
1878 if (failed)
1879 return false;
1880 base_ir = NULL;
1881
1882 emit_thread_end();
1883
1884 calculate_cfg();
1885
1886 /* Before any optimization, push array accesses out to scratch
1887 * space where we need them to be. This pass may allocate new
1888 * virtual GRFs, so we want to do it early. It also makes sure
1889 * that we have reladdr computations available for CSE, since we'll
1890 * often do repeated subexpressions for those.
1891 */
1892 move_grf_array_access_to_scratch();
1893 move_uniform_array_access_to_pull_constants();
1894
1895 pack_uniform_registers();
1896 move_push_constants_to_pull_constants();
1897 split_virtual_grfs();
1898
1899 #define OPT(pass, args...) ({ \
1900 pass_num++; \
1901 bool this_progress = pass(args); \
1902 \
1903 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
1904 char filename[64]; \
1905 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
1906 stage_abbrev, nir->info.name, iteration, pass_num); \
1907 \
1908 backend_shader::dump_instructions(filename); \
1909 } \
1910 \
1911 progress = progress || this_progress; \
1912 this_progress; \
1913 })
1914
1915
1916 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1917 char filename[64];
1918 snprintf(filename, 64, "%s-%s-00-start",
1919 stage_abbrev, nir->info.name);
1920
1921 backend_shader::dump_instructions(filename);
1922 }
1923
1924 bool progress;
1925 int iteration = 0;
1926 int pass_num = 0;
1927 do {
1928 progress = false;
1929 pass_num = 0;
1930 iteration++;
1931
1932 OPT(opt_predicated_break, this);
1933 OPT(opt_reduce_swizzle);
1934 OPT(dead_code_eliminate);
1935 OPT(dead_control_flow_eliminate, this);
1936 OPT(opt_copy_propagation);
1937 OPT(opt_cmod_propagation);
1938 OPT(opt_cse);
1939 OPT(opt_algebraic);
1940 OPT(opt_register_coalesce);
1941 OPT(eliminate_find_live_channel);
1942 } while (progress);
1943
1944 pass_num = 0;
1945
1946 if (OPT(opt_vector_float)) {
1947 OPT(opt_cse);
1948 OPT(opt_copy_propagation, false);
1949 OPT(opt_copy_propagation, true);
1950 OPT(dead_code_eliminate);
1951 }
1952
1953 if (failed)
1954 return false;
1955
1956 setup_payload();
1957
1958 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
1959 /* Debug of register spilling: Go spill everything. */
1960 const int grf_count = alloc.count;
1961 float spill_costs[alloc.count];
1962 bool no_spill[alloc.count];
1963 evaluate_spill_costs(spill_costs, no_spill);
1964 for (int i = 0; i < grf_count; i++) {
1965 if (no_spill[i])
1966 continue;
1967 spill_reg(i);
1968 }
1969 }
1970
1971 bool allocated_without_spills = reg_allocate();
1972
1973 if (!allocated_without_spills) {
1974 compiler->shader_perf_log(log_data,
1975 "%s shader triggered register spilling. "
1976 "Try reducing the number of live vec4 values "
1977 "to improve performance.\n",
1978 stage_name);
1979
1980 while (!reg_allocate()) {
1981 if (failed)
1982 return false;
1983 }
1984 }
1985
1986 opt_schedule_instructions();
1987
1988 opt_set_dependency_control();
1989
1990 convert_to_hw_regs();
1991
1992 if (last_scratch > 0) {
1993 prog_data->base.total_scratch =
1994 brw_get_scratch_size(last_scratch * REG_SIZE);
1995 }
1996
1997 return !failed;
1998 }
1999
2000 } /* namespace brw */
2001
2002 extern "C" {
2003
2004 /**
2005 * Compile a vertex shader.
2006 *
2007 * Returns the final assembly and the program's size.
2008 */
2009 const unsigned *
2010 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
2011 void *mem_ctx,
2012 const struct brw_vs_prog_key *key,
2013 struct brw_vs_prog_data *prog_data,
2014 const nir_shader *shader,
2015 gl_clip_plane *clip_planes,
2016 bool use_legacy_snorm_formula,
2017 int shader_time_index,
2018 unsigned *final_assembly_size,
2019 char **error_str)
2020 {
2021 const unsigned *assembly = NULL;
2022
2023 unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
2024
2025 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2026 * incoming vertex attribute. So, add an extra slot.
2027 */
2028 if (shader->info.system_values_read &
2029 (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
2030 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
2031 nr_attributes++;
2032 }
2033
2034 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2035 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
2036 * vec4 mode, the hardware appears to wedge unless we read something.
2037 */
2038 if (compiler->scalar_vs)
2039 prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
2040 else
2041 prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
2042
2043 prog_data->nr_attributes = nr_attributes;
2044
2045 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2046 * (overwriting the original contents), we need to make sure the size is
2047 * the larger of the two.
2048 */
2049 const unsigned vue_entries =
2050 MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots);
2051
2052 if (compiler->devinfo->gen == 6)
2053 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2054 else
2055 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2056
2057 if (compiler->scalar_vs) {
2058 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
2059
2060 fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
2061 NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
2062 shader, 8, shader_time_index);
2063 if (!v.run_vs(clip_planes)) {
2064 if (error_str)
2065 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2066
2067 return NULL;
2068 }
2069
2070 fs_generator g(compiler, log_data, mem_ctx, (void *) key,
2071 &prog_data->base.base, v.promoted_constants,
2072 v.runtime_check_aads_emit, "VS");
2073 if (INTEL_DEBUG & DEBUG_VS) {
2074 const char *debug_name =
2075 ralloc_asprintf(mem_ctx, "%s vertex shader %s",
2076 shader->info.label ? shader->info.label : "unnamed",
2077 shader->info.name);
2078
2079 g.enable_debug(debug_name);
2080 }
2081 g.generate_code(v.cfg, 8);
2082 assembly = g.get_assembly(final_assembly_size);
2083 }
2084
2085 if (!assembly) {
2086 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2087
2088 vec4_vs_visitor v(compiler, log_data, key, prog_data,
2089 shader, clip_planes, mem_ctx,
2090 shader_time_index, use_legacy_snorm_formula);
2091 if (!v.run()) {
2092 if (error_str)
2093 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2094
2095 return NULL;
2096 }
2097
2098 assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2099 shader, &prog_data->base, v.cfg,
2100 final_assembly_size);
2101 }
2102
2103 return assembly;
2104 }
2105
2106 } /* extern "C" */