Merge remote-tracking branch 'mesa-public/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_vs.h"
28 #include "brw_nir.h"
29 #include "brw_vec4_live_variables.h"
30 #include "brw_dead_control_flow.h"
31
32 extern "C" {
33 #include "main/macros.h"
34 #include "main/shaderobj.h"
35 #include "program/prog_print.h"
36 #include "program/prog_parameter.h"
37 }
38 #include "main/context.h"
39
40 #define MAX_INSTRUCTION (1 << 30)
41
42 using namespace brw;
43
44 namespace brw {
45
46 void
47 src_reg::init()
48 {
49 memset(this, 0, sizeof(*this));
50
51 this->file = BAD_FILE;
52 }
53
54 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
55 {
56 init();
57
58 this->file = file;
59 this->reg = reg;
60 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
61 this->swizzle = brw_swizzle_for_size(type->vector_elements);
62 else
63 this->swizzle = BRW_SWIZZLE_XYZW;
64 }
65
66 /** Generic unset register constructor. */
67 src_reg::src_reg()
68 {
69 init();
70 }
71
72 src_reg::src_reg(float f)
73 {
74 init();
75
76 this->file = IMM;
77 this->type = BRW_REGISTER_TYPE_F;
78 this->fixed_hw_reg.dw1.f = f;
79 }
80
81 src_reg::src_reg(uint32_t u)
82 {
83 init();
84
85 this->file = IMM;
86 this->type = BRW_REGISTER_TYPE_UD;
87 this->fixed_hw_reg.dw1.ud = u;
88 }
89
90 src_reg::src_reg(int32_t i)
91 {
92 init();
93
94 this->file = IMM;
95 this->type = BRW_REGISTER_TYPE_D;
96 this->fixed_hw_reg.dw1.d = i;
97 }
98
99 src_reg::src_reg(uint8_t vf[4])
100 {
101 init();
102
103 this->file = IMM;
104 this->type = BRW_REGISTER_TYPE_VF;
105 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
106 }
107
108 src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
109 {
110 init();
111
112 this->file = IMM;
113 this->type = BRW_REGISTER_TYPE_VF;
114 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
115 (vf1 << 8) |
116 (vf2 << 16) |
117 (vf3 << 24);
118 }
119
120 src_reg::src_reg(struct brw_reg reg)
121 {
122 init();
123
124 this->file = HW_REG;
125 this->fixed_hw_reg = reg;
126 this->type = reg.type;
127 }
128
129 src_reg::src_reg(const dst_reg &reg)
130 {
131 init();
132
133 this->file = reg.file;
134 this->reg = reg.reg;
135 this->reg_offset = reg.reg_offset;
136 this->type = reg.type;
137 this->reladdr = reg.reladdr;
138 this->fixed_hw_reg = reg.fixed_hw_reg;
139 this->swizzle = brw_swizzle_for_mask(reg.writemask);
140 }
141
142 void
143 dst_reg::init()
144 {
145 memset(this, 0, sizeof(*this));
146 this->file = BAD_FILE;
147 this->writemask = WRITEMASK_XYZW;
148 }
149
150 dst_reg::dst_reg()
151 {
152 init();
153 }
154
155 dst_reg::dst_reg(register_file file, int reg)
156 {
157 init();
158
159 this->file = file;
160 this->reg = reg;
161 }
162
163 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
164 unsigned writemask)
165 {
166 init();
167
168 this->file = file;
169 this->reg = reg;
170 this->type = brw_type_for_base_type(type);
171 this->writemask = writemask;
172 }
173
174 dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
175 unsigned writemask)
176 {
177 init();
178
179 this->file = file;
180 this->reg = reg;
181 this->type = type;
182 this->writemask = writemask;
183 }
184
185 dst_reg::dst_reg(struct brw_reg reg)
186 {
187 init();
188
189 this->file = HW_REG;
190 this->fixed_hw_reg = reg;
191 this->type = reg.type;
192 }
193
194 dst_reg::dst_reg(const src_reg &reg)
195 {
196 init();
197
198 this->file = reg.file;
199 this->reg = reg.reg;
200 this->reg_offset = reg.reg_offset;
201 this->type = reg.type;
202 this->writemask = brw_mask_for_swizzle(reg.swizzle);
203 this->reladdr = reg.reladdr;
204 this->fixed_hw_reg = reg.fixed_hw_reg;
205 }
206
207 bool
208 dst_reg::equals(const dst_reg &r) const
209 {
210 return (file == r.file &&
211 reg == r.reg &&
212 reg_offset == r.reg_offset &&
213 type == r.type &&
214 negate == r.negate &&
215 abs == r.abs &&
216 writemask == r.writemask &&
217 (reladdr == r.reladdr ||
218 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
219 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
220 sizeof(fixed_hw_reg)) == 0);
221 }
222
223 bool
224 vec4_instruction::is_send_from_grf()
225 {
226 switch (opcode) {
227 case SHADER_OPCODE_SHADER_TIME_ADD:
228 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
229 case SHADER_OPCODE_UNTYPED_ATOMIC:
230 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
231 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
232 case SHADER_OPCODE_TYPED_ATOMIC:
233 case SHADER_OPCODE_TYPED_SURFACE_READ:
234 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
235 return true;
236 default:
237 return false;
238 }
239 }
240
241 unsigned
242 vec4_instruction::regs_read(unsigned arg) const
243 {
244 if (src[arg].file == BAD_FILE)
245 return 0;
246
247 switch (opcode) {
248 case SHADER_OPCODE_SHADER_TIME_ADD:
249 case SHADER_OPCODE_UNTYPED_ATOMIC:
250 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
251 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
252 case SHADER_OPCODE_TYPED_ATOMIC:
253 case SHADER_OPCODE_TYPED_SURFACE_READ:
254 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
255 return arg == 0 ? mlen : 1;
256
257 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
258 return arg == 1 ? mlen : 1;
259
260 default:
261 return 1;
262 }
263 }
264
265 bool
266 vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
267 {
268 if (devinfo->gen == 6 && is_math())
269 return false;
270
271 if (is_send_from_grf())
272 return false;
273
274 if (!backend_instruction::can_do_source_mods())
275 return false;
276
277 return true;
278 }
279
280 /**
281 * Returns how many MRFs an opcode will write over.
282 *
283 * Note that this is not the 0 or 1 implied writes in an actual gen
284 * instruction -- the generate_* functions generate additional MOVs
285 * for setup.
286 */
287 int
288 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
289 {
290 if (inst->mlen == 0 || inst->is_send_from_grf())
291 return 0;
292
293 switch (inst->opcode) {
294 case SHADER_OPCODE_RCP:
295 case SHADER_OPCODE_RSQ:
296 case SHADER_OPCODE_SQRT:
297 case SHADER_OPCODE_EXP2:
298 case SHADER_OPCODE_LOG2:
299 case SHADER_OPCODE_SIN:
300 case SHADER_OPCODE_COS:
301 return 1;
302 case SHADER_OPCODE_INT_QUOTIENT:
303 case SHADER_OPCODE_INT_REMAINDER:
304 case SHADER_OPCODE_POW:
305 return 2;
306 case VS_OPCODE_URB_WRITE:
307 return 1;
308 case VS_OPCODE_PULL_CONSTANT_LOAD:
309 return 2;
310 case SHADER_OPCODE_GEN4_SCRATCH_READ:
311 return 2;
312 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
313 return 3;
314 case GS_OPCODE_URB_WRITE:
315 case GS_OPCODE_URB_WRITE_ALLOCATE:
316 case GS_OPCODE_THREAD_END:
317 return 0;
318 case GS_OPCODE_FF_SYNC:
319 return 1;
320 case SHADER_OPCODE_SHADER_TIME_ADD:
321 return 0;
322 case SHADER_OPCODE_TEX:
323 case SHADER_OPCODE_TXL:
324 case SHADER_OPCODE_TXD:
325 case SHADER_OPCODE_TXF:
326 case SHADER_OPCODE_TXF_CMS:
327 case SHADER_OPCODE_TXF_MCS:
328 case SHADER_OPCODE_TXS:
329 case SHADER_OPCODE_TG4:
330 case SHADER_OPCODE_TG4_OFFSET:
331 return inst->header_size;
332 default:
333 unreachable("not reached");
334 }
335 }
336
337 bool
338 src_reg::equals(const src_reg &r) const
339 {
340 return (file == r.file &&
341 reg == r.reg &&
342 reg_offset == r.reg_offset &&
343 type == r.type &&
344 negate == r.negate &&
345 abs == r.abs &&
346 swizzle == r.swizzle &&
347 !reladdr && !r.reladdr &&
348 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
349 sizeof(fixed_hw_reg)) == 0);
350 }
351
352 bool
353 vec4_visitor::opt_vector_float()
354 {
355 bool progress = false;
356
357 int last_reg = -1, last_reg_offset = -1;
358 enum register_file last_reg_file = BAD_FILE;
359
360 int remaining_channels = 0;
361 uint8_t imm[4];
362 int inst_count = 0;
363 vec4_instruction *imm_inst[4];
364
365 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
366 if (last_reg != inst->dst.reg ||
367 last_reg_offset != inst->dst.reg_offset ||
368 last_reg_file != inst->dst.file) {
369 last_reg = inst->dst.reg;
370 last_reg_offset = inst->dst.reg_offset;
371 last_reg_file = inst->dst.file;
372 remaining_channels = WRITEMASK_XYZW;
373
374 inst_count = 0;
375 }
376
377 if (inst->opcode != BRW_OPCODE_MOV ||
378 inst->dst.writemask == WRITEMASK_XYZW ||
379 inst->src[0].file != IMM)
380 continue;
381
382 int vf = brw_float_to_vf(inst->src[0].fixed_hw_reg.dw1.f);
383 if (vf == -1)
384 continue;
385
386 if ((inst->dst.writemask & WRITEMASK_X) != 0)
387 imm[0] = vf;
388 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
389 imm[1] = vf;
390 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
391 imm[2] = vf;
392 if ((inst->dst.writemask & WRITEMASK_W) != 0)
393 imm[3] = vf;
394
395 imm_inst[inst_count++] = inst;
396
397 remaining_channels &= ~inst->dst.writemask;
398 if (remaining_channels == 0) {
399 vec4_instruction *mov = MOV(inst->dst, imm);
400 mov->dst.type = BRW_REGISTER_TYPE_F;
401 mov->dst.writemask = WRITEMASK_XYZW;
402 inst->insert_after(block, mov);
403 last_reg = -1;
404
405 for (int i = 0; i < inst_count; i++) {
406 imm_inst[i]->remove(block);
407 }
408 progress = true;
409 }
410 }
411
412 if (progress)
413 invalidate_live_intervals();
414
415 return progress;
416 }
417
418 /* Replaces unused channels of a swizzle with channels that are used.
419 *
420 * For instance, this pass transforms
421 *
422 * mov vgrf4.yz, vgrf5.wxzy
423 *
424 * into
425 *
426 * mov vgrf4.yz, vgrf5.xxzx
427 *
428 * This eliminates false uses of some channels, letting dead code elimination
429 * remove the instructions that wrote them.
430 */
431 bool
432 vec4_visitor::opt_reduce_swizzle()
433 {
434 bool progress = false;
435
436 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
437 if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG ||
438 inst->is_send_from_grf())
439 continue;
440
441 unsigned swizzle;
442
443 /* Determine which channels of the sources are read. */
444 switch (inst->opcode) {
445 case VEC4_OPCODE_PACK_BYTES:
446 case BRW_OPCODE_DP4:
447 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
448 * but all four of src1.
449 */
450 swizzle = brw_swizzle_for_size(4);
451 break;
452 case BRW_OPCODE_DP3:
453 swizzle = brw_swizzle_for_size(3);
454 break;
455 case BRW_OPCODE_DP2:
456 swizzle = brw_swizzle_for_size(2);
457 break;
458 default:
459 swizzle = brw_swizzle_for_mask(inst->dst.writemask);
460 break;
461 }
462
463 /* Update sources' swizzles. */
464 for (int i = 0; i < 3; i++) {
465 if (inst->src[i].file != GRF &&
466 inst->src[i].file != ATTR &&
467 inst->src[i].file != UNIFORM)
468 continue;
469
470 const unsigned new_swizzle =
471 brw_compose_swizzle(swizzle, inst->src[i].swizzle);
472 if (inst->src[i].swizzle != new_swizzle) {
473 inst->src[i].swizzle = new_swizzle;
474 progress = true;
475 }
476 }
477 }
478
479 if (progress)
480 invalidate_live_intervals();
481
482 return progress;
483 }
484
485 void
486 vec4_visitor::split_uniform_registers()
487 {
488 /* Prior to this, uniforms have been in an array sized according to
489 * the number of vector uniforms present, sparsely filled (so an
490 * aggregate results in reg indices being skipped over). Now we're
491 * going to cut those aggregates up so each .reg index is one
492 * vector. The goal is to make elimination of unused uniform
493 * components easier later.
494 */
495 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
496 for (int i = 0 ; i < 3; i++) {
497 if (inst->src[i].file != UNIFORM)
498 continue;
499
500 assert(!inst->src[i].reladdr);
501
502 inst->src[i].reg += inst->src[i].reg_offset;
503 inst->src[i].reg_offset = 0;
504 }
505 }
506
507 /* Update that everything is now vector-sized. */
508 for (int i = 0; i < this->uniforms; i++) {
509 this->uniform_size[i] = 1;
510 }
511 }
512
513 void
514 vec4_visitor::pack_uniform_registers()
515 {
516 bool uniform_used[this->uniforms];
517 int new_loc[this->uniforms];
518 int new_chan[this->uniforms];
519
520 memset(uniform_used, 0, sizeof(uniform_used));
521 memset(new_loc, 0, sizeof(new_loc));
522 memset(new_chan, 0, sizeof(new_chan));
523
524 /* Find which uniform vectors are actually used by the program. We
525 * expect unused vector elements when we've moved array access out
526 * to pull constants, and from some GLSL code generators like wine.
527 */
528 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
529 for (int i = 0 ; i < 3; i++) {
530 if (inst->src[i].file != UNIFORM)
531 continue;
532
533 uniform_used[inst->src[i].reg] = true;
534 }
535 }
536
537 int new_uniform_count = 0;
538
539 /* Now, figure out a packing of the live uniform vectors into our
540 * push constants.
541 */
542 for (int src = 0; src < uniforms; src++) {
543 assert(src < uniform_array_size);
544 int size = this->uniform_vector_size[src];
545
546 if (!uniform_used[src]) {
547 this->uniform_vector_size[src] = 0;
548 continue;
549 }
550
551 int dst;
552 /* Find the lowest place we can slot this uniform in. */
553 for (dst = 0; dst < src; dst++) {
554 if (this->uniform_vector_size[dst] + size <= 4)
555 break;
556 }
557
558 if (src == dst) {
559 new_loc[src] = dst;
560 new_chan[src] = 0;
561 } else {
562 new_loc[src] = dst;
563 new_chan[src] = this->uniform_vector_size[dst];
564
565 /* Move the references to the data */
566 for (int j = 0; j < size; j++) {
567 stage_prog_data->param[dst * 4 + new_chan[src] + j] =
568 stage_prog_data->param[src * 4 + j];
569 }
570
571 this->uniform_vector_size[dst] += size;
572 this->uniform_vector_size[src] = 0;
573 }
574
575 new_uniform_count = MAX2(new_uniform_count, dst + 1);
576 }
577
578 this->uniforms = new_uniform_count;
579
580 /* Now, update the instructions for our repacked uniforms. */
581 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
582 for (int i = 0 ; i < 3; i++) {
583 int src = inst->src[i].reg;
584
585 if (inst->src[i].file != UNIFORM)
586 continue;
587
588 inst->src[i].reg = new_loc[src];
589 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
590 new_chan[src], new_chan[src]);
591 }
592 }
593 }
594
595 /**
596 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
597 *
598 * While GLSL IR also performs this optimization, we end up with it in
599 * our instruction stream for a couple of reasons. One is that we
600 * sometimes generate silly instructions, for example in array access
601 * where we'll generate "ADD offset, index, base" even if base is 0.
602 * The other is that GLSL IR's constant propagation doesn't track the
603 * components of aggregates, so some VS patterns (initialize matrix to
604 * 0, accumulate in vertex blending factors) end up breaking down to
605 * instructions involving 0.
606 */
607 bool
608 vec4_visitor::opt_algebraic()
609 {
610 bool progress = false;
611
612 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
613 switch (inst->opcode) {
614 case BRW_OPCODE_MOV:
615 if (inst->src[0].file != IMM)
616 break;
617
618 if (inst->saturate) {
619 if (inst->dst.type != inst->src[0].type)
620 assert(!"unimplemented: saturate mixed types");
621
622 if (brw_saturate_immediate(inst->dst.type,
623 &inst->src[0].fixed_hw_reg)) {
624 inst->saturate = false;
625 progress = true;
626 }
627 }
628 break;
629
630 case VEC4_OPCODE_UNPACK_UNIFORM:
631 if (inst->src[0].file != UNIFORM) {
632 inst->opcode = BRW_OPCODE_MOV;
633 progress = true;
634 }
635 break;
636
637 case BRW_OPCODE_ADD:
638 if (inst->src[1].is_zero()) {
639 inst->opcode = BRW_OPCODE_MOV;
640 inst->src[1] = src_reg();
641 progress = true;
642 }
643 break;
644
645 case BRW_OPCODE_MUL:
646 if (inst->src[1].is_zero()) {
647 inst->opcode = BRW_OPCODE_MOV;
648 switch (inst->src[0].type) {
649 case BRW_REGISTER_TYPE_F:
650 inst->src[0] = src_reg(0.0f);
651 break;
652 case BRW_REGISTER_TYPE_D:
653 inst->src[0] = src_reg(0);
654 break;
655 case BRW_REGISTER_TYPE_UD:
656 inst->src[0] = src_reg(0u);
657 break;
658 default:
659 unreachable("not reached");
660 }
661 inst->src[1] = src_reg();
662 progress = true;
663 } else if (inst->src[1].is_one()) {
664 inst->opcode = BRW_OPCODE_MOV;
665 inst->src[1] = src_reg();
666 progress = true;
667 } else if (inst->src[1].is_negative_one()) {
668 inst->opcode = BRW_OPCODE_MOV;
669 inst->src[0].negate = !inst->src[0].negate;
670 inst->src[1] = src_reg();
671 progress = true;
672 }
673 break;
674 case BRW_OPCODE_CMP:
675 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
676 inst->src[0].abs &&
677 inst->src[0].negate &&
678 inst->src[1].is_zero()) {
679 inst->src[0].abs = false;
680 inst->src[0].negate = false;
681 inst->conditional_mod = BRW_CONDITIONAL_Z;
682 progress = true;
683 break;
684 }
685 break;
686 case SHADER_OPCODE_RCP: {
687 vec4_instruction *prev = (vec4_instruction *)inst->prev;
688 if (prev->opcode == SHADER_OPCODE_SQRT) {
689 if (inst->src[0].equals(src_reg(prev->dst))) {
690 inst->opcode = SHADER_OPCODE_RSQ;
691 inst->src[0] = prev->src[0];
692 progress = true;
693 }
694 }
695 break;
696 }
697 case SHADER_OPCODE_BROADCAST:
698 if (is_uniform(inst->src[0]) ||
699 inst->src[1].is_zero()) {
700 inst->opcode = BRW_OPCODE_MOV;
701 inst->src[1] = src_reg();
702 inst->force_writemask_all = true;
703 progress = true;
704 }
705 break;
706
707 default:
708 break;
709 }
710 }
711
712 if (progress)
713 invalidate_live_intervals();
714
715 return progress;
716 }
717
718 /**
719 * Only a limited number of hardware registers may be used for push
720 * constants, so this turns access to the overflowed constants into
721 * pull constants.
722 */
723 void
724 vec4_visitor::move_push_constants_to_pull_constants()
725 {
726 int pull_constant_loc[this->uniforms];
727
728 /* Only allow 32 registers (256 uniform components) as push constants,
729 * which is the limit on gen6.
730 *
731 * If changing this value, note the limitation about total_regs in
732 * brw_curbe.c.
733 */
734 int max_uniform_components = 32 * 8;
735 if (this->uniforms * 4 <= max_uniform_components)
736 return;
737
738 /* Make some sort of choice as to which uniforms get sent to pull
739 * constants. We could potentially do something clever here like
740 * look for the most infrequently used uniform vec4s, but leave
741 * that for later.
742 */
743 for (int i = 0; i < this->uniforms * 4; i += 4) {
744 pull_constant_loc[i / 4] = -1;
745
746 if (i >= max_uniform_components) {
747 const gl_constant_value **values = &stage_prog_data->param[i];
748
749 /* Try to find an existing copy of this uniform in the pull
750 * constants if it was part of an array access already.
751 */
752 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
753 int matches;
754
755 for (matches = 0; matches < 4; matches++) {
756 if (stage_prog_data->pull_param[j + matches] != values[matches])
757 break;
758 }
759
760 if (matches == 4) {
761 pull_constant_loc[i / 4] = j / 4;
762 break;
763 }
764 }
765
766 if (pull_constant_loc[i / 4] == -1) {
767 assert(stage_prog_data->nr_pull_params % 4 == 0);
768 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
769
770 for (int j = 0; j < 4; j++) {
771 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
772 values[j];
773 }
774 }
775 }
776 }
777
778 /* Now actually rewrite usage of the things we've moved to pull
779 * constants.
780 */
781 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
782 for (int i = 0 ; i < 3; i++) {
783 if (inst->src[i].file != UNIFORM ||
784 pull_constant_loc[inst->src[i].reg] == -1)
785 continue;
786
787 int uniform = inst->src[i].reg;
788
789 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
790
791 emit_pull_constant_load(block, inst, temp, inst->src[i],
792 pull_constant_loc[uniform]);
793
794 inst->src[i].file = temp.file;
795 inst->src[i].reg = temp.reg;
796 inst->src[i].reg_offset = temp.reg_offset;
797 inst->src[i].reladdr = NULL;
798 }
799 }
800
801 /* Repack push constants to remove the now-unused ones. */
802 pack_uniform_registers();
803 }
804
805 /* Conditions for which we want to avoid setting the dependency control bits */
806 bool
807 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
808 {
809 #define IS_DWORD(reg) \
810 (reg.type == BRW_REGISTER_TYPE_UD || \
811 reg.type == BRW_REGISTER_TYPE_D)
812
813 /* "When source or destination datatype is 64b or operation is integer DWord
814 * multiply, DepCtrl must not be used."
815 * May apply to future SoCs as well.
816 */
817 if (devinfo->is_cherryview) {
818 if (inst->opcode == BRW_OPCODE_MUL &&
819 IS_DWORD(inst->src[0]) &&
820 IS_DWORD(inst->src[1]))
821 return true;
822 }
823 #undef IS_DWORD
824
825 if (devinfo->gen >= 8) {
826 if (inst->opcode == BRW_OPCODE_F32TO16)
827 return true;
828 }
829
830 /*
831 * mlen:
832 * In the presence of send messages, totally interrupt dependency
833 * control. They're long enough that the chance of dependency
834 * control around them just doesn't matter.
835 *
836 * predicate:
837 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
838 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
839 * completes the scoreboard clear must have a non-zero execution mask. This
840 * means, if any kind of predication can change the execution mask or channel
841 * enable of the last instruction, the optimization must be avoided. This is
842 * to avoid instructions being shot down the pipeline when no writes are
843 * required.
844 *
845 * math:
846 * Dependency control does not work well over math instructions.
847 * NB: Discovered empirically
848 */
849 return (inst->mlen || inst->predicate || inst->is_math());
850 }
851
852 /**
853 * Sets the dependency control fields on instructions after register
854 * allocation and before the generator is run.
855 *
856 * When you have a sequence of instructions like:
857 *
858 * DP4 temp.x vertex uniform[0]
859 * DP4 temp.y vertex uniform[0]
860 * DP4 temp.z vertex uniform[0]
861 * DP4 temp.w vertex uniform[0]
862 *
863 * The hardware doesn't know that it can actually run the later instructions
864 * while the previous ones are in flight, producing stalls. However, we have
865 * manual fields we can set in the instructions that let it do so.
866 */
867 void
868 vec4_visitor::opt_set_dependency_control()
869 {
870 vec4_instruction *last_grf_write[BRW_MAX_GRF];
871 uint8_t grf_channels_written[BRW_MAX_GRF];
872 vec4_instruction *last_mrf_write[BRW_MAX_GRF];
873 uint8_t mrf_channels_written[BRW_MAX_GRF];
874
875 assert(prog_data->total_grf ||
876 !"Must be called after register allocation");
877
878 foreach_block (block, cfg) {
879 memset(last_grf_write, 0, sizeof(last_grf_write));
880 memset(last_mrf_write, 0, sizeof(last_mrf_write));
881
882 foreach_inst_in_block (vec4_instruction, inst, block) {
883 /* If we read from a register that we were doing dependency control
884 * on, don't do dependency control across the read.
885 */
886 for (int i = 0; i < 3; i++) {
887 int reg = inst->src[i].reg + inst->src[i].reg_offset;
888 if (inst->src[i].file == GRF) {
889 last_grf_write[reg] = NULL;
890 } else if (inst->src[i].file == HW_REG) {
891 memset(last_grf_write, 0, sizeof(last_grf_write));
892 break;
893 }
894 assert(inst->src[i].file != MRF);
895 }
896
897 if (is_dep_ctrl_unsafe(inst)) {
898 memset(last_grf_write, 0, sizeof(last_grf_write));
899 memset(last_mrf_write, 0, sizeof(last_mrf_write));
900 continue;
901 }
902
903 /* Now, see if we can do dependency control for this instruction
904 * against a previous one writing to its destination.
905 */
906 int reg = inst->dst.reg + inst->dst.reg_offset;
907 if (inst->dst.file == GRF) {
908 if (last_grf_write[reg] &&
909 !(inst->dst.writemask & grf_channels_written[reg])) {
910 last_grf_write[reg]->no_dd_clear = true;
911 inst->no_dd_check = true;
912 } else {
913 grf_channels_written[reg] = 0;
914 }
915
916 last_grf_write[reg] = inst;
917 grf_channels_written[reg] |= inst->dst.writemask;
918 } else if (inst->dst.file == MRF) {
919 if (last_mrf_write[reg] &&
920 !(inst->dst.writemask & mrf_channels_written[reg])) {
921 last_mrf_write[reg]->no_dd_clear = true;
922 inst->no_dd_check = true;
923 } else {
924 mrf_channels_written[reg] = 0;
925 }
926
927 last_mrf_write[reg] = inst;
928 mrf_channels_written[reg] |= inst->dst.writemask;
929 } else if (inst->dst.reg == HW_REG) {
930 if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE)
931 memset(last_grf_write, 0, sizeof(last_grf_write));
932 if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE)
933 memset(last_mrf_write, 0, sizeof(last_mrf_write));
934 }
935 }
936 }
937 }
938
939 bool
940 vec4_instruction::can_reswizzle(int dst_writemask,
941 int swizzle,
942 int swizzle_mask)
943 {
944 /* If this instruction sets anything not referenced by swizzle, then we'd
945 * totally break it when we reswizzle.
946 */
947 if (dst.writemask & ~swizzle_mask)
948 return false;
949
950 if (mlen > 0)
951 return false;
952
953 return true;
954 }
955
956 /**
957 * For any channels in the swizzle's source that were populated by this
958 * instruction, rewrite the instruction to put the appropriate result directly
959 * in those channels.
960 *
961 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
962 */
963 void
964 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
965 {
966 /* Destination write mask doesn't correspond to source swizzle for the dot
967 * product and pack_bytes instructions.
968 */
969 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
970 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
971 opcode != VEC4_OPCODE_PACK_BYTES) {
972 for (int i = 0; i < 3; i++) {
973 if (src[i].file == BAD_FILE || src[i].file == IMM)
974 continue;
975
976 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
977 }
978 }
979
980 /* Apply the specified swizzle and writemask to the original mask of
981 * written components.
982 */
983 dst.writemask = dst_writemask &
984 brw_apply_swizzle_to_mask(swizzle, dst.writemask);
985 }
986
987 /*
988 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
989 * just written and then MOVed into another reg and making the original write
990 * of the GRF write directly to the final destination instead.
991 */
992 bool
993 vec4_visitor::opt_register_coalesce()
994 {
995 bool progress = false;
996 int next_ip = 0;
997
998 calculate_live_intervals();
999
1000 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1001 int ip = next_ip;
1002 next_ip++;
1003
1004 if (inst->opcode != BRW_OPCODE_MOV ||
1005 (inst->dst.file != GRF && inst->dst.file != MRF) ||
1006 inst->predicate ||
1007 inst->src[0].file != GRF ||
1008 inst->dst.type != inst->src[0].type ||
1009 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1010 continue;
1011
1012 bool to_mrf = (inst->dst.file == MRF);
1013
1014 /* Can't coalesce this GRF if someone else was going to
1015 * read it later.
1016 */
1017 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1018 continue;
1019
1020 /* We need to check interference with the final destination between this
1021 * instruction and the earliest instruction involved in writing the GRF
1022 * we're eliminating. To do that, keep track of which of our source
1023 * channels we've seen initialized.
1024 */
1025 const unsigned chans_needed =
1026 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1027 inst->dst.writemask);
1028 unsigned chans_remaining = chans_needed;
1029
1030 /* Now walk up the instruction stream trying to see if we can rewrite
1031 * everything writing to the temporary to write into the destination
1032 * instead.
1033 */
1034 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1035 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1036 inst, block) {
1037 _scan_inst = scan_inst;
1038
1039 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1040 /* Found something writing to the reg we want to coalesce away. */
1041 if (to_mrf) {
1042 /* SEND instructions can't have MRF as a destination. */
1043 if (scan_inst->mlen)
1044 break;
1045
1046 if (devinfo->gen == 6) {
1047 /* gen6 math instructions must have the destination be
1048 * GRF, so no compute-to-MRF for them.
1049 */
1050 if (scan_inst->is_math()) {
1051 break;
1052 }
1053 }
1054 }
1055
1056 /* If we can't handle the swizzle, bail. */
1057 if (!scan_inst->can_reswizzle(inst->dst.writemask,
1058 inst->src[0].swizzle,
1059 chans_needed)) {
1060 break;
1061 }
1062
1063 /* This doesn't handle coalescing of multiple registers. */
1064 if (scan_inst->regs_written > 1)
1065 break;
1066
1067 /* Mark which channels we found unconditional writes for. */
1068 if (!scan_inst->predicate)
1069 chans_remaining &= ~scan_inst->dst.writemask;
1070
1071 if (chans_remaining == 0)
1072 break;
1073 }
1074
1075 /* You can't read from an MRF, so if someone else reads our MRF's
1076 * source GRF that we wanted to rewrite, that stops us. If it's a
1077 * GRF we're trying to coalesce to, we don't actually handle
1078 * rewriting sources so bail in that case as well.
1079 */
1080 bool interfered = false;
1081 for (int i = 0; i < 3; i++) {
1082 if (inst->src[0].in_range(scan_inst->src[i],
1083 scan_inst->regs_read(i)))
1084 interfered = true;
1085 }
1086 if (interfered)
1087 break;
1088
1089 /* If somebody else writes our destination here, we can't coalesce
1090 * before that.
1091 */
1092 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written))
1093 break;
1094
1095 /* Check for reads of the register we're trying to coalesce into. We
1096 * can't go rewriting instructions above that to put some other value
1097 * in the register instead.
1098 */
1099 if (to_mrf && scan_inst->mlen > 0) {
1100 if (inst->dst.reg >= scan_inst->base_mrf &&
1101 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
1102 break;
1103 }
1104 } else {
1105 for (int i = 0; i < 3; i++) {
1106 if (inst->dst.in_range(scan_inst->src[i],
1107 scan_inst->regs_read(i)))
1108 interfered = true;
1109 }
1110 if (interfered)
1111 break;
1112 }
1113 }
1114
1115 if (chans_remaining == 0) {
1116 /* If we've made it here, we have an MOV we want to coalesce out, and
1117 * a scan_inst pointing to the earliest instruction involved in
1118 * computing the value. Now go rewrite the instruction stream
1119 * between the two.
1120 */
1121 vec4_instruction *scan_inst = _scan_inst;
1122 while (scan_inst != inst) {
1123 if (scan_inst->dst.file == GRF &&
1124 scan_inst->dst.reg == inst->src[0].reg &&
1125 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1126 scan_inst->reswizzle(inst->dst.writemask,
1127 inst->src[0].swizzle);
1128 scan_inst->dst.file = inst->dst.file;
1129 scan_inst->dst.reg = inst->dst.reg;
1130 scan_inst->dst.reg_offset = inst->dst.reg_offset;
1131 scan_inst->saturate |= inst->saturate;
1132 }
1133 scan_inst = (vec4_instruction *)scan_inst->next;
1134 }
1135 inst->remove(block);
1136 progress = true;
1137 }
1138 }
1139
1140 if (progress)
1141 invalidate_live_intervals();
1142
1143 return progress;
1144 }
1145
1146 /**
1147 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1148 * flow. We could probably do better here with some form of divergence
1149 * analysis.
1150 */
1151 bool
1152 vec4_visitor::eliminate_find_live_channel()
1153 {
1154 bool progress = false;
1155 unsigned depth = 0;
1156
1157 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1158 switch (inst->opcode) {
1159 case BRW_OPCODE_IF:
1160 case BRW_OPCODE_DO:
1161 depth++;
1162 break;
1163
1164 case BRW_OPCODE_ENDIF:
1165 case BRW_OPCODE_WHILE:
1166 depth--;
1167 break;
1168
1169 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1170 if (depth == 0) {
1171 inst->opcode = BRW_OPCODE_MOV;
1172 inst->src[0] = src_reg(0);
1173 inst->force_writemask_all = true;
1174 progress = true;
1175 }
1176 break;
1177
1178 default:
1179 break;
1180 }
1181 }
1182
1183 return progress;
1184 }
1185
1186 /**
1187 * Splits virtual GRFs requesting more than one contiguous physical register.
1188 *
1189 * We initially create large virtual GRFs for temporary structures, arrays,
1190 * and matrices, so that the dereference visitor functions can add reg_offsets
1191 * to work their way down to the actual member being accessed. But when it
1192 * comes to optimization, we'd like to treat each register as individual
1193 * storage if possible.
1194 *
1195 * So far, the only thing that might prevent splitting is a send message from
1196 * a GRF on IVB.
1197 */
1198 void
1199 vec4_visitor::split_virtual_grfs()
1200 {
1201 int num_vars = this->alloc.count;
1202 int new_virtual_grf[num_vars];
1203 bool split_grf[num_vars];
1204
1205 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1206
1207 /* Try to split anything > 0 sized. */
1208 for (int i = 0; i < num_vars; i++) {
1209 split_grf[i] = this->alloc.sizes[i] != 1;
1210 }
1211
1212 /* Check that the instructions are compatible with the registers we're trying
1213 * to split.
1214 */
1215 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1216 if (inst->dst.file == GRF && inst->regs_written > 1)
1217 split_grf[inst->dst.reg] = false;
1218
1219 for (int i = 0; i < 3; i++) {
1220 if (inst->src[i].file == GRF && inst->regs_read(i) > 1)
1221 split_grf[inst->src[i].reg] = false;
1222 }
1223 }
1224
1225 /* Allocate new space for split regs. Note that the virtual
1226 * numbers will be contiguous.
1227 */
1228 for (int i = 0; i < num_vars; i++) {
1229 if (!split_grf[i])
1230 continue;
1231
1232 new_virtual_grf[i] = alloc.allocate(1);
1233 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1234 unsigned reg = alloc.allocate(1);
1235 assert(reg == new_virtual_grf[i] + j - 1);
1236 (void) reg;
1237 }
1238 this->alloc.sizes[i] = 1;
1239 }
1240
1241 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1242 if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
1243 inst->dst.reg_offset != 0) {
1244 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1245 inst->dst.reg_offset - 1);
1246 inst->dst.reg_offset = 0;
1247 }
1248 for (int i = 0; i < 3; i++) {
1249 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
1250 inst->src[i].reg_offset != 0) {
1251 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1252 inst->src[i].reg_offset - 1);
1253 inst->src[i].reg_offset = 0;
1254 }
1255 }
1256 }
1257 invalidate_live_intervals();
1258 }
1259
1260 void
1261 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1262 {
1263 dump_instruction(be_inst, stderr);
1264 }
1265
1266 void
1267 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1268 {
1269 vec4_instruction *inst = (vec4_instruction *)be_inst;
1270
1271 if (inst->predicate) {
1272 fprintf(file, "(%cf0.%d) ",
1273 inst->predicate_inverse ? '-' : '+',
1274 inst->flag_subreg);
1275 }
1276
1277 fprintf(file, "%s", brw_instruction_name(inst->opcode));
1278 if (inst->saturate)
1279 fprintf(file, ".sat");
1280 if (inst->conditional_mod) {
1281 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1282 if (!inst->predicate &&
1283 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1284 inst->opcode != BRW_OPCODE_IF &&
1285 inst->opcode != BRW_OPCODE_WHILE))) {
1286 fprintf(file, ".f0.%d", inst->flag_subreg);
1287 }
1288 }
1289 fprintf(file, " ");
1290
1291 switch (inst->dst.file) {
1292 case GRF:
1293 fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
1294 break;
1295 case MRF:
1296 fprintf(file, "m%d", inst->dst.reg);
1297 break;
1298 case HW_REG:
1299 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1300 switch (inst->dst.fixed_hw_reg.nr) {
1301 case BRW_ARF_NULL:
1302 fprintf(file, "null");
1303 break;
1304 case BRW_ARF_ADDRESS:
1305 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
1306 break;
1307 case BRW_ARF_ACCUMULATOR:
1308 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
1309 break;
1310 case BRW_ARF_FLAG:
1311 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1312 inst->dst.fixed_hw_reg.subnr);
1313 break;
1314 default:
1315 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1316 inst->dst.fixed_hw_reg.subnr);
1317 break;
1318 }
1319 } else {
1320 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
1321 }
1322 if (inst->dst.fixed_hw_reg.subnr)
1323 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
1324 break;
1325 case BAD_FILE:
1326 fprintf(file, "(null)");
1327 break;
1328 default:
1329 fprintf(file, "???");
1330 break;
1331 }
1332 if (inst->dst.writemask != WRITEMASK_XYZW) {
1333 fprintf(file, ".");
1334 if (inst->dst.writemask & 1)
1335 fprintf(file, "x");
1336 if (inst->dst.writemask & 2)
1337 fprintf(file, "y");
1338 if (inst->dst.writemask & 4)
1339 fprintf(file, "z");
1340 if (inst->dst.writemask & 8)
1341 fprintf(file, "w");
1342 }
1343 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1344
1345 if (inst->src[0].file != BAD_FILE)
1346 fprintf(file, ", ");
1347
1348 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1349 if (inst->src[i].negate)
1350 fprintf(file, "-");
1351 if (inst->src[i].abs)
1352 fprintf(file, "|");
1353 switch (inst->src[i].file) {
1354 case GRF:
1355 fprintf(file, "vgrf%d", inst->src[i].reg);
1356 break;
1357 case ATTR:
1358 fprintf(file, "attr%d", inst->src[i].reg);
1359 break;
1360 case UNIFORM:
1361 fprintf(file, "u%d", inst->src[i].reg);
1362 break;
1363 case IMM:
1364 switch (inst->src[i].type) {
1365 case BRW_REGISTER_TYPE_F:
1366 fprintf(file, "%fF", inst->src[i].fixed_hw_reg.dw1.f);
1367 break;
1368 case BRW_REGISTER_TYPE_D:
1369 fprintf(file, "%dD", inst->src[i].fixed_hw_reg.dw1.d);
1370 break;
1371 case BRW_REGISTER_TYPE_UD:
1372 fprintf(file, "%uU", inst->src[i].fixed_hw_reg.dw1.ud);
1373 break;
1374 case BRW_REGISTER_TYPE_VF:
1375 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1376 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
1377 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
1378 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
1379 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
1380 break;
1381 default:
1382 fprintf(file, "???");
1383 break;
1384 }
1385 break;
1386 case HW_REG:
1387 if (inst->src[i].fixed_hw_reg.negate)
1388 fprintf(file, "-");
1389 if (inst->src[i].fixed_hw_reg.abs)
1390 fprintf(file, "|");
1391 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1392 switch (inst->src[i].fixed_hw_reg.nr) {
1393 case BRW_ARF_NULL:
1394 fprintf(file, "null");
1395 break;
1396 case BRW_ARF_ADDRESS:
1397 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
1398 break;
1399 case BRW_ARF_ACCUMULATOR:
1400 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
1401 break;
1402 case BRW_ARF_FLAG:
1403 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1404 inst->src[i].fixed_hw_reg.subnr);
1405 break;
1406 default:
1407 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1408 inst->src[i].fixed_hw_reg.subnr);
1409 break;
1410 }
1411 } else {
1412 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
1413 }
1414 if (inst->src[i].fixed_hw_reg.subnr)
1415 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
1416 if (inst->src[i].fixed_hw_reg.abs)
1417 fprintf(file, "|");
1418 break;
1419 case BAD_FILE:
1420 fprintf(file, "(null)");
1421 break;
1422 default:
1423 fprintf(file, "???");
1424 break;
1425 }
1426
1427 /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1428 if (inst->src[i].reg_offset != 0 &&
1429 inst->src[i].file == GRF &&
1430 alloc.sizes[inst->src[i].reg] != 1)
1431 fprintf(file, ".%d", inst->src[i].reg_offset);
1432
1433 if (inst->src[i].file != IMM) {
1434 static const char *chans[4] = {"x", "y", "z", "w"};
1435 fprintf(file, ".");
1436 for (int c = 0; c < 4; c++) {
1437 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1438 }
1439 }
1440
1441 if (inst->src[i].abs)
1442 fprintf(file, "|");
1443
1444 if (inst->src[i].file != IMM) {
1445 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1446 }
1447
1448 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1449 fprintf(file, ", ");
1450 }
1451
1452 fprintf(file, "\n");
1453 }
1454
1455
1456 static inline struct brw_reg
1457 attribute_to_hw_reg(int attr, bool interleaved)
1458 {
1459 if (interleaved)
1460 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1461 else
1462 return brw_vec8_grf(attr, 0);
1463 }
1464
1465
1466 /**
1467 * Replace each register of type ATTR in this->instructions with a reference
1468 * to a fixed HW register.
1469 *
1470 * If interleaved is true, then each attribute takes up half a register, with
1471 * register N containing attribute 2*N in its first half and attribute 2*N+1
1472 * in its second half (this corresponds to the payload setup used by geometry
1473 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
1474 * false, then each attribute takes up a whole register, with register N
1475 * containing attribute N (this corresponds to the payload setup used by
1476 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1477 */
1478 void
1479 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1480 bool interleaved)
1481 {
1482 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1483 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1484 if (inst->dst.file == ATTR) {
1485 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1486
1487 /* All attributes used in the shader need to have been assigned a
1488 * hardware register by the caller
1489 */
1490 assert(grf != 0);
1491
1492 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1493 reg.type = inst->dst.type;
1494 reg.dw1.bits.writemask = inst->dst.writemask;
1495
1496 inst->dst.file = HW_REG;
1497 inst->dst.fixed_hw_reg = reg;
1498 }
1499
1500 for (int i = 0; i < 3; i++) {
1501 if (inst->src[i].file != ATTR)
1502 continue;
1503
1504 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1505
1506 /* All attributes used in the shader need to have been assigned a
1507 * hardware register by the caller
1508 */
1509 assert(grf != 0);
1510
1511 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1512 reg.dw1.bits.swizzle = inst->src[i].swizzle;
1513 reg.type = inst->src[i].type;
1514 if (inst->src[i].abs)
1515 reg = brw_abs(reg);
1516 if (inst->src[i].negate)
1517 reg = negate(reg);
1518
1519 inst->src[i].file = HW_REG;
1520 inst->src[i].fixed_hw_reg = reg;
1521 }
1522 }
1523 }
1524
1525 int
1526 vec4_vs_visitor::setup_attributes(int payload_reg)
1527 {
1528 int nr_attributes;
1529 int attribute_map[VERT_ATTRIB_MAX + 1];
1530 memset(attribute_map, 0, sizeof(attribute_map));
1531
1532 nr_attributes = 0;
1533 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1534 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1535 attribute_map[i] = payload_reg + nr_attributes;
1536 nr_attributes++;
1537 }
1538 }
1539
1540 /* VertexID is stored by the VF as the last vertex element, but we
1541 * don't represent it with a flag in inputs_read, so we call it
1542 * VERT_ATTRIB_MAX.
1543 */
1544 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
1545 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1546 nr_attributes++;
1547 }
1548
1549 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1550
1551 /* The BSpec says we always have to read at least one thing from
1552 * the VF, and it appears that the hardware wedges otherwise.
1553 */
1554 if (nr_attributes == 0)
1555 nr_attributes = 1;
1556
1557 prog_data->urb_read_length = (nr_attributes + 1) / 2;
1558
1559 unsigned vue_entries =
1560 MAX2(nr_attributes, prog_data->vue_map.num_slots);
1561
1562 if (devinfo->gen == 6)
1563 prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8;
1564 else
1565 prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4;
1566
1567 return payload_reg + nr_attributes;
1568 }
1569
1570 int
1571 vec4_visitor::setup_uniforms(int reg)
1572 {
1573 prog_data->base.dispatch_grf_start_reg = reg;
1574
1575 /* The pre-gen6 VS requires that some push constants get loaded no
1576 * matter what, or the GPU would hang.
1577 */
1578 if (devinfo->gen < 6 && this->uniforms == 0) {
1579 assert(this->uniforms < this->uniform_array_size);
1580 this->uniform_vector_size[this->uniforms] = 1;
1581
1582 stage_prog_data->param =
1583 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1584 for (unsigned int i = 0; i < 4; i++) {
1585 unsigned int slot = this->uniforms * 4 + i;
1586 static gl_constant_value zero = { 0.0 };
1587 stage_prog_data->param[slot] = &zero;
1588 }
1589
1590 this->uniforms++;
1591 reg++;
1592 } else {
1593 reg += ALIGN(uniforms, 2) / 2;
1594 }
1595
1596 stage_prog_data->nr_params = this->uniforms * 4;
1597
1598 prog_data->base.curb_read_length =
1599 reg - prog_data->base.dispatch_grf_start_reg;
1600
1601 return reg;
1602 }
1603
1604 void
1605 vec4_vs_visitor::setup_payload(void)
1606 {
1607 int reg = 0;
1608
1609 /* The payload always contains important data in g0, which contains
1610 * the URB handles that are passed on to the URB write at the end
1611 * of the thread. So, we always start push constants at g1.
1612 */
1613 reg++;
1614
1615 reg = setup_uniforms(reg);
1616
1617 reg = setup_attributes(reg);
1618
1619 this->first_non_payload_grf = reg;
1620 }
1621
1622 void
1623 vec4_visitor::assign_binding_table_offsets()
1624 {
1625 assign_common_binding_table_offsets(0);
1626 }
1627
1628 src_reg
1629 vec4_visitor::get_timestamp()
1630 {
1631 assert(devinfo->gen >= 7);
1632
1633 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1634 BRW_ARF_TIMESTAMP,
1635 0,
1636 0,
1637 0,
1638 BRW_REGISTER_TYPE_UD,
1639 BRW_VERTICAL_STRIDE_0,
1640 BRW_WIDTH_4,
1641 BRW_HORIZONTAL_STRIDE_4,
1642 BRW_SWIZZLE_XYZW,
1643 WRITEMASK_XYZW));
1644
1645 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1646
1647 vec4_instruction *mov = emit(MOV(dst, ts));
1648 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1649 * even if it's not enabled in the dispatch.
1650 */
1651 mov->force_writemask_all = true;
1652
1653 return src_reg(dst);
1654 }
1655
1656 void
1657 vec4_visitor::emit_shader_time_begin()
1658 {
1659 current_annotation = "shader time start";
1660 shader_start_time = get_timestamp();
1661 }
1662
1663 void
1664 vec4_visitor::emit_shader_time_end()
1665 {
1666 current_annotation = "shader time end";
1667 src_reg shader_end_time = get_timestamp();
1668
1669
1670 /* Check that there weren't any timestamp reset events (assuming these
1671 * were the only two timestamp reads that happened).
1672 */
1673 src_reg reset_end = shader_end_time;
1674 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1675 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1676 test->conditional_mod = BRW_CONDITIONAL_Z;
1677
1678 emit(IF(BRW_PREDICATE_NORMAL));
1679
1680 /* Take the current timestamp and get the delta. */
1681 shader_start_time.negate = true;
1682 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1683 emit(ADD(diff, shader_start_time, shader_end_time));
1684
1685 /* If there were no instructions between the two timestamp gets, the diff
1686 * is 2 cycles. Remove that overhead, so I can forget about that when
1687 * trying to determine the time taken for single instructions.
1688 */
1689 emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1690
1691 emit_shader_time_write(0, src_reg(diff));
1692 emit_shader_time_write(1, src_reg(1u));
1693 emit(BRW_OPCODE_ELSE);
1694 emit_shader_time_write(2, src_reg(1u));
1695 emit(BRW_OPCODE_ENDIF);
1696 }
1697
1698 void
1699 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1700 {
1701 dst_reg dst =
1702 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1703
1704 dst_reg offset = dst;
1705 dst_reg time = dst;
1706 time.reg_offset++;
1707
1708 offset.type = BRW_REGISTER_TYPE_UD;
1709 int index = shader_time_index * 3 + shader_time_subindex;
1710 emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
1711
1712 time.type = BRW_REGISTER_TYPE_UD;
1713 emit(MOV(time, src_reg(value)));
1714
1715 vec4_instruction *inst =
1716 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1717 inst->mlen = 2;
1718 }
1719
1720 bool
1721 vec4_visitor::run(gl_clip_plane *clip_planes)
1722 {
1723 bool use_vec4_nir =
1724 compiler->glsl_compiler_options[stage].NirOptions != NULL;
1725
1726 sanity_param_count = prog->Parameters->NumParameters;
1727
1728 if (shader_time_index >= 0)
1729 emit_shader_time_begin();
1730
1731 assign_binding_table_offsets();
1732
1733 emit_prolog();
1734
1735 if (use_vec4_nir) {
1736 assert(prog->nir != NULL);
1737 emit_nir_code();
1738 if (failed)
1739 return false;
1740 } else if (shader) {
1741 /* Generate VS IR for main(). (the visitor only descends into
1742 * functions called "main").
1743 */
1744 visit_instructions(shader->base.ir);
1745 } else {
1746 emit_program_code();
1747 }
1748 base_ir = NULL;
1749
1750 if (key->userclip_active && !prog->UsesClipDistanceOut)
1751 setup_uniform_clipplane_values(clip_planes);
1752
1753 emit_thread_end();
1754
1755 calculate_cfg();
1756
1757 /* Before any optimization, push array accesses out to scratch
1758 * space where we need them to be. This pass may allocate new
1759 * virtual GRFs, so we want to do it early. It also makes sure
1760 * that we have reladdr computations available for CSE, since we'll
1761 * often do repeated subexpressions for those.
1762 */
1763 if (shader || use_vec4_nir) {
1764 move_grf_array_access_to_scratch();
1765 move_uniform_array_access_to_pull_constants();
1766 } else {
1767 /* The ARB_vertex_program frontend emits pull constant loads directly
1768 * rather than using reladdr, so we don't need to walk through all the
1769 * instructions looking for things to move. There isn't anything.
1770 *
1771 * We do still need to split things to vec4 size.
1772 */
1773 split_uniform_registers();
1774 }
1775 pack_uniform_registers();
1776 move_push_constants_to_pull_constants();
1777 split_virtual_grfs();
1778
1779 #define OPT(pass, args...) ({ \
1780 pass_num++; \
1781 bool this_progress = pass(args); \
1782 \
1783 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
1784 char filename[64]; \
1785 snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass, \
1786 stage_abbrev, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
1787 \
1788 backend_shader::dump_instructions(filename); \
1789 } \
1790 \
1791 progress = progress || this_progress; \
1792 this_progress; \
1793 })
1794
1795
1796 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1797 char filename[64];
1798 snprintf(filename, 64, "%s-%04d-00-start",
1799 stage_abbrev, shader_prog ? shader_prog->Name : 0);
1800
1801 backend_shader::dump_instructions(filename);
1802 }
1803
1804 bool progress;
1805 int iteration = 0;
1806 int pass_num = 0;
1807 do {
1808 progress = false;
1809 pass_num = 0;
1810 iteration++;
1811
1812 OPT(opt_reduce_swizzle);
1813 OPT(dead_code_eliminate);
1814 OPT(dead_control_flow_eliminate, this);
1815 OPT(opt_copy_propagation);
1816 OPT(opt_cse);
1817 OPT(opt_algebraic);
1818 OPT(opt_register_coalesce);
1819 OPT(eliminate_find_live_channel);
1820 } while (progress);
1821
1822 pass_num = 0;
1823
1824 if (OPT(opt_vector_float)) {
1825 OPT(opt_cse);
1826 OPT(opt_copy_propagation, false);
1827 OPT(opt_copy_propagation, true);
1828 OPT(dead_code_eliminate);
1829 }
1830
1831 if (failed)
1832 return false;
1833
1834 setup_payload();
1835
1836 if (false) {
1837 /* Debug of register spilling: Go spill everything. */
1838 const int grf_count = alloc.count;
1839 float spill_costs[alloc.count];
1840 bool no_spill[alloc.count];
1841 evaluate_spill_costs(spill_costs, no_spill);
1842 for (int i = 0; i < grf_count; i++) {
1843 if (no_spill[i])
1844 continue;
1845 spill_reg(i);
1846 }
1847 }
1848
1849 bool allocated_without_spills = reg_allocate();
1850
1851 if (!allocated_without_spills) {
1852 compiler->shader_perf_log(log_data,
1853 "%s shader triggered register spilling. "
1854 "Try reducing the number of live vec4 values "
1855 "to improve performance.\n",
1856 stage_name);
1857
1858 while (!reg_allocate()) {
1859 if (failed)
1860 return false;
1861 }
1862 }
1863
1864 opt_schedule_instructions();
1865
1866 opt_set_dependency_control();
1867
1868 if (last_scratch > 0) {
1869 prog_data->base.total_scratch =
1870 brw_get_scratch_size(last_scratch * REG_SIZE);
1871 }
1872
1873 /* If any state parameters were appended, then ParameterValues could have
1874 * been realloced, in which case the driver uniform storage set up by
1875 * _mesa_associate_uniform_storage() would point to freed memory. Make
1876 * sure that didn't happen.
1877 */
1878 assert(sanity_param_count == prog->Parameters->NumParameters);
1879
1880 return !failed;
1881 }
1882
1883 } /* namespace brw */
1884
1885 extern "C" {
1886
1887 /**
1888 * Compile a vertex shader.
1889 *
1890 * Returns the final assembly and the program's size.
1891 */
1892 const unsigned *
1893 brw_vs_emit(struct brw_context *brw,
1894 void *mem_ctx,
1895 const struct brw_vs_prog_key *key,
1896 struct brw_vs_prog_data *prog_data,
1897 struct gl_vertex_program *vp,
1898 struct gl_shader_program *prog,
1899 unsigned *final_assembly_size)
1900 {
1901 bool start_busy = false;
1902 double start_time = 0;
1903 const unsigned *assembly = NULL;
1904
1905 if (unlikely(brw->perf_debug)) {
1906 start_busy = (brw->batch.last_bo &&
1907 drm_intel_bo_busy(brw->batch.last_bo));
1908 start_time = get_time();
1909 }
1910
1911 struct brw_shader *shader = NULL;
1912 if (prog)
1913 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
1914
1915 int st_index = -1;
1916 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1917 st_index = brw_get_shader_time_index(brw, prog, &vp->Base, ST_VS);
1918
1919 if (unlikely(INTEL_DEBUG & DEBUG_VS) && shader->base.ir)
1920 brw_dump_ir("vertex", prog, &shader->base, &vp->Base);
1921
1922 if (!vp->Base.nir &&
1923 (brw->intelScreen->compiler->scalar_vs ||
1924 brw->intelScreen->compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions != NULL)) {
1925 /* Normally we generate NIR in LinkShader() or
1926 * ProgramStringNotify(), but Mesa's fixed-function vertex program
1927 * handling doesn't notify the driver at all. Just do it here, at
1928 * the last minute, even though it's lame.
1929 */
1930 assert(vp->Base.Id == 0 && prog == NULL);
1931 vp->Base.nir =
1932 brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX,
1933 brw->intelScreen->compiler->scalar_vs);
1934 }
1935
1936 if (brw->intelScreen->compiler->scalar_vs) {
1937 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
1938
1939 fs_visitor v(brw->intelScreen->compiler, brw,
1940 mem_ctx, MESA_SHADER_VERTEX, key,
1941 &prog_data->base.base, prog, &vp->Base,
1942 8, st_index);
1943 if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
1944 if (prog) {
1945 prog->LinkStatus = false;
1946 ralloc_strcat(&prog->InfoLog, v.fail_msg);
1947 }
1948
1949 _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
1950 v.fail_msg);
1951
1952 return NULL;
1953 }
1954
1955 fs_generator g(brw->intelScreen->compiler, brw,
1956 mem_ctx, (void *) key, &prog_data->base.base,
1957 &vp->Base, v.promoted_constants,
1958 v.runtime_check_aads_emit, "VS");
1959 if (INTEL_DEBUG & DEBUG_VS) {
1960 char *name;
1961 if (prog) {
1962 name = ralloc_asprintf(mem_ctx, "%s vertex shader %d",
1963 prog->Label ? prog->Label : "unnamed",
1964 prog->Name);
1965 } else {
1966 name = ralloc_asprintf(mem_ctx, "vertex program %d",
1967 vp->Base.Id);
1968 }
1969 g.enable_debug(name);
1970 }
1971 g.generate_code(v.cfg, 8);
1972 assembly = g.get_assembly(final_assembly_size);
1973 }
1974
1975 if (!assembly) {
1976 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
1977
1978 vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data,
1979 vp, prog, mem_ctx, st_index,
1980 !_mesa_is_gles3(&brw->ctx));
1981 if (!v.run(brw_select_clip_planes(&brw->ctx))) {
1982 if (prog) {
1983 prog->LinkStatus = false;
1984 ralloc_strcat(&prog->InfoLog, v.fail_msg);
1985 }
1986
1987 _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
1988 v.fail_msg);
1989
1990 return NULL;
1991 }
1992
1993 vec4_generator g(brw->intelScreen->compiler, brw,
1994 prog, &vp->Base, &prog_data->base,
1995 mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
1996 assembly = g.generate_assembly(v.cfg, final_assembly_size);
1997 }
1998
1999 if (unlikely(brw->perf_debug) && shader) {
2000 if (shader->compiled_once) {
2001 brw_vs_debug_recompile(brw, prog, key);
2002 }
2003 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
2004 perf_debug("VS compile took %.03f ms and stalled the GPU\n",
2005 (get_time() - start_time) * 1000);
2006 }
2007 shader->compiled_once = true;
2008 }
2009
2010 return assembly;
2011 }
2012
2013
2014 void
2015 brw_vue_setup_prog_key_for_precompile(struct gl_context *ctx,
2016 struct brw_vue_prog_key *key,
2017 GLuint id, struct gl_program *prog)
2018 {
2019 struct brw_context *brw = brw_context(ctx);
2020 key->program_string_id = id;
2021
2022 brw_setup_tex_for_precompile(brw, &key->tex, prog);
2023 }
2024
2025 } /* extern "C" */