i965/vec4: extend the DWORD multiply DepCtrl restriction to all gen8 platforms
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_vs.h"
28 #include "brw_nir.h"
29 #include "brw_vec4_builder.h"
30 #include "brw_vec4_live_variables.h"
31 #include "brw_dead_control_flow.h"
32 #include "program/prog_parameter.h"
33
34 #define MAX_INSTRUCTION (1 << 30)
35
36 using namespace brw;
37
38 namespace brw {
39
40 void
41 src_reg::init()
42 {
43 memset(this, 0, sizeof(*this));
44
45 this->file = BAD_FILE;
46 }
47
48 src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
49 {
50 init();
51
52 this->file = file;
53 this->nr = nr;
54 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
55 this->swizzle = brw_swizzle_for_size(type->vector_elements);
56 else
57 this->swizzle = BRW_SWIZZLE_XYZW;
58 if (type)
59 this->type = brw_type_for_base_type(type);
60 }
61
62 /** Generic unset register constructor. */
63 src_reg::src_reg()
64 {
65 init();
66 }
67
68 src_reg::src_reg(struct ::brw_reg reg) :
69 backend_reg(reg)
70 {
71 this->offset = 0;
72 this->reladdr = NULL;
73 }
74
75 src_reg::src_reg(const dst_reg &reg) :
76 backend_reg(reg)
77 {
78 this->reladdr = reg.reladdr;
79 this->swizzle = brw_swizzle_for_mask(reg.writemask);
80 }
81
82 void
83 dst_reg::init()
84 {
85 memset(this, 0, sizeof(*this));
86 this->file = BAD_FILE;
87 this->writemask = WRITEMASK_XYZW;
88 }
89
90 dst_reg::dst_reg()
91 {
92 init();
93 }
94
95 dst_reg::dst_reg(enum brw_reg_file file, int nr)
96 {
97 init();
98
99 this->file = file;
100 this->nr = nr;
101 }
102
103 dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
104 unsigned writemask)
105 {
106 init();
107
108 this->file = file;
109 this->nr = nr;
110 this->type = brw_type_for_base_type(type);
111 this->writemask = writemask;
112 }
113
114 dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
115 unsigned writemask)
116 {
117 init();
118
119 this->file = file;
120 this->nr = nr;
121 this->type = type;
122 this->writemask = writemask;
123 }
124
125 dst_reg::dst_reg(struct ::brw_reg reg) :
126 backend_reg(reg)
127 {
128 this->offset = 0;
129 this->reladdr = NULL;
130 }
131
132 dst_reg::dst_reg(const src_reg &reg) :
133 backend_reg(reg)
134 {
135 this->writemask = brw_mask_for_swizzle(reg.swizzle);
136 this->reladdr = reg.reladdr;
137 }
138
139 bool
140 dst_reg::equals(const dst_reg &r) const
141 {
142 return (this->backend_reg::equals(r) &&
143 (reladdr == r.reladdr ||
144 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
145 }
146
147 bool
148 vec4_instruction::is_send_from_grf()
149 {
150 switch (opcode) {
151 case SHADER_OPCODE_SHADER_TIME_ADD:
152 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
153 case SHADER_OPCODE_UNTYPED_ATOMIC:
154 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
155 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
156 case SHADER_OPCODE_TYPED_ATOMIC:
157 case SHADER_OPCODE_TYPED_SURFACE_READ:
158 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
159 case VEC4_OPCODE_URB_READ:
160 case TCS_OPCODE_URB_WRITE:
161 case TCS_OPCODE_RELEASE_INPUT:
162 case SHADER_OPCODE_BARRIER:
163 return true;
164 default:
165 return false;
166 }
167 }
168
169 /**
170 * Returns true if this instruction's sources and destinations cannot
171 * safely be the same register.
172 *
173 * In most cases, a register can be written over safely by the same
174 * instruction that is its last use. For a single instruction, the
175 * sources are dereferenced before writing of the destination starts
176 * (naturally).
177 *
178 * However, there are a few cases where this can be problematic:
179 *
180 * - Virtual opcodes that translate to multiple instructions in the
181 * code generator: if src == dst and one instruction writes the
182 * destination before a later instruction reads the source, then
183 * src will have been clobbered.
184 *
185 * The register allocator uses this information to set up conflicts between
186 * GRF sources and the destination.
187 */
188 bool
189 vec4_instruction::has_source_and_destination_hazard() const
190 {
191 switch (opcode) {
192 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
193 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
194 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
195 return true;
196 default:
197 return false;
198 }
199 }
200
201 unsigned
202 vec4_instruction::size_read(unsigned arg) const
203 {
204 switch (opcode) {
205 case SHADER_OPCODE_SHADER_TIME_ADD:
206 case SHADER_OPCODE_UNTYPED_ATOMIC:
207 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
208 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
209 case SHADER_OPCODE_TYPED_ATOMIC:
210 case SHADER_OPCODE_TYPED_SURFACE_READ:
211 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
212 case TCS_OPCODE_URB_WRITE:
213 if (arg == 0)
214 return mlen * REG_SIZE;
215 break;
216 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
217 if (arg == 1)
218 return mlen * REG_SIZE;
219 break;
220 default:
221 break;
222 }
223
224 switch (src[arg].file) {
225 case BAD_FILE:
226 return 0;
227 case IMM:
228 case UNIFORM:
229 return 4 * type_sz(src[arg].type);
230 default:
231 /* XXX - Represent actual vertical stride. */
232 return exec_size * type_sz(src[arg].type);
233 }
234 }
235
236 bool
237 vec4_instruction::can_do_source_mods(const struct gen_device_info *devinfo)
238 {
239 if (devinfo->gen == 6 && is_math())
240 return false;
241
242 if (is_send_from_grf())
243 return false;
244
245 if (!backend_instruction::can_do_source_mods())
246 return false;
247
248 return true;
249 }
250
251 bool
252 vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
253 {
254 switch (opcode) {
255 case SHADER_OPCODE_GEN4_SCRATCH_READ:
256 case VEC4_OPCODE_FROM_DOUBLE:
257 case VEC4_OPCODE_TO_DOUBLE:
258 case VEC4_OPCODE_PICK_LOW_32BIT:
259 case VEC4_OPCODE_PICK_HIGH_32BIT:
260 case VEC4_OPCODE_SET_LOW_32BIT:
261 case VEC4_OPCODE_SET_HIGH_32BIT:
262 case VS_OPCODE_PULL_CONSTANT_LOAD:
263 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
264 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
265 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
266 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
267 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
268 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
269 case VEC4_OPCODE_URB_READ:
270 case SHADER_OPCODE_MOV_INDIRECT:
271 return false;
272 default:
273 /* The MATH instruction on Gen6 only executes in align1 mode, which does
274 * not support writemasking.
275 */
276 if (devinfo->gen == 6 && is_math())
277 return false;
278
279 if (is_tex())
280 return false;
281
282 return true;
283 }
284 }
285
286 bool
287 vec4_instruction::can_change_types() const
288 {
289 return dst.type == src[0].type &&
290 !src[0].abs && !src[0].negate && !saturate &&
291 (opcode == BRW_OPCODE_MOV ||
292 (opcode == BRW_OPCODE_SEL &&
293 dst.type == src[1].type &&
294 predicate != BRW_PREDICATE_NONE &&
295 !src[1].abs && !src[1].negate));
296 }
297
298 /**
299 * Returns how many MRFs an opcode will write over.
300 *
301 * Note that this is not the 0 or 1 implied writes in an actual gen
302 * instruction -- the generate_* functions generate additional MOVs
303 * for setup.
304 */
305 int
306 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
307 {
308 if (inst->mlen == 0 || inst->is_send_from_grf())
309 return 0;
310
311 switch (inst->opcode) {
312 case SHADER_OPCODE_RCP:
313 case SHADER_OPCODE_RSQ:
314 case SHADER_OPCODE_SQRT:
315 case SHADER_OPCODE_EXP2:
316 case SHADER_OPCODE_LOG2:
317 case SHADER_OPCODE_SIN:
318 case SHADER_OPCODE_COS:
319 return 1;
320 case SHADER_OPCODE_INT_QUOTIENT:
321 case SHADER_OPCODE_INT_REMAINDER:
322 case SHADER_OPCODE_POW:
323 case TCS_OPCODE_THREAD_END:
324 return 2;
325 case VS_OPCODE_URB_WRITE:
326 return 1;
327 case VS_OPCODE_PULL_CONSTANT_LOAD:
328 return 2;
329 case SHADER_OPCODE_GEN4_SCRATCH_READ:
330 return 2;
331 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
332 return 3;
333 case GS_OPCODE_URB_WRITE:
334 case GS_OPCODE_URB_WRITE_ALLOCATE:
335 case GS_OPCODE_THREAD_END:
336 return 0;
337 case GS_OPCODE_FF_SYNC:
338 return 1;
339 case TCS_OPCODE_URB_WRITE:
340 return 0;
341 case SHADER_OPCODE_SHADER_TIME_ADD:
342 return 0;
343 case SHADER_OPCODE_TEX:
344 case SHADER_OPCODE_TXL:
345 case SHADER_OPCODE_TXD:
346 case SHADER_OPCODE_TXF:
347 case SHADER_OPCODE_TXF_CMS:
348 case SHADER_OPCODE_TXF_CMS_W:
349 case SHADER_OPCODE_TXF_MCS:
350 case SHADER_OPCODE_TXS:
351 case SHADER_OPCODE_TG4:
352 case SHADER_OPCODE_TG4_OFFSET:
353 case SHADER_OPCODE_SAMPLEINFO:
354 case VS_OPCODE_GET_BUFFER_SIZE:
355 return inst->header_size;
356 default:
357 unreachable("not reached");
358 }
359 }
360
361 bool
362 src_reg::equals(const src_reg &r) const
363 {
364 return (this->backend_reg::equals(r) &&
365 !reladdr && !r.reladdr);
366 }
367
368 bool
369 vec4_visitor::opt_vector_float()
370 {
371 bool progress = false;
372
373 foreach_block(block, cfg) {
374 int last_reg = -1, last_offset = -1;
375 enum brw_reg_file last_reg_file = BAD_FILE;
376
377 uint8_t imm[4] = { 0 };
378 int inst_count = 0;
379 vec4_instruction *imm_inst[4];
380 unsigned writemask = 0;
381 enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
382
383 foreach_inst_in_block_safe(vec4_instruction, inst, block) {
384 int vf = -1;
385 enum brw_reg_type need_type;
386
387 /* Look for unconditional MOVs from an immediate with a partial
388 * writemask. Skip type-conversion MOVs other than integer 0,
389 * where the type doesn't matter. See if the immediate can be
390 * represented as a VF.
391 */
392 if (inst->opcode == BRW_OPCODE_MOV &&
393 inst->src[0].file == IMM &&
394 inst->predicate == BRW_PREDICATE_NONE &&
395 inst->dst.writemask != WRITEMASK_XYZW &&
396 type_sz(inst->src[0].type) < 8 &&
397 (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
398
399 vf = brw_float_to_vf(inst->src[0].d);
400 need_type = BRW_REGISTER_TYPE_D;
401
402 if (vf == -1) {
403 vf = brw_float_to_vf(inst->src[0].f);
404 need_type = BRW_REGISTER_TYPE_F;
405 }
406 } else {
407 last_reg = -1;
408 }
409
410 /* If this wasn't a MOV, or the destination register doesn't match,
411 * or we have to switch destination types, then this breaks our
412 * sequence. Combine anything we've accumulated so far.
413 */
414 if (last_reg != inst->dst.nr ||
415 last_offset != inst->dst.offset ||
416 last_reg_file != inst->dst.file ||
417 (vf > 0 && dest_type != need_type)) {
418
419 if (inst_count > 1) {
420 unsigned vf;
421 memcpy(&vf, imm, sizeof(vf));
422 vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
423 mov->dst.type = dest_type;
424 mov->dst.writemask = writemask;
425 inst->insert_before(block, mov);
426
427 for (int i = 0; i < inst_count; i++) {
428 imm_inst[i]->remove(block);
429 }
430
431 progress = true;
432 }
433
434 inst_count = 0;
435 last_reg = -1;
436 writemask = 0;
437 dest_type = BRW_REGISTER_TYPE_F;
438
439 for (int i = 0; i < 4; i++) {
440 imm[i] = 0;
441 }
442 }
443
444 /* Record this instruction's value (if it was representable). */
445 if (vf != -1) {
446 if ((inst->dst.writemask & WRITEMASK_X) != 0)
447 imm[0] = vf;
448 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
449 imm[1] = vf;
450 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
451 imm[2] = vf;
452 if ((inst->dst.writemask & WRITEMASK_W) != 0)
453 imm[3] = vf;
454
455 writemask |= inst->dst.writemask;
456 imm_inst[inst_count++] = inst;
457
458 last_reg = inst->dst.nr;
459 last_offset = inst->dst.offset;
460 last_reg_file = inst->dst.file;
461 if (vf > 0)
462 dest_type = need_type;
463 }
464 }
465 }
466
467 if (progress)
468 invalidate_live_intervals();
469
470 return progress;
471 }
472
473 /* Replaces unused channels of a swizzle with channels that are used.
474 *
475 * For instance, this pass transforms
476 *
477 * mov vgrf4.yz, vgrf5.wxzy
478 *
479 * into
480 *
481 * mov vgrf4.yz, vgrf5.xxzx
482 *
483 * This eliminates false uses of some channels, letting dead code elimination
484 * remove the instructions that wrote them.
485 */
486 bool
487 vec4_visitor::opt_reduce_swizzle()
488 {
489 bool progress = false;
490
491 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
492 if (inst->dst.file == BAD_FILE ||
493 inst->dst.file == ARF ||
494 inst->dst.file == FIXED_GRF ||
495 inst->is_send_from_grf())
496 continue;
497
498 unsigned swizzle;
499
500 /* Determine which channels of the sources are read. */
501 switch (inst->opcode) {
502 case VEC4_OPCODE_PACK_BYTES:
503 case BRW_OPCODE_DP4:
504 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
505 * but all four of src1.
506 */
507 swizzle = brw_swizzle_for_size(4);
508 break;
509 case BRW_OPCODE_DP3:
510 swizzle = brw_swizzle_for_size(3);
511 break;
512 case BRW_OPCODE_DP2:
513 swizzle = brw_swizzle_for_size(2);
514 break;
515
516 case VEC4_OPCODE_TO_DOUBLE:
517 case VEC4_OPCODE_FROM_DOUBLE:
518 case VEC4_OPCODE_PICK_LOW_32BIT:
519 case VEC4_OPCODE_PICK_HIGH_32BIT:
520 case VEC4_OPCODE_SET_LOW_32BIT:
521 case VEC4_OPCODE_SET_HIGH_32BIT:
522 swizzle = brw_swizzle_for_size(4);
523 break;
524
525 default:
526 swizzle = brw_swizzle_for_mask(inst->dst.writemask);
527 break;
528 }
529
530 /* Update sources' swizzles. */
531 for (int i = 0; i < 3; i++) {
532 if (inst->src[i].file != VGRF &&
533 inst->src[i].file != ATTR &&
534 inst->src[i].file != UNIFORM)
535 continue;
536
537 const unsigned new_swizzle =
538 brw_compose_swizzle(swizzle, inst->src[i].swizzle);
539 if (inst->src[i].swizzle != new_swizzle) {
540 inst->src[i].swizzle = new_swizzle;
541 progress = true;
542 }
543 }
544 }
545
546 if (progress)
547 invalidate_live_intervals();
548
549 return progress;
550 }
551
552 void
553 vec4_visitor::split_uniform_registers()
554 {
555 /* Prior to this, uniforms have been in an array sized according to
556 * the number of vector uniforms present, sparsely filled (so an
557 * aggregate results in reg indices being skipped over). Now we're
558 * going to cut those aggregates up so each .nr index is one
559 * vector. The goal is to make elimination of unused uniform
560 * components easier later.
561 */
562 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
563 for (int i = 0 ; i < 3; i++) {
564 if (inst->src[i].file != UNIFORM)
565 continue;
566
567 assert(!inst->src[i].reladdr);
568
569 inst->src[i].nr += inst->src[i].offset / 16;
570 inst->src[i].offset %= 16;
571 }
572 }
573 }
574
575 void
576 vec4_visitor::pack_uniform_registers()
577 {
578 uint8_t chans_used[this->uniforms];
579 int new_loc[this->uniforms];
580 int new_chan[this->uniforms];
581
582 memset(chans_used, 0, sizeof(chans_used));
583 memset(new_loc, 0, sizeof(new_loc));
584 memset(new_chan, 0, sizeof(new_chan));
585
586 /* Find which uniform vectors are actually used by the program. We
587 * expect unused vector elements when we've moved array access out
588 * to pull constants, and from some GLSL code generators like wine.
589 */
590 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
591 unsigned readmask;
592 switch (inst->opcode) {
593 case VEC4_OPCODE_PACK_BYTES:
594 case BRW_OPCODE_DP4:
595 case BRW_OPCODE_DPH:
596 readmask = 0xf;
597 break;
598 case BRW_OPCODE_DP3:
599 readmask = 0x7;
600 break;
601 case BRW_OPCODE_DP2:
602 readmask = 0x3;
603 break;
604 default:
605 readmask = inst->dst.writemask;
606 break;
607 }
608
609 for (int i = 0 ; i < 3; i++) {
610 if (inst->src[i].file != UNIFORM)
611 continue;
612
613 assert(type_sz(inst->src[i].type) % 4 == 0);
614 unsigned channel_size = type_sz(inst->src[i].type) / 4;
615
616 int reg = inst->src[i].nr;
617 for (int c = 0; c < 4; c++) {
618 if (!(readmask & (1 << c)))
619 continue;
620
621 unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
622 unsigned used = MAX2(chans_used[reg], channel * channel_size);
623 if (used <= 4)
624 chans_used[reg] = used;
625 else
626 chans_used[reg + 1] = used - 4;
627 }
628 }
629
630 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
631 inst->src[0].file == UNIFORM) {
632 assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
633 assert(inst->src[0].subnr == 0);
634
635 unsigned bytes_read = inst->src[2].ud;
636 assert(bytes_read % 4 == 0);
637 unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
638
639 /* We just mark every register touched by a MOV_INDIRECT as being
640 * fully used. This ensures that it doesn't broken up piecewise by
641 * the next part of our packing algorithm.
642 */
643 int reg = inst->src[0].nr;
644 for (unsigned i = 0; i < vec4s_read; i++)
645 chans_used[reg + i] = 4;
646 }
647 }
648
649 int new_uniform_count = 0;
650
651 /* Now, figure out a packing of the live uniform vectors into our
652 * push constants.
653 */
654 for (int src = 0; src < uniforms; src++) {
655 int size = chans_used[src];
656
657 if (size == 0)
658 continue;
659
660 int dst;
661 /* Find the lowest place we can slot this uniform in. */
662 for (dst = 0; dst < src; dst++) {
663 if (chans_used[dst] + size <= 4)
664 break;
665 }
666
667 if (src == dst) {
668 new_loc[src] = dst;
669 new_chan[src] = 0;
670 } else {
671 new_loc[src] = dst;
672 new_chan[src] = chans_used[dst];
673
674 /* Move the references to the data */
675 for (int j = 0; j < size; j++) {
676 stage_prog_data->param[dst * 4 + new_chan[src] + j] =
677 stage_prog_data->param[src * 4 + j];
678 }
679
680 chans_used[dst] += size;
681 chans_used[src] = 0;
682 }
683
684 new_uniform_count = MAX2(new_uniform_count, dst + 1);
685 }
686
687 this->uniforms = new_uniform_count;
688
689 /* Now, update the instructions for our repacked uniforms. */
690 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
691 for (int i = 0 ; i < 3; i++) {
692 int src = inst->src[i].nr;
693
694 if (inst->src[i].file != UNIFORM)
695 continue;
696
697 inst->src[i].nr = new_loc[src];
698 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
699 new_chan[src], new_chan[src]);
700 }
701 }
702 }
703
704 /**
705 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
706 *
707 * While GLSL IR also performs this optimization, we end up with it in
708 * our instruction stream for a couple of reasons. One is that we
709 * sometimes generate silly instructions, for example in array access
710 * where we'll generate "ADD offset, index, base" even if base is 0.
711 * The other is that GLSL IR's constant propagation doesn't track the
712 * components of aggregates, so some VS patterns (initialize matrix to
713 * 0, accumulate in vertex blending factors) end up breaking down to
714 * instructions involving 0.
715 */
716 bool
717 vec4_visitor::opt_algebraic()
718 {
719 bool progress = false;
720
721 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
722 switch (inst->opcode) {
723 case BRW_OPCODE_MOV:
724 if (inst->src[0].file != IMM)
725 break;
726
727 if (inst->saturate) {
728 if (inst->dst.type != inst->src[0].type)
729 assert(!"unimplemented: saturate mixed types");
730
731 if (brw_saturate_immediate(inst->dst.type,
732 &inst->src[0].as_brw_reg())) {
733 inst->saturate = false;
734 progress = true;
735 }
736 }
737 break;
738
739 case VEC4_OPCODE_UNPACK_UNIFORM:
740 if (inst->src[0].file != UNIFORM) {
741 inst->opcode = BRW_OPCODE_MOV;
742 progress = true;
743 }
744 break;
745
746 case BRW_OPCODE_ADD:
747 if (inst->src[1].is_zero()) {
748 inst->opcode = BRW_OPCODE_MOV;
749 inst->src[1] = src_reg();
750 progress = true;
751 }
752 break;
753
754 case BRW_OPCODE_MUL:
755 if (inst->src[1].is_zero()) {
756 inst->opcode = BRW_OPCODE_MOV;
757 switch (inst->src[0].type) {
758 case BRW_REGISTER_TYPE_F:
759 inst->src[0] = brw_imm_f(0.0f);
760 break;
761 case BRW_REGISTER_TYPE_D:
762 inst->src[0] = brw_imm_d(0);
763 break;
764 case BRW_REGISTER_TYPE_UD:
765 inst->src[0] = brw_imm_ud(0u);
766 break;
767 default:
768 unreachable("not reached");
769 }
770 inst->src[1] = src_reg();
771 progress = true;
772 } else if (inst->src[1].is_one()) {
773 inst->opcode = BRW_OPCODE_MOV;
774 inst->src[1] = src_reg();
775 progress = true;
776 } else if (inst->src[1].is_negative_one()) {
777 inst->opcode = BRW_OPCODE_MOV;
778 inst->src[0].negate = !inst->src[0].negate;
779 inst->src[1] = src_reg();
780 progress = true;
781 }
782 break;
783 case BRW_OPCODE_CMP:
784 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
785 inst->src[0].abs &&
786 inst->src[0].negate &&
787 inst->src[1].is_zero()) {
788 inst->src[0].abs = false;
789 inst->src[0].negate = false;
790 inst->conditional_mod = BRW_CONDITIONAL_Z;
791 progress = true;
792 break;
793 }
794 break;
795 case SHADER_OPCODE_BROADCAST:
796 if (is_uniform(inst->src[0]) ||
797 inst->src[1].is_zero()) {
798 inst->opcode = BRW_OPCODE_MOV;
799 inst->src[1] = src_reg();
800 inst->force_writemask_all = true;
801 progress = true;
802 }
803 break;
804
805 default:
806 break;
807 }
808 }
809
810 if (progress)
811 invalidate_live_intervals();
812
813 return progress;
814 }
815
816 /**
817 * Only a limited number of hardware registers may be used for push
818 * constants, so this turns access to the overflowed constants into
819 * pull constants.
820 */
821 void
822 vec4_visitor::move_push_constants_to_pull_constants()
823 {
824 int pull_constant_loc[this->uniforms];
825
826 /* Only allow 32 registers (256 uniform components) as push constants,
827 * which is the limit on gen6.
828 *
829 * If changing this value, note the limitation about total_regs in
830 * brw_curbe.c.
831 */
832 int max_uniform_components = 32 * 8;
833 if (this->uniforms * 4 <= max_uniform_components)
834 return;
835
836 /* Make some sort of choice as to which uniforms get sent to pull
837 * constants. We could potentially do something clever here like
838 * look for the most infrequently used uniform vec4s, but leave
839 * that for later.
840 */
841 for (int i = 0; i < this->uniforms * 4; i += 4) {
842 pull_constant_loc[i / 4] = -1;
843
844 if (i >= max_uniform_components) {
845 const gl_constant_value **values = &stage_prog_data->param[i];
846
847 /* Try to find an existing copy of this uniform in the pull
848 * constants if it was part of an array access already.
849 */
850 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
851 int matches;
852
853 for (matches = 0; matches < 4; matches++) {
854 if (stage_prog_data->pull_param[j + matches] != values[matches])
855 break;
856 }
857
858 if (matches == 4) {
859 pull_constant_loc[i / 4] = j / 4;
860 break;
861 }
862 }
863
864 if (pull_constant_loc[i / 4] == -1) {
865 assert(stage_prog_data->nr_pull_params % 4 == 0);
866 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
867
868 for (int j = 0; j < 4; j++) {
869 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
870 values[j];
871 }
872 }
873 }
874 }
875
876 /* Now actually rewrite usage of the things we've moved to pull
877 * constants.
878 */
879 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
880 for (int i = 0 ; i < 3; i++) {
881 if (inst->src[i].file != UNIFORM ||
882 pull_constant_loc[inst->src[i].nr] == -1)
883 continue;
884
885 int uniform = inst->src[i].nr;
886
887 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
888
889 emit_pull_constant_load(block, inst, temp, inst->src[i],
890 pull_constant_loc[uniform], src_reg());
891
892 inst->src[i].file = temp.file;
893 inst->src[i].nr = temp.nr;
894 inst->src[i].offset %= 16;
895 inst->src[i].reladdr = NULL;
896 }
897 }
898
899 /* Repack push constants to remove the now-unused ones. */
900 pack_uniform_registers();
901 }
902
903 /* Conditions for which we want to avoid setting the dependency control bits */
904 bool
905 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
906 {
907 #define IS_DWORD(reg) \
908 (reg.type == BRW_REGISTER_TYPE_UD || \
909 reg.type == BRW_REGISTER_TYPE_D)
910
911 /* From the Cherryview and Broadwell PRMs:
912 *
913 * "When source or destination datatype is 64b or operation is integer DWord
914 * multiply, DepCtrl must not be used."
915 *
916 * SKL PRMs don't include this restriction though.
917 */
918 if (devinfo->gen == 8 || devinfo->is_broxton) {
919 if (inst->opcode == BRW_OPCODE_MUL &&
920 IS_DWORD(inst->src[0]) &&
921 IS_DWORD(inst->src[1]))
922 return true;
923 }
924 #undef IS_DWORD
925
926 if (devinfo->gen >= 8) {
927 if (inst->opcode == BRW_OPCODE_F32TO16)
928 return true;
929 }
930
931 /*
932 * mlen:
933 * In the presence of send messages, totally interrupt dependency
934 * control. They're long enough that the chance of dependency
935 * control around them just doesn't matter.
936 *
937 * predicate:
938 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
939 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
940 * completes the scoreboard clear must have a non-zero execution mask. This
941 * means, if any kind of predication can change the execution mask or channel
942 * enable of the last instruction, the optimization must be avoided. This is
943 * to avoid instructions being shot down the pipeline when no writes are
944 * required.
945 *
946 * math:
947 * Dependency control does not work well over math instructions.
948 * NB: Discovered empirically
949 */
950 return (inst->mlen || inst->predicate || inst->is_math());
951 }
952
953 /**
954 * Sets the dependency control fields on instructions after register
955 * allocation and before the generator is run.
956 *
957 * When you have a sequence of instructions like:
958 *
959 * DP4 temp.x vertex uniform[0]
960 * DP4 temp.y vertex uniform[0]
961 * DP4 temp.z vertex uniform[0]
962 * DP4 temp.w vertex uniform[0]
963 *
964 * The hardware doesn't know that it can actually run the later instructions
965 * while the previous ones are in flight, producing stalls. However, we have
966 * manual fields we can set in the instructions that let it do so.
967 */
968 void
969 vec4_visitor::opt_set_dependency_control()
970 {
971 vec4_instruction *last_grf_write[BRW_MAX_GRF];
972 uint8_t grf_channels_written[BRW_MAX_GRF];
973 vec4_instruction *last_mrf_write[BRW_MAX_GRF];
974 uint8_t mrf_channels_written[BRW_MAX_GRF];
975
976 assert(prog_data->total_grf ||
977 !"Must be called after register allocation");
978
979 foreach_block (block, cfg) {
980 memset(last_grf_write, 0, sizeof(last_grf_write));
981 memset(last_mrf_write, 0, sizeof(last_mrf_write));
982
983 foreach_inst_in_block (vec4_instruction, inst, block) {
984 /* If we read from a register that we were doing dependency control
985 * on, don't do dependency control across the read.
986 */
987 for (int i = 0; i < 3; i++) {
988 int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE;
989 if (inst->src[i].file == VGRF) {
990 last_grf_write[reg] = NULL;
991 } else if (inst->src[i].file == FIXED_GRF) {
992 memset(last_grf_write, 0, sizeof(last_grf_write));
993 break;
994 }
995 assert(inst->src[i].file != MRF);
996 }
997
998 if (is_dep_ctrl_unsafe(inst)) {
999 memset(last_grf_write, 0, sizeof(last_grf_write));
1000 memset(last_mrf_write, 0, sizeof(last_mrf_write));
1001 continue;
1002 }
1003
1004 /* Now, see if we can do dependency control for this instruction
1005 * against a previous one writing to its destination.
1006 */
1007 int reg = inst->dst.nr + inst->dst.offset / REG_SIZE;
1008 if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
1009 if (last_grf_write[reg] &&
1010 last_grf_write[reg]->dst.offset == inst->dst.offset &&
1011 !(inst->dst.writemask & grf_channels_written[reg])) {
1012 last_grf_write[reg]->no_dd_clear = true;
1013 inst->no_dd_check = true;
1014 } else {
1015 grf_channels_written[reg] = 0;
1016 }
1017
1018 last_grf_write[reg] = inst;
1019 grf_channels_written[reg] |= inst->dst.writemask;
1020 } else if (inst->dst.file == MRF) {
1021 if (last_mrf_write[reg] &&
1022 last_mrf_write[reg]->dst.offset == inst->dst.offset &&
1023 !(inst->dst.writemask & mrf_channels_written[reg])) {
1024 last_mrf_write[reg]->no_dd_clear = true;
1025 inst->no_dd_check = true;
1026 } else {
1027 mrf_channels_written[reg] = 0;
1028 }
1029
1030 last_mrf_write[reg] = inst;
1031 mrf_channels_written[reg] |= inst->dst.writemask;
1032 }
1033 }
1034 }
1035 }
1036
1037 bool
1038 vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo,
1039 int dst_writemask,
1040 int swizzle,
1041 int swizzle_mask)
1042 {
1043 /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
1044 * are not allowed.
1045 */
1046 if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW)
1047 return false;
1048
1049 if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW)
1050 return false;
1051
1052 /* If this instruction sets anything not referenced by swizzle, then we'd
1053 * totally break it when we reswizzle.
1054 */
1055 if (dst.writemask & ~swizzle_mask)
1056 return false;
1057
1058 if (mlen > 0)
1059 return false;
1060
1061 for (int i = 0; i < 3; i++) {
1062 if (src[i].is_accumulator())
1063 return false;
1064 }
1065
1066 return true;
1067 }
1068
1069 /**
1070 * For any channels in the swizzle's source that were populated by this
1071 * instruction, rewrite the instruction to put the appropriate result directly
1072 * in those channels.
1073 *
1074 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
1075 */
1076 void
1077 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
1078 {
1079 /* Destination write mask doesn't correspond to source swizzle for the dot
1080 * product and pack_bytes instructions.
1081 */
1082 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
1083 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
1084 opcode != VEC4_OPCODE_PACK_BYTES) {
1085 for (int i = 0; i < 3; i++) {
1086 if (src[i].file == BAD_FILE || src[i].file == IMM)
1087 continue;
1088
1089 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
1090 }
1091 }
1092
1093 /* Apply the specified swizzle and writemask to the original mask of
1094 * written components.
1095 */
1096 dst.writemask = dst_writemask &
1097 brw_apply_swizzle_to_mask(swizzle, dst.writemask);
1098 }
1099
1100 /*
1101 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
1102 * just written and then MOVed into another reg and making the original write
1103 * of the GRF write directly to the final destination instead.
1104 */
1105 bool
1106 vec4_visitor::opt_register_coalesce()
1107 {
1108 bool progress = false;
1109 int next_ip = 0;
1110
1111 calculate_live_intervals();
1112
1113 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1114 int ip = next_ip;
1115 next_ip++;
1116
1117 if (inst->opcode != BRW_OPCODE_MOV ||
1118 (inst->dst.file != VGRF && inst->dst.file != MRF) ||
1119 inst->predicate ||
1120 inst->src[0].file != VGRF ||
1121 inst->dst.type != inst->src[0].type ||
1122 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1123 continue;
1124
1125 /* Remove no-op MOVs */
1126 if (inst->dst.file == inst->src[0].file &&
1127 inst->dst.nr == inst->src[0].nr &&
1128 inst->dst.offset == inst->src[0].offset) {
1129 bool is_nop_mov = true;
1130
1131 for (unsigned c = 0; c < 4; c++) {
1132 if ((inst->dst.writemask & (1 << c)) == 0)
1133 continue;
1134
1135 if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
1136 is_nop_mov = false;
1137 break;
1138 }
1139 }
1140
1141 if (is_nop_mov) {
1142 inst->remove(block);
1143 progress = true;
1144 continue;
1145 }
1146 }
1147
1148 bool to_mrf = (inst->dst.file == MRF);
1149
1150 /* Can't coalesce this GRF if someone else was going to
1151 * read it later.
1152 */
1153 if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
1154 continue;
1155
1156 /* We need to check interference with the final destination between this
1157 * instruction and the earliest instruction involved in writing the GRF
1158 * we're eliminating. To do that, keep track of which of our source
1159 * channels we've seen initialized.
1160 */
1161 const unsigned chans_needed =
1162 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1163 inst->dst.writemask);
1164 unsigned chans_remaining = chans_needed;
1165
1166 /* Now walk up the instruction stream trying to see if we can rewrite
1167 * everything writing to the temporary to write into the destination
1168 * instead.
1169 */
1170 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1171 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1172 inst) {
1173 _scan_inst = scan_inst;
1174
1175 if (regions_overlap(inst->src[0], inst->size_read(0),
1176 scan_inst->dst, scan_inst->size_written)) {
1177 /* Found something writing to the reg we want to coalesce away. */
1178 if (to_mrf) {
1179 /* SEND instructions can't have MRF as a destination. */
1180 if (scan_inst->mlen)
1181 break;
1182
1183 if (devinfo->gen == 6) {
1184 /* gen6 math instructions must have the destination be
1185 * VGRF, so no compute-to-MRF for them.
1186 */
1187 if (scan_inst->is_math()) {
1188 break;
1189 }
1190 }
1191 }
1192
1193 /* This doesn't handle saturation on the instruction we
1194 * want to coalesce away if the register types do not match.
1195 * But if scan_inst is a non type-converting 'mov', we can fix
1196 * the types later.
1197 */
1198 if (inst->saturate &&
1199 inst->dst.type != scan_inst->dst.type &&
1200 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1201 scan_inst->dst.type == scan_inst->src[0].type))
1202 break;
1203
1204 /* Only allow coalescing between registers of the same type size.
1205 * Otherwise we would need to make the pass aware of the fact that
1206 * channel sizes are different for single and double precision.
1207 */
1208 if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
1209 break;
1210
1211 /* Check that scan_inst writes the same amount of data as the
1212 * instruction, otherwise coalescing would lead to writing a
1213 * different (larger or smaller) region of the destination
1214 */
1215 if (scan_inst->size_written != inst->size_written)
1216 break;
1217
1218 /* If we can't handle the swizzle, bail. */
1219 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1220 inst->src[0].swizzle,
1221 chans_needed)) {
1222 break;
1223 }
1224
1225 /* This only handles coalescing writes of 8 channels (1 register
1226 * for single-precision and 2 registers for double-precision)
1227 * starting at the source offset of the copy instruction.
1228 */
1229 if (DIV_ROUND_UP(scan_inst->size_written,
1230 type_sz(scan_inst->dst.type)) > 8 ||
1231 scan_inst->dst.offset != inst->src[0].offset)
1232 break;
1233
1234 /* Mark which channels we found unconditional writes for. */
1235 if (!scan_inst->predicate)
1236 chans_remaining &= ~scan_inst->dst.writemask;
1237
1238 if (chans_remaining == 0)
1239 break;
1240 }
1241
1242 /* You can't read from an MRF, so if someone else reads our MRF's
1243 * source GRF that we wanted to rewrite, that stops us. If it's a
1244 * GRF we're trying to coalesce to, we don't actually handle
1245 * rewriting sources so bail in that case as well.
1246 */
1247 bool interfered = false;
1248 for (int i = 0; i < 3; i++) {
1249 if (regions_overlap(inst->src[0], inst->size_read(0),
1250 scan_inst->src[i], scan_inst->size_read(i)))
1251 interfered = true;
1252 }
1253 if (interfered)
1254 break;
1255
1256 /* If somebody else writes the same channels of our destination here,
1257 * we can't coalesce before that.
1258 */
1259 if (regions_overlap(inst->dst, inst->size_written,
1260 scan_inst->dst, scan_inst->size_written) &&
1261 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1262 break;
1263 }
1264
1265 /* Check for reads of the register we're trying to coalesce into. We
1266 * can't go rewriting instructions above that to put some other value
1267 * in the register instead.
1268 */
1269 if (to_mrf && scan_inst->mlen > 0) {
1270 if (inst->dst.nr >= scan_inst->base_mrf &&
1271 inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
1272 break;
1273 }
1274 } else {
1275 for (int i = 0; i < 3; i++) {
1276 if (regions_overlap(inst->dst, inst->size_written,
1277 scan_inst->src[i], scan_inst->size_read(i)))
1278 interfered = true;
1279 }
1280 if (interfered)
1281 break;
1282 }
1283 }
1284
1285 if (chans_remaining == 0) {
1286 /* If we've made it here, we have an MOV we want to coalesce out, and
1287 * a scan_inst pointing to the earliest instruction involved in
1288 * computing the value. Now go rewrite the instruction stream
1289 * between the two.
1290 */
1291 vec4_instruction *scan_inst = _scan_inst;
1292 while (scan_inst != inst) {
1293 if (scan_inst->dst.file == VGRF &&
1294 scan_inst->dst.nr == inst->src[0].nr &&
1295 scan_inst->dst.offset == inst->src[0].offset) {
1296 scan_inst->reswizzle(inst->dst.writemask,
1297 inst->src[0].swizzle);
1298 scan_inst->dst.file = inst->dst.file;
1299 scan_inst->dst.nr = inst->dst.nr;
1300 scan_inst->dst.offset = inst->dst.offset;
1301 if (inst->saturate &&
1302 inst->dst.type != scan_inst->dst.type) {
1303 /* If we have reached this point, scan_inst is a non
1304 * type-converting 'mov' and we can modify its register types
1305 * to match the ones in inst. Otherwise, we could have an
1306 * incorrect saturation result.
1307 */
1308 scan_inst->dst.type = inst->dst.type;
1309 scan_inst->src[0].type = inst->src[0].type;
1310 }
1311 scan_inst->saturate |= inst->saturate;
1312 }
1313 scan_inst = (vec4_instruction *)scan_inst->next;
1314 }
1315 inst->remove(block);
1316 progress = true;
1317 }
1318 }
1319
1320 if (progress)
1321 invalidate_live_intervals();
1322
1323 return progress;
1324 }
1325
1326 /**
1327 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1328 * flow. We could probably do better here with some form of divergence
1329 * analysis.
1330 */
1331 bool
1332 vec4_visitor::eliminate_find_live_channel()
1333 {
1334 bool progress = false;
1335 unsigned depth = 0;
1336
1337 if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
1338 /* The optimization below assumes that channel zero is live on thread
1339 * dispatch, which may not be the case if the fixed function dispatches
1340 * threads sparsely.
1341 */
1342 return false;
1343 }
1344
1345 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1346 switch (inst->opcode) {
1347 case BRW_OPCODE_IF:
1348 case BRW_OPCODE_DO:
1349 depth++;
1350 break;
1351
1352 case BRW_OPCODE_ENDIF:
1353 case BRW_OPCODE_WHILE:
1354 depth--;
1355 break;
1356
1357 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1358 if (depth == 0) {
1359 inst->opcode = BRW_OPCODE_MOV;
1360 inst->src[0] = brw_imm_d(0);
1361 inst->force_writemask_all = true;
1362 progress = true;
1363 }
1364 break;
1365
1366 default:
1367 break;
1368 }
1369 }
1370
1371 return progress;
1372 }
1373
1374 /**
1375 * Splits virtual GRFs requesting more than one contiguous physical register.
1376 *
1377 * We initially create large virtual GRFs for temporary structures, arrays,
1378 * and matrices, so that the visitor functions can add offsets to work their
1379 * way down to the actual member being accessed. But when it comes to
1380 * optimization, we'd like to treat each register as individual storage if
1381 * possible.
1382 *
1383 * So far, the only thing that might prevent splitting is a send message from
1384 * a GRF on IVB.
1385 */
1386 void
1387 vec4_visitor::split_virtual_grfs()
1388 {
1389 int num_vars = this->alloc.count;
1390 int new_virtual_grf[num_vars];
1391 bool split_grf[num_vars];
1392
1393 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1394
1395 /* Try to split anything > 0 sized. */
1396 for (int i = 0; i < num_vars; i++) {
1397 split_grf[i] = this->alloc.sizes[i] != 1;
1398 }
1399
1400 /* Check that the instructions are compatible with the registers we're trying
1401 * to split.
1402 */
1403 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1404 if (inst->dst.file == VGRF && regs_written(inst) > 1)
1405 split_grf[inst->dst.nr] = false;
1406
1407 for (int i = 0; i < 3; i++) {
1408 if (inst->src[i].file == VGRF && regs_read(inst, i) > 1)
1409 split_grf[inst->src[i].nr] = false;
1410 }
1411 }
1412
1413 /* Allocate new space for split regs. Note that the virtual
1414 * numbers will be contiguous.
1415 */
1416 for (int i = 0; i < num_vars; i++) {
1417 if (!split_grf[i])
1418 continue;
1419
1420 new_virtual_grf[i] = alloc.allocate(1);
1421 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1422 unsigned reg = alloc.allocate(1);
1423 assert(reg == new_virtual_grf[i] + j - 1);
1424 (void) reg;
1425 }
1426 this->alloc.sizes[i] = 1;
1427 }
1428
1429 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1430 if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
1431 inst->dst.offset / REG_SIZE != 0) {
1432 inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
1433 inst->dst.offset / REG_SIZE - 1);
1434 inst->dst.offset %= REG_SIZE;
1435 }
1436 for (int i = 0; i < 3; i++) {
1437 if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
1438 inst->src[i].offset / REG_SIZE != 0) {
1439 inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
1440 inst->src[i].offset / REG_SIZE - 1);
1441 inst->src[i].offset %= REG_SIZE;
1442 }
1443 }
1444 }
1445 invalidate_live_intervals();
1446 }
1447
1448 void
1449 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1450 {
1451 dump_instruction(be_inst, stderr);
1452 }
1453
1454 void
1455 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1456 {
1457 vec4_instruction *inst = (vec4_instruction *)be_inst;
1458
1459 if (inst->predicate) {
1460 fprintf(file, "(%cf0.%d%s) ",
1461 inst->predicate_inverse ? '-' : '+',
1462 inst->flag_subreg,
1463 pred_ctrl_align16[inst->predicate]);
1464 }
1465
1466 fprintf(file, "%s(%d)", brw_instruction_name(devinfo, inst->opcode),
1467 inst->exec_size);
1468 if (inst->saturate)
1469 fprintf(file, ".sat");
1470 if (inst->conditional_mod) {
1471 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1472 if (!inst->predicate &&
1473 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1474 inst->opcode != BRW_OPCODE_IF &&
1475 inst->opcode != BRW_OPCODE_WHILE))) {
1476 fprintf(file, ".f0.%d", inst->flag_subreg);
1477 }
1478 }
1479 fprintf(file, " ");
1480
1481 switch (inst->dst.file) {
1482 case VGRF:
1483 fprintf(file, "vgrf%d", inst->dst.nr);
1484 break;
1485 case FIXED_GRF:
1486 fprintf(file, "g%d", inst->dst.nr);
1487 break;
1488 case MRF:
1489 fprintf(file, "m%d", inst->dst.nr);
1490 break;
1491 case ARF:
1492 switch (inst->dst.nr) {
1493 case BRW_ARF_NULL:
1494 fprintf(file, "null");
1495 break;
1496 case BRW_ARF_ADDRESS:
1497 fprintf(file, "a0.%d", inst->dst.subnr);
1498 break;
1499 case BRW_ARF_ACCUMULATOR:
1500 fprintf(file, "acc%d", inst->dst.subnr);
1501 break;
1502 case BRW_ARF_FLAG:
1503 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1504 break;
1505 default:
1506 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1507 break;
1508 }
1509 break;
1510 case BAD_FILE:
1511 fprintf(file, "(null)");
1512 break;
1513 case IMM:
1514 case ATTR:
1515 case UNIFORM:
1516 unreachable("not reached");
1517 }
1518 if (inst->dst.offset ||
1519 (inst->dst.file == VGRF &&
1520 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
1521 const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE);
1522 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
1523 inst->dst.offset % reg_size);
1524 }
1525 if (inst->dst.writemask != WRITEMASK_XYZW) {
1526 fprintf(file, ".");
1527 if (inst->dst.writemask & 1)
1528 fprintf(file, "x");
1529 if (inst->dst.writemask & 2)
1530 fprintf(file, "y");
1531 if (inst->dst.writemask & 4)
1532 fprintf(file, "z");
1533 if (inst->dst.writemask & 8)
1534 fprintf(file, "w");
1535 }
1536 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1537
1538 if (inst->src[0].file != BAD_FILE)
1539 fprintf(file, ", ");
1540
1541 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1542 if (inst->src[i].negate)
1543 fprintf(file, "-");
1544 if (inst->src[i].abs)
1545 fprintf(file, "|");
1546 switch (inst->src[i].file) {
1547 case VGRF:
1548 fprintf(file, "vgrf%d", inst->src[i].nr);
1549 break;
1550 case FIXED_GRF:
1551 fprintf(file, "g%d", inst->src[i].nr);
1552 break;
1553 case ATTR:
1554 fprintf(file, "attr%d", inst->src[i].nr);
1555 break;
1556 case UNIFORM:
1557 fprintf(file, "u%d", inst->src[i].nr);
1558 break;
1559 case IMM:
1560 switch (inst->src[i].type) {
1561 case BRW_REGISTER_TYPE_F:
1562 fprintf(file, "%fF", inst->src[i].f);
1563 break;
1564 case BRW_REGISTER_TYPE_DF:
1565 fprintf(file, "%fDF", inst->src[i].df);
1566 break;
1567 case BRW_REGISTER_TYPE_D:
1568 fprintf(file, "%dD", inst->src[i].d);
1569 break;
1570 case BRW_REGISTER_TYPE_UD:
1571 fprintf(file, "%uU", inst->src[i].ud);
1572 break;
1573 case BRW_REGISTER_TYPE_VF:
1574 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1575 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1576 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1577 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1578 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1579 break;
1580 default:
1581 fprintf(file, "???");
1582 break;
1583 }
1584 break;
1585 case ARF:
1586 switch (inst->src[i].nr) {
1587 case BRW_ARF_NULL:
1588 fprintf(file, "null");
1589 break;
1590 case BRW_ARF_ADDRESS:
1591 fprintf(file, "a0.%d", inst->src[i].subnr);
1592 break;
1593 case BRW_ARF_ACCUMULATOR:
1594 fprintf(file, "acc%d", inst->src[i].subnr);
1595 break;
1596 case BRW_ARF_FLAG:
1597 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1598 break;
1599 default:
1600 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1601 break;
1602 }
1603 break;
1604 case BAD_FILE:
1605 fprintf(file, "(null)");
1606 break;
1607 case MRF:
1608 unreachable("not reached");
1609 }
1610
1611 if (inst->src[i].offset ||
1612 (inst->src[i].file == VGRF &&
1613 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
1614 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE);
1615 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
1616 inst->src[i].offset % reg_size);
1617 }
1618
1619 if (inst->src[i].file != IMM) {
1620 static const char *chans[4] = {"x", "y", "z", "w"};
1621 fprintf(file, ".");
1622 for (int c = 0; c < 4; c++) {
1623 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1624 }
1625 }
1626
1627 if (inst->src[i].abs)
1628 fprintf(file, "|");
1629
1630 if (inst->src[i].file != IMM) {
1631 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1632 }
1633
1634 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1635 fprintf(file, ", ");
1636 }
1637
1638 if (inst->force_writemask_all)
1639 fprintf(file, " NoMask");
1640
1641 if (inst->exec_size != 8)
1642 fprintf(file, " group%d", inst->group);
1643
1644 fprintf(file, "\n");
1645 }
1646
1647
1648 static inline struct brw_reg
1649 attribute_to_hw_reg(int attr, bool interleaved)
1650 {
1651 if (interleaved)
1652 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1653 else
1654 return brw_vec8_grf(attr, 0);
1655 }
1656
1657
1658 /**
1659 * Replace each register of type ATTR in this->instructions with a reference
1660 * to a fixed HW register.
1661 *
1662 * If interleaved is true, then each attribute takes up half a register, with
1663 * register N containing attribute 2*N in its first half and attribute 2*N+1
1664 * in its second half (this corresponds to the payload setup used by geometry
1665 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
1666 * false, then each attribute takes up a whole register, with register N
1667 * containing attribute N (this corresponds to the payload setup used by
1668 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1669 */
1670 void
1671 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1672 bool interleaved)
1673 {
1674 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1675 for (int i = 0; i < 3; i++) {
1676 if (inst->src[i].file != ATTR)
1677 continue;
1678
1679 int grf = attribute_map[inst->src[i].nr +
1680 inst->src[i].offset / REG_SIZE];
1681 assert(inst->src[i].offset % REG_SIZE == 0);
1682
1683 /* All attributes used in the shader need to have been assigned a
1684 * hardware register by the caller
1685 */
1686 assert(grf != 0);
1687
1688 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1689 reg.swizzle = inst->src[i].swizzle;
1690 reg.type = inst->src[i].type;
1691 if (inst->src[i].abs)
1692 reg = brw_abs(reg);
1693 if (inst->src[i].negate)
1694 reg = negate(reg);
1695
1696 inst->src[i] = reg;
1697 }
1698 }
1699 }
1700
1701 int
1702 vec4_vs_visitor::setup_attributes(int payload_reg)
1703 {
1704 int nr_attributes;
1705 int attribute_map[VERT_ATTRIB_MAX + 2];
1706 memset(attribute_map, 0, sizeof(attribute_map));
1707
1708 nr_attributes = 0;
1709 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1710 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1711 attribute_map[i] = payload_reg + nr_attributes;
1712 nr_attributes++;
1713 }
1714 }
1715
1716 /* VertexID is stored by the VF as the last vertex element, but we
1717 * don't represent it with a flag in inputs_read, so we call it
1718 * VERT_ATTRIB_MAX.
1719 */
1720 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
1721 vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
1722 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1723 nr_attributes++;
1724 }
1725
1726 if (vs_prog_data->uses_drawid) {
1727 attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes;
1728 nr_attributes++;
1729 }
1730
1731 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1732
1733 return payload_reg + vs_prog_data->nr_attributes;
1734 }
1735
1736 int
1737 vec4_visitor::setup_uniforms(int reg)
1738 {
1739 prog_data->base.dispatch_grf_start_reg = reg;
1740
1741 /* The pre-gen6 VS requires that some push constants get loaded no
1742 * matter what, or the GPU would hang.
1743 */
1744 if (devinfo->gen < 6 && this->uniforms == 0) {
1745 stage_prog_data->param =
1746 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1747 for (unsigned int i = 0; i < 4; i++) {
1748 unsigned int slot = this->uniforms * 4 + i;
1749 static gl_constant_value zero = { 0.0 };
1750 stage_prog_data->param[slot] = &zero;
1751 }
1752
1753 this->uniforms++;
1754 reg++;
1755 } else {
1756 reg += ALIGN(uniforms, 2) / 2;
1757 }
1758
1759 stage_prog_data->nr_params = this->uniforms * 4;
1760
1761 prog_data->base.curb_read_length =
1762 reg - prog_data->base.dispatch_grf_start_reg;
1763
1764 return reg;
1765 }
1766
1767 void
1768 vec4_vs_visitor::setup_payload(void)
1769 {
1770 int reg = 0;
1771
1772 /* The payload always contains important data in g0, which contains
1773 * the URB handles that are passed on to the URB write at the end
1774 * of the thread. So, we always start push constants at g1.
1775 */
1776 reg++;
1777
1778 reg = setup_uniforms(reg);
1779
1780 reg = setup_attributes(reg);
1781
1782 this->first_non_payload_grf = reg;
1783 }
1784
1785 bool
1786 vec4_visitor::lower_minmax()
1787 {
1788 assert(devinfo->gen < 6);
1789
1790 bool progress = false;
1791
1792 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1793 const vec4_builder ibld(this, block, inst);
1794
1795 if (inst->opcode == BRW_OPCODE_SEL &&
1796 inst->predicate == BRW_PREDICATE_NONE) {
1797 /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
1798 * the original SEL.L/GE instruction
1799 */
1800 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
1801 inst->conditional_mod);
1802 inst->predicate = BRW_PREDICATE_NORMAL;
1803 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1804
1805 progress = true;
1806 }
1807 }
1808
1809 if (progress)
1810 invalidate_live_intervals();
1811
1812 return progress;
1813 }
1814
1815 src_reg
1816 vec4_visitor::get_timestamp()
1817 {
1818 assert(devinfo->gen >= 7);
1819
1820 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1821 BRW_ARF_TIMESTAMP,
1822 0,
1823 0,
1824 0,
1825 BRW_REGISTER_TYPE_UD,
1826 BRW_VERTICAL_STRIDE_0,
1827 BRW_WIDTH_4,
1828 BRW_HORIZONTAL_STRIDE_4,
1829 BRW_SWIZZLE_XYZW,
1830 WRITEMASK_XYZW));
1831
1832 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1833
1834 vec4_instruction *mov = emit(MOV(dst, ts));
1835 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1836 * even if it's not enabled in the dispatch.
1837 */
1838 mov->force_writemask_all = true;
1839
1840 return src_reg(dst);
1841 }
1842
1843 void
1844 vec4_visitor::emit_shader_time_begin()
1845 {
1846 current_annotation = "shader time start";
1847 shader_start_time = get_timestamp();
1848 }
1849
1850 void
1851 vec4_visitor::emit_shader_time_end()
1852 {
1853 current_annotation = "shader time end";
1854 src_reg shader_end_time = get_timestamp();
1855
1856
1857 /* Check that there weren't any timestamp reset events (assuming these
1858 * were the only two timestamp reads that happened).
1859 */
1860 src_reg reset_end = shader_end_time;
1861 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1862 vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
1863 test->conditional_mod = BRW_CONDITIONAL_Z;
1864
1865 emit(IF(BRW_PREDICATE_NORMAL));
1866
1867 /* Take the current timestamp and get the delta. */
1868 shader_start_time.negate = true;
1869 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1870 emit(ADD(diff, shader_start_time, shader_end_time));
1871
1872 /* If there were no instructions between the two timestamp gets, the diff
1873 * is 2 cycles. Remove that overhead, so I can forget about that when
1874 * trying to determine the time taken for single instructions.
1875 */
1876 emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
1877
1878 emit_shader_time_write(0, src_reg(diff));
1879 emit_shader_time_write(1, brw_imm_ud(1u));
1880 emit(BRW_OPCODE_ELSE);
1881 emit_shader_time_write(2, brw_imm_ud(1u));
1882 emit(BRW_OPCODE_ENDIF);
1883 }
1884
1885 void
1886 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1887 {
1888 dst_reg dst =
1889 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1890
1891 dst_reg offset = dst;
1892 dst_reg time = dst;
1893 time.offset += REG_SIZE;
1894
1895 offset.type = BRW_REGISTER_TYPE_UD;
1896 int index = shader_time_index * 3 + shader_time_subindex;
1897 emit(MOV(offset, brw_imm_d(index * SHADER_TIME_STRIDE)));
1898
1899 time.type = BRW_REGISTER_TYPE_UD;
1900 emit(MOV(time, value));
1901
1902 vec4_instruction *inst =
1903 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1904 inst->mlen = 2;
1905 }
1906
1907 void
1908 vec4_visitor::convert_to_hw_regs()
1909 {
1910 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1911 for (int i = 0; i < 3; i++) {
1912 struct src_reg &src = inst->src[i];
1913 struct brw_reg reg;
1914 switch (src.file) {
1915 case VGRF: {
1916 const unsigned type_size = type_sz(src.type);
1917 const unsigned width = REG_SIZE / 2 / MAX2(4, type_size);
1918 reg = byte_offset(brw_vecn_grf(width, src.nr, 0), src.offset);
1919 reg.type = src.type;
1920 reg.abs = src.abs;
1921 reg.negate = src.negate;
1922 break;
1923 }
1924
1925 case UNIFORM: {
1926 const unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type));
1927 reg = stride(byte_offset(brw_vec4_grf(
1928 prog_data->base.dispatch_grf_start_reg +
1929 src.nr / 2, src.nr % 2 * 4),
1930 src.offset),
1931 0, width, 1);
1932 reg.type = src.type;
1933 reg.abs = src.abs;
1934 reg.negate = src.negate;
1935
1936 /* This should have been moved to pull constants. */
1937 assert(!src.reladdr);
1938 break;
1939 }
1940
1941 case FIXED_GRF:
1942 if (type_sz(src.type) == 8) {
1943 reg = src.as_brw_reg();
1944 break;
1945 }
1946 /* fallthrough */
1947 case ARF:
1948 case IMM:
1949 continue;
1950
1951 case BAD_FILE:
1952 /* Probably unused. */
1953 reg = brw_null_reg();
1954 break;
1955
1956 case MRF:
1957 case ATTR:
1958 unreachable("not reached");
1959 }
1960
1961 apply_logical_swizzle(&reg, inst, i);
1962 src = reg;
1963 }
1964
1965 if (inst->is_3src(devinfo)) {
1966 /* 3-src instructions with scalar sources support arbitrary subnr,
1967 * but don't actually use swizzles. Convert swizzle into subnr.
1968 * Skip this for double-precision instructions: RepCtrl=1 is not
1969 * allowed for them and needs special handling.
1970 */
1971 for (int i = 0; i < 3; i++) {
1972 if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
1973 type_sz(inst->src[i].type) < 8) {
1974 assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
1975 inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
1976 }
1977 }
1978 }
1979
1980 dst_reg &dst = inst->dst;
1981 struct brw_reg reg;
1982
1983 switch (inst->dst.file) {
1984 case VGRF:
1985 reg = byte_offset(brw_vec8_grf(dst.nr, 0), dst.offset);
1986 reg.type = dst.type;
1987 reg.writemask = dst.writemask;
1988 break;
1989
1990 case MRF:
1991 reg = byte_offset(brw_message_reg(dst.nr), dst.offset);
1992 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
1993 reg.type = dst.type;
1994 reg.writemask = dst.writemask;
1995 break;
1996
1997 case ARF:
1998 case FIXED_GRF:
1999 reg = dst.as_brw_reg();
2000 break;
2001
2002 case BAD_FILE:
2003 reg = brw_null_reg();
2004 break;
2005
2006 case IMM:
2007 case ATTR:
2008 case UNIFORM:
2009 unreachable("not reached");
2010 }
2011
2012 dst = reg;
2013 }
2014 }
2015
2016 /**
2017 * Get the closest native SIMD width supported by the hardware for instruction
2018 * \p inst. The instruction will be left untouched by
2019 * vec4_visitor::lower_simd_width() if the returned value matches the
2020 * instruction's original execution size.
2021 */
2022 static unsigned
2023 get_lowered_simd_width(const struct gen_device_info *devinfo,
2024 const vec4_instruction *inst)
2025 {
2026 unsigned lowered_width = MIN2(16, inst->exec_size);
2027
2028 /* We need to split some cases of double-precision instructions that write
2029 * 2 registers. We only need to care about this in gen7 because that is the
2030 * only hardware that implements fp64 in Align16.
2031 */
2032 if (devinfo->gen == 7 && inst->size_written > REG_SIZE) {
2033 /* Align16 8-wide double-precision SEL does not work well. Verified
2034 * empirically.
2035 */
2036 if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8)
2037 lowered_width = MIN2(lowered_width, 4);
2038
2039 /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
2040 * Register Addressing:
2041 *
2042 * "When destination spans two registers, the source MUST span two
2043 * registers."
2044 */
2045 for (unsigned i = 0; i < 3; i++) {
2046 if (inst->src[i].file == BAD_FILE)
2047 continue;
2048 if (inst->size_read(i) <= REG_SIZE)
2049 lowered_width = MIN2(lowered_width, 4);
2050 }
2051 }
2052
2053 return lowered_width;
2054 }
2055
2056 static bool
2057 dst_src_regions_overlap(vec4_instruction *inst)
2058 {
2059 if (inst->size_written == 0)
2060 return false;
2061
2062 unsigned dst_start = inst->dst.offset;
2063 unsigned dst_end = dst_start + inst->size_written - 1;
2064 for (int i = 0; i < 3; i++) {
2065 if (inst->src[i].file == BAD_FILE)
2066 continue;
2067
2068 if (inst->dst.file != inst->src[i].file ||
2069 inst->dst.nr != inst->src[i].nr)
2070 continue;
2071
2072 unsigned src_start = inst->src[i].offset;
2073 unsigned src_end = src_start + inst->size_read(i) - 1;
2074
2075 if ((dst_start >= src_start && dst_start <= src_end) ||
2076 (dst_end >= src_start && dst_end <= src_end) ||
2077 (dst_start <= src_start && dst_end >= src_end)) {
2078 return true;
2079 }
2080 }
2081
2082 return false;
2083 }
2084
2085 bool
2086 vec4_visitor::lower_simd_width()
2087 {
2088 bool progress = false;
2089
2090 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2091 const unsigned lowered_width = get_lowered_simd_width(devinfo, inst);
2092 assert(lowered_width <= inst->exec_size);
2093 if (lowered_width == inst->exec_size)
2094 continue;
2095
2096 /* We need to deal with source / destination overlaps when splitting.
2097 * The hardware supports reading from and writing to the same register
2098 * in the same instruction, but we need to be careful that each split
2099 * instruction we produce does not corrupt the source of the next.
2100 *
2101 * The easiest way to handle this is to make the split instructions write
2102 * to temporaries if there is an src/dst overlap and then move from the
2103 * temporaries to the original destination. We also need to consider
2104 * instructions that do partial writes via align1 opcodes, in which case
2105 * we need to make sure that the we initialize the temporary with the
2106 * value of the instruction's dst.
2107 */
2108 bool needs_temp = dst_src_regions_overlap(inst);
2109 for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) {
2110 unsigned channel_offset = lowered_width * n;
2111
2112 unsigned size_written = lowered_width * type_sz(inst->dst.type);
2113
2114 /* Create the split instruction from the original so that we copy all
2115 * relevant instruction fields, then set the width and calculate the
2116 * new dst/src regions.
2117 */
2118 vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
2119 linst->exec_size = lowered_width;
2120 linst->group = channel_offset;
2121 linst->size_written = size_written;
2122
2123 /* Compute split dst region */
2124 dst_reg dst;
2125 if (needs_temp) {
2126 unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
2127 dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
2128 inst->dst.type);
2129 if (inst->is_align1_partial_write()) {
2130 vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
2131 copy->exec_size = lowered_width;
2132 copy->group = channel_offset;
2133 copy->size_written = size_written;
2134 inst->insert_before(block, copy);
2135 }
2136 } else {
2137 dst = horiz_offset(inst->dst, channel_offset);
2138 }
2139 linst->dst = dst;
2140
2141 /* Compute split source regions */
2142 for (int i = 0; i < 3; i++) {
2143 if (linst->src[i].file == BAD_FILE)
2144 continue;
2145
2146 if (!is_uniform(linst->src[i]))
2147 linst->src[i] = horiz_offset(linst->src[i], channel_offset);
2148 }
2149
2150 inst->insert_before(block, linst);
2151
2152 /* If we used a temporary to store the result of the split
2153 * instruction, copy the result to the original destination
2154 */
2155 if (needs_temp) {
2156 vec4_instruction *mov =
2157 MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
2158 mov->exec_size = lowered_width;
2159 mov->group = channel_offset;
2160 mov->size_written = size_written;
2161 mov->predicate = inst->predicate;
2162 inst->insert_before(block, mov);
2163 }
2164 }
2165
2166 inst->remove(block);
2167 progress = true;
2168 }
2169
2170 if (progress)
2171 invalidate_live_intervals();
2172
2173 return progress;
2174 }
2175
2176 static bool
2177 is_align1_df(vec4_instruction *inst)
2178 {
2179 switch (inst->opcode) {
2180 case VEC4_OPCODE_FROM_DOUBLE:
2181 case VEC4_OPCODE_TO_DOUBLE:
2182 case VEC4_OPCODE_PICK_LOW_32BIT:
2183 case VEC4_OPCODE_PICK_HIGH_32BIT:
2184 case VEC4_OPCODE_SET_LOW_32BIT:
2185 case VEC4_OPCODE_SET_HIGH_32BIT:
2186 return true;
2187 default:
2188 return false;
2189 }
2190 }
2191
2192 static brw_predicate
2193 scalarize_predicate(brw_predicate predicate, unsigned writemask)
2194 {
2195 if (predicate != BRW_PREDICATE_NORMAL)
2196 return predicate;
2197
2198 switch (writemask) {
2199 case WRITEMASK_X:
2200 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
2201 case WRITEMASK_Y:
2202 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
2203 case WRITEMASK_Z:
2204 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
2205 case WRITEMASK_W:
2206 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
2207 default:
2208 unreachable("invalid writemask");
2209 }
2210 }
2211
2212 bool
2213 vec4_visitor::scalarize_df()
2214 {
2215 bool progress = false;
2216
2217 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2218 /* Skip DF instructions that operate in Align1 mode */
2219 if (is_align1_df(inst))
2220 continue;
2221
2222 /* Check if this is a double-precision instruction */
2223 bool is_double = type_sz(inst->dst.type) == 8;
2224 for (int arg = 0; !is_double && arg < 3; arg++) {
2225 is_double = inst->src[arg].file != BAD_FILE &&
2226 type_sz(inst->src[arg].type) == 8;
2227 }
2228
2229 if (!is_double)
2230 continue;
2231
2232 /* Generate scalar instructions for each enabled channel */
2233 for (unsigned chan = 0; chan < 4; chan++) {
2234 unsigned chan_mask = 1 << chan;
2235 if (!(inst->dst.writemask & chan_mask))
2236 continue;
2237
2238 vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
2239
2240 for (unsigned i = 0; i < 3; i++) {
2241 unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
2242 scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
2243 }
2244
2245 scalar_inst->dst.writemask = chan_mask;
2246
2247 if (inst->predicate != BRW_PREDICATE_NONE) {
2248 scalar_inst->predicate =
2249 scalarize_predicate(inst->predicate, chan_mask);
2250 }
2251
2252 inst->insert_before(block, scalar_inst);
2253 }
2254
2255 inst->remove(block);
2256 progress = true;
2257 }
2258
2259 if (progress)
2260 invalidate_live_intervals();
2261
2262 return progress;
2263 }
2264
2265 bool
2266 vec4_visitor::lower_64bit_mad_to_mul_add()
2267 {
2268 bool progress = false;
2269
2270 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2271 if (inst->opcode != BRW_OPCODE_MAD)
2272 continue;
2273
2274 if (type_sz(inst->dst.type) != 8)
2275 continue;
2276
2277 dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
2278
2279 /* Use the copy constructor so we copy all relevant instruction fields
2280 * from the original mad into the add and mul instructions
2281 */
2282 vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
2283 mul->opcode = BRW_OPCODE_MUL;
2284 mul->dst = mul_dst;
2285 mul->src[0] = inst->src[1];
2286 mul->src[1] = inst->src[2];
2287 mul->src[2].file = BAD_FILE;
2288
2289 vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
2290 add->opcode = BRW_OPCODE_ADD;
2291 add->src[0] = src_reg(mul_dst);
2292 add->src[1] = inst->src[0];
2293 add->src[2].file = BAD_FILE;
2294
2295 inst->insert_before(block, mul);
2296 inst->insert_before(block, add);
2297 inst->remove(block);
2298
2299 progress = true;
2300 }
2301
2302 if (progress)
2303 invalidate_live_intervals();
2304
2305 return progress;
2306 }
2307
2308 /* The align16 hardware can only do 32-bit swizzle channels, so we need to
2309 * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
2310 * to 32-bit swizzle channels in hardware registers.
2311 *
2312 * @inst and @arg identify the original vec4 IR source operand we need to
2313 * translate the swizzle for and @hw_reg is the hardware register where we
2314 * will write the hardware swizzle to use.
2315 *
2316 * This pass assumes that Align16/DF instructions have been fully scalarized
2317 * previously so there is just one 64-bit swizzle channel to deal with for any
2318 * given Vec4 IR source.
2319 */
2320 void
2321 vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg,
2322 vec4_instruction *inst, int arg)
2323 {
2324 src_reg reg = inst->src[arg];
2325
2326 if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE)
2327 return;
2328
2329 /* If this is not a 64-bit operand or this is a scalar instruction we don't
2330 * need to do anything about the swizzles.
2331 */
2332 if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
2333 hw_reg->swizzle = reg.swizzle;
2334 return;
2335 }
2336
2337 /* Otherwise we should have scalarized the instruction, so take the single
2338 * 64-bit logical swizzle channel and translate it to 32-bit
2339 */
2340 assert(brw_is_single_value_swizzle(reg.swizzle));
2341
2342 /* To gain access to Z/W components we need to select the second half
2343 * of the register and then use a X/Y swizzle to select Z/W respectively.
2344 */
2345 unsigned swizzle = BRW_GET_SWZ(reg.swizzle, 0);
2346
2347 if (swizzle >= 2) {
2348 *hw_reg = suboffset(*hw_reg, 2);
2349 swizzle -= 2;
2350 }
2351
2352 /* Any 64-bit source with an offset at 16B is intended to address the
2353 * second half of a register and needs a vertical stride of 0 so we:
2354 *
2355 * 1. Don't violate register region restrictions.
2356 * 2. Activate the gen7 instruction decompresion bug exploit when
2357 * execsize > 4
2358 */
2359 if (hw_reg->subnr % REG_SIZE == 16) {
2360 assert(devinfo->gen == 7);
2361 hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
2362 }
2363
2364 hw_reg->swizzle = BRW_SWIZZLE4(swizzle * 2, swizzle * 2 + 1,
2365 swizzle * 2, swizzle * 2 + 1);
2366 }
2367
2368 bool
2369 vec4_visitor::run()
2370 {
2371 if (shader_time_index >= 0)
2372 emit_shader_time_begin();
2373
2374 emit_prolog();
2375
2376 emit_nir_code();
2377 if (failed)
2378 return false;
2379 base_ir = NULL;
2380
2381 emit_thread_end();
2382
2383 calculate_cfg();
2384
2385 /* Before any optimization, push array accesses out to scratch
2386 * space where we need them to be. This pass may allocate new
2387 * virtual GRFs, so we want to do it early. It also makes sure
2388 * that we have reladdr computations available for CSE, since we'll
2389 * often do repeated subexpressions for those.
2390 */
2391 move_grf_array_access_to_scratch();
2392 move_uniform_array_access_to_pull_constants();
2393
2394 pack_uniform_registers();
2395 move_push_constants_to_pull_constants();
2396 split_virtual_grfs();
2397
2398 #define OPT(pass, args...) ({ \
2399 pass_num++; \
2400 bool this_progress = pass(args); \
2401 \
2402 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
2403 char filename[64]; \
2404 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
2405 stage_abbrev, nir->info->name, iteration, pass_num); \
2406 \
2407 backend_shader::dump_instructions(filename); \
2408 } \
2409 \
2410 progress = progress || this_progress; \
2411 this_progress; \
2412 })
2413
2414
2415 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
2416 char filename[64];
2417 snprintf(filename, 64, "%s-%s-00-00-start",
2418 stage_abbrev, nir->info->name);
2419
2420 backend_shader::dump_instructions(filename);
2421 }
2422
2423 bool progress;
2424 int iteration = 0;
2425 int pass_num = 0;
2426 do {
2427 progress = false;
2428 pass_num = 0;
2429 iteration++;
2430
2431 OPT(opt_predicated_break, this);
2432 OPT(opt_reduce_swizzle);
2433 OPT(dead_code_eliminate);
2434 OPT(dead_control_flow_eliminate, this);
2435 OPT(opt_copy_propagation);
2436 OPT(opt_cmod_propagation);
2437 OPT(opt_cse);
2438 OPT(opt_algebraic);
2439 OPT(opt_register_coalesce);
2440 OPT(eliminate_find_live_channel);
2441 } while (progress);
2442
2443 pass_num = 0;
2444
2445 if (OPT(opt_vector_float)) {
2446 OPT(opt_cse);
2447 OPT(opt_copy_propagation, false);
2448 OPT(opt_copy_propagation, true);
2449 OPT(dead_code_eliminate);
2450 }
2451
2452 if (devinfo->gen <= 5 && OPT(lower_minmax)) {
2453 OPT(opt_cmod_propagation);
2454 OPT(opt_cse);
2455 OPT(opt_copy_propagation);
2456 OPT(dead_code_eliminate);
2457 }
2458
2459 if (OPT(lower_simd_width)) {
2460 OPT(opt_copy_propagation);
2461 OPT(dead_code_eliminate);
2462 }
2463
2464 if (failed)
2465 return false;
2466
2467 OPT(lower_64bit_mad_to_mul_add);
2468 OPT(scalarize_df);
2469
2470 setup_payload();
2471
2472 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
2473 /* Debug of register spilling: Go spill everything. */
2474 const int grf_count = alloc.count;
2475 float spill_costs[alloc.count];
2476 bool no_spill[alloc.count];
2477 evaluate_spill_costs(spill_costs, no_spill);
2478 for (int i = 0; i < grf_count; i++) {
2479 if (no_spill[i])
2480 continue;
2481 spill_reg(i);
2482 }
2483 }
2484
2485 bool allocated_without_spills = reg_allocate();
2486
2487 if (!allocated_without_spills) {
2488 compiler->shader_perf_log(log_data,
2489 "%s shader triggered register spilling. "
2490 "Try reducing the number of live vec4 values "
2491 "to improve performance.\n",
2492 stage_name);
2493
2494 while (!reg_allocate()) {
2495 if (failed)
2496 return false;
2497 }
2498 }
2499
2500 opt_schedule_instructions();
2501
2502 opt_set_dependency_control();
2503
2504 convert_to_hw_regs();
2505
2506 if (last_scratch > 0) {
2507 prog_data->base.total_scratch =
2508 brw_get_scratch_size(last_scratch * REG_SIZE);
2509 }
2510
2511 return !failed;
2512 }
2513
2514 } /* namespace brw */
2515
2516 extern "C" {
2517
2518 /**
2519 * Compile a vertex shader.
2520 *
2521 * Returns the final assembly and the program's size.
2522 */
2523 const unsigned *
2524 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
2525 void *mem_ctx,
2526 const struct brw_vs_prog_key *key,
2527 struct brw_vs_prog_data *prog_data,
2528 const nir_shader *src_shader,
2529 gl_clip_plane *clip_planes,
2530 bool use_legacy_snorm_formula,
2531 int shader_time_index,
2532 unsigned *final_assembly_size,
2533 char **error_str)
2534 {
2535 const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
2536 nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
2537 shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
2538 brw_nir_lower_vs_inputs(shader, is_scalar,
2539 use_legacy_snorm_formula, key->gl_attrib_wa_flags);
2540 brw_nir_lower_vue_outputs(shader, is_scalar);
2541 shader = brw_postprocess_nir(shader, compiler, is_scalar);
2542
2543 const unsigned *assembly = NULL;
2544
2545 prog_data->base.clip_distance_mask =
2546 ((1 << shader->info->clip_distance_array_size) - 1);
2547 prog_data->base.cull_distance_mask =
2548 ((1 << shader->info->cull_distance_array_size) - 1) <<
2549 shader->info->clip_distance_array_size;
2550
2551 unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
2552
2553 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2554 * incoming vertex attribute. So, add an extra slot.
2555 */
2556 if (shader->info->system_values_read &
2557 (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
2558 BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
2559 BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
2560 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
2561 nr_attributes++;
2562 }
2563
2564 /* gl_DrawID has its very own vec4 */
2565 if (shader->info->system_values_read &
2566 BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
2567 nr_attributes++;
2568 }
2569
2570 unsigned nr_attribute_slots =
2571 nr_attributes +
2572 _mesa_bitcount_64(shader->info->double_inputs_read);
2573
2574 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2575 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
2576 * vec4 mode, the hardware appears to wedge unless we read something.
2577 */
2578 if (is_scalar)
2579 prog_data->base.urb_read_length =
2580 DIV_ROUND_UP(nr_attribute_slots, 2);
2581 else
2582 prog_data->base.urb_read_length =
2583 DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
2584
2585 prog_data->nr_attributes = nr_attributes;
2586 prog_data->nr_attribute_slots = nr_attribute_slots;
2587
2588 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2589 * (overwriting the original contents), we need to make sure the size is
2590 * the larger of the two.
2591 */
2592 const unsigned vue_entries =
2593 MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
2594
2595 if (compiler->devinfo->gen == 6)
2596 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2597 else
2598 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2599
2600 if (is_scalar) {
2601 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
2602
2603 fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
2604 NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
2605 shader, 8, shader_time_index);
2606 if (!v.run_vs(clip_planes)) {
2607 if (error_str)
2608 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2609
2610 return NULL;
2611 }
2612
2613 prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
2614
2615 fs_generator g(compiler, log_data, mem_ctx, (void *) key,
2616 &prog_data->base.base, v.promoted_constants,
2617 v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
2618 if (INTEL_DEBUG & DEBUG_VS) {
2619 const char *debug_name =
2620 ralloc_asprintf(mem_ctx, "%s vertex shader %s",
2621 shader->info->label ? shader->info->label :
2622 "unnamed",
2623 shader->info->name);
2624
2625 g.enable_debug(debug_name);
2626 }
2627 g.generate_code(v.cfg, 8);
2628 assembly = g.get_assembly(final_assembly_size);
2629 }
2630
2631 if (!assembly) {
2632 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2633
2634 vec4_vs_visitor v(compiler, log_data, key, prog_data,
2635 shader, clip_planes, mem_ctx,
2636 shader_time_index, use_legacy_snorm_formula);
2637 if (!v.run()) {
2638 if (error_str)
2639 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2640
2641 return NULL;
2642 }
2643
2644 assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2645 shader, &prog_data->base, v.cfg,
2646 final_assembly_size);
2647 }
2648
2649 return assembly;
2650 }
2651
2652 } /* extern "C" */